2017-10-26 06:20:04 -04:00
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
2017-11-08 08:14:24 -05:00
# include "FxCheckAdult.h"
2017-10-26 06:20:04 -04:00
# include "Log.h"
# include "Conf.h"
# include "Mem.h"
# include "termid_mask.h"
# include "Phrases.h"
# include "XmlDoc.h"
2017-11-08 08:14:24 -05:00
TermCheckList g_checkAdultList ;
2017-10-26 06:20:04 -04:00
2017-11-08 08:14:24 -05:00
CheckAdult : : CheckAdult ( XmlDoc * xd , bool debug ) :
m_debbuf ( NULL ) , m_debbufUsed ( 0 ) , m_debbufSize ( 0 ) , m_docMatchScore ( - 1 ) ,
m_numUniqueMatchedWords ( 0 ) , m_numUniqueMatchedPhrases ( 0 ) , m_numWordsChecked ( 0 ) ,
2017-10-26 09:13:50 -04:00
m_emptyDocumentBody ( false ) , m_resultValid ( false ) , m_result ( false ) {
2017-10-26 06:20:04 -04:00
2017-10-26 09:13:50 -04:00
if ( ! xd ) {
2017-11-08 08:14:24 -05:00
log ( LOG_ERROR , " CheckAdult::CheckAdult passed NULL-pointer " ) ;
2017-10-26 09:13:50 -04:00
gbshutdownLogicError ( ) ;
2017-10-26 06:20:04 -04:00
}
2017-10-26 09:13:50 -04:00
m_url = xd - > getFirstUrl ( ) ;
if ( m_url = = ( Url * ) - 1 ) {
2017-10-26 06:20:04 -04:00
m_url = NULL ;
2017-10-26 09:13:50 -04:00
}
m_xml = xd - > getXml ( ) ;
if ( m_xml = = ( Xml * ) - 1 ) {
2017-10-26 06:20:04 -04:00
m_xml = NULL ;
2017-10-26 09:13:50 -04:00
}
2018-03-09 10:24:39 -05:00
m_tokenizerResult = xd - > getTokenizerResult ( ) ;
if ( m_tokenizerResult = = ( TokenizerResult * ) - 1 ) {
m_tokenizerResult = NULL ;
2017-10-26 06:20:04 -04:00
}
2017-10-26 09:13:50 -04:00
m_phrases = xd - > getPhrases ( ) ;
if ( m_phrases = = ( Phrases * ) - 1 ) {
m_phrases = NULL ;
}
2017-10-26 06:20:04 -04:00
if ( debug ) {
m_debbufSize = 2000 ;
2017-11-08 08:14:24 -05:00
m_debbuf = ( char * ) mmalloc ( m_debbufSize , " CheckAdult " ) ;
2017-10-27 07:48:38 -04:00
if ( m_debbuf ) {
// zero-terminate now as we may not need it, but may try logging it later
m_debbuf [ 0 ] = ' \0 ' ;
}
else {
2017-10-26 06:20:04 -04:00
m_debbufSize = 0 ;
}
}
}
2017-11-08 08:14:24 -05:00
CheckAdult : : ~ CheckAdult ( ) {
2017-10-26 06:20:04 -04:00
if ( m_debbuf ) {
2017-11-08 08:14:24 -05:00
mfree ( m_debbuf , m_debbufSize , " CheckAdult " ) ;
2017-10-26 06:20:04 -04:00
}
}
2017-11-08 08:14:24 -05:00
int32_t CheckAdult : : getScore ( ) {
return m_docMatchScore ;
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
int32_t CheckAdult : : getNumUniqueMatchedWords ( ) {
return m_numUniqueMatchedWords ;
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
int32_t CheckAdult : : getNumUniqueMatchedPhrases ( ) {
return m_numUniqueMatchedPhrases ;
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
int32_t CheckAdult : : getNumWordsChecked ( ) {
2017-10-26 09:13:50 -04:00
return m_numWordsChecked ;
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
bool CheckAdult : : hasEmptyDocumentBody ( ) {
2017-10-26 09:13:50 -04:00
return m_emptyDocumentBody ;
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
const char * CheckAdult : : getReason ( ) {
2017-10-26 06:20:04 -04:00
return m_reason . c_str ( ) ;
}
2017-11-08 08:14:24 -05:00
const char * CheckAdult : : getDebugInfo ( ) {
2017-10-26 06:20:04 -04:00
if ( m_debbuf ) {
return m_debbuf ;
}
return " " ;
}
2017-11-08 08:14:24 -05:00
bool CheckAdult : : hasAdultRatingTag ( ) {
2017-10-26 06:20:04 -04:00
if ( ! m_xml ) {
return false ;
}
int32_t mtlen ;
// https://webmasters.googleblog.com/2012/04/1000-words-about-images.html
// http://www.safelabeling.org/how.htm
// http://www.rtalabel.org/index.php?content=howto
char * mtag = m_xml - > getMetaContentPointer ( " rating " , 6 , " name " , & mtlen ) ;
if ( ! mtag | | mtlen < = 0 ) {
// http://www.billdietrich.me/Computers.html#ContentRating
// https://en.wikipedia.org/wiki/User:ArneBab/Voluntary_Content_Rating
mtag = m_xml - > getMetaContentPointer ( " voluntary content rating " , 24 , " name " , & mtlen ) ;
}
switch ( mtlen ) {
case 5 :
if ( strncasecmp ( mtag , " adult " , mtlen ) = = 0 ) {
return true ;
}
break ;
case 6 :
if ( strncasecmp ( mtag , " mature " , mtlen ) = = 0 ) {
return true ;
}
break ;
case 7 :
// non-standard, seen in the wild
if ( strncasecmp ( mtag , " adulto " , mtlen ) = = 0 ) {
return true ;
}
break ;
case 10 :
// non-standard, seen in the wild
if ( strncasecmp ( mtag , " restricted " , mtlen ) = = 0 ) {
return true ;
}
break ;
case 27 :
if ( strncasecmp ( mtag , " RTA-5042-1996-1400-1577-RTA " , mtlen ) = = 0 ) {
return true ;
}
break ;
}
if ( mtlen > 0 ) {
// non-standard, seen in the wild
if ( strncasestr ( mtag , " porn " , mtlen ) | |
strncasestr ( mtag , " porno " , mtlen ) | |
strncasestr ( mtag , " adult " , mtlen ) | |
strncasestr ( mtag , " fuck " , mtlen ) | |
strncasestr ( mtag , " sex " , mtlen ) | |
strncasestr ( mtag , " xxx " , mtlen ) ) {
return true ;
}
}
// YouTube
mtag = m_xml - > getMetaContentPointer ( " isFamilyFriendly " , 16 , " itemprop " , & mtlen ) ;
switch ( mtlen ) {
case 5 :
if ( strncasecmp ( mtag , " false " , mtlen ) = = 0 ) {
return true ;
}
break ;
}
return false ;
}
2017-11-08 08:14:24 -05:00
bool CheckAdult : : hasAdultAds ( ) {
2017-10-26 06:20:04 -04:00
if ( ! m_xml ) {
return false ;
}
int32_t mtlen ;
//
// Adult ad networks verification tags
//
char * mtag = m_xml - > getMetaContentPointer ( " ero_verify " , 10 , " name " , & mtlen ) ;
if ( mtag & & mtlen > 0 ) {
return true ;
}
mtag = m_xml - > getMetaContentPointer ( " juicyads-site-verification " , 26 , " name " , & mtlen ) ;
if ( mtag & & mtlen > 0 ) {
return true ;
}
mtag = m_xml - > getMetaContentPointer ( " trafficjunky-site-verification " , 30 , " name " , & mtlen ) ;
if ( mtag & & mtlen > 0 ) {
return true ;
}
mtag = m_xml - > getMetaContentPointer ( " adamo-site-verification " , 23 , " name " , & mtlen ) ;
if ( mtag & & mtlen > 0 ) {
return true ;
}
return false ;
}
2017-11-08 08:14:24 -05:00
bool CheckAdult : : isDocAdult ( ) {
2017-10-26 06:20:04 -04:00
// Hash table used to hold unique termIds to make sure we only count each unique word once
HashTableX uniqueTermIds ;
if ( m_resultValid ) {
return m_result ;
}
2017-11-08 08:14:24 -05:00
m_docMatchScore = 0 ;
2017-10-26 06:20:04 -04:00
//
// Check for adult TLDs
//
if ( m_url & & m_url - > isAdult ( ) ) {
m_reason = " adultTLD " ;
2017-11-08 08:14:24 -05:00
m_docMatchScore + = 1000 ;
logTrace ( g_conf . m_logTraceTermCheckList , " Adult TLD found in %s " , m_url - > getUrl ( ) ) ;
2017-10-26 06:20:04 -04:00
}
//
// Check for adult content meta tags
//
2017-11-08 08:14:24 -05:00
if ( ! m_docMatchScore ) {
2017-10-26 06:20:04 -04:00
if ( hasAdultRatingTag ( ) ) {
m_reason = " adultRatingTag " ;
2017-11-08 08:14:24 -05:00
m_docMatchScore + = 1000 ;
logTrace ( g_conf . m_logTraceTermCheckList , " Rating tag found in %s " , m_url - > getUrl ( ) ) ;
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
if ( ! m_docMatchScore & &
2017-10-26 06:20:04 -04:00
hasAdultAds ( ) ) {
m_reason = " adultAds " ;
2017-11-08 08:14:24 -05:00
m_docMatchScore + = 1000 ;
logTrace ( g_conf . m_logTraceTermCheckList , " Adult ads found in %s " , m_url - > getUrl ( ) ) ;
2017-10-26 06:20:04 -04:00
}
}
//
// If not blocked by the cheaper checks, do the hard work and check document content
//
2017-11-08 08:14:24 -05:00
if ( ! m_docMatchScore ) {
2017-10-26 06:20:04 -04:00
//
// Score words and phrases from the document body text
//
2018-03-09 10:24:39 -05:00
if ( m_tokenizerResult ) {
if ( ! uniqueTermIds . set ( sizeof ( int64_t ) , 0 , m_tokenizerResult - > size ( ) + 5000 , NULL , 0 , false , " uniquetermids " , false , 0 ) ) {
2017-10-26 09:13:50 -04:00
log ( LOG_ERROR , " isDocAdult: Could not initialize uniqueTermIds hash table " ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-09 10:24:39 -05:00
if ( m_tokenizerResult - > empty ( ) ) {
2017-10-26 06:20:04 -04:00
// No words in document body
m_emptyDocumentBody = true ;
}
else {
2018-03-09 10:24:39 -05:00
g_checkAdultList . getScore ( * m_tokenizerResult , m_phrases , & uniqueTermIds , & m_docMatchScore , & m_numUniqueMatchedWords , & m_numUniqueMatchedPhrases , m_debbuf , m_debbufUsed , m_debbufSize ) ;
m_numWordsChecked + = m_tokenizerResult - > size ( ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-09 10:24:39 -05:00
logTrace ( g_conf . m_logTraceTermCheckList , " %zu words checked (% " PRId32 " unique) in body: %s. % " PRId32 " unique matched words, % " PRId32 " unique matched phrases. Score: % " PRId32 " " ,
m_tokenizerResult - > size ( ) , uniqueTermIds . getNumUsedSlots ( ) , m_url - > getUrl ( ) , m_numUniqueMatchedWords , m_numUniqueMatchedPhrases , m_docMatchScore ) ;
2017-10-26 06:20:04 -04:00
}
else {
// No words in document body
m_emptyDocumentBody = true ;
2017-11-08 08:14:24 -05:00
logTrace ( g_conf . m_logTraceTermCheckList , " Document body is empty in %s " , m_url - > getUrl ( ) ) ;
2017-10-26 06:20:04 -04:00
}
//
// Score words and phrases from the document meta tags
//
if ( m_xml ) {
2018-03-09 10:24:39 -05:00
TokenizerResult metatr ;
2017-10-26 06:20:04 -04:00
Bits metab ;
Phrases metap ;
int32_t mtlen ;
2018-03-09 10:24:39 -05:00
const char * mtag = m_xml - > getMetaContentPointer ( " keywords " , 8 , " name " , & mtlen ) ;
2017-10-26 06:20:04 -04:00
if ( mtlen > 0 ) {
2018-03-09 10:24:39 -05:00
plain_tokenizer_phase_1 ( mtag , mtlen , & metatr ) ;
2017-10-26 06:20:04 -04:00
}
mtag = m_xml - > getMetaContentPointer ( " description " , 11 , " name " , & mtlen ) ;
if ( mtlen > 0 ) {
2018-03-09 10:24:39 -05:00
plain_tokenizer_phase_1 ( mtag , mtlen , & metatr ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-09 10:24:39 -05:00
if ( ! metatr . empty ( ) ) {
plain_tokenizer_phase_2 ( langUnknown , nullptr , & metatr ) ;
if ( ! metab . set ( & metatr ) ) {
2017-10-26 09:13:50 -04:00
log ( LOG_ERROR , " isDocAdult: Could not set bits for meta words " ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-12 11:18:17 -04:00
if ( ! metap . set ( metatr , metab ) ) {
2017-10-26 09:13:50 -04:00
log ( LOG_ERROR , " isDocAdult: Could not set phrases for meta words " ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-09 10:24:39 -05:00
g_checkAdultList . getScore ( metatr , & metap , & uniqueTermIds , & m_docMatchScore , & m_numUniqueMatchedWords , & m_numUniqueMatchedPhrases , m_debbuf , m_debbufUsed , m_debbufSize ) ;
m_numWordsChecked + = metatr . size ( ) ;
2017-10-26 09:13:50 -04:00
2018-03-09 10:24:39 -05:00
logTrace ( g_conf . m_logTraceTermCheckList , " %zu words checked (% " PRId32 " unique) in meta tags: %s. % " PRId32 " unique matched words, % " PRId32 " unique matched phrases. Score: % " PRId32 " " ,
metatr . size ( ) , uniqueTermIds . getNumUsedSlots ( ) , m_url - > getUrl ( ) , m_numUniqueMatchedWords , m_numUniqueMatchedPhrases , m_docMatchScore ) ;
2017-10-26 06:20:04 -04:00
}
}
//
// Score words and phrases from URL
//
if ( m_url ) {
2018-03-09 10:24:39 -05:00
TokenizerResult urltr ;
2017-10-26 06:20:04 -04:00
Bits urlb ;
Phrases urlp ;
2018-03-09 10:24:39 -05:00
plain_tokenizer_phase_1 ( m_url - > getUrl ( ) , m_url - > getUrlLen ( ) , & urltr ) ;
if ( ! urlb . set ( & urltr ) ) {
2017-10-26 09:13:50 -04:00
log ( LOG_ERROR , " isDocAdult: Could not set bits for URL words " ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-12 11:18:17 -04:00
if ( ! urlp . set ( urltr , urlb ) ) {
2017-10-26 09:13:50 -04:00
log ( LOG_ERROR , " isDocAdult: Could not set phrases for URL words " ) ;
2017-10-26 06:20:04 -04:00
}
2018-03-09 10:24:39 -05:00
g_checkAdultList . getScore ( urltr , & urlp , & uniqueTermIds , & m_docMatchScore , & m_numUniqueMatchedWords , & m_numUniqueMatchedPhrases , m_debbuf , m_debbufUsed , m_debbufSize ) ;
m_numWordsChecked + = urltr . size ( ) ;
2017-10-26 06:20:04 -04:00
2018-03-09 10:24:39 -05:00
logTrace ( g_conf . m_logTraceTermCheckList , " %zu words checked (% " PRId32 " unique) in URL: %s. % " PRId32 " unique matched words, % " PRId32 " unique matched phrases. Score: % " PRId32 " " ,
urltr . size ( ) , uniqueTermIds . getNumUsedSlots ( ) , m_url - > getUrl ( ) , m_numUniqueMatchedWords , m_numUniqueMatchedPhrases , m_docMatchScore ) ;
2017-10-26 06:20:04 -04:00
}
//
// Additional check for adult content compliance statement
//
// "18 U.S.C. 2257 Record-Keeping Requirements Compliance Statement"
// "18 USC. 2257 Record-Keeping Requirements Compliance Statement"
int64_t hs18 = hash64Lower_utf8_nospaces ( " 18 " , 2 ) ;
int64_t hsu = hash64Lower_utf8_nospaces ( " u " , 1 ) ;
int64_t hss = hash64Lower_utf8_nospaces ( " s " , 1 ) ;
int64_t hsc = hash64Lower_utf8_nospaces ( " c " , 1 ) ;
int64_t hsusc = hash64Lower_utf8_nospaces ( " usc " , 3 ) ;
int64_t hs2257 = hash64Lower_utf8_nospaces ( " 2257 " , 4 ) ;
int64_t hsrecord = hash64Lower_utf8_nospaces ( " record " , 6 ) ;
int64_t hskeeping = hash64Lower_utf8_nospaces ( " keeping " , 7 ) ;
int64_t hsrequirements = hash64Lower_utf8_nospaces ( " requirements " , 12 ) ;
int64_t hscompliance = hash64Lower_utf8_nospaces ( " compliance " , 10 ) ;
if ( uniqueTermIds . getSlot ( & hs18 ) > = 0 & &
uniqueTermIds . getSlot ( & hs2257 ) > = 0 & &
uniqueTermIds . getSlot ( & hsrecord ) > = 0 & &
uniqueTermIds . getSlot ( & hskeeping ) > = 0 & &
uniqueTermIds . getSlot ( & hsrequirements ) > = 0 & &
uniqueTermIds . getSlot ( & hscompliance ) > = 0 & &
( ( uniqueTermIds . getSlot ( & hsu ) > = 0 & &
uniqueTermIds . getSlot ( & hss ) > = 0 & &
uniqueTermIds . getSlot ( & hsc ) > = 0 ) | |
uniqueTermIds . getSlot ( & hsusc ) > = 0
) ) {
2017-10-26 09:13:50 -04:00
//m_reason = "USC2257Disclaimer";
// Give it a score of 10 and count it as a phrase
2017-11-08 08:14:24 -05:00
m_docMatchScore + = 10 ;
m_numUniqueMatchedPhrases + + ;
logTrace ( g_conf . m_logTraceTermCheckList , " USC 2257 compliance statement found in %s: score=% " PRId32 " " , m_url - > getUrl ( ) , m_docMatchScore ) ;
2017-10-26 06:20:04 -04:00
}
2017-10-26 07:21:27 -04:00
//TODO:
//18 U.S.C. 2257
//Title 18 U.S.C. 2257 Compliance Statement
//Compliance with 18 U.S.C. § 2257
2018-02-28 05:45:05 -05:00
// <meta http-equiv="PICS-Label" content='(pics-1.1 "http://www.icra.org/ratingsv02.html" comment "ICRAonline EN v2.0" l gen true for "" r (nb 1 nc 1 nd 1 ne 1 nh 1 ni 1 vz 1 la 1 oz 1 cz 1) "http://www.rsac.org/ratingsv01.html" l gen true for "" r (n 3 s 3 v 0 l 4))' />
2017-10-26 07:21:27 -04:00
//Beskyt dine børn mod erotiske sites med
//Protect your children against Adult Content with
//Eltern können ihre Kinder vor ungeeigneten Inhalten schützen mit
//Protégez vos enfants contre le Contenu pour adultes au moyen de
//Skydda dina barn mot innehåll som endast är avsett för vuxna med hjälp av
//Beskytt barna dine mot voksent innhold med
//Proteggete i vostri figli dal contenuto erotico di questo sito con
//Los padres, protegen a sus menores del Contenido Adulto con
//Os Pais devem usar um dos seguintes programas para salvaguardar os filhos do conteúdo erótico
//Bescherm minderjarigen tegen expliciete beelden op internet met software als Netnanny, Cyberpatrol of Cybersitter.
2017-10-26 09:13:50 -04:00
2017-11-08 08:14:24 -05:00
if ( m_docMatchScore > 0 ) {
2017-10-26 09:13:50 -04:00
m_reason = " adultTerms " ;
}
2017-10-26 06:20:04 -04:00
}
2017-11-08 08:14:24 -05:00
logTrace ( g_conf . m_logTraceTermCheckList , " Final score % " PRId32 " for: %s. % " PRId32 " unique matched words, % " PRId32 " unique matched phrases " ,
m_docMatchScore , m_url - > getUrl ( ) , m_numUniqueMatchedWords , m_numUniqueMatchedPhrases ) ;
2017-10-26 06:20:04 -04:00
2017-10-26 09:13:50 -04:00
m_result = false ;
2017-11-08 08:14:24 -05:00
if ( ( m_docMatchScore > = 30 | | m_numUniqueMatchedWords > 7 ) | |
( m_docMatchScore > = 30 | | m_numUniqueMatchedPhrases > = 3 ) ) {
2017-10-26 09:13:50 -04:00
m_result = true ;
2017-10-26 06:20:04 -04:00
}
m_resultValid = true ;
return m_result ;
}
// Check for adult TLDs
// https://tld-list.com/tld-categories/adult
bool isAdultTLD ( const char * tld , size_t tld_len ) {
switch ( tld_len ) {
case 3 :
if ( strncasecmp ( tld , " cam " , tld_len ) = = 0 | |
strncasecmp ( tld , " sex " , tld_len ) = = 0 | |
strncasecmp ( tld , " xxx " , tld_len ) = = 0 ) {
return true ;
}
break ;
case 4 :
if ( strncasecmp ( tld , " porn " , tld_len ) = = 0 | |
strncasecmp ( tld , " sexy " , tld_len ) = = 0 | |
strncasecmp ( tld , " tube " , tld_len ) = = 0 ) {
return true ;
}
break ;
case 5 :
if ( strncasecmp ( tld , " adult " , tld_len ) = = 0 ) {
return true ;
}
break ;
case 6 :
if ( strncasecmp ( tld , " webcam " , tld_len ) = = 0 ) {
return true ;
}
break ;
default :
break ;
}
return false ;
}