Merge branch 'master' into sqlite

This commit is contained in:
Ivan Skytte Jørgensen
2017-10-26 15:40:58 +02:00
7 changed files with 82 additions and 97 deletions

@ -269,6 +269,7 @@ Conf::Conf ( ) {
m_logTraceUrlResultOverride = false;
m_logTraceWordSpam=false;
m_logTraceUrlClassification = false;
m_logTraceAdultCheck = false;
m_logTimingAddurl = false;
m_logTimingAdmin = false;
m_logTimingBuild = false;

1
Conf.h

@ -421,6 +421,7 @@ class Conf {
bool m_logTraceWordSpam;
bool m_logTraceUrlClassification;
bool m_logTraceTopTree;
bool m_logTraceAdultCheck;
// expensive timing messages
bool m_logTimingAddurl;

@ -24,55 +24,37 @@
#include "Phrases.h"
#include "Words.h"
#include "XmlDoc.h"
#include <stddef.h>
#include <fstream>
#include <sys/stat.h>
AdultCheck::AdultCheck(XmlDoc *xd, bool debug) {
m_docAdultScore = 0;
m_numUniqueDirtyWords = 0;
m_numUniqueDirtyPhrases = 0;
m_numWordsChecked = 0;
m_emptyDocumentBody = false;
m_result = false;
m_resultValid = false;
AdultCheck::AdultCheck(XmlDoc *xd, bool debug) :
m_debbuf(NULL), m_debbufUsed(0), m_debbufSize(0), m_docAdultScore(-1),
m_numUniqueDirtyWords(0), m_numUniqueDirtyPhrases(0), m_numWordsChecked(0),
m_emptyDocumentBody(false), m_resultValid(false), m_result(false) {
if( xd ) {
m_url = xd->getFirstUrl();
if( m_url == (Url *)-1 ) {
log(LOG_WARN, "XmlDoc::getFirstUrl() failed in AdultCheck::AdultCheck");
m_url = NULL;
}
m_xml = xd->getXml();
if( m_xml == (Xml *)-1 ) {
log(LOG_WARN, "XmlDoc::getXml() failed in AdultCheck::AdultCheck");
m_xml = NULL;
}
m_words = xd->getWords();
if( m_words == (Words *)-1 ) {
log(LOG_WARN, "XmlDoc::getWords() failed in AdultCheck::AdultCheck");
m_words = NULL;
}
m_phrases = xd->getPhrases();
if( m_phrases == (Phrases *)-1 ) {
log(LOG_WARN, "XmlDoc::getPhrases() failed in AdultCheck::AdultCheck");
m_phrases = NULL;
}
if( !xd ) {
log(LOG_ERROR, "AdultCheck::AdultCheck passed NULL-pointer");
gbshutdownLogicError();
}
else {
m_url = xd->getFirstUrl();
if( m_url == (Url *)-1 ) {
m_url = NULL;
}
m_xml = xd->getXml();
if( m_xml == (Xml *)-1 ) {
m_xml = NULL;
}
m_words = xd->getWords();
if( m_words == (Words *)-1 ) {
m_words = NULL;
}
m_phrases = xd->getPhrases();
if( m_phrases == (Phrases *)-1 ) {
m_phrases = NULL;
}
m_debbufSize = 0;
m_debbufUsed = 0;
m_debbuf = NULL;
if( debug ) {
m_debbufSize = 2000;
@ -92,38 +74,23 @@ AdultCheck::~AdultCheck() {
int32_t AdultCheck::getScore() {
if( m_resultValid ) {
return m_docAdultScore;
}
return -1;
return m_docAdultScore;
}
int32_t AdultCheck::getNumUniqueDirtyWords() {
if( m_resultValid ) {
return m_numUniqueDirtyWords;
}
return -1;
return m_numUniqueDirtyWords;
}
int32_t AdultCheck::getNumUniqueDirtyPhrases() {
if( m_resultValid ) {
return m_numUniqueDirtyPhrases;
}
return -1;
return m_numUniqueDirtyPhrases;
}
int32_t AdultCheck::getNumWordsChecked() {
if( m_resultValid ) {
return m_numWordsChecked;
}
return -1;
return m_numWordsChecked;
}
bool AdultCheck::hasEmptyDocumentBody() {
if( m_resultValid ) {
return m_emptyDocumentBody;
}
return false;
return m_emptyDocumentBody;
}
const char *AdultCheck::getReason() {
@ -255,12 +222,14 @@ bool AdultCheck::isDocAdult() {
return m_result;
}
m_docAdultScore = 0;
//
// Check for adult TLDs
//
if( m_url && m_url->isAdult() ) {
m_reason = "adultTLD";
m_docAdultScore += 1000;
logTrace(g_conf.m_logTraceAdultCheck, "Adult TLD found in %s", m_url->getUrl());
}
//
@ -270,12 +239,14 @@ bool AdultCheck::isDocAdult() {
if( hasAdultRatingTag() ) {
m_reason = "adultRatingTag";
m_docAdultScore += 1000;
logTrace(g_conf.m_logTraceAdultCheck, "Rating tag found in %s", m_url->getUrl());
}
if( !m_docAdultScore &&
hasAdultAds() ) {
m_reason = "adultAds";
m_docAdultScore += 1000;
logTrace(g_conf.m_logTraceAdultCheck, "Adult ads found in %s", m_url->getUrl());
}
}
@ -290,7 +261,7 @@ bool AdultCheck::isDocAdult() {
if( m_words ) {
if (!uniqueTermIds.set(sizeof(int64_t), 0, m_words->getNumWords()+5000, NULL, 0, false, "uniquetermids", false, 0)) {
log(LOG_ERROR,"Could not initialize uniqueTermIds hash table");
log(LOG_ERROR,"isDocAdult: Could not initialize uniqueTermIds hash table");
}
if( !m_words->getNumWords() ) {
@ -301,10 +272,13 @@ bool AdultCheck::isDocAdult() {
g_adultCheckList.getDirtyScore(m_words, m_phrases, &uniqueTermIds, &m_docAdultScore, &m_numUniqueDirtyWords, &m_numUniqueDirtyPhrases, m_debbuf, m_debbufUsed, m_debbufSize);
m_numWordsChecked += m_words->getNumWords();
}
logTrace(g_conf.m_logTraceAdultCheck, "%" PRId32 " words checked (%" PRId32 " unique) in body: %s. %" PRId32 " unique dirty words, %" PRId32 " unique dirty phrases. Score: %" PRId32 "",
m_words->getNumWords(), uniqueTermIds.getNumUsedSlots(), m_url->getUrl(), m_numUniqueDirtyWords, m_numUniqueDirtyPhrases, m_docAdultScore);
}
else {
// No words in document body
m_emptyDocumentBody = true;
logTrace(g_conf.m_logTraceAdultCheck, "Document body is empty in %s", m_url->getUrl());
}
//
@ -322,18 +296,20 @@ bool AdultCheck::isDocAdult() {
}
mtag = m_xml->getMetaContentPointer( "description", 11, "name", &mtlen );
if( mtlen > 0 ) {
//log(LOG_ERROR, "SETTING DESCRIPTION WORDS");
metaw.addWords(mtag, mtlen, true);
}
if( metaw.getNumWords() ) {
if( !metab.set(&metaw) ) {
log(LOG_ERROR,"COULD NOT SET BITS FOR META WORDS");
log(LOG_ERROR,"isDocAdult: Could not set bits for meta words");
}
if( !metap.set(&metaw, &metab) ) {
log(LOG_ERROR,"COULD NOT SET PHRASES FOR META WORDS");
log(LOG_ERROR,"isDocAdult: Could not set phrases for meta words");
}
g_adultCheckList.getDirtyScore(&metaw, &metap, &uniqueTermIds, &m_docAdultScore, &m_numUniqueDirtyWords, &m_numUniqueDirtyPhrases, m_debbuf, m_debbufUsed, m_debbufSize);
m_numWordsChecked += metaw.getNumWords();
logTrace(g_conf.m_logTraceAdultCheck, "%" PRId32 " words checked (%" PRId32 " unique) in meta tags: %s. %" PRId32 " unique dirty words, %" PRId32 " unique dirty phrases. Score: %" PRId32 "",
metaw.getNumWords(), uniqueTermIds.getNumUsedSlots(), m_url->getUrl(), m_numUniqueDirtyWords, m_numUniqueDirtyPhrases, m_docAdultScore);
}
}
@ -347,20 +323,18 @@ bool AdultCheck::isDocAdult() {
urlw.set(m_url->getUrl(), m_url->getUrlLen(), true);
if( !urlb.set(&urlw) ) {
log(LOG_ERROR,"COULD NOT SET BITS FOR URL WORDS");
log(LOG_ERROR,"isDocAdult: Could not set bits for URL words");
}
if( !urlp.set(&urlw, &urlb) ) {
log(LOG_ERROR,"COULD NOT SET PHRASES FOR URL WORDS");
log(LOG_ERROR,"isDocAdult: Could not set phrases for URL words");
}
g_adultCheckList.getDirtyScore(&urlw, &urlp, &uniqueTermIds, &m_docAdultScore, &m_numUniqueDirtyWords, &m_numUniqueDirtyPhrases, m_debbuf, m_debbufUsed, m_debbufSize);
m_numWordsChecked += urlw.getNumWords();
}
if( m_docAdultScore > 0 ) {
m_reason = "adultTerms";
logTrace(g_conf.m_logTraceAdultCheck, "%" PRId32 " words checked (%" PRId32 " unique) in URL: %s. %" PRId32 " unique dirty words, %" PRId32 " unique dirty phrases. Score: %" PRId32 "",
urlw.getNumWords(), uniqueTermIds.getNumUsedSlots(), m_url->getUrl(), m_numUniqueDirtyWords, m_numUniqueDirtyPhrases, m_docAdultScore);
}
//
// Additional check for adult content compliance statement
//
@ -388,9 +362,12 @@ bool AdultCheck::isDocAdult() {
uniqueTermIds.getSlot(&hsc) >= 0) ||
uniqueTermIds.getSlot(&hsusc) >= 0
)) {
m_reason = "USC2257Disclaimer";
m_docAdultScore+=1000;
//log(LOG_ERROR,"@@@ USC 2257 compliance statement FOUND in %s: score=%" PRId32 "", url->getUrl(), m_docAdultScore);
//m_reason = "USC2257Disclaimer";
// Give it a score of 10 and count it as a phrase
m_docAdultScore += 10;
m_numUniqueDirtyPhrases++;
logTrace(g_conf.m_logTraceAdultCheck, "USC 2257 compliance statement found in %s: score=%" PRId32 "", m_url->getUrl(), m_docAdultScore);
}
//TODO:
@ -410,17 +387,20 @@ bool AdultCheck::isDocAdult() {
//Los padres, protegen a sus menores del Contenido Adulto con
//Os Pais devem usar um dos seguintes programas para salvaguardar os filhos do conteúdo erótico
//Bescherm minderjarigen tegen expliciete beelden op internet met software als Netnanny, Cyberpatrol of Cybersitter.
if( m_docAdultScore > 0 ) {
m_reason = "adultTerms";
}
}
logTrace(g_conf.m_logTraceAdultCheck, "Final score %" PRId32 " for: %s. %" PRId32 " unique dirty words, %" PRId32 " unique dirty phrases",
m_docAdultScore, m_url->getUrl(), m_numUniqueDirtyWords, m_numUniqueDirtyPhrases);
bool adult = false;
m_result = false;
if( ( m_docAdultScore >= 30 || m_numUniqueDirtyWords > 7) ||
( m_docAdultScore >= 30 || m_numUniqueDirtyPhrases >= 3) ) {
adult = true;
m_result = true;
}
m_result = adult;
m_resultValid = true;
return m_result;
@ -428,7 +408,6 @@ bool AdultCheck::isDocAdult() {
// Check for adult TLDs
// https://tld-list.com/tld-categories/adult
bool isAdultTLD(const char *tld, size_t tld_len) {

@ -24,7 +24,6 @@
#include <string>
#include "FxAdultCheckList.h"
class AdultCheck {
public:
AdultCheck(XmlDoc *xd, bool debug=false);

@ -17,11 +17,11 @@
// License TL;DR: If you change this file, you must publish your changes.
//
#include "FxAdultCheckList.h"
#include "Conf.h"
#include "Log.h"
#include "termid_mask.h"
#include "Phrases.h"
#include "Words.h"
#include <stddef.h>
#include <fstream>
#include <sys/stat.h>
@ -58,14 +58,14 @@ bool AdultCheckList::load() {
}
if( !loadScoredTermList(&m_dirtyTerms, "adultwords.txt") ) {
log(LOG_ERROR,"Could not load dirty word file");
log(LOG_ERROR,"Could not load 'adultwords.txt'");
}
//
// Initialize dirty phrases (bigrams) - use same hash table as words
//
if( !loadScoredTermList(&m_dirtyTerms, "adultphrases.txt") ) {
log(LOG_ERROR,"Could not load dirty phrase file");
log(LOG_ERROR,"Could not load 'adultphrases.txt'");
}
return true;
@ -95,14 +95,14 @@ bool AdultCheckList::loadScoredTermList(HashTableX *ht, const char *filename) {
size_t secondCol = line.find_first_not_of("|", firstColEnd);
if( firstColEnd == std::string::npos || secondCol == std::string::npos) {
// invalid format
log(LOG_ERROR,"Invalid line read: %.*s", (int)line.length(), line.data());
log(LOG_ERROR,"Invalid line read from %s: %.*s", filename, (int)line.length(), line.data());
continue;
}
size_t secondColEnd = line.find_first_of("|", secondCol);
size_t thirdCol = line.find_first_not_of("|", secondColEnd);
if (thirdCol == std::string::npos) {
// invalid format
log(LOG_ERROR,"Invalid line read: %.*s", (int)line.length(), line.data());
log(LOG_ERROR,"Invalid line read from %s: %.*s", filename, (int)line.length(), line.data());
continue;
}
@ -113,7 +113,7 @@ bool AdultCheckList::loadScoredTermList(HashTableX *ht, const char *filename) {
int32_t dwscore = atoi(col3.data());
if( dwscore < 1 || col2.length() < 1 || col3.length() < 1 ) {
log(LOG_ERROR,"Invalid line read: %.*s", (int)line.length(), line.data());
log(LOG_ERROR,"Invalid line read from %s: %.*s", filename, (int)line.length(), line.data());
continue;
}
@ -122,7 +122,7 @@ bool AdultCheckList::loadScoredTermList(HashTableX *ht, const char *filename) {
int64_t dwid = hash64Lower_utf8_nospaces(col2.data(), col2.length());
if( !ht->addKey(&dwid, &dwscore) ) {
log(LOG_ERROR,"COULD NOT ADD [%.*s] TO DIRTY WORD LIST", (int)col2.length(), col2.data());
log(LOG_ERROR,"Could not add [%.*s] to dirty word list", (int)col2.length(), col2.data());
return false;
}
}
@ -145,12 +145,9 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
}
int rc;
bool debug=false;
const uint64_t *wids = reinterpret_cast<const uint64_t*>(w->getWordIds());
int32_t nw = w->getNumWords();
// log(LOG_ERROR,"=== numWords=%" PRId32 ", debbuf=%p, debbuf_size=%" PRId32 " ===", nw, debbuf, debbuf_size);
for(int32_t i=0; i < nw; i++) {
if( !wids[i] ) {
continue;
@ -166,7 +163,7 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
// only process if we haven't seen it before
if ( uniqueTermIds->getSlot( &termId ) >= 0 ) {
if(debug) log(LOG_ERROR,"ALREADY SEEN WORD %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK));
logTrace(g_conf.m_logTraceAdultCheck, "Already seen word %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK));
}
else {
// add to hash table. return NULL and set g_errno on error
@ -176,7 +173,7 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
int32_t *sc = (int32_t*)m_dirtyTerms.getValue64(termId);
if( sc ) {
if(debug) log(LOG_ERROR,"DIRTY WORD %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ") score %" PRId32 ". debbuf_used=%" PRId32 ", debbuf_size=%" PRId32 "", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK), *sc, debbuf_used, debbuf_size);
logTrace(g_conf.m_logTraceAdultCheck, "Dirty word %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ") score %" PRId32 ". debbuf_used=%" PRId32 ", debbuf_size=%" PRId32 "", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK), *sc, debbuf_used, debbuf_size);
(*docAdultScore) += *sc;
(*numUniqueDirtyWords)++;
@ -198,7 +195,7 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
}
}
else {
if(debug) log(LOG_ERROR,"WORD %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK));
logTrace(g_conf.m_logTraceAdultCheck, "Word %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK));
}
}
@ -218,7 +215,7 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
int64_t phraseId = hash64Lower_utf8_nospaces( pbuf , plen );
if ( uniqueTermIds->getSlot ( &phraseId ) >= 0 ) {
if(debug) log(LOG_ERROR,"ALREADY SEEN PHRASE %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK));
logTrace(g_conf.m_logTraceAdultCheck, "Already seen phrase %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK));
continue;
}
@ -229,7 +226,7 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
int32_t *sc = (int32_t*)m_dirtyTerms.getValue64(phraseId);
if( sc ) {
if(debug) log(LOG_ERROR,"DIRTY PHRASE %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ") score %" PRId32 ". debbuf_used=%" PRId32 ", debbuf_size=%" PRId32 "", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK), *sc, debbuf_used, debbuf_size);
logTrace(g_conf.m_logTraceAdultCheck, "Dirty phrase %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ") score %" PRId32 ". debbuf_used=%" PRId32 ", debbuf_size=%" PRId32 "", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK), *sc, debbuf_used, debbuf_size);
(*docAdultScore) += *sc;
(*numUniqueDirtyPhrases)++;
@ -250,7 +247,7 @@ bool AdultCheckList::getDirtyScore(Words *w, Phrases *p, HashTableX *uniqueTermI
}
}
else {
if(debug) log(LOG_ERROR,"PHRASE %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK));
logTrace(g_conf.m_logTraceAdultCheck, "Phrase %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK));
}
}
}

@ -8623,6 +8623,13 @@ void Parms::init ( ) {
// log trace
////////////////////
m->m_title = "log trace info for AdultCheck";
m->m_cgi = "ltrc_adult";
simple_m_set(Conf,m_logTraceAdultCheck);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for BigFile";
m->m_cgi = "ltrc_bf";
simple_m_set(Conf,m_logTraceBigFile);

@ -4028,7 +4028,7 @@ static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t
AdultCheck achk(xd, true);
bool newblocked = achk.isDocAdult();
#if 0
// Sanity check.
bool gbadult = false;
char *adultbit = xd->getIsAdult();
@ -4040,6 +4040,7 @@ static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t
gbshutdownLogicError();
}
}
#endif
if( newblocked ) {
time_t idxtim = (time_t)xd->getIndexedTime();