Merge branch 'master' into diffbot

Conflicts:
	SearchInput.cpp
	XmlDoc.cpp
This commit is contained in:
Matt Wells 2013-09-18 09:23:48 -07:00
commit c77453348f
8 changed files with 73 additions and 5 deletions

1
Conf.h

@ -601,6 +601,7 @@ class Conf {
bool m_logDebugBuild ;
bool m_logDebugBuildTime ;
bool m_logDebugDb ;
bool m_logDebugDirty ;
bool m_logDebugDisk ;
bool m_logDebugDns ;
bool m_logDebugDownloads;

@ -467,7 +467,8 @@ bool Msg40::prepareToGetDocIds ( ) {
if ( m_si->m_familyFilter &&
getDirtyPoints ( m_si->m_sbuf1.getBufStart() ,
m_si->m_sbuf1.length() ,
0 ) ) {
0 ,
NULL ) ) {
// make sure the m_numDocIds gets set to 0
m_msg3a.reset();
m_queryCensored = true;

@ -7356,6 +7356,14 @@ void Parms::init ( ) {
m->m_priv = 1;
m++;
m->m_title = "log debug dirty messages";
m->m_cgi = "lddm";
m->m_off = (char *)&g_conf.m_logDebugDirty - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m++;
m->m_title = "log debug disk messages";
m->m_cgi = "lddi";
m->m_off = (char *)&g_conf.m_logDebugDisk - g;

@ -3002,6 +3002,7 @@ struct QueryField g_fields[] = {
//{"gbruleset",FIELD_GBRULESET, true,"Obsolete."},
{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension."},
{"filetype", FIELD_TYPE, false,"Same as type:"},
{"gbisadult",FIELD_TYPE,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."},
{"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."},
{"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."},
{"zipcode", FIELD_ZIP, false,"Same as zip:"},

@ -975,6 +975,11 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
m_sbuf1.safeMemcpy(m_site,m_siteLen);
}
if ( m_familyFilter ) {
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
m_sbuf1.safePrintf("gbisadult:0 | ");
}
// append gblang: term
if( m_gblang > 0 ) {
//if( p > pstart ) *p++ = ' ';

@ -3149,7 +3149,7 @@ static Needle s_dirtyWords [] = {
{"blowjob" ,0,2,0,0,NULL,0,NULL},
{"blow job" ,0,2,0,0,NULL,0,NULL},
{"gangbang" ,0,2,0,0,NULL,0,NULL},
{"xxx" ,0,2,0,0,NULL,0,NULL},
{"xxx" ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
{"porn" ,0,2,0,0,NULL,0,NULL},
{"felch" ,0,2,0,0,NULL,0,NULL},
{"cunt" ,0,2,0,0,NULL,0,NULL},
@ -3211,6 +3211,7 @@ static Needle s_dirtyWords [] = {
{"sexclu" ,0,-2,0,0,NULL,0,NULL},
{"sexo" ,0,-2,0,0,NULL,0,NULL},
{"sexism" ,0,-2,0,0,NULL,0,NULL},
{"sexpan" ,0,-2,0,0,NULL,0,NULL}, // buttonsexpanion
{"same-sex" ,0,-2,0,0,NULL,0,NULL},
{"opposite sex",0,-2,0,0,NULL,0,NULL},
@ -3359,7 +3360,7 @@ char *XmlDoc::getIsAdult ( ) {
// score that up
long total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
m_niceness );
m_niceness , m_firstUrl.m_url );
// then the url
//char *u = getFirstUrl()->getUrl();
@ -3383,13 +3384,18 @@ char *XmlDoc::getIsAdult ( ) {
m_isAdult2 = (bool)m_isAdult;
// validate
m_isAdultValid = true;
// note it
if ( m_isAdult2 && g_conf.m_logDebugDirty )
log("dirty: %s points = %li",m_firstUrl.m_url,total);
// no dirty words found
return &m_isAdult2;
}
long getDirtyPoints ( char *s , long slen , long niceness ) {
long getDirtyPoints ( char *s , long slen , long niceness , char *url ) {
// . use the matches function to get all the matches
// . then check each match to see if it is actually a legit word
// . actually match the dirty words, then match the clean words
@ -3415,6 +3421,14 @@ long getDirtyPoints ( char *s , long slen , long niceness ) {
// . uses +2/-2 for really dirty words
// . uses +1/-1 for borderline dirty words
points += s_dirtyWords[i].m_id;
// log debug
if ( ! g_conf.m_logDebugDirty ) continue;
// show it in the log
log("dirty: %s %li %s"
,s_dirtyWords[i].m_string
,(long)s_dirtyWords[i].m_id
,url
);
}
return points;
}
@ -15845,6 +15859,8 @@ bool XmlDoc::logIt ( ) {
sb.safePrintf("oldpubdate=%s ",tmp );
}
if ( m_isAdultValid )
sb.safePrintf("isadult=%li ",(long)m_isAdult);
// only print if different now! good for grepping changes
if ( m_oldDocValid && m_oldDoc &&
@ -21081,6 +21097,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashTagRec ( table ) ) return NULL;
if ( ! hashAds ( table ) ) return NULL;
if ( ! hashSubmitUrls ( table ) ) return NULL;
if ( ! hashIsAdult ( table ) ) return NULL;
// hash sectionhash:xxxx terms
if ( ! hashSections ( table ) ) return NULL;
@ -22921,6 +22938,34 @@ Url *XmlDoc::getBaseUrl ( ) {
return &m_baseUrl;
}
// returns false and sets g_errno on error
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
setStatus ("hashing isadult");
char *ia = getIsAdult();
// this should not block or return error! should have been
// set in prepareToMakeTitleRec() before hashAll() was called!
if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }
// index gbisadult:1 if adult or gbisadult:0 if not
char *val;
if ( *ia ) val = "1";
else val = "0";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbisadult";
hi.m_desc = "is document adult content";
// this returns false on failure
if ( ! hashString ( val,1,&hi ) ) return false;
return true;
}
// hash destination urls for embedded gb search boxes
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {

@ -167,7 +167,7 @@ long intersectGigabits ( Msg20 **mp , // search results
class GigabitInfo *top ,
long niceness ) ;
long getDirtyPoints ( char *s , long len , long niceness ) ;
long getDirtyPoints ( char *s , long len , long niceness , char *logUrl ) ;
bool storeTerm ( char *s ,
long slen ,
@ -757,6 +757,8 @@ class XmlDoc {
bool hashAds(class HashTableX *table ) ;
class Url *getBaseUrl ( ) ;
bool hashSubmitUrls ( class HashTableX *table ) ;
bool hashIsAdult ( class HashTableX *table ) ;
void set20 ( Msg20Request *req ) ;
class Msg20Reply *getMsg20Reply ( ) ;
char **getImageUrl() ;

@ -232,6 +232,8 @@ char *getMatches2 ( Needle *needles ,
QUICKPOLL(niceness);
//if ( (char *)p - (char *)haystack >= 12508 )
// log("hey");
// analytics...
// is this a possible match? (this should be VERY fast)
mask = s0[*(p+0)];
if ( ! mask ) continue;
@ -348,6 +350,9 @@ char *getMatches2 ( Needle *needles ,
}
// otherwise, just count it
needles[j].m_count++;
// see if we match another needle, fixes bug
// of matching "anal" but not "analy[tics]"
continue;
// advance to next char in the haystack
break;
}