Merge branch 'master' into diffbot
Conflicts: SearchInput.cpp XmlDoc.cpp
This commit is contained in:
commit
c77453348f
1
Conf.h
1
Conf.h
@ -601,6 +601,7 @@ class Conf {
|
||||
bool m_logDebugBuild ;
|
||||
bool m_logDebugBuildTime ;
|
||||
bool m_logDebugDb ;
|
||||
bool m_logDebugDirty ;
|
||||
bool m_logDebugDisk ;
|
||||
bool m_logDebugDns ;
|
||||
bool m_logDebugDownloads;
|
||||
|
@ -467,7 +467,8 @@ bool Msg40::prepareToGetDocIds ( ) {
|
||||
if ( m_si->m_familyFilter &&
|
||||
getDirtyPoints ( m_si->m_sbuf1.getBufStart() ,
|
||||
m_si->m_sbuf1.length() ,
|
||||
0 ) ) {
|
||||
0 ,
|
||||
NULL ) ) {
|
||||
// make sure the m_numDocIds gets set to 0
|
||||
m_msg3a.reset();
|
||||
m_queryCensored = true;
|
||||
|
@ -7356,6 +7356,14 @@ void Parms::init ( ) {
|
||||
m->m_priv = 1;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug dirty messages";
|
||||
m->m_cgi = "lddm";
|
||||
m->m_off = (char *)&g_conf.m_logDebugDirty - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug disk messages";
|
||||
m->m_cgi = "lddi";
|
||||
m->m_off = (char *)&g_conf.m_logDebugDisk - g;
|
||||
|
@ -3002,6 +3002,7 @@ struct QueryField g_fields[] = {
|
||||
//{"gbruleset",FIELD_GBRULESET, true,"Obsolete."},
|
||||
{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension."},
|
||||
{"filetype", FIELD_TYPE, false,"Same as type:"},
|
||||
{"gbisadult",FIELD_TYPE,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."},
|
||||
{"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."},
|
||||
{"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."},
|
||||
{"zipcode", FIELD_ZIP, false,"Same as zip:"},
|
||||
|
@ -975,6 +975,11 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
m_sbuf1.safeMemcpy(m_site,m_siteLen);
|
||||
}
|
||||
|
||||
if ( m_familyFilter ) {
|
||||
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
|
||||
m_sbuf1.safePrintf("gbisadult:0 | ");
|
||||
}
|
||||
|
||||
// append gblang: term
|
||||
if( m_gblang > 0 ) {
|
||||
//if( p > pstart ) *p++ = ' ';
|
||||
|
51
XmlDoc.cpp
51
XmlDoc.cpp
@ -3149,7 +3149,7 @@ static Needle s_dirtyWords [] = {
|
||||
{"blowjob" ,0,2,0,0,NULL,0,NULL},
|
||||
{"blow job" ,0,2,0,0,NULL,0,NULL},
|
||||
{"gangbang" ,0,2,0,0,NULL,0,NULL},
|
||||
{"xxx" ,0,2,0,0,NULL,0,NULL},
|
||||
{"xxx" ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
|
||||
{"porn" ,0,2,0,0,NULL,0,NULL},
|
||||
{"felch" ,0,2,0,0,NULL,0,NULL},
|
||||
{"cunt" ,0,2,0,0,NULL,0,NULL},
|
||||
@ -3211,6 +3211,7 @@ static Needle s_dirtyWords [] = {
|
||||
{"sexclu" ,0,-2,0,0,NULL,0,NULL},
|
||||
{"sexo" ,0,-2,0,0,NULL,0,NULL},
|
||||
{"sexism" ,0,-2,0,0,NULL,0,NULL},
|
||||
{"sexpan" ,0,-2,0,0,NULL,0,NULL}, // buttonsexpanion
|
||||
{"same-sex" ,0,-2,0,0,NULL,0,NULL},
|
||||
{"opposite sex",0,-2,0,0,NULL,0,NULL},
|
||||
|
||||
@ -3359,7 +3360,7 @@ char *XmlDoc::getIsAdult ( ) {
|
||||
|
||||
// score that up
|
||||
long total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
|
||||
m_niceness );
|
||||
m_niceness , m_firstUrl.m_url );
|
||||
|
||||
// then the url
|
||||
//char *u = getFirstUrl()->getUrl();
|
||||
@ -3383,13 +3384,18 @@ char *XmlDoc::getIsAdult ( ) {
|
||||
m_isAdult2 = (bool)m_isAdult;
|
||||
// validate
|
||||
m_isAdultValid = true;
|
||||
|
||||
// note it
|
||||
if ( m_isAdult2 && g_conf.m_logDebugDirty )
|
||||
log("dirty: %s points = %li",m_firstUrl.m_url,total);
|
||||
|
||||
// no dirty words found
|
||||
return &m_isAdult2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
long getDirtyPoints ( char *s , long slen , long niceness ) {
|
||||
long getDirtyPoints ( char *s , long slen , long niceness , char *url ) {
|
||||
// . use the matches function to get all the matches
|
||||
// . then check each match to see if it is actually a legit word
|
||||
// . actually match the dirty words, then match the clean words
|
||||
@ -3415,6 +3421,14 @@ long getDirtyPoints ( char *s , long slen , long niceness ) {
|
||||
// . uses +2/-2 for really dirty words
|
||||
// . uses +1/-1 for borderline dirty words
|
||||
points += s_dirtyWords[i].m_id;
|
||||
// log debug
|
||||
if ( ! g_conf.m_logDebugDirty ) continue;
|
||||
// show it in the log
|
||||
log("dirty: %s %li %s"
|
||||
,s_dirtyWords[i].m_string
|
||||
,(long)s_dirtyWords[i].m_id
|
||||
,url
|
||||
);
|
||||
}
|
||||
return points;
|
||||
}
|
||||
@ -15845,6 +15859,8 @@ bool XmlDoc::logIt ( ) {
|
||||
sb.safePrintf("oldpubdate=%s ",tmp );
|
||||
}
|
||||
|
||||
if ( m_isAdultValid )
|
||||
sb.safePrintf("isadult=%li ",(long)m_isAdult);
|
||||
|
||||
// only print if different now! good for grepping changes
|
||||
if ( m_oldDocValid && m_oldDoc &&
|
||||
@ -21081,6 +21097,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
if ( ! hashTagRec ( table ) ) return NULL;
|
||||
if ( ! hashAds ( table ) ) return NULL;
|
||||
if ( ! hashSubmitUrls ( table ) ) return NULL;
|
||||
if ( ! hashIsAdult ( table ) ) return NULL;
|
||||
|
||||
// hash sectionhash:xxxx terms
|
||||
if ( ! hashSections ( table ) ) return NULL;
|
||||
@ -22921,6 +22938,34 @@ Url *XmlDoc::getBaseUrl ( ) {
|
||||
return &m_baseUrl;
|
||||
}
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
|
||||
|
||||
setStatus ("hashing isadult");
|
||||
|
||||
char *ia = getIsAdult();
|
||||
// this should not block or return error! should have been
|
||||
// set in prepareToMakeTitleRec() before hashAll() was called!
|
||||
if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }
|
||||
|
||||
// index gbisadult:1 if adult or gbisadult:0 if not
|
||||
char *val;
|
||||
if ( *ia ) val = "1";
|
||||
else val = "0";
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbisadult";
|
||||
hi.m_desc = "is document adult content";
|
||||
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( val,1,&hi ) ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// hash destination urls for embedded gb search boxes
|
||||
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {
|
||||
|
4
XmlDoc.h
4
XmlDoc.h
@ -167,7 +167,7 @@ long intersectGigabits ( Msg20 **mp , // search results
|
||||
class GigabitInfo *top ,
|
||||
long niceness ) ;
|
||||
|
||||
long getDirtyPoints ( char *s , long len , long niceness ) ;
|
||||
long getDirtyPoints ( char *s , long len , long niceness , char *logUrl ) ;
|
||||
|
||||
bool storeTerm ( char *s ,
|
||||
long slen ,
|
||||
@ -757,6 +757,8 @@ class XmlDoc {
|
||||
bool hashAds(class HashTableX *table ) ;
|
||||
class Url *getBaseUrl ( ) ;
|
||||
bool hashSubmitUrls ( class HashTableX *table ) ;
|
||||
bool hashIsAdult ( class HashTableX *table ) ;
|
||||
|
||||
void set20 ( Msg20Request *req ) ;
|
||||
class Msg20Reply *getMsg20Reply ( ) ;
|
||||
char **getImageUrl() ;
|
||||
|
@ -232,6 +232,8 @@ char *getMatches2 ( Needle *needles ,
|
||||
QUICKPOLL(niceness);
|
||||
//if ( (char *)p - (char *)haystack >= 12508 )
|
||||
// log("hey");
|
||||
// analytics...
|
||||
|
||||
// is this a possible match? (this should be VERY fast)
|
||||
mask = s0[*(p+0)];
|
||||
if ( ! mask ) continue;
|
||||
@ -348,6 +350,9 @@ char *getMatches2 ( Needle *needles ,
|
||||
}
|
||||
// otherwise, just count it
|
||||
needles[j].m_count++;
|
||||
// see if we match another needle, fixes bug
|
||||
// of matching "anal" but not "analy[tics]"
|
||||
continue;
|
||||
// advance to next char in the haystack
|
||||
break;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user