Merge branch 'master' into diffbot

Conflicts: SearchInput.cpp XmlDoc.cpp
2013-09-18 09:23:48 -07:00 · 2013-09-18 09:23:48 -07:00 · c77453348f
commit c77453348f
parent 10fcfb6987 d6815f2c9d
8 changed files with 73 additions and 5 deletions
--- a/Conf.h
+++ b/Conf.h
@ -601,6 +601,7 @@ class Conf {
 	bool  m_logDebugBuild   ;
 	bool  m_logDebugBuildTime ;
 	bool  m_logDebugDb      ;
+	bool  m_logDebugDirty   ;
 	bool  m_logDebugDisk    ;
 	bool  m_logDebugDns     ;
 	bool  m_logDebugDownloads;
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -467,7 +467,8 @@ bool Msg40::prepareToGetDocIds ( ) {
 	if ( m_si->m_familyFilter && 
 	     getDirtyPoints ( m_si->m_sbuf1.getBufStart() , 
 			      m_si->m_sbuf1.length() , 
-			      0 ) ) {
+			      0 ,
+			      NULL ) ) {
 		// make sure the m_numDocIds gets set to 0
 		m_msg3a.reset();
 		m_queryCensored = true;
--- a/Parms.cpp
+++ b/Parms.cpp
@ -7356,6 +7356,14 @@ void Parms::init ( ) {
 	m->m_priv  = 1;
 	m++;

+	m->m_title = "log debug dirty messages";
+	m->m_cgi   = "lddm";
+	m->m_off   = (char *)&g_conf.m_logDebugDirty - g;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_priv  = 1;
+	m++;
+
 	m->m_title = "log debug disk messages";
 	m->m_cgi   = "lddi";
 	m->m_off   = (char *)&g_conf.m_logDebugDisk - g;
--- a/Query.cpp
+++ b/Query.cpp
@ -3002,6 +3002,7 @@ struct QueryField g_fields[] = {
 	//{"gbruleset",FIELD_GBRULESET, true,"Obsolete."},
 	{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension."},
 	{"filetype", FIELD_TYPE, false,"Same as type:"},
+	{"gbisadult",FIELD_TYPE,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."},
 	{"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."},
 	{"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."},
 	{"zipcode", FIELD_ZIP, false,"Same as zip:"},
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -975,6 +975,11 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 		m_sbuf1.safeMemcpy(m_site,m_siteLen);
 	}

+	if ( m_familyFilter ) {
+		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
+		m_sbuf1.safePrintf("gbisadult:0 | ");
+	}
+
 	// append gblang: term
 	if( m_gblang > 0 ) {
 		//if( p > pstart ) *p++ =  ' ';
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -3149,7 +3149,7 @@ static Needle s_dirtyWords []  = {
 	{"blowjob"    ,0,2,0,0,NULL,0,NULL},
 	{"blow job"   ,0,2,0,0,NULL,0,NULL},
 	{"gangbang"   ,0,2,0,0,NULL,0,NULL},
-	{"xxx"        ,0,2,0,0,NULL,0,NULL},
+	{"xxx"        ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
 	{"porn"       ,0,2,0,0,NULL,0,NULL},
 	{"felch"      ,0,2,0,0,NULL,0,NULL},
 	{"cunt"       ,0,2,0,0,NULL,0,NULL},
@ -3211,6 +3211,7 @@ static Needle s_dirtyWords []  = {
 	{"sexclu"      ,0,-2,0,0,NULL,0,NULL},
 	{"sexo"        ,0,-2,0,0,NULL,0,NULL},
 	{"sexism"      ,0,-2,0,0,NULL,0,NULL},
+	{"sexpan"      ,0,-2,0,0,NULL,0,NULL}, // buttonsexpanion
 	{"same-sex"    ,0,-2,0,0,NULL,0,NULL},
 	{"opposite sex",0,-2,0,0,NULL,0,NULL},

@ -3359,7 +3360,7 @@ char *XmlDoc::getIsAdult ( ) {

 	// score that up
 	long total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
-				      m_niceness );
+				      m_niceness , m_firstUrl.m_url );

 	// then the url
 	//char *u = getFirstUrl()->getUrl();
@ -3383,13 +3384,18 @@ char *XmlDoc::getIsAdult ( ) {
 	m_isAdult2 = (bool)m_isAdult;
 	// validate
 	m_isAdultValid = true;
+
+	// note it
+	if ( m_isAdult2 && g_conf.m_logDebugDirty )
+		log("dirty: %s points = %li",m_firstUrl.m_url,total);
+
 	// no dirty words found
 	return &m_isAdult2;
 }



-long getDirtyPoints ( char *s , long slen , long niceness ) {
+long getDirtyPoints ( char *s , long slen , long niceness , char *url ) {
 	// . use the matches function to get all the matches
 	// . then check each match to see if it is actually a legit word
 	// . actually match the dirty words, then match the clean words
@ -3415,6 +3421,14 @@ long getDirtyPoints ( char *s , long slen , long niceness ) {
 		// . uses +2/-2 for really dirty words 
 		// . uses +1/-1 for borderline dirty words
 		points += s_dirtyWords[i].m_id;
+		// log debug
+		if ( ! g_conf.m_logDebugDirty ) continue;
+		// show it in the log
+		log("dirty: %s %li %s"
+		    ,s_dirtyWords[i].m_string
+		    ,(long)s_dirtyWords[i].m_id
+		    ,url
+		    );
 	}
 	return points;
 }
@ -15845,6 +15859,8 @@ bool XmlDoc::logIt ( ) {
 		sb.safePrintf("oldpubdate=%s ",tmp );
 	}

+	if ( m_isAdultValid )
+		sb.safePrintf("isadult=%li ",(long)m_isAdult);

 	// only print if different now! good for grepping changes
 	if ( m_oldDocValid && m_oldDoc && 
@ -21081,6 +21097,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 	if ( ! hashTagRec        ( table ) ) return NULL;
 	if ( ! hashAds           ( table ) ) return NULL;
 	if ( ! hashSubmitUrls    ( table ) ) return NULL;
+	if ( ! hashIsAdult       ( table ) ) return NULL;

 	// hash sectionhash:xxxx terms
 	if ( ! hashSections   ( table ) ) return NULL;
@ -22921,6 +22938,34 @@ Url *XmlDoc::getBaseUrl ( ) {
 	return &m_baseUrl;
 }

+// returns false and sets g_errno on error
+bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
+
+	setStatus ("hashing isadult");
+
+	char *ia = getIsAdult();
+	// this should not block or return error! should have been
+	// set in prepareToMakeTitleRec() before hashAll() was called!
+	if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }
+
+	// index gbisadult:1 if adult or gbisadult:0 if not
+	char *val;
+	if ( *ia ) val = "1";
+	else       val = "0";
+
+	// update hash parms
+	HashInfo hi;
+	hi.m_tt        = tt;
+	hi.m_hashGroup = HASHGROUP_INTAG;
+	hi.m_prefix    = "gbisadult";
+	hi.m_desc      = "is document adult content";
+
+	// this returns false on failure
+	if ( ! hashString ( val,1,&hi ) ) return false;
+
+	return true;
+}
+

 // hash destination urls for embedded gb search boxes
 bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -167,7 +167,7 @@ long intersectGigabits ( Msg20       **mp          ,   // search results
 			 class GigabitInfo  *top   ,
 			 long          niceness    ) ;

-long getDirtyPoints ( char *s , long len , long niceness ) ;
+long getDirtyPoints ( char *s , long len , long niceness , char *logUrl ) ;

 bool storeTerm ( char             *s        ,
                 long              slen     ,
@ -757,6 +757,8 @@ class XmlDoc {
 	bool hashAds(class HashTableX *table ) ;
 	class Url *getBaseUrl ( ) ;
 	bool hashSubmitUrls ( class HashTableX *table ) ;
+	bool hashIsAdult    ( class HashTableX *table ) ;
+
 	void set20 ( Msg20Request *req ) ;
 	class Msg20Reply *getMsg20Reply ( ) ;
 	char **getImageUrl() ;
--- a/matches.cpp
+++ b/matches.cpp
@ -232,6 +232,8 @@ char *getMatches2 ( Needle *needles          ,
 		QUICKPOLL(niceness);
 		//if ( (char *)p - (char *)haystack >= 12508 )
 		//	log("hey");
+		// analytics...
+		
 		// is this a possible match? (this should be VERY fast)
 		mask  = s0[*(p+0)];
 		if ( ! mask ) continue;
@ -348,6 +350,9 @@ char *getMatches2 ( Needle *needles          ,
 			}
 			// otherwise, just count it
 			needles[j].m_count++;
+			// see if we match another needle, fixes bug
+			// of matching "anal" but not "analy[tics]"
+			continue;
 			// advance to next char in the haystack
 			break;
 		}