added X-referring-url: X-anchor-text: and

X-surrounding-text: to diffbot http request header.
2013-10-31 11:44:09 -07:00 · 2013-10-31 11:44:09 -07:00 · 21a6b070a7
commit 21a6b070a7
parent 2bdbdb8982
5 changed files with 151 additions and 17 deletions
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -3443,7 +3443,8 @@ LinkInfo *makeLinkInfo ( char        *coll                    ,
 			if ( g_conf.m_logDebugLinkInfo ) 
 				log("linkdb: inlink #%li is link spam: %s",
 				    i,r->ptr_note);
-			continue;
+			if ( onlyNeedGoodInlinks )
+				continue;
 		}
 		// do a quick set
 		Inlink k; k.set ( r );
@ -3508,7 +3509,7 @@ LinkInfo *makeLinkInfo ( char        *coll                    ,
 		//if ( r->m_linkTextScoreWeight <= 0 ) continue;
 		// ignore if spam
 		//if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
-		if ( r->m_isLinkSpam ) continue;
+		if ( r->m_isLinkSpam && onlyNeedGoodInlinks ) continue;
 		// are we internal?
 		bool internal = false;
 		if ( (r->m_ip&0x0000ffff) == (ip & 0x0000ffff) ) 
--- a/4
+++ b/4
@ -108,6 +108,10 @@ endif

 all: gb

+g8: gb
+	scp gb g8:/p/gb.new
+	ssh g8 'cd /p/ ; ./gb stop ; ./gb installgb ; sleep 4 ; ./gb start'
+
 utils: addtest blaster dump hashtest makeclusterdb makespiderdb membustest monitor seektest urlinfo treetest dnstest dmozparse gbtitletest

 gb: $(OBJS) main.o $(LIBFILES)
--- a/PageParser.cpp
+++ b/PageParser.cpp
@ -508,13 +508,6 @@ bool processLoop ( void *state ) {
 	// get the xmldoc
 	XmlDoc *xd = &st->m_xd;

-	// . save the ips.txt file if we are the test coll
-	// . saveTestBuf() is a function in Msge1.cpp
-	CollectionRec *cr = xd->getCollRec();
-	if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll , "test") )
-		// use same dir that XmlDoc::getTestDir() would use
-		saveTestBuf ( "test-page-parser" );
-
 	// error?
 	if ( g_errno ) return sendErrorReply ( st , g_errno );

@ -522,6 +515,12 @@ bool processLoop ( void *state ) {
 	SafeBuf *xbuf = &st->m_xbuf;

 	if ( st->m_u && st->m_u[0] ) {
+		// . save the ips.txt file if we are the test coll
+		// . saveTestBuf() is a function in Msge1.cpp
+		CollectionRec *cr = xd->getCollRec();
+		if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
+			// use same dir that XmlDoc::getTestDir() would use
+			saveTestBuf ( "test-page-parser" );
 		// now get the meta list, in the process it will print out a 
 		// bunch of junk into st->m_xbuf
 		char *metalist = xd->getMetaList ( );
--- a/Spider.cpp
+++ b/Spider.cpp
@ -8576,7 +8576,9 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			if ( isForMsg20 ) continue;
 			// . if we are not submitted from the add url api, skip
 			// . if we have '!' then val is 1
-			if ( sreq->m_isAddUrl|| sreq->m_isInjecting ) {
+			if ( sreq->m_isAddUrl    || 
+			     sreq->m_isInjecting ||
+			     sreq->m_isPageParser ) {
 				if ( val ) continue;
 			}
 			else {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -12308,6 +12308,17 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 		setStatus ( "calling msg25 for url" );
 		CollectionRec *cr = getCollRec();
 		if ( ! cr ) return NULL;
+
+		// we want to get all inlinks if doing a custom crawlbot crawl
+		// because we need the anchor text to pass in to diffbot
+		bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
+		bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
+		if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
+			doLinkSpamCheck     = false;
+			oneVotePerIpDom     = false;
+			onlyNeedGoodInlinks = false;
+		}
+
 		// call it
 		char *url = getFirstUrl()->getUrl();
 		if ( ! m->getLinkInfo ( mysite , 
@ -12327,8 +12338,8 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 					//m_sitePop           ,
 					oldLinkInfo1        ,	     
 					m_niceness          ,
-					cr->m_doLinkSpamCheck ,
-					cr->m_oneVotePerIpDom ,
+					doLinkSpamCheck ,
+					oneVotePerIpDom ,
 					canBeCancelled        ,
 					lastUpdateTime ,
 					onlyNeedGoodInlinks ,
@ -12710,6 +12721,13 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
 	if ( m_recycleDiffbotReplyValid )
 		return &m_recycleDiffbotReply;

+	// if from pageparser.cpp re-call diffbot for debugging
+	if ( getIsPageParser() ) {
+		m_recycleDiffbotReply = false;
+		m_recycleDiffbotReplyValid = true;
+		return &m_recycleDiffbotReply;
+	}
+
 	XmlDoc **odp = getOldXmlDoc( );
 	if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp;
 	XmlDoc *od = *odp;
@ -12763,6 +12781,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 		return &m_diffbotReply;
 	}

+	// if set from title rec, do not do it. we are possibly an "old doc"
+	// and we should only call diffbot.com with new docs
+	if ( m_setFromTitleRec ) {
+		m_diffbotReplyValid = true;
+		return &m_diffbotReply;
+	}
+
 	// . check the url filters table to see if diffbot api is specified
 	// . just return "\0" if none, but NULL means error i guess
 	SafeBuf *au = getDiffbotApiUrl();
@ -12841,8 +12866,94 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 		return &m_diffbotReply;
 	}

+	// now include referring link anchor text, etc.
+	LinkInfo  *info1    = getLinkInfo1 ();
+	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
+
+
 	setStatus("getting diffbot reply");

+
+	// set up dedup table for deduping on link text
+	HashTableX dedup;
+	char tmp[512];
+	if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") )
+		return NULL;
+
+	SafeBuf headers;
+	bool first = true;
+
+	// . make additional headers
+	// . add two headers for every "good" (non-dup) link
+	// . do NOT end headers in \r\n since HttpServer adds that!
+	for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
+		// breathe
+		QUICKPOLL(m_niceness);
+		// sanity
+		if ( k->size_urlBuf <= 1 ) continue;
+		// skip if too long
+		if ( k->size_linkText > 1024 ) continue;
+		// or not enough! (size includes \0)
+		if ( k->size_linkText <= 1 ) continue;
+		// sanity check
+		char *txt = k->ptr_linkText;
+		long tlen = k->size_linkText;
+		if ( tlen > 0 ) tlen--;
+		// this seems to happen sometimes..
+		if ( ! verifyUtf8 ( txt , tlen ) ) continue;
+		// if anchor text has \0 skip it
+		if ( gbstrlen(txt) != tlen ) continue;
+		// or if surrounding text has \0 skip as well
+		char *surStr = k->ptr_surroundingText;
+		long  surLen = k->size_surroundingText;
+		if ( surLen > 0 ) surLen--;
+		if ( surStr && gbstrlen(surStr) != surLen ) continue;
+		// dedup on that
+		long h32 = hash32 ( txt , tlen );
+		if ( dedup.isInTable ( &h32 ) ) continue;
+		if ( ! dedup.addKey ( &h32 ) ) return NULL;
+		// separate with \r\n
+		if ( ! first && ! headers.safePrintf("\r\n" ) ) 
+			return NULL;
+		first = false;
+		// add to http header
+		if ( ! headers.safePrintf("X-referring-url: ") ) 
+			return NULL;
+		// do not include the terminating \0, so -1
+		if ( ! headers.safeMemcpy(k->ptr_urlBuf , k->size_urlBuf-1 ))
+			return NULL;
+		// and link text
+		if ( ! headers.safePrintf("\r\nX-anchor-text: ") ) 
+			return NULL;
+		// store the anchor text without any \r or \n chars
+		if ( ! headers.reserve ( tlen ) ) return NULL;
+		char *p    = txt;
+		char *pend = txt + tlen;
+		for ( ; p < pend ; p++ ) {
+			if ( *p == '\r' ) continue;
+			if ( *p == '\n' ) continue;
+			headers.pushChar(*p);
+		}
+		// do not include it if more than 2000 chars big
+		if ( surLen > 0 && surLen < 2000 ) {
+			if ( ! headers.safePrintf("\r\nX-surrounding-text: ") )
+				return NULL;
+			// make room for copying the surrounding text
+			if ( ! headers.reserve ( surLen ) ) return NULL;
+			// copy minus any \r or \n so its mime header safe
+			p    = surStr;
+			pend = surStr + surLen;
+			for ( ; p < pend ; p++ ) {
+				if ( *p == '\r' ) continue;
+				if ( *p == '\n' ) continue;
+				headers.pushChar(*p);
+			}
+		}
+	}
+
+	// make sure to null term the headers
+	if ( headers.length() && ! headers.nullTerm() ) return NULL;
+
 	//char *path = "api";
 	//if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 )
 	//	path = "v2";
@ -12928,14 +13039,19 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	// null term it
 	diffbotUrl.nullTerm();

-	log("diffbot: getting %s",diffbotUrl.getBufStart());
-
 	// mark as tried
 	m_sentToDiffbot = 1;
 	
 	// count it for stats
 	cr->m_localCrawlInfo.m_pageProcessAttempts++;

+	char *additionalHeaders = NULL;
+	if ( headers.length() > 0 )
+		additionalHeaders = headers.getBufStart();
+
+	log("diffbot: getting %s headers=%s",diffbotUrl.getBufStart(),
+	    additionalHeaders);
+
 	if ( ! g_httpServer.getDoc ( diffbotUrl.getBufStart() ,
 				     0 , // ip
 				     0 , // offset
@ -12948,7 +13064,11 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 				     0,//proxyport
 				     10000,//maxtextdoclen
 				     10000,//maxotherdoclen
-				     g_conf.m_spiderUserAgent ) )
+				     g_conf.m_spiderUserAgent ,
+				     "HTTP/1.0",
+				     false, // do post?
+				     NULL, // cookie
+				     additionalHeaders ) )
 		// return -1 if blocked
 		return (SafeBuf *)-1;
 	// error?
@ -18690,7 +18810,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// have indexed in od's diffbot reply buffer because they all
 	// were indexed with their own docids in the "m_dx" code below. so
 	// just delete them and we'll re-add from this doc's diffbot reply.
-	if ( od && od->m_diffbotJSONCount && ! *recycle ) {
+	if ( od && od->m_diffbotJSONCount && ! *recycle && 
+	     // do not remove old json objects if pageparser.cpp test
+	     // because that can not change the index, etc.
+	     ! getIsPageParser() ) {
 		// this returns false if it blocks
 		long *status = od->nukeJSONObjects();
 		if ( ! status || status == (void *)-1) return (char *)status;
@ -19022,12 +19145,17 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	SafeBuf *dbr = getDiffbotReply();
 	if ( ! dbr || dbr == (void *)-1 ) return (char *)dbr;

+	long dbrLen = dbr->length();
+
+	// do not index json items as separate docs if we are page parser
+	if ( getIsPageParser() ) dbrLen = 0;
+
 	//
 	// if we got a json object or two from diffbot, index them
 	// as their own child xmldocs.
 	// watch out for reply from diffbot of "-1" indicating error!
 	//
-	if ( dbr->length() > 3 ) {
+	if ( dbrLen > 3 ) {
 		// make sure diffbot reply is valid for sure
 		if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
 		// set status for this