added X-referring-url: X-anchor-text: and
X-surrounding-text: to diffbot http request header.
This commit is contained in:
parent
2bdbdb8982
commit
21a6b070a7
@ -3443,7 +3443,8 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
if ( g_conf.m_logDebugLinkInfo )
|
||||
log("linkdb: inlink #%li is link spam: %s",
|
||||
i,r->ptr_note);
|
||||
continue;
|
||||
if ( onlyNeedGoodInlinks )
|
||||
continue;
|
||||
}
|
||||
// do a quick set
|
||||
Inlink k; k.set ( r );
|
||||
@ -3508,7 +3509,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
//if ( r->m_linkTextScoreWeight <= 0 ) continue;
|
||||
// ignore if spam
|
||||
//if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
|
||||
if ( r->m_isLinkSpam ) continue;
|
||||
if ( r->m_isLinkSpam && onlyNeedGoodInlinks ) continue;
|
||||
// are we internal?
|
||||
bool internal = false;
|
||||
if ( (r->m_ip&0x0000ffff) == (ip & 0x0000ffff) )
|
||||
|
4
Makefile
4
Makefile
@ -108,6 +108,10 @@ endif
|
||||
|
||||
all: gb
|
||||
|
||||
g8: gb
|
||||
scp gb g8:/p/gb.new
|
||||
ssh g8 'cd /p/ ; ./gb stop ; ./gb installgb ; sleep 4 ; ./gb start'
|
||||
|
||||
utils: addtest blaster dump hashtest makeclusterdb makespiderdb membustest monitor seektest urlinfo treetest dnstest dmozparse gbtitletest
|
||||
|
||||
gb: $(OBJS) main.o $(LIBFILES)
|
||||
|
@ -508,13 +508,6 @@ bool processLoop ( void *state ) {
|
||||
// get the xmldoc
|
||||
XmlDoc *xd = &st->m_xd;
|
||||
|
||||
// . save the ips.txt file if we are the test coll
|
||||
// . saveTestBuf() is a function in Msge1.cpp
|
||||
CollectionRec *cr = xd->getCollRec();
|
||||
if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll , "test") )
|
||||
// use same dir that XmlDoc::getTestDir() would use
|
||||
saveTestBuf ( "test-page-parser" );
|
||||
|
||||
// error?
|
||||
if ( g_errno ) return sendErrorReply ( st , g_errno );
|
||||
|
||||
@ -522,6 +515,12 @@ bool processLoop ( void *state ) {
|
||||
SafeBuf *xbuf = &st->m_xbuf;
|
||||
|
||||
if ( st->m_u && st->m_u[0] ) {
|
||||
// . save the ips.txt file if we are the test coll
|
||||
// . saveTestBuf() is a function in Msge1.cpp
|
||||
CollectionRec *cr = xd->getCollRec();
|
||||
if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
|
||||
// use same dir that XmlDoc::getTestDir() would use
|
||||
saveTestBuf ( "test-page-parser" );
|
||||
// now get the meta list, in the process it will print out a
|
||||
// bunch of junk into st->m_xbuf
|
||||
char *metalist = xd->getMetaList ( );
|
||||
|
@ -8576,7 +8576,9 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
if ( isForMsg20 ) continue;
|
||||
// . if we are not submitted from the add url api, skip
|
||||
// . if we have '!' then val is 1
|
||||
if ( sreq->m_isAddUrl|| sreq->m_isInjecting ) {
|
||||
if ( sreq->m_isAddUrl ||
|
||||
sreq->m_isInjecting ||
|
||||
sreq->m_isPageParser ) {
|
||||
if ( val ) continue;
|
||||
}
|
||||
else {
|
||||
|
142
XmlDoc.cpp
142
XmlDoc.cpp
@ -12308,6 +12308,17 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
setStatus ( "calling msg25 for url" );
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// we want to get all inlinks if doing a custom crawlbot crawl
|
||||
// because we need the anchor text to pass in to diffbot
|
||||
bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
|
||||
bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
|
||||
if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
|
||||
doLinkSpamCheck = false;
|
||||
oneVotePerIpDom = false;
|
||||
onlyNeedGoodInlinks = false;
|
||||
}
|
||||
|
||||
// call it
|
||||
char *url = getFirstUrl()->getUrl();
|
||||
if ( ! m->getLinkInfo ( mysite ,
|
||||
@ -12327,8 +12338,8 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
//m_sitePop ,
|
||||
oldLinkInfo1 ,
|
||||
m_niceness ,
|
||||
cr->m_doLinkSpamCheck ,
|
||||
cr->m_oneVotePerIpDom ,
|
||||
doLinkSpamCheck ,
|
||||
oneVotePerIpDom ,
|
||||
canBeCancelled ,
|
||||
lastUpdateTime ,
|
||||
onlyNeedGoodInlinks ,
|
||||
@ -12710,6 +12721,13 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
||||
if ( m_recycleDiffbotReplyValid )
|
||||
return &m_recycleDiffbotReply;
|
||||
|
||||
// if from pageparser.cpp re-call diffbot for debugging
|
||||
if ( getIsPageParser() ) {
|
||||
m_recycleDiffbotReply = false;
|
||||
m_recycleDiffbotReplyValid = true;
|
||||
return &m_recycleDiffbotReply;
|
||||
}
|
||||
|
||||
XmlDoc **odp = getOldXmlDoc( );
|
||||
if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp;
|
||||
XmlDoc *od = *odp;
|
||||
@ -12763,6 +12781,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
return &m_diffbotReply;
|
||||
}
|
||||
|
||||
// if set from title rec, do not do it. we are possibly an "old doc"
|
||||
// and we should only call diffbot.com with new docs
|
||||
if ( m_setFromTitleRec ) {
|
||||
m_diffbotReplyValid = true;
|
||||
return &m_diffbotReply;
|
||||
}
|
||||
|
||||
// . check the url filters table to see if diffbot api is specified
|
||||
// . just return "\0" if none, but NULL means error i guess
|
||||
SafeBuf *au = getDiffbotApiUrl();
|
||||
@ -12841,8 +12866,94 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
return &m_diffbotReply;
|
||||
}
|
||||
|
||||
// now include referring link anchor text, etc.
|
||||
LinkInfo *info1 = getLinkInfo1 ();
|
||||
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
|
||||
|
||||
|
||||
setStatus("getting diffbot reply");
|
||||
|
||||
|
||||
// set up dedup table for deduping on link text
|
||||
HashTableX dedup;
|
||||
char tmp[512];
|
||||
if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") )
|
||||
return NULL;
|
||||
|
||||
SafeBuf headers;
|
||||
bool first = true;
|
||||
|
||||
// . make additional headers
|
||||
// . add two headers for every "good" (non-dup) link
|
||||
// . do NOT end headers in \r\n since HttpServer adds that!
|
||||
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
// sanity
|
||||
if ( k->size_urlBuf <= 1 ) continue;
|
||||
// skip if too long
|
||||
if ( k->size_linkText > 1024 ) continue;
|
||||
// or not enough! (size includes \0)
|
||||
if ( k->size_linkText <= 1 ) continue;
|
||||
// sanity check
|
||||
char *txt = k->ptr_linkText;
|
||||
long tlen = k->size_linkText;
|
||||
if ( tlen > 0 ) tlen--;
|
||||
// this seems to happen sometimes..
|
||||
if ( ! verifyUtf8 ( txt , tlen ) ) continue;
|
||||
// if anchor text has \0 skip it
|
||||
if ( gbstrlen(txt) != tlen ) continue;
|
||||
// or if surrounding text has \0 skip as well
|
||||
char *surStr = k->ptr_surroundingText;
|
||||
long surLen = k->size_surroundingText;
|
||||
if ( surLen > 0 ) surLen--;
|
||||
if ( surStr && gbstrlen(surStr) != surLen ) continue;
|
||||
// dedup on that
|
||||
long h32 = hash32 ( txt , tlen );
|
||||
if ( dedup.isInTable ( &h32 ) ) continue;
|
||||
if ( ! dedup.addKey ( &h32 ) ) return NULL;
|
||||
// separate with \r\n
|
||||
if ( ! first && ! headers.safePrintf("\r\n" ) )
|
||||
return NULL;
|
||||
first = false;
|
||||
// add to http header
|
||||
if ( ! headers.safePrintf("X-referring-url: ") )
|
||||
return NULL;
|
||||
// do not include the terminating \0, so -1
|
||||
if ( ! headers.safeMemcpy(k->ptr_urlBuf , k->size_urlBuf-1 ))
|
||||
return NULL;
|
||||
// and link text
|
||||
if ( ! headers.safePrintf("\r\nX-anchor-text: ") )
|
||||
return NULL;
|
||||
// store the anchor text without any \r or \n chars
|
||||
if ( ! headers.reserve ( tlen ) ) return NULL;
|
||||
char *p = txt;
|
||||
char *pend = txt + tlen;
|
||||
for ( ; p < pend ; p++ ) {
|
||||
if ( *p == '\r' ) continue;
|
||||
if ( *p == '\n' ) continue;
|
||||
headers.pushChar(*p);
|
||||
}
|
||||
// do not include it if more than 2000 chars big
|
||||
if ( surLen > 0 && surLen < 2000 ) {
|
||||
if ( ! headers.safePrintf("\r\nX-surrounding-text: ") )
|
||||
return NULL;
|
||||
// make room for copying the surrounding text
|
||||
if ( ! headers.reserve ( surLen ) ) return NULL;
|
||||
// copy minus any \r or \n so its mime header safe
|
||||
p = surStr;
|
||||
pend = surStr + surLen;
|
||||
for ( ; p < pend ; p++ ) {
|
||||
if ( *p == '\r' ) continue;
|
||||
if ( *p == '\n' ) continue;
|
||||
headers.pushChar(*p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make sure to null term the headers
|
||||
if ( headers.length() && ! headers.nullTerm() ) return NULL;
|
||||
|
||||
//char *path = "api";
|
||||
//if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 )
|
||||
// path = "v2";
|
||||
@ -12928,14 +13039,19 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
// null term it
|
||||
diffbotUrl.nullTerm();
|
||||
|
||||
log("diffbot: getting %s",diffbotUrl.getBufStart());
|
||||
|
||||
// mark as tried
|
||||
m_sentToDiffbot = 1;
|
||||
|
||||
// count it for stats
|
||||
cr->m_localCrawlInfo.m_pageProcessAttempts++;
|
||||
|
||||
char *additionalHeaders = NULL;
|
||||
if ( headers.length() > 0 )
|
||||
additionalHeaders = headers.getBufStart();
|
||||
|
||||
log("diffbot: getting %s headers=%s",diffbotUrl.getBufStart(),
|
||||
additionalHeaders);
|
||||
|
||||
if ( ! g_httpServer.getDoc ( diffbotUrl.getBufStart() ,
|
||||
0 , // ip
|
||||
0 , // offset
|
||||
@ -12948,7 +13064,11 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
0,//proxyport
|
||||
10000,//maxtextdoclen
|
||||
10000,//maxotherdoclen
|
||||
g_conf.m_spiderUserAgent ) )
|
||||
g_conf.m_spiderUserAgent ,
|
||||
"HTTP/1.0",
|
||||
false, // do post?
|
||||
NULL, // cookie
|
||||
additionalHeaders ) )
|
||||
// return -1 if blocked
|
||||
return (SafeBuf *)-1;
|
||||
// error?
|
||||
@ -18690,7 +18810,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// have indexed in od's diffbot reply buffer because they all
|
||||
// were indexed with their own docids in the "m_dx" code below. so
|
||||
// just delete them and we'll re-add from this doc's diffbot reply.
|
||||
if ( od && od->m_diffbotJSONCount && ! *recycle ) {
|
||||
if ( od && od->m_diffbotJSONCount && ! *recycle &&
|
||||
// do not remove old json objects if pageparser.cpp test
|
||||
// because that can not change the index, etc.
|
||||
! getIsPageParser() ) {
|
||||
// this returns false if it blocks
|
||||
long *status = od->nukeJSONObjects();
|
||||
if ( ! status || status == (void *)-1) return (char *)status;
|
||||
@ -19022,12 +19145,17 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
SafeBuf *dbr = getDiffbotReply();
|
||||
if ( ! dbr || dbr == (void *)-1 ) return (char *)dbr;
|
||||
|
||||
long dbrLen = dbr->length();
|
||||
|
||||
// do not index json items as separate docs if we are page parser
|
||||
if ( getIsPageParser() ) dbrLen = 0;
|
||||
|
||||
//
|
||||
// if we got a json object or two from diffbot, index them
|
||||
// as their own child xmldocs.
|
||||
// watch out for reply from diffbot of "-1" indicating error!
|
||||
//
|
||||
if ( dbr->length() > 3 ) {
|
||||
if ( dbrLen > 3 ) {
|
||||
// make sure diffbot reply is valid for sure
|
||||
if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
|
||||
// set status for this
|
||||
|
Loading…
x
Reference in New Issue
Block a user