53370 lines
1.6 MiB
53370 lines
1.6 MiB
//-*- coding: utf-8 -*-
|
|
|
|
#include "gb-include.h"
|
|
|
|
#include "hash.h"
|
|
#include "XmlDoc.h"
|
|
#include "Indexdb.h" // for TERMID_MASK definition and g_indexdb.getTermId()
|
|
#include "Conf.h"
|
|
#include "Query.h" // getFieldCode()
|
|
#include "Clusterdb.h" // g_clusterdb
|
|
#include "Categories.h" // g_categories
|
|
#include "iana_charset.h"
|
|
//#include "Checksumdb.h"
|
|
//#include "Msg24.h"
|
|
#include "Stats.h"
|
|
#include "Sanity.h"
|
|
#include "Speller.h"
|
|
#include "CountryCode.h"
|
|
//#include "SiteBonus.h"
|
|
#include "linkspam.h"
|
|
#include "Tagdb.h"
|
|
//#include "Dates.h"
|
|
#include "Repair.h"
|
|
//#include "Links.h"
|
|
#include "HashTableX.h"
|
|
#include "LanguageIdentifier.h" // g_langId
|
|
#include "CountryCode.h" // g_countryCode
|
|
#include "sort.h"
|
|
#include "Wiki.h"
|
|
#include "Speller.h"
|
|
#include "SiteGetter.h"
|
|
#include "Placedb.h"
|
|
#include "Test.h"
|
|
#include "Synonyms.h"
|
|
//#include "Revdb.h"
|
|
#include "Timedb.h"
|
|
#ifdef _USETURKS_
|
|
//#include "PageTurk.h"
|
|
#endif
|
|
#include "PageInject.h"
|
|
#include "HttpServer.h"
|
|
#include "Facebook.h"
|
|
#include "Posdb.h"
|
|
#include "Highlight.h"
|
|
#include "Wiktionary.h"
|
|
#include "seo.h" // Msg99Request etc.
|
|
//#include <regex.h>
|
|
#include "PingServer.h"
|
|
#include "Parms.h"
|
|
|
|
extern int g_inMemcpy;
|
|
|
|
//#define MAXDOCLEN (1024*1024 * 5)
|
|
//#define MAXDOCLEN (1024*1024)
|
|
|
|
HashTableX *g_ct = NULL;
|
|
XmlDoc *g_doc = NULL;
|
|
char *g_ptr = NULL;
|
|
int32_t *g_int32_t = NULL;
|
|
|
|
#define SENT_UNITS 30
|
|
|
|
static int32_t getIsContacty ( Url *url ,
|
|
LinkInfo *info1 ,
|
|
int32_t hops ,
|
|
uint8_t ct ,
|
|
bool isRoot ,
|
|
int32_t niceness );
|
|
|
|
|
|
static int32_t getTopGigabits ( HashTableX *ht ,
|
|
GigabitInfo **top ,
|
|
int32_t max ,
|
|
int32_t minDocCount ) ;
|
|
|
|
static void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
|
|
int64_t wid1 ,
|
|
int64_t pid2 ,
|
|
int64_t wid2 , // post word
|
|
float *ww ,
|
|
HashTableX *tt1 ,
|
|
int32_t titleRecVersion ) ;
|
|
|
|
static bool addGigabit ( HashTableX *ht ,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int64_t docId ,
|
|
Section *sp ,
|
|
bool singleWord ,
|
|
uint8_t langId ,
|
|
// starts with word #i
|
|
int32_t i ,
|
|
int32_t ptsArg = -1 ) ;
|
|
|
|
static bool getWordPosVec ( Words *words ,
|
|
Sections *sections,
|
|
//int32_t wordStart,
|
|
//int32_t wordEnd,
|
|
int32_t startDist,
|
|
char *fragVec,
|
|
int32_t niceness ,
|
|
SafeBuf *wpos ) ;
|
|
|
|
static void getMetaListWrapper ( void *state ) ;
|
|
|
|
char *getFirstJSONObject ( char *p ,
|
|
int32_t niceness ,
|
|
bool *isProduct ,
|
|
bool *isImage ) ;
|
|
char *getJSONObjectEnd ( char *p , int32_t niceness ) ;
|
|
|
|
void doneReadingArchiveFileWrapper ( int fd, void *state );
|
|
|
|
XmlDoc::XmlDoc() {
|
|
m_readThreadOut = false;
|
|
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) m_msg7s[i] = NULL;
|
|
m_esbuf.setLabel("exputfbuf");
|
|
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
|
|
m_freed = false;
|
|
m_contentInjected = false;
|
|
m_wasContentInjected = false;
|
|
|
|
// warc parsing stuff
|
|
m_msg7 = NULL;
|
|
m_warcError = 0;
|
|
m_arcError = 0;
|
|
m_doneInjectingWarc = false;
|
|
m_numInjectionsOut = 0;
|
|
m_fptr = NULL;
|
|
m_fptrEnd = NULL;
|
|
m_fileBuf = NULL;
|
|
m_warcContentPtr = NULL;
|
|
m_calledWgetThread = false;
|
|
|
|
//m_coll = NULL;
|
|
m_ubuf = NULL;
|
|
m_pbuf = NULL;
|
|
//m_contactDoc = NULL;
|
|
m_rootDoc = NULL;
|
|
m_oldDoc = NULL;
|
|
m_dx = NULL;
|
|
m_printedMenu = false;
|
|
// reset all *valid* flags to false
|
|
void *p = &m_VALIDSTART;
|
|
void *pend = &m_VALIDEND;
|
|
memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p
|
|
m_msg22Request.m_inUse = 0;
|
|
m_msg4Waiting = false;
|
|
m_msg4Launched = false;
|
|
//m_sectiondbData = NULL;
|
|
//m_placedbData = NULL;
|
|
m_dupTrPtr = NULL;
|
|
m_oldTitleRec = NULL;
|
|
m_filteredContent = NULL;
|
|
m_filteredContentAllocSize = 0;
|
|
m_metaList = NULL;
|
|
m_metaListSize = 0;
|
|
m_metaListAllocSize = 0;
|
|
//m_titleRec = NULL;
|
|
//m_freeTitleRec = true;
|
|
m_rootTitleRec = NULL;
|
|
m_outlinkHopCountVector = NULL;
|
|
//m_gsbuf = NULL;
|
|
m_extraDoc = NULL;
|
|
m_ahrefsDoc = NULL;
|
|
m_wikiqbuf = NULL;
|
|
//m_cr = NULL;
|
|
//m_msg3aArray = NULL;
|
|
m_msg3a = NULL;
|
|
m_query3a = NULL;
|
|
//m_numMsg99Replies = 0;
|
|
m_numMsg95Replies = 0;
|
|
m_seoSocket = NULL;
|
|
m_hackSocket = NULL;
|
|
m_doingSEO = false;
|
|
//m_newxd = NULL;
|
|
//m_newxd2 = NULL;
|
|
//m_newMsg20 = NULL;
|
|
m_registeredSocketCallback = false;
|
|
//m_numMsg98Requests = 0;
|
|
//m_numMsg98Replies = 0;
|
|
m_numMsg8eReplies = 0;
|
|
m_numMsg8eRequests = 0;
|
|
m_tempMsg25Page = NULL;
|
|
m_tempMsg25Site = NULL;
|
|
m_numLinkRequestsOut = 0;
|
|
m_numLinkRequestsIn = 0;
|
|
m_numMsg3fReplies = 0;
|
|
m_numMsg3fRequests = 0;
|
|
m_numMsg4fRequests = 0;
|
|
m_numMsg4fReplies = 0;
|
|
m_sentMsg4fRequests = false;
|
|
|
|
//m_notifyBlocked = 0;
|
|
//m_mcasts = NULL;
|
|
//for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ )
|
|
// m_currentBinPtrs[i] = NULL;
|
|
m_registeredWgetReadCallback = false;
|
|
m_pipe = NULL;
|
|
reset();
|
|
};
|
|
|
|
XmlDoc::~XmlDoc() {
|
|
setStatus("freeing this xmldoc");
|
|
reset();
|
|
m_freed = true;
|
|
};
|
|
|
|
static int64_t s_lastTimeStart = 0LL;
|
|
|
|
// for debugging
|
|
class XmlDoc *g_xd;
|
|
|
|
void XmlDoc::reset ( ) {
|
|
|
|
m_oldDocExistedButHadError = false;
|
|
|
|
m_addedStatusDocId = 0;
|
|
|
|
if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) {
|
|
mfree ( m_diffbotProxyReply , sizeof(ProxyReply) , "dprox" );
|
|
m_diffbotProxyReply = NULL;
|
|
}
|
|
|
|
if ( m_readThreadOut )
|
|
log("build: deleting xmldoc class that has a read thread out "
|
|
"on a warc file");
|
|
|
|
if ( m_fileValid ) {
|
|
m_file.close();
|
|
m_file.unlink();
|
|
}
|
|
|
|
if ( m_fileBuf )
|
|
mfree ( m_fileBuf , m_fileBufAllocSize , "fbdd");
|
|
|
|
for ( int i = 0 ; i < MAXMSG7S ; i++ ) {
|
|
Msg7 *msg7 = m_msg7s[i];
|
|
if ( ! msg7 ) continue;
|
|
if(msg7->m_inUse) {
|
|
log("build: archive: resetting xmldoc when msg7s are outstanding");
|
|
|
|
}
|
|
mdelete ( msg7 , sizeof(Msg7) , "xdmsg7" );
|
|
delete ( msg7 );
|
|
m_msg7s[i] = NULL;
|
|
}
|
|
|
|
if ( m_msg7 ) {
|
|
mdelete ( m_msg7 , sizeof(Msg7) , "xdmsg7" );
|
|
delete ( m_msg7 );
|
|
m_msg7 = NULL;
|
|
}
|
|
m_warcContentPtr = NULL;
|
|
m_arcContentPtr = NULL;
|
|
m_anyContentPtr = NULL;
|
|
m_savedChar = '\0';
|
|
m_contentDelim = NULL;
|
|
|
|
if(m_registeredWgetReadCallback && m_pipe) {
|
|
log("build:came back from sleep callback");
|
|
g_loop.unregisterReadCallback( fileno(m_pipe), this,doneReadingArchiveFileWrapper);
|
|
m_registeredWgetReadCallback = false;
|
|
}
|
|
|
|
if(m_pipe) {
|
|
int32_t retCode = fclose(m_pipe);
|
|
log("we closed the warc pipe on reset with error %s", mstrerror(retCode));
|
|
m_pipe = NULL;
|
|
}
|
|
|
|
|
|
|
|
m_redirUrl.reset();
|
|
|
|
m_updatedMetaData = false;
|
|
|
|
m_ipStartTime = 0;
|
|
m_ipEndTime = 0;
|
|
m_diffbotReplyRetries = 0;
|
|
|
|
m_isImporting = false;
|
|
|
|
m_printedMenu = false;
|
|
|
|
// for hashing CT_STATUS docs consistently, this might be invalid
|
|
// so call it 0
|
|
m_pubDate = 0;
|
|
|
|
m_tmpBuf2.purge();
|
|
m_gotFacets = false;
|
|
|
|
m_bodyStartPos = 0;
|
|
|
|
m_mcastArray = NULL;
|
|
|
|
m_skipIframeExpansion = false;
|
|
m_indexedTime = 0;
|
|
|
|
m_didDelete = false;
|
|
|
|
m_metaList2.purge();
|
|
m_zbuf.purge();
|
|
m_kbuf.purge();
|
|
|
|
m_mySiteLinkInfoBuf.purge();
|
|
m_myPageLinkInfoBuf.purge();
|
|
m_myTempLinkInfoBuf.purge();
|
|
|
|
// reset count for nukeJSONObjects() function
|
|
m_joc = 0;
|
|
|
|
// notifications pending?
|
|
//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
|
|
|
m_sentToDiffbot = 0;
|
|
m_gotDiffbotSuccessfulReply = 0;
|
|
// we need to reset this to false
|
|
m_useTimeAxis = false;
|
|
|
|
m_sentToDiffbotThisTime = false;
|
|
|
|
m_loaded = false;
|
|
|
|
m_msg4Launched = false;
|
|
|
|
m_diffbotReplyError = 0;
|
|
m_diffbotJSONCount = 0;
|
|
//m_downloadAttempted = false;
|
|
m_incrementedAttemptsCount = false;
|
|
m_incrementedDownloadCount = false;
|
|
|
|
if ( m_dx ) {
|
|
mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
|
|
delete ( m_dx );
|
|
m_dx = NULL;
|
|
//log("diffbot: deleting m_dx2");
|
|
}
|
|
|
|
m_isDiffbotJSONObject = false;
|
|
|
|
m_dmozBuf.purge();
|
|
m_fakeIpBuf.purge();
|
|
m_fakeTagRecPtrBuf.purge();
|
|
|
|
m_tlbufTimer = 0LL;
|
|
m_gsbuf.reset();
|
|
|
|
//m_launchedAll = false;
|
|
|
|
m_qstringTable.reset();
|
|
|
|
//m_setForReplyPtrs = false;
|
|
//m_setForLinkPtrs = false;
|
|
|
|
// must be none outstanding
|
|
if ( m_numMsg3fReplies != m_numMsg3fRequests ) { char *xx=NULL;*xx=0;}
|
|
if ( m_numMsg4fReplies != m_numMsg4fRequests ) { char *xx=NULL;*xx=0;}
|
|
|
|
m_numMsg4fRequests = 0;
|
|
m_numMsg4fReplies = 0;
|
|
m_sentMsg4fRequests = false;
|
|
|
|
// free table's mem if used
|
|
//m_tmpDupTable.reset();
|
|
|
|
//m_newxd2Blocked = false;
|
|
|
|
m_lastPrintedDocId = 0LL;
|
|
|
|
m_loggedMsg3 = false;
|
|
|
|
m_progressBar = 0;
|
|
|
|
m_triedToAddWordPosInfoToCachedb = false;
|
|
|
|
if ( m_numLinkRequestsOut > m_numLinkRequestsIn ){char *xx=NULL;*xx=0;}
|
|
|
|
m_doConsistencyTesting = g_conf.m_doConsistencyTesting;
|
|
|
|
m_computedMetaListCheckSum = false;
|
|
|
|
m_msg3aErrno = 0;
|
|
|
|
m_hadMatchError = 0;
|
|
m_clientClosed = false;
|
|
m_lastCheckTime = 0;
|
|
|
|
m_calledMsg25ForSite = false;
|
|
m_calledMsg25ForPage = false;
|
|
m_checkedCachedbForSite = false;
|
|
m_checkedCachedbForPage = false;
|
|
m_allHashed = false;
|
|
|
|
// nuke it
|
|
if ( m_tempMsg25Page ) {
|
|
mdelete ( m_tempMsg25Page , sizeof(Msg25), "m25li" );
|
|
delete ( m_tempMsg25Page );
|
|
m_tempMsg25Page = NULL;
|
|
}
|
|
|
|
if ( m_tempMsg25Site ) {
|
|
mdelete ( m_tempMsg25Site , sizeof(Msg25), "m25li" );
|
|
delete ( m_tempMsg25Site );
|
|
m_tempMsg25Site = NULL;
|
|
}
|
|
|
|
m_numLinkRequestsOut = 0;
|
|
m_seoDebug = 0;
|
|
//m_seoInfoSetFromCache = false;
|
|
m_checkedCachedb = false;
|
|
m_processedCachedbReply = false;
|
|
m_cacheList.freeList();
|
|
|
|
for ( int32_t i = 0; m_numMsg8eReplies && i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg8eReply[i] ) continue;
|
|
mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
|
|
m_msg8eReply[i] = NULL;
|
|
}
|
|
m_numMsg8eRequests = 0;
|
|
m_numMsg8eReplies = 0;
|
|
|
|
|
|
for ( int32_t i = 0; m_numMsg95Replies && i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg95ReplyPtrs[i] ) continue;
|
|
mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
|
|
m_msg95ReplyPtrs[i] = NULL;
|
|
}
|
|
m_numMsg95Replies = 0;
|
|
|
|
|
|
m_numMsg3fRequests = 0;
|
|
m_numMsg3fReplies = 0;
|
|
m_qcursor = 0;
|
|
//m_binError = 0;
|
|
//m_msg98ReplyError = 0;
|
|
//m_binErrorForReplyPtrs = 0;
|
|
//m_binErrorForLinkPtrs = 0;
|
|
|
|
//m_msg17.reset();
|
|
//m_triedCache = false;
|
|
//m_cacheRec = NULL;
|
|
//m_cacheRecSize = 0;
|
|
|
|
// reset this crap
|
|
m_beginTimeAllMatch = 0LL;
|
|
m_beginTimeMatchUrl = 0LL;
|
|
m_beginTimeFullQueries = 0LL;
|
|
m_beginTimeLinks = 0LL;
|
|
//m_beginMsg98s = 0LL;
|
|
m_beginRelatedQueries = 0LL;
|
|
|
|
m_doledbKey.n0 = 0LL;
|
|
m_doledbKey.n1 = 0;
|
|
|
|
// sanity check, any outstanding?
|
|
//if( m_numMsg98Requests != m_numMsg98Replies ) { char *xx=NULL;*xx=0;}
|
|
// reset them now
|
|
//m_numMsg98Requests = 0;
|
|
//m_numMsg98Replies = 0;
|
|
|
|
//if ( m_newxd ) {
|
|
// mdelete ( m_newxd , sizeof(XmlDoc),"newxd");
|
|
// delete ( m_newxd );
|
|
// m_newxd = NULL;
|
|
//}
|
|
|
|
//if ( m_newxd2 ) {
|
|
// mdelete ( m_newxd2 , sizeof(XmlDoc),"newxd2");
|
|
// delete ( m_newxd2 );
|
|
// m_newxd2 = NULL;
|
|
//}
|
|
|
|
/*
|
|
if ( m_newMsg20 ) {
|
|
mdelete ( m_newMsg20 , sizeof(Msg20),"newmsg20");
|
|
delete ( m_newMsg20 );
|
|
m_newMsg20 = NULL;
|
|
}*/
|
|
/*
|
|
|
|
NO! we use this for clientClosedConnection() function now
|
|
|
|
if ( m_seoSocket ) {
|
|
TcpServer *tcp = m_seoSocket->m_this;
|
|
// gotta set this so it can be destroyed and closed
|
|
m_seoSocket->m_waitingOnHandler = false;
|
|
tcp->destroySocket ( m_seoSocket );
|
|
m_seoSocket = NULL;
|
|
}
|
|
*/
|
|
if ( m_registeredSocketCallback ) { char *xx=NULL; *xx=0; }
|
|
//for ( int32_t i = 0 ; i < m_numMsg99Replies ; i++ ) {
|
|
// if ( ! m_msg99ReplyPtrs[i] ) continue;
|
|
// mfree ( m_msg99ReplyPtrs [i] ,
|
|
// m_msg99ReplyAlloc[i] ,
|
|
// "m99reply" );
|
|
//}
|
|
//m_numMsg99Replies = 0;
|
|
//m_sentMsg99Requests = false;
|
|
|
|
|
|
if ( m_msg3a ) {
|
|
mdelete ( m_msg3a , sizeof(Msg3a) , "xdmsg3a" );
|
|
delete ( m_msg3a );
|
|
m_msg3a = NULL;
|
|
}
|
|
|
|
if ( m_query3a ) {
|
|
mdelete ( m_query3a , sizeof(Query),"xdqry3a");
|
|
delete ( m_query3a );
|
|
m_query3a = NULL;
|
|
}
|
|
|
|
m_surroundingTextBuf.purge();
|
|
m_rssItemBuf.purge();
|
|
//m_twbuf.purge();
|
|
m_topMatchingQueryBuf.purge();
|
|
//m_queryPtrs.purge();
|
|
m_queryOffsets.purge();
|
|
m_extraQueryBuf.purge();
|
|
//m_socketWriteBuf.purge();
|
|
m_relatedDocIdBuf.purge();
|
|
m_relatedTitleBuf.purge();
|
|
m_commonQueryNumBuf.purge();
|
|
m_queryLinkBuf.purge();
|
|
//m_relatedQueryLinksIntersected.purge();
|
|
m_queryLinkStringBuf.purge();
|
|
//m_queryRelBuf.purge();
|
|
//m_relPtrs.purge();
|
|
m_sortedPosdbListBuf.purge();
|
|
m_wpSortedPosdbListBuf.purge();
|
|
m_termListBuf.purge();
|
|
m_insertableTermsBuf.purge();
|
|
//m_iwfiBuf.purge();
|
|
m_wordPosInfoBuf.purge();
|
|
//m_msg20ReplyPtrBuf.purge();
|
|
m_recommendedLinksBuf.purge();
|
|
m_tmpMsg0Buf.purge();
|
|
m_msg20Array.purge();
|
|
m_newLinkerBuf.purge();
|
|
|
|
//m_msg99ReplyBuf.purge();
|
|
m_matchingQueryBuf.purge();
|
|
m_relatedQueryBuf.purge();
|
|
m_queryLinkBuf.purge();
|
|
m_matchingQueryStringBuf.purge();
|
|
m_relatedQueryStringBuf.purge();
|
|
m_queryLinkStringBuf.purge();
|
|
m_docIdListBuf.purge();
|
|
|
|
m_queryChangeBuf.purge();
|
|
m_queryLogBuf.purge();
|
|
//m_itStrBuf.purge();
|
|
m_debugScoreInfoBuf.purge();
|
|
m_origScoreInfoBuf.purge();
|
|
m_msg20Buf.purge();
|
|
m_topDocIdsBuf.purge();
|
|
m_missingTermBuf.purge();
|
|
m_termInfoBuf.purge();
|
|
m_newTermInfoBuf.purge();
|
|
m_matchingTermBuf.purge();
|
|
m_termId32Buf.purge();
|
|
m_storeList.freeList();
|
|
|
|
//m_queryHashTable.reset();
|
|
m_tidTable32.reset();
|
|
m_queryOffsetTable.reset();
|
|
m_tmpTable.reset();
|
|
m_fullQueryDedup.reset();
|
|
//m_dupVotes.reset();
|
|
|
|
m_wordSpamBuf.purge();
|
|
m_fragBuf.purge();
|
|
|
|
m_downloadLevel = 0;
|
|
|
|
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) {
|
|
if ( ! m_xmlDocs[i] ) continue;
|
|
mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
|
|
delete ( m_xmlDocs[i] );
|
|
m_xmlDocs[i] = NULL;
|
|
}
|
|
|
|
s_lastTimeStart = 0LL;
|
|
|
|
m_req = NULL;
|
|
|
|
m_doneWithAhrefs = false;
|
|
m_useAhrefs = false;
|
|
m_linkDedupTablePtr = NULL;
|
|
m_domDedupTablePtr = NULL;
|
|
|
|
m_storeTermListInfo = false;
|
|
m_gotDupStats = false;
|
|
//m_nextSection = (Section *)-1;
|
|
m_si = (Section *)-1;
|
|
|
|
// for limiting # of iframe tag expansions
|
|
m_numExpansions = 0;
|
|
|
|
// . are not allowed to exit if waiting for msg4 to complete
|
|
// . yes we are, it should be saved as addsinprogress.dat
|
|
if ( m_msg4Waiting ) {
|
|
log("doc: resetting xmldoc with outstanding msg4. should "
|
|
"me saved in addsinprogress.dat. docid=%" UINT64 "",m_docId);
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
m_ei = 0;
|
|
m_lastLaunch = -1;
|
|
|
|
m_pbuf = NULL;
|
|
m_wts = NULL;
|
|
|
|
m_deleteFromIndex = false;
|
|
|
|
//if ( m_contactDocValid ) nukeDoc ( m_contactDoc );
|
|
if ( m_rootDocValid ) nukeDoc ( m_rootDoc );
|
|
if ( m_oldDocValid ) nukeDoc ( m_oldDoc );
|
|
if ( m_extraDocValid ) nukeDoc ( m_extraDoc );
|
|
if ( m_ahrefsDocValid ) nukeDoc ( m_ahrefsDoc );
|
|
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 && m_freeLinkInfo1 ) {
|
|
// it now points into m_myPageLinkInfoBuf !
|
|
//mfree ( ptr_linkInfo1 , size_linkInfo1, "LinkInfo1");
|
|
ptr_linkInfo1 = NULL;
|
|
m_linkInfo1Valid = false;
|
|
}
|
|
if ( m_linkInfo2Valid && ptr_linkInfo2 && m_freeLinkInfo2 ) {
|
|
// should point into a safebuf as well
|
|
//mfree ( ptr_linkInfo2 , size_linkInfo2, "LinkInfo2");
|
|
ptr_linkInfo2 = NULL;
|
|
m_linkInfo2Valid = false;
|
|
}
|
|
if ( m_rawUtf8ContentValid && m_rawUtf8Content && !m_setFromTitleRec
|
|
// was content supplied by pageInject.cpp?
|
|
//! m_contentInjected ) {
|
|
) {
|
|
mfree ( m_rawUtf8Content, m_rawUtf8ContentAllocSize,"Xml3");
|
|
}
|
|
|
|
// reset this
|
|
m_contentInjected = false;
|
|
m_rawUtf8ContentValid = false;
|
|
m_wasContentInjected = false;
|
|
|
|
m_rootDoc = NULL;
|
|
|
|
// if this is true, then only index if new
|
|
m_newOnly = 0;
|
|
|
|
//if ( m_sectiondbData ) {
|
|
// mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" );
|
|
// m_sectiondbData = NULL;
|
|
//}
|
|
|
|
//if ( m_placedbData ) {
|
|
// mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" );
|
|
// m_placedbData = NULL;
|
|
//}
|
|
|
|
if ( m_httpReplyValid && m_httpReply ) {
|
|
mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
|
|
m_httpReply = NULL;
|
|
m_httpReplyValid = false;
|
|
}
|
|
|
|
if ( m_filteredContentAllocSize ) {
|
|
mfree (m_filteredContent,m_filteredContentAllocSize,"xdfc");
|
|
m_filteredContent = NULL;
|
|
m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
//if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content )
|
|
// mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3");
|
|
|
|
|
|
if ( m_metaList ) { // m_metaListValid && m_metaList ) {
|
|
mfree ( m_metaList , m_metaListAllocSize , "metalist");
|
|
m_metaList = NULL;
|
|
m_metaListSize = 0;
|
|
m_metaListAllocSize = 0;
|
|
}
|
|
|
|
if ( m_ubuf ) {
|
|
// log("xmldoc: delete m_ubuf=%"PTRFMT" this=%"PTRFMT
|
|
// , (PTRTYPE) m_ubuf
|
|
// , (PTRTYPE) this
|
|
// );
|
|
mfree ( m_ubuf , m_ubufAlloc , "ubuf");
|
|
m_ubuf = NULL;
|
|
}
|
|
|
|
//if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) {
|
|
// mfree ( m_titleRec , m_titleRecAllocSize , "trec" );
|
|
//}
|
|
//m_titleRec = NULL;
|
|
m_titleRecBuf.purge();
|
|
|
|
if ( m_dupTrPtr ) {
|
|
mfree ( m_dupTrPtr , m_dupTrSize , "trecd" );
|
|
m_dupTrPtr = NULL;
|
|
}
|
|
|
|
if ( m_oldTitleRecValid && m_oldTitleRec ) {
|
|
mfree ( m_oldTitleRec , m_oldTitleRecSize , "treca" );
|
|
m_oldTitleRec = NULL;
|
|
m_oldTitleRecValid = false;
|
|
}
|
|
|
|
if ( m_rootTitleRecValid && m_rootTitleRec ) {
|
|
mfree ( m_rootTitleRec , m_rootTitleRecSize , "treca" );
|
|
m_rootTitleRec = NULL;
|
|
m_rootTitleRecValid = false;
|
|
}
|
|
|
|
|
|
if ( m_outlinkHopCountVectorValid && m_outlinkHopCountVector ) {
|
|
int32_t sz = m_outlinkHopCountVectorSize;
|
|
mfree ( m_outlinkHopCountVector,sz,"ohv");
|
|
}
|
|
m_outlinkHopCountVector = NULL;
|
|
|
|
//if ( m_gsbufValid && m_gsbuf ) {
|
|
// mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" );
|
|
//}
|
|
//m_gsbuf = NULL;
|
|
m_gsbuf.reset();
|
|
|
|
|
|
// reset all *valid* flags to false
|
|
void *p = &m_VALIDSTART;
|
|
void *pend = &m_VALIDEND;
|
|
memset ( p , 0 , (char *)pend - (char *)p );
|
|
|
|
m_hashedMetas = false;
|
|
|
|
m_mcastBuf.purge();
|
|
m_serpBuf.purge();
|
|
|
|
// Doc.cpp:
|
|
m_mime.reset();
|
|
m_words.reset();
|
|
m_phrases.reset();
|
|
m_bits.reset();
|
|
m_sections.reset();
|
|
//m_weights.reset();
|
|
m_countTable.reset();
|
|
m_dates.reset();
|
|
m_addresses.reset();
|
|
|
|
// other crap
|
|
m_xml.reset();
|
|
m_links.reset();
|
|
m_bits2.reset();
|
|
m_pos.reset();
|
|
//m_synonyms.reset();
|
|
m_synBuf.reset();
|
|
//m_nsvt.reset();
|
|
//m_osvt.reset();
|
|
m_turkVotingTable.reset();
|
|
m_turkBitsTable.reset();
|
|
m_vtr.reset();
|
|
m_rdtab.reset();
|
|
m_vctab.reset();
|
|
m_vcduptab.reset();
|
|
m_images.reset();
|
|
m_countTable.reset();
|
|
m_mime.reset();
|
|
m_tagRec.reset();
|
|
m_newTagBuf.reset();
|
|
m_catRec.reset();
|
|
//m_clockCandidatesTable.reset();
|
|
//m_cctbuf.reset();
|
|
m_dupList.reset();
|
|
//m_oldMetaList.reset();
|
|
m_msg8a.reset();
|
|
//m_siteLinkInfo.reset();
|
|
//m_msg25.reset();
|
|
//m_msgc.reset();
|
|
m_msg13.reset();
|
|
m_tmpsb1.reset();
|
|
m_tmpsb2.reset();
|
|
m_turkBuf.reset();
|
|
m_msg0b.reset();
|
|
//m_siteGetter.reset();
|
|
m_msge0.reset();
|
|
m_msge1.reset();
|
|
m_reply.reset();
|
|
// more stuff skipped
|
|
|
|
m_wtsTable.reset();
|
|
m_wbuf.reset();
|
|
m_pageLinkBuf.reset();
|
|
m_siteLinkBuf.reset();
|
|
m_esbuf.reset();
|
|
m_xbuf.reset();
|
|
m_tagRecBuf.reset();
|
|
|
|
//m_titleRec = NULL;
|
|
//m_titleRecSize = 0;
|
|
|
|
// origin of this XmlDoc
|
|
m_setFromTitleRec = false;
|
|
m_setFromUrl = false;
|
|
m_setFromDocId = false;
|
|
m_setFromSpiderRec = false;
|
|
m_freeLinkInfo1 = false;
|
|
m_freeLinkInfo2 = false;
|
|
|
|
m_checkedUrlFilters = false;
|
|
|
|
m_indexCode = 0;
|
|
m_masterLoop = NULL;
|
|
m_masterState = NULL;
|
|
|
|
//m_isAddUrl = false;
|
|
m_isInjecting = false;
|
|
m_useFakeMime = false;
|
|
m_useSiteLinkBuf = false;
|
|
m_usePageLinkBuf = false;
|
|
m_printInXml = false;
|
|
|
|
m_check1 = false;
|
|
m_check2 = false;
|
|
m_prepared = false;
|
|
|
|
// keep track of updates to the rdbs we have done, so we do not re-do
|
|
m_listAdded = false;
|
|
m_listFlushed = false;
|
|
m_updatedCounts = false;
|
|
m_updatedCounts2 = false;
|
|
//m_updatedTagdb1 = false;
|
|
//m_updatedTagdb2 = false;
|
|
//m_updatedTagdb3 = false;
|
|
//m_updatedTagdb4 = false;
|
|
//m_updatedTagdb5 = false;
|
|
m_copied1 = false;
|
|
m_updatingSiteLinkInfoTags = false;
|
|
m_addressSetCalled = false;
|
|
m_hashedTitle = false;
|
|
|
|
m_registeredSleepCallback = false;
|
|
m_addedNegativeDoledbRec = false;
|
|
|
|
m_numRedirects = 0;
|
|
m_numOutlinksAdded = 0;
|
|
// . use sameDomain and sameIp waits?
|
|
// . these may be bypassed in getContactDoc()
|
|
//m_throttleDownload = true;
|
|
m_spamCheckDisabled = false;
|
|
m_useRobotsTxt = true;
|
|
m_redirectFlag = false;
|
|
|
|
// Scraper.cpp sets this to true
|
|
//m_isScraping = false;
|
|
|
|
m_allowSimplifiedRedirs = false;
|
|
|
|
//m_calledMsg22a = false;
|
|
//m_calledMsg22b = false;
|
|
//m_calledMsg22c = false;
|
|
m_didDelay = false;
|
|
m_didDelayUnregister = false;
|
|
m_calledMsg22d = 0LL;
|
|
m_calledMsg22e = false;
|
|
m_calledMsg22f = false;
|
|
m_calledMsg25 = false;
|
|
m_calledMsg25b = false;
|
|
m_calledMsg40 = false;
|
|
m_calledSections = false;
|
|
m_calledThread = false;
|
|
m_alreadyRegistered = false;
|
|
m_loaded = false;
|
|
m_firstEntry = true;
|
|
m_firstEntry2 = true;
|
|
m_launchedSpecialMsg8a = false;
|
|
m_launchedMsg8a2 = false;
|
|
|
|
m_numSectiondbReads = 0;
|
|
m_numSectiondbNeeds = 0;
|
|
m_sectiondbRecall = 0;
|
|
|
|
//m_triedVoteCache = false;
|
|
//m_storedVoteCache = false;
|
|
|
|
m_setTr = false;
|
|
//m_checkedRobots = false;
|
|
m_triedTagRec = false;
|
|
m_didGatewayPage = false;
|
|
m_didQuickDupCheck = false;
|
|
m_calledMsg8b = false;
|
|
|
|
m_recycleContent = false;
|
|
//m_loadFromOldTitleRec = false;
|
|
m_callback1 = NULL;
|
|
m_callback2 = NULL;
|
|
m_state = NULL;
|
|
|
|
// used for getHasContactInfo()
|
|
m_processed0 = false;
|
|
m_hasContactInfo = false;
|
|
m_hasContactInfo2 = false;
|
|
|
|
|
|
//m_checkForRedir = true;
|
|
|
|
m_processedLang = false;
|
|
|
|
m_doingConsistencyCheck = false;
|
|
|
|
// used for getting contact info
|
|
//m_triedRoot = false;
|
|
//m_winner = -2;
|
|
|
|
// tell Msg13 to just call HttpServer::getDoc() and not to forward
|
|
// the download request to another host. although this does not
|
|
// exclude possible forwarding it to a compression proxy if
|
|
// g_conf.m_useCompressionProxy is set
|
|
m_forwardDownloadRequest = false;
|
|
|
|
m_isChildDoc = false;
|
|
m_parentDocPtr = NULL;
|
|
|
|
// for utf8 content functions
|
|
m_savedp = NULL;
|
|
m_oldp = NULL;
|
|
m_didExpansion = false;
|
|
|
|
// Repair.cpp now explicitly sets these to false if needs to
|
|
m_usePosdb = true;
|
|
//m_useDatedb = true;
|
|
m_useClusterdb = true;
|
|
m_useLinkdb = true;
|
|
m_useSpiderdb = true;
|
|
m_useTitledb = true;
|
|
m_useTagdb = true;
|
|
m_usePlacedb = true;
|
|
//m_useTimedb = true;
|
|
// only use for custom crawls for now to save disk space
|
|
m_useSectiondb = false;
|
|
//m_useRevdb = true;
|
|
m_useSecondaryRdbs = false;
|
|
|
|
//m_useIpsTxtFile = true;
|
|
|
|
// used by Msg13.cpp only. kinda a hack.
|
|
m_isSpiderProxy = false;
|
|
|
|
// do not cache the http reply in msg13 etc.
|
|
m_maxCacheAge = 0;
|
|
|
|
// reset these ptrs too!
|
|
void *px = &ptr_firstUrl;
|
|
void *pxend = &size_firstUrl;
|
|
memset ( px , 0 , (char *)pxend - (char *)px );
|
|
|
|
m_hasMetadata = false;
|
|
ptr_metadata = NULL;
|
|
size_metadata = 0;
|
|
}
|
|
|
|
// . set the url with the intention of adding it or deleting it from the index
|
|
// . Msg7 and Repair.cpp can also set other members of XmlDoc rather than just
|
|
// m_firstUrl. they can provide the ip, the http reply, content, filtered
|
|
// content, the forced next spider time and the forced first indexed date,
|
|
// the hop count
|
|
// . they might also want to skip deduping, or any algo deemed unnecessary
|
|
// by setting, for instance, m_isDupValid = true, or something
|
|
bool XmlDoc::set1 ( char *url ,
|
|
char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ) {
|
|
|
|
reset();
|
|
|
|
// this is true
|
|
m_setFromUrl = true;
|
|
|
|
//m_coll = coll;
|
|
m_pbuf = pbuf;
|
|
m_niceness = niceness;
|
|
m_version = TITLEREC_CURRENT_VERSION;
|
|
m_versionValid = true;
|
|
|
|
// sanity check
|
|
if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// copy this in case collection gets deleted i guess...
|
|
//m_forceDelete = forceDelete;
|
|
// did we get this url from PageAddUrl?
|
|
//m_isAddUrl = isAddUrl;
|
|
// set m_indexCode so that XmlDoc::indexDoc() will delete it
|
|
//if ( forceDelete ) m_indexCode = EDOCFORCEDELETE;
|
|
|
|
// set this important member var
|
|
//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
|
|
//if ( ! cr ) return false;
|
|
if ( ! setCollNum ( coll ) ) return false;
|
|
|
|
setFirstUrl ( url , false );
|
|
|
|
//setSpideredTime();
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
char *XmlDoc::getTestDir ( ) {
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// return NULL if we are not the "qatest123" collection
|
|
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
|
|
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
|
|
// then return "test-spider" otherwise...
|
|
//if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
|
|
// return "qa";//"test-spider";
|
|
// ... default to "test-parser"
|
|
//return "test-parser";
|
|
return "qa";
|
|
/*
|
|
if ( getIsPageParser() )
|
|
return "test-page-parser";
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting )
|
|
// return "test-page-inject";
|
|
else if ( g_conf.m_testParserEnabled )
|
|
return "test-parser";
|
|
else if ( g_conf.m_testSpiderEnabled )
|
|
return "test-spider";
|
|
// default to being from PageInject
|
|
return "test-page-inject";
|
|
*/
|
|
//else { char *xx=NULL;*xx=0; }
|
|
//return NULL;
|
|
}
|
|
|
|
int32_t XmlDoc::getSpideredTime ( ) {
|
|
// stop if already set
|
|
if ( m_spideredTimeValid ) return m_spideredTime;
|
|
|
|
// tmp var
|
|
int32_t date = 0;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return 0;
|
|
|
|
// if not test collection keep it simple
|
|
if ( strcmp(cr->m_coll,"qatest123") || cr->m_useTimeAxis) {
|
|
// . set spider time to current time
|
|
// . this might already be valid if we set it in
|
|
// getTestSpideredDate()
|
|
m_spideredTime = getTimeGlobal();
|
|
m_spideredTimeValid = true;
|
|
return m_spideredTime;
|
|
}
|
|
|
|
char *testDir = getTestDir();
|
|
|
|
// get url
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this returns false if not in there, in which case, add it
|
|
if ( ! getTestSpideredDate(cu,&date,testDir) ) {
|
|
m_spideredTime = getTimeGlobal();
|
|
m_spideredTimeValid = true;
|
|
addTestSpideredDate ( cu , m_spideredTime , testDir );
|
|
return m_spideredTime;
|
|
}
|
|
|
|
// if we are injecting into the test coll for the 2nd+ time
|
|
// we need to use the spidered date from the first time we
|
|
// injected the doc in order to ensure things are parsed
|
|
// exactly the same way since some things depend on the
|
|
// spideredTime, like Dates (for setting "in future"
|
|
// flags)
|
|
m_spideredTimeValid = true;
|
|
m_spideredTime = date;
|
|
// hack for test coll which has fake vals for these because
|
|
// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
|
|
//m_minPubDate = m_spideredTime - 48*3600;
|
|
//m_maxPubDate = m_spideredTime - 24*3600;
|
|
|
|
return m_spideredTime;
|
|
}
|
|
|
|
// . we need this so PageGet.cpp can get the cached web page
|
|
// . but not for Msg20::getSummary(), that uses XmlDoc::set(Msg20Request*)
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::set3 ( int64_t docId ,
|
|
char *coll ,
|
|
int32_t niceness ) {
|
|
|
|
reset();
|
|
|
|
// this is true
|
|
m_setFromDocId = true;
|
|
|
|
m_docId = docId;
|
|
m_docIdValid = true;
|
|
//m_coll = coll;
|
|
m_niceness = niceness;
|
|
|
|
// . sanity check
|
|
// . why can't we allow this??? MDW
|
|
//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// set this important member var
|
|
//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
|
|
//if ( ! cr ) { m_errno = ENOCOLLREC; return false; }
|
|
if ( ! setCollNum ( coll ) ) return false;
|
|
|
|
// solidify some parms
|
|
//m_eliminateMenus = cr->m_eliminateMenus;
|
|
//m_eliminateMenusValid = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
void loadFromOldTitleRecWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "loading from old title rec wrapper" );
|
|
// return if it blocked
|
|
if ( ! THIS->loadFromOldTitleRec ( ) ) return;
|
|
|
|
char *coll = "";
|
|
CollectionRec *cr = THIS->getCollRec();
|
|
if ( cr ) coll = cr->m_coll;
|
|
|
|
// error?
|
|
if ( g_errno ) log("doc: loadfromtitlerec coll=%s: %s",
|
|
coll,
|
|
mstrerror(g_errno));
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// returns false if blocked, returns true and sets g_errno on error otherwise
|
|
bool XmlDoc::loadFromOldTitleRec ( ) {
|
|
// . we are an entry point.
|
|
// . if anything blocks, this will be called when it comes back
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = loadFromOldTitleRecWrapper;
|
|
m_masterState = this;
|
|
}
|
|
// if we already loaded!
|
|
if ( m_loaded ) return true;
|
|
// if set from a docid, use msg22 for this!
|
|
char **otr = getOldTitleRec ( );
|
|
// error?
|
|
if ( ! otr ) return true;
|
|
// blocked?
|
|
if ( otr == (void *)-1 ) return false;
|
|
// this is a not found
|
|
if ( ! *otr ) {
|
|
// so we do not retry
|
|
m_loaded = true;
|
|
// make it an error
|
|
g_errno = ENOTFOUND;
|
|
return true;
|
|
}
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
// use that. decompress it! this will also set
|
|
// m_setFromTitleRec to true
|
|
if ( ! set2 ( m_oldTitleRec ,
|
|
m_oldTitleRecSize , // maxSize
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness )) {
|
|
// we are now loaded, do not re-call
|
|
m_loaded = true;
|
|
// return true with g_errno set on error uncompressing
|
|
return true;
|
|
}
|
|
// we are now loaded, do not re-call
|
|
m_loaded = true;
|
|
// sanity check
|
|
if ( ! m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
|
|
// good to go
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::setCollNum ( char *coll ) {
|
|
CollectionRec *cr;
|
|
cr = g_collectiondb.getRec ( coll , gbstrlen(coll) );
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
return log("build: collrec not found for %s",coll);
|
|
}
|
|
// we can store this safely:
|
|
m_collnum = cr->m_collnum;
|
|
m_collnumValid = true;
|
|
// if user "resets" the collection we need to know
|
|
m_lastCollRecResetCount = cr->m_lastResetCount;
|
|
return true;
|
|
}
|
|
|
|
CollectionRec *XmlDoc::getCollRec ( ) {
|
|
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
|
|
if ( ! cr ) {
|
|
log("build: got NULL collection rec for collnum=%" INT32 ".",
|
|
(int32_t)m_collnum);
|
|
g_errno = ENOCOLLREC;
|
|
return NULL;
|
|
}
|
|
// was it reset since we started spidering this url?
|
|
// we don't do it this way, when resetting a coll when delete it and
|
|
// re-add under a different collnum to avoid getting msg4 adds to it.
|
|
//if ( cr->m_lastResetCount != m_lastCollRecResetCount ) {
|
|
// log("build: collection rec was reset. returning null.");
|
|
// g_errno = ENOCOLLREC;
|
|
// return NULL;
|
|
//}
|
|
return cr;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
|
key_t *doledbKey ,
|
|
char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ,
|
|
char *utf8ContentArg ,
|
|
bool deleteFromIndex ,
|
|
int32_t forcedIp ,
|
|
uint8_t contentType ,
|
|
uint32_t spideredTime ,
|
|
bool contentHasMimeArg ,
|
|
char *contentDelim,
|
|
char *metadata ,
|
|
uint32_t metadataLen,
|
|
int32_t payloadLen
|
|
) {
|
|
|
|
// sanity check
|
|
if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
reset();
|
|
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("xmldoc: set4 uh48=%" UINT64 " parentdocid=%" UINT64 "",
|
|
sreq->getUrlHash48(),sreq->getParentDocId());
|
|
|
|
// used by PageSpiderdb.cpp
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
m_startTimeValid = true;
|
|
|
|
// this is true
|
|
m_setFromSpiderRec = true;
|
|
|
|
// did page inject (pageinject) request to delete it?
|
|
m_deleteFromIndex = deleteFromIndex;
|
|
|
|
// PageReindex.cpp will set this in the spider request
|
|
if ( sreq->m_forceDelete )
|
|
m_deleteFromIndex = true;
|
|
|
|
// if we are a container doc then we need the content delimeter,
|
|
// unless if we are a warc or arc, then we know how those delimit
|
|
// already.
|
|
m_contentDelim = contentDelim;
|
|
m_contentDelimValid = true;
|
|
|
|
bool contentHasMime = contentHasMimeArg;
|
|
// but if we are a container doc then this parm applies to each subdoc
|
|
// not to us, so turn it off for this part.
|
|
if ( isContainerDoc() ) {
|
|
contentHasMime = false;
|
|
m_subDocsHaveMime = contentHasMimeArg;
|
|
}
|
|
|
|
|
|
char *utf8Content = utf8ContentArg;
|
|
|
|
if ( contentHasMime && utf8Content ) {
|
|
// get length of it all
|
|
int32_t clen = gbstrlen(utf8Content);
|
|
// return true on error with g_errno set
|
|
if ( ! m_mime.set ( utf8ContentArg , clen , NULL ) ) {
|
|
if ( ! g_errno ) g_errno = EBADMIME;
|
|
log("xmldoc: could not set mime: %s",
|
|
mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// it's valid
|
|
m_mimeValid = true;
|
|
// advance
|
|
utf8Content = m_mime.getContent();
|
|
|
|
if(payloadLen != -1) {
|
|
payloadLen -= m_mime.getContent() - utf8ContentArg;
|
|
}
|
|
}
|
|
|
|
// use this to avoid ip lookup if it is not zero
|
|
if ( forcedIp ) {
|
|
m_ip = forcedIp;
|
|
m_ipValid = true;
|
|
}
|
|
|
|
// sometimes they supply the content they want! like when zaks'
|
|
// injects pages from PageInject.cpp
|
|
if ( utf8Content ) {
|
|
// . this is the most basic content from the http reply
|
|
// . only set this since sometimes it is facebook xml and
|
|
// contains encoded html which needs to be decoded.
|
|
// like <name>Ben & Jerry's</name> otherwise are
|
|
// sentence formation stops at the ';' in the "&" and
|
|
// we also index "amp" which is bad.
|
|
m_content = utf8Content;
|
|
if(payloadLen != -1) {
|
|
m_contentLen = payloadLen;
|
|
}
|
|
else if ( m_mimeValid && m_mime.m_contentLen > 0) {
|
|
m_contentLen = m_mime.m_contentLen;
|
|
} else {
|
|
m_contentLen = gbstrlen(utf8Content);
|
|
}
|
|
|
|
m_contentValid = true;
|
|
|
|
//m_rawUtf8Content = utf8Content;
|
|
//m_expandedUtf8Content = utf8Content;
|
|
|
|
//ptr_utf8Content = utf8Content;
|
|
//size_utf8Content = slen+1;
|
|
|
|
//m_rawUtf8ContentValid = true;
|
|
//m_expandedUtf8ContentValid = true;
|
|
//m_utf8ContentValid = true;
|
|
|
|
m_contentInjected = true;
|
|
m_wasContentInjected = true;
|
|
m_contentType = contentType;
|
|
m_contentTypeValid = true;
|
|
// use this ip as well for now to avoid ip lookup
|
|
//m_ip = atoip("127.0.0.1");
|
|
//m_ipValid = true;
|
|
// do not need robots.txt then
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
// nor mime
|
|
m_httpStatus = 200;
|
|
m_httpStatusValid = true;
|
|
// this too
|
|
m_downloadStatus = 0;
|
|
m_downloadStatusValid = true;
|
|
// assume this is the download time since the content
|
|
// was pushed/provided to us
|
|
if ( spideredTime )
|
|
m_downloadEndTime = spideredTime;
|
|
else
|
|
m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
|
|
// either way, validate it
|
|
m_downloadEndTimeValid = true;
|
|
// and need a legit mime
|
|
if ( ! m_mimeValid ) {
|
|
m_mime.m_bufLen = 1;
|
|
m_mimeValid = true;
|
|
m_mime.m_contentType = contentType;
|
|
}
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// no redir
|
|
ptr_redirUrl = NULL;
|
|
size_redirUrl = 0;
|
|
m_redirUrl.reset();
|
|
m_redirUrlPtr = NULL;//&m_redirUrl;
|
|
m_redirUrlValid = true;
|
|
m_redirErrorValid = true;
|
|
m_redirError = 0;
|
|
m_crawlDelay = -1;
|
|
m_crawlDelayValid = true;
|
|
}
|
|
|
|
// override content type based on mime for application/json
|
|
if ( m_mimeValid ) {
|
|
m_contentType = m_mime.m_contentType;
|
|
m_contentTypeValid = true;
|
|
}
|
|
|
|
|
|
//m_coll = coll;
|
|
m_pbuf = pbuf;
|
|
m_niceness = niceness;
|
|
m_version = TITLEREC_CURRENT_VERSION;
|
|
m_versionValid = true;
|
|
|
|
/*
|
|
// set min/max pub dates right away
|
|
m_minPubDate = -1;
|
|
m_maxPubDate = -1;
|
|
// parentPrevSpiderTime is 0 if that was the first time that the
|
|
// parent was spidered, in which case isNewOutlink will always be set
|
|
// for every outlink it had!
|
|
if ( sreq->m_isNewOutlink && sreq->m_parentPrevSpiderTime ) {
|
|
// sanity check
|
|
if ( ! sreq->m_parentPrevSpiderTime ) {char *xx=NULL;*xx=0;}
|
|
// pub date is somewhere between these two times
|
|
m_minPubDate = sreq->m_parentPrevSpiderTime;
|
|
m_maxPubDate = sreq->m_addedTime;
|
|
}
|
|
*/
|
|
|
|
// this is used to removing the rec from doledb after we spider it
|
|
m_doledbKey.setMin();
|
|
if ( doledbKey ) m_doledbKey = *doledbKey;
|
|
|
|
// . sanity check
|
|
// . we really don't want the parser holding up the query pipeline
|
|
// even if this page is being turked!
|
|
//if ( m_niceness == 0 &&
|
|
// // spider proxy uses xmldoc class to expand iframe tags and
|
|
// // sometimes the initiating msg13 class was re-niced to 0
|
|
// // in the niceness converstion logic.
|
|
// ! g_hostdb.m_myHost->m_isProxy ) {
|
|
// char *xx=NULL; *xx=0; }
|
|
|
|
if ( sreq->isCorrupt() )
|
|
return log("XmlDoc: set4() spider request is corrupt in coll "
|
|
"%s u=%s",coll,sreq->m_url);
|
|
|
|
m_sreqValid = true;
|
|
|
|
// store the whole rec, key+dataSize+data, in case it disappears.
|
|
gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
|
|
|
|
// set m_collnum etc.
|
|
if ( ! setCollNum ( coll ) )
|
|
return log("XmlDoc: set4() coll %s invalid",coll);
|
|
|
|
// it should be valid since we just set it
|
|
CollectionRec *cr = getCollRec();
|
|
|
|
m_useRobotsTxt = cr->m_useRobotsTxt;
|
|
|
|
// solidify some parms
|
|
//m_eliminateMenus = cr->m_eliminateMenus;
|
|
//m_eliminateMenusValid = true;
|
|
|
|
// validate these here too
|
|
/*
|
|
m_titleWeight = cr->m_titleWeight;
|
|
m_headerWeight = cr->m_headerWeight;
|
|
m_urlPathWeight = cr->m_urlPathWeight;
|
|
m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
|
|
m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
|
|
m_conceptWeight = cr->m_conceptWeight;
|
|
|
|
m_titleWeightValid = true;
|
|
m_headerWeightValid = true;
|
|
m_urlPathWeightValid = true;
|
|
m_externalLinkTextWeightValid = true;
|
|
m_internalLinkTextWeightValid = true;
|
|
m_conceptWeightValid = true;
|
|
*/
|
|
|
|
// fix some corruption i've seen
|
|
if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
|
|
log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
|
|
m_sreq.m_urlIsDocId = 0;
|
|
}
|
|
|
|
// if url is a docid... we are from pagereindex.cpp
|
|
//if ( sreq->m_isPageReindex ) {
|
|
// now we can have url-based page reindex requests because
|
|
// if we have a diffbot json object fake url reindex request
|
|
// we add a spider request of the PARENT url for it as page reindex
|
|
//if ( is_digit ( sreq->m_url[0] ) ) {
|
|
// watch out for 0.r.msn.com!!
|
|
if ( m_sreq.m_urlIsDocId ) {
|
|
m_docId = atoll(m_sreq.m_url);
|
|
// assume its good
|
|
m_docIdValid = true;
|
|
// similar to set3() above
|
|
m_setFromDocId = true;
|
|
// use content and ip from old title rec to save time
|
|
// . crap this is making the query reindex not actually
|
|
// re-download the content.
|
|
// . we already check the m_deleteFromIndex flag below
|
|
// in getUtf8Content() and use the old content in that case
|
|
// so i'm not sure why we are recycling here, so take
|
|
// this out. MDW 9/25/2014.
|
|
//m_recycleContent = true;
|
|
// sanity
|
|
if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
// add www is now REQUIRED for all!
|
|
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
|
|
// www.tmblr.co has no IP
|
|
setFirstUrl ( m_sreq.m_url , false );//true ); // false );
|
|
// you can't call this from a docid based url until you
|
|
// know the uh48
|
|
//setSpideredTime();
|
|
}
|
|
|
|
// now query reindex can specify a recycle content option so it
|
|
// can replace the rebuild tool. try to recycle on global index.
|
|
if ( m_sreqValid )
|
|
m_recycleContent = m_sreq.m_recycleContent;
|
|
|
|
m_hasMetadata = (bool)metadata;
|
|
|
|
ptr_metadata = metadata;
|
|
size_metadata = metadataLen;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . set our stuff from the TitleRec (from titledb)
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::set2 ( char *titleRec ,
|
|
int32_t maxSize ,
|
|
char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ,
|
|
SpiderRequest *sreq ) {
|
|
|
|
// NO! can't do this. see below
|
|
//reset();
|
|
|
|
setStatus ( "setting xml doc from title rec");
|
|
|
|
// . it resets us, so save this
|
|
// . we only save these for set2() not the other sets()!
|
|
//void (*cb1)(void *state) = m_callback1;
|
|
//bool (*cb2)(void *state) = m_callback2;
|
|
//void *state = m_state;
|
|
|
|
// . clear it all out
|
|
// . no! this is clearing our msg20/msg22 reply...
|
|
// . ok, but repair.cpp needs it so do it there then
|
|
//reset();
|
|
|
|
// restore callbacks
|
|
//m_callback1 = cb1;
|
|
//m_callback2 = cb2;
|
|
//m_state = state;
|
|
|
|
// sanity check - since we do not reset
|
|
if ( m_contentValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this is true
|
|
m_setFromTitleRec = true;
|
|
|
|
// this is valid i guess. includes key, etc.
|
|
//m_titleRec = titleRec;
|
|
//m_titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
|
|
//m_titleRecValid = true;
|
|
// . should we free m_cbuf on our reset/destruction?
|
|
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
|
|
// that should not be freed, besides the alloc size is not known!
|
|
//m_freeTitleRec = false;
|
|
|
|
int32_t titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
|
|
// . should we free m_cbuf on our reset/destruction?
|
|
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
|
|
// that should not be freed, besides the alloc size is not known!
|
|
m_titleRecBuf.setBuf ( titleRec ,
|
|
titleRecSize , // bufmax
|
|
titleRecSize , // bytes in use
|
|
false, // ownData?
|
|
csUTF8); // encoding
|
|
m_titleRecBufValid = true;
|
|
|
|
|
|
//m_coll = coll;
|
|
m_pbuf = pbuf;
|
|
m_niceness = niceness;
|
|
|
|
// . sanity check
|
|
// . NO! could be from XmlDoc::getMsg20Reply()!
|
|
//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// it must be there!
|
|
if ( !titleRec||titleRecSize==0 ) {g_errno=ENOTFOUND; return false;}
|
|
|
|
// set our collection number
|
|
if ( ! setCollNum ( coll ) ) return false;
|
|
|
|
// store the whole rec, key+dataSize+data, in case it disappears.
|
|
if ( sreq ) {
|
|
gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
|
|
m_sreqValid = true;
|
|
}
|
|
|
|
m_hashedTitle = false;
|
|
m_hashedMetas = false;
|
|
|
|
// save the compressed buffer in case we should free it when done
|
|
//m_titleRec = titleRec;
|
|
// should we free m_cbuf on our reset/destruction?
|
|
//m_freeTitleRec = true;
|
|
// our record may not occupy all of m_cbuf, careful
|
|
//m_titleRecAllocSize = maxSize;
|
|
|
|
// get a parse ptr
|
|
char *p = titleRec ;
|
|
// . this is just like a serialized RdbList key/dataSize/data of 1 rec
|
|
// . first thing is the key
|
|
// . key should have docId embedded in it
|
|
m_titleRecKey = *(key_t *) p ;
|
|
//m_titleRecKeyValid = true;
|
|
p += sizeof(key_t);
|
|
// bail on error
|
|
if ( (m_titleRecKey.n0 & 0x01) == 0x00 ) {
|
|
g_errno = EBADTITLEREC;
|
|
log("db: Titledb record is a negative key.");
|
|
char *xx=NULL; *xx=0;
|
|
return false;
|
|
}
|
|
// set m_docId from key
|
|
m_docId = g_titledb.getDocIdFromKey ( m_titleRecKey );
|
|
// validate that
|
|
m_docIdValid = true;
|
|
// then the size of the data that follows this
|
|
int32_t dataSize = *(int32_t *) p ;
|
|
p += 4;
|
|
// bail on error
|
|
if ( dataSize < 4 ) {
|
|
g_errno = EBADTITLEREC;
|
|
return log("db: Titledb record has size of %" INT32 " which "
|
|
"is less then 4. Probable disk corruption in a "
|
|
"titledb file.",
|
|
dataSize);
|
|
}
|
|
// what is the size of cbuf/titleRec in bytes?
|
|
int32_t cbufSize = dataSize + 4 + sizeof(key_t);
|
|
// . the actual data follows "dataSize"
|
|
// . what's the size of the uncompressed compressed stuff below here?
|
|
m_ubufSize = *(int32_t *) p ; p += 4;
|
|
// . because of disk/network data corruption this may be wrong!
|
|
// . we can now have absolutely huge titlerecs...
|
|
if ( m_ubufSize <= 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 )
|
|
g_errno = EBADTITLEREC;
|
|
return log("db: TitleRec::set: uncompress uncompressed "
|
|
"size=%" INT32 ".",m_ubufSize );
|
|
}
|
|
// trying to uncompress corrupt titlerecs sometimes results in
|
|
// a seg fault... watch out
|
|
if ( m_ubufSize > 100*1024*1024 ) {
|
|
g_errno = EBADTITLEREC;
|
|
return log("db: TitleRec::set: uncompress uncompressed "
|
|
"size=%" INT32 " > 100MB. unacceptable, probable "
|
|
"corruption.",m_ubufSize );
|
|
}
|
|
// make buf space for holding the uncompressed stuff
|
|
m_ubufAlloc = m_ubufSize;
|
|
m_ubuf = (char *) mmalloc ( m_ubufAlloc ,"TitleRecu1");
|
|
// log("xmldoc: m_ubuf=%"PTRFMT" this=%"PTRFMT
|
|
// , (PTRTYPE) m_ubuf
|
|
// , (PTRTYPE) this
|
|
// );
|
|
if ( ! m_ubuf ) {
|
|
// we had bad ubufsizes on gb6, like > 1GB print out key
|
|
// so we can manually make a titledb.dat file to delete these
|
|
// bad keys
|
|
log("build: alloc failed ubufsize=%" INT32 " key.n1=%" UINT32 " "
|
|
"n0=%" UINT64 ,
|
|
m_ubufAlloc,m_titleRecKey.n1,m_titleRecKey.n0);
|
|
return false;
|
|
}
|
|
// we need to loop since uncompress is weird, sometimes it needs more
|
|
// space then it should. see how much it actually took.
|
|
int32_t realSize = m_ubufSize;
|
|
// time it
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
// debug msg
|
|
|
|
setStatus( "Uncompressing title rec." );
|
|
// . uncompress the data into m_ubuf
|
|
// . m_ubufSize should remain unchanged since we stored it
|
|
int err = gbuncompress ( (unsigned char *) m_ubuf ,
|
|
(uint32_t *) &realSize ,
|
|
(unsigned char *) p ,
|
|
(uint32_t ) (dataSize - 4) );
|
|
// hmmmm...
|
|
if ( err == Z_BUF_ERROR ) {
|
|
log("db: Buffer is too small to hold uncompressed "
|
|
"document. Probable disk corruption in a titledb file.");
|
|
g_errno = EUNCOMPRESSERROR;
|
|
return false;
|
|
}
|
|
// set g_errno and return false on error
|
|
if ( err != Z_OK ) {
|
|
g_errno = EUNCOMPRESSERROR;
|
|
return log("db: Uncompress of document failed. ZG_ERRNO=%i. "
|
|
"cbufSize=%" INT32 " ubufsize=%" INT32 " realSize=%" INT32 "",
|
|
err , cbufSize , m_ubufSize , realSize );
|
|
}
|
|
if ( realSize != m_ubufSize ) {
|
|
g_errno = EBADENGINEER;
|
|
return log("db: Uncompressed document size is not what we "
|
|
"recorded it to be. Probable disk corruption in "
|
|
"a titledb file.");
|
|
}
|
|
// . add the stat
|
|
// . use white for the stat
|
|
g_stats.addStat_r ( 0 ,
|
|
startTime ,
|
|
gettimeofdayInMilliseconds(),
|
|
0x00ffffff );
|
|
|
|
// first 2 bytes in m_ubuf is the header size
|
|
int32_t headerSize = *(uint16_t *)m_ubuf;
|
|
|
|
int32_t shouldbe = (char *)&ptr_firstUrl - (char *)&m_headerSize;
|
|
|
|
if ( headerSize != shouldbe ) {
|
|
g_errno = ECORRUPTDATA;
|
|
return log("doc: bad header size in title rec");
|
|
}
|
|
|
|
// set our easy stuff
|
|
gbmemcpy ( (void *)this , m_ubuf , headerSize );
|
|
|
|
// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
|
|
// like in Msg.cpp and Msg20Reply.cpp
|
|
|
|
if ( m_pbuf ) {
|
|
int32_t crc = hash32(m_ubuf,headerSize);
|
|
m_pbuf->safePrintf("crchdr=0x%" XINT32 " sizehdr=%" INT32 ", ",
|
|
crc,headerSize);
|
|
}
|
|
|
|
|
|
// point to the string data
|
|
char *up = m_ubuf + headerSize;
|
|
|
|
// end of the rec
|
|
char *upend = m_ubuf + m_ubufSize;
|
|
|
|
// how many XmlDoc::ptr_* members do we have? set "np" to that
|
|
int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ;
|
|
np /= sizeof(char *);
|
|
|
|
// point to the first ptr
|
|
char **pd = (char **)&ptr_firstUrl;
|
|
// point to the first size
|
|
int32_t *ps = (int32_t *)&size_firstUrl;
|
|
|
|
// loop over them
|
|
for ( int32_t i = 0 ; i < np ; i++ , pd++ , ps++ ) {
|
|
// zero out the ith ptr_ and size_ member
|
|
*pd = 0;
|
|
*ps = 0;
|
|
// make the mask
|
|
uint32_t mask = 1 << i ;
|
|
// do we have this member? skip if not.
|
|
if ( ! (m_internalFlags1 & mask) ) continue;
|
|
// watch out for corruption
|
|
if ( up > upend ) {
|
|
g_errno = ECORRUPTDATA;
|
|
return log("doc: corrupt titlerec.");
|
|
}
|
|
// get the size
|
|
*ps = *(int32_t *)up;
|
|
// this should never be 0, otherwise, why was its flag set?
|
|
if ( *ps <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip over to point to data
|
|
up += 4;
|
|
// point to the data. could be 64-bit ptr.
|
|
*pd = up;//(int32_t)up;
|
|
// debug
|
|
if ( m_pbuf ) {
|
|
int32_t crc = hash32(up,*ps);
|
|
m_pbuf->safePrintf("crc%" INT32 "=0x%" XINT32 " size%" INT32 "=%" INT32 ", ",
|
|
i,crc,i,*ps);
|
|
}
|
|
// skip over data
|
|
up += *ps;
|
|
// watch out for corruption
|
|
if ( up > upend ) {
|
|
g_errno = ECORRUPTDATA;
|
|
return log("doc: corrupt titlerec.");
|
|
}
|
|
}
|
|
// cap it
|
|
char *pend = m_ubuf + m_ubufSize;
|
|
// sanity check. must match exactly.
|
|
if ( up != pend ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set the urls i guess
|
|
m_firstUrl.set ( ptr_firstUrl );
|
|
if ( ptr_redirUrl ) {
|
|
m_redirUrl.set ( ptr_redirUrl );
|
|
m_currentUrl.set ( ptr_redirUrl );
|
|
m_currentUrlValid = true;
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
}
|
|
else {
|
|
m_currentUrl.set ( ptr_firstUrl );
|
|
m_currentUrlValid = true;
|
|
m_redirUrlPtr = NULL;
|
|
}
|
|
m_firstUrlValid = true;
|
|
m_redirUrlValid = true;
|
|
|
|
|
|
// convert 8 bit to a 32 bit
|
|
//m_numBannedOutlinks = score8to32 ( m_numBannedOutlinks8 );
|
|
|
|
// validate *shadow* members since bit flags cannot be returned
|
|
m_isRSS2 = m_isRSS;
|
|
m_isPermalink2 = m_isPermalink;
|
|
m_isAdult2 = m_isAdult;
|
|
m_spiderLinks2 = m_spiderLinks;
|
|
m_isContentTruncated2 = m_isContentTruncated;
|
|
m_isLinkSpam2 = m_isLinkSpam;
|
|
m_hasAddress2 = m_hasAddress;
|
|
m_hasTOD2 = m_hasTOD;
|
|
//m_hasSiteVenue2 = m_hasSiteVenue;
|
|
m_hasContactInfo2 = m_hasContactInfo;
|
|
//m_skipIndexingByte = m_skipIndexing;
|
|
m_isSiteRoot2 = m_isSiteRoot;
|
|
|
|
// these members are automatically validated
|
|
m_ipValid = true;
|
|
m_spideredTimeValid = true;
|
|
m_indexedTimeValid = true;
|
|
|
|
m_pubDateValid = true;
|
|
m_firstIndexedValid = true;
|
|
m_outlinksAddedDateValid = true;
|
|
m_charsetValid = true;
|
|
m_countryIdValid = true;
|
|
/*
|
|
m_titleWeightValid = true;
|
|
m_headerWeightValid = true;
|
|
m_urlPathWeightValid = true;
|
|
m_externalLinkTextWeightValid = true;
|
|
m_internalLinkTextWeightValid = true;
|
|
m_conceptWeightValid = true;
|
|
*/
|
|
|
|
// new stuff
|
|
m_siteNumInlinksValid = true;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
//m_sitePopValid = true;
|
|
m_rootLangIdValid = true;
|
|
m_hasContactInfoValid = true;
|
|
m_metaListCheckSum8Valid = true;
|
|
|
|
m_hopCountValid = true;
|
|
//m_numBannedOutlinksValid = true;
|
|
m_langIdValid = true;
|
|
m_contentTypeValid = true;
|
|
m_isRSSValid = true;
|
|
m_isPermalinkValid = true;
|
|
m_isAdultValid = true;
|
|
//m_eliminateMenusValid = true;
|
|
m_spiderLinksValid = true;
|
|
m_isContentTruncatedValid = true;
|
|
m_isLinkSpamValid = true;
|
|
m_hasAddressValid = true;
|
|
m_tagRecDataValid = true;
|
|
m_gigabitHashesValid = true;
|
|
m_contentHash32Valid = true;
|
|
//m_tagHash32Valid = true;
|
|
m_tagPairHash32Valid = true;
|
|
m_adVectorValid = true;
|
|
m_wikiDocIdsValid = true;
|
|
m_imageDataValid = true;
|
|
m_catIdsValid = true;
|
|
m_indCatIdsValid = true;
|
|
// ptr_dmozTitles/Summs/Anchors valid:
|
|
m_dmozInfoValid = true;
|
|
m_utf8ContentValid = true;
|
|
//m_sectionsReplyValid = true;
|
|
//m_sectionsVotesValid = true;
|
|
//m_addressReplyValid = true;
|
|
m_siteValid = true;
|
|
m_linkInfo1Valid = true;
|
|
m_linkInfo2Valid = true;
|
|
m_versionValid = true;
|
|
m_httpStatusValid = true;
|
|
m_crawlDelayValid = true;
|
|
//m_sectiondbDataValid = true;
|
|
//m_placedbDataValid = true;
|
|
//m_clockCandidatesDataValid = true;
|
|
//m_skipIndexingValid = true;
|
|
m_isSiteRootValid = true;
|
|
|
|
// ptr_linkInfo2 is valid. so getDiffbotTitleHashes() works.
|
|
m_diffbotTitleHashBufValid = true;
|
|
|
|
// set "m_oldTagRec" from ptr_tagRecData
|
|
//gbmemcpy ( &m_oldTagRec , ptr_tagRecData , size_tagRecData );
|
|
//m_oldTagRecValid = true;
|
|
|
|
// there was no issue indexing it...
|
|
m_indexCode = 0;
|
|
m_indexCodeValid = true;
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
|
|
// stop core when importing and calling getNewSpiderReply()
|
|
m_downloadEndTime = m_spideredTime;
|
|
m_downloadEndTimeValid = true;
|
|
|
|
// make a copy for new tag rec too, this one we modify
|
|
//gbmemcpy ( &m_newTagRec , ptr_tagRecData , size_tagRecData );
|
|
|
|
// set "m_siteNumInlinks" from m_oldTagRec
|
|
//Tag *tag = m_oldTagRec.getTag("sitenuminlinks");
|
|
// must always be there!
|
|
//if ( ! tag ) { char *xx=NULL;*xx=0; }
|
|
// must be null terminated
|
|
//if ( tag->getTagData()[tag->getTagData()Size-1] != 0 ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// grab that
|
|
//m_siteNumInlinks = atol(tag->getTagData());
|
|
//m_siteNumInlinksValid = true;
|
|
// must not be negative
|
|
if ( m_siteNumInlinks < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set m_hasContactInfo in case someone calls ::getHasContactInfo()
|
|
// which will do a bunch of parsing!!
|
|
//tag = m_oldTagRec.getTag ("hascontactinfo");
|
|
//if ( tag ) m_hasContactInfo = true;
|
|
//else m_hasContactInfo = false;
|
|
//m_hasContactInfoValid = true;
|
|
|
|
// sanity check. if m_siteValid is true, this must be there
|
|
if ( ! ptr_site ) {
|
|
log("set2: ptr_site is null for docid %" INT64 "",m_docId);
|
|
//char *xx=NULL;*xx=0; }
|
|
g_errno = ECORRUPTDATA;
|
|
return false;
|
|
}
|
|
|
|
// lookup the tagdb rec fresh if setting for a summary. that way we
|
|
// can see if it is banned or not
|
|
//if ( m_req ) m_tagRecDataValid = false;
|
|
|
|
// debug thing
|
|
ptr_sectiondbData = NULL;
|
|
size_sectiondbData = 0;
|
|
|
|
// set m_sections.m_nsvt from data. ptr_sectiondbData is the m_osvt
|
|
// serialized, which is from our read of sectiondb at the time we
|
|
// indexed it. but now that we may have nulled out our content to
|
|
// save space in titledb because m_skipIndexing is true, then we have
|
|
// to save our votes as well, BUT, only if we skipped indexing.
|
|
// and not allowed to serialize UNLESS we skipped because
|
|
// that would waste space as well
|
|
//if (! m_skipIndexing && size_sectionsVotes ) { char *xx=NULL;*xx=0; }
|
|
|
|
// success, return true then
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::setFirstUrl ( char *u , bool addWWW , Url *baseUrl ) {
|
|
|
|
m_firstUrl.reset();
|
|
m_currentUrl.reset();
|
|
|
|
m_firstUrlValid = true;
|
|
|
|
// sanity check. "u" must be normalized
|
|
//if ( strncmp(u,"http",4 ) != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume url is not correct format
|
|
ptr_firstUrl = NULL;
|
|
size_firstUrl = 0;
|
|
|
|
if ( ! u || ! u[0] ) {
|
|
//if ( ! m_indexCode ) m_indexCode = EBADURL;
|
|
return true;
|
|
}
|
|
|
|
//if ( gbstrlen (u) + 1 > MAX_URL_LEN )
|
|
// m_indexCode = EURLTOOLONG;
|
|
|
|
m_firstUrl.set ( baseUrl , u , gbstrlen(u) , addWWW ) ;
|
|
|
|
// it is the active url
|
|
m_currentUrl.set ( &m_firstUrl , false );
|
|
m_currentUrlValid = true;
|
|
|
|
// set this to the normalized url
|
|
ptr_firstUrl = m_firstUrl.getUrl();
|
|
size_firstUrl = m_firstUrl.getUrlLen() + 1;
|
|
|
|
// is it is a link loop?
|
|
//if ( m_firstUrl.isLinkLoop() ) {
|
|
// if ( ! m_indexCode ) m_indexCode = ELINKLOOP;
|
|
// return true;
|
|
//}
|
|
// it it illegal?
|
|
//if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
|
|
// if ( ! m_indexCode ) m_indexCode = EBADURL;
|
|
// return true;
|
|
//}
|
|
|
|
// check if url is porn words in it
|
|
//if ( cr->m_doUrlSpamCheck && m_firstUrl.isSpam() ) {
|
|
// if ( ! m_indexCode ) m_indexCode = EDOCURLSPAM;
|
|
// return true;
|
|
//}
|
|
|
|
return true;
|
|
}
|
|
|
|
//CollectionRec *XmlDoc::getCollRec ( ) {
|
|
// return g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
|
|
//}
|
|
|
|
//bool XmlDoc::setRedirUrl ( char *u , bool addWWW ) {
|
|
// m_redirUrl.set ( u , gbstrlen(u) , addWWW );
|
|
// ptr_redirUrl = m_redirUrl.getUrl();
|
|
// size_redirUrl = m_redirUrl.getUrlLen()+1;
|
|
// return true;
|
|
//}
|
|
|
|
void XmlDoc::setStatus ( char *s ) {
|
|
m_statusMsg = s;
|
|
m_statusMsgValid = true;
|
|
static char *s_last = NULL;
|
|
|
|
if ( s == s_last ) return;
|
|
|
|
bool timeIt = false;
|
|
// if ( m_sreqValid &&
|
|
// m_sreq.m_isInjecting &&
|
|
// m_sreq.m_isPageInject )
|
|
// timeIt = true;
|
|
if ( g_conf.m_logDebugBuildTime )
|
|
timeIt = true;
|
|
|
|
// log times to detect slowness
|
|
if ( timeIt ) {
|
|
int64_t now = gettimeofdayInMillisecondsLocal();
|
|
if ( s_lastTimeStart == 0LL ) s_lastTimeStart = now;
|
|
int32_t took = now - s_lastTimeStart;
|
|
//if ( took > 100 )
|
|
log("xmldoc: %s (xd=0x%" PTRFMT " "
|
|
"u=%s) took %" INT32 "ms",
|
|
s_last,
|
|
(PTRTYPE)this,
|
|
m_firstUrl.m_url,
|
|
took);
|
|
s_lastTimeStart = now;
|
|
}
|
|
|
|
s_last = s;
|
|
|
|
bool logIt = g_conf.m_logDebugBuild;
|
|
// CollectionRec *cr = NULL;
|
|
// if ( m_collnumValid )
|
|
// cr = g_collectiondb.m_recs[m_collnum];
|
|
// if ( cr &&
|
|
// cr->m_coll &&
|
|
// cr->m_coll[0] == 'c' &&
|
|
// cr->m_coll[1] == 'r' &&
|
|
// strncmp(cr->m_coll,"crawlbottesting-",16) == 0 )
|
|
// logIt = true;
|
|
|
|
if ( ! logIt ) return;
|
|
//return;
|
|
if ( m_firstUrlValid )
|
|
logf(LOG_DEBUG,"build: status = %s for %s (this=0x%" PTRFMT ")",
|
|
s,m_firstUrl.m_url,(PTRTYPE)this);
|
|
else
|
|
logf(LOG_DEBUG,"build: status = %s for docId %" INT64 " "
|
|
"(this=0x%" PTRFMT ")",
|
|
s,m_docId, (PTRTYPE)this);
|
|
}
|
|
|
|
// caller must now call XmlDoc::setCallback()
|
|
void XmlDoc::setCallback ( void *state, void (* callback) (void *state) ) {
|
|
m_state = state;
|
|
m_callback1 = callback;
|
|
// add this additional state==this constraint to prevent core when
|
|
// doing a page parser
|
|
if ( state == this &&
|
|
// i don't remember why i added this sanity check...
|
|
callback == getMetaListWrapper ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) {
|
|
m_state = state;
|
|
m_callback2 = callback;
|
|
}
|
|
|
|
// . similar to XmlDoc::indexDoc() but just adds m_firstUrl to spiderdb
|
|
// . used by PageAddUrl.cpp
|
|
/*
|
|
bool XmlDoc::addToSpiderdb ( ) {
|
|
// set a flag
|
|
m_isAddUrl = true;
|
|
// url must be valid
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
// do not add if something wrong with url
|
|
if ( m_indexCode ) return true;
|
|
// this should just add to spiderdb because m_isAddUrl is true
|
|
return indexDoc(false,false,false,false,true,false);
|
|
}
|
|
*/
|
|
|
|
void indexDocWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in index doc wrapper" );
|
|
// return if it blocked
|
|
if ( ! THIS->indexDoc( ) ) return;
|
|
// otherwise, all done, call the caller callback
|
|
|
|
// g_statsdb.addStat ( MAX_NICENESS,
|
|
// "docs_indexed",
|
|
// 20,
|
|
// 21,
|
|
// );
|
|
|
|
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// for registerSleepCallback
|
|
void indexDocWrapper2 ( int fd , void *state ) {
|
|
indexDocWrapper ( state );
|
|
}
|
|
|
|
// . inject from http request
|
|
// . replace more of Msg7.cpp logic with this?
|
|
//bool XmlDoc::injectDoc ( HttpRequest *hr ) {
|
|
//}
|
|
|
|
// . the highest level function in here
|
|
// . user is requesting to inject this url
|
|
// . returns false if blocked and your callback will be called when done
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::injectDoc ( char *url ,
|
|
CollectionRec *cr ,
|
|
char *content ,
|
|
char *diffbotReply, // usually null
|
|
bool contentHasMimeArg ,
|
|
int32_t hopCount,
|
|
int32_t charset,
|
|
|
|
bool deleteUrl,
|
|
char *contentTypeStr, // text/html application/json
|
|
bool spiderLinks ,
|
|
char newOnly, // index iff new
|
|
|
|
void *state,
|
|
void (*callback)(void *state) ,
|
|
|
|
uint32_t firstIndexed,
|
|
uint32_t lastSpidered ,
|
|
int32_t injectDocIp ,
|
|
char *contentDelim,
|
|
char *metadata,
|
|
uint32_t metadataLen,
|
|
int32_t payloadLen
|
|
) {
|
|
|
|
|
|
// wait until we are synced with host #0
|
|
if ( ! isClockInSync() ) {
|
|
log("xmldocl: got injection request but clock not yet "
|
|
"synced with host #0");
|
|
g_errno = ETRYAGAIN;//CLOCKNOTSYNCED;
|
|
return true;
|
|
}
|
|
|
|
// normalize url
|
|
Url uu;
|
|
// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
|
|
// which has no www.tmblr.co IP!
|
|
uu.set(url,gbstrlen(url),false);//true);
|
|
|
|
// if (!strncmp(url , "http://www.focusinfo.com/products/mxprodv" ,40))
|
|
// log("hey");
|
|
|
|
|
|
// remove >'s i guess and store in st1->m_url[] buffer
|
|
char cleanUrl[MAX_URL_LEN+1];
|
|
cleanInput ( cleanUrl,
|
|
MAX_URL_LEN,
|
|
uu.getUrl(),
|
|
uu.getUrlLen() );
|
|
|
|
|
|
int32_t contentType = CT_UNKNOWN;
|
|
if ( contentTypeStr && contentTypeStr[0] )
|
|
contentType = getContentTypeFromStr(contentTypeStr);
|
|
|
|
// use CT_HTML if contentTypeStr is empty or blank. default
|
|
if ( ! contentTypeStr || ! contentTypeStr[0] )
|
|
contentType = CT_HTML;
|
|
|
|
// this can go on the stack since set4() copies it
|
|
SpiderRequest sreq;
|
|
sreq.setFromInject ( cleanUrl );
|
|
|
|
if ( lastSpidered )
|
|
sreq.m_addedTime = lastSpidered;
|
|
|
|
if ( deleteUrl )
|
|
sreq.m_forceDelete = 1;
|
|
|
|
//static char s_dummy[3];
|
|
// sometimes the content is indeed NULL...
|
|
//if ( newOnly && ! content ) {
|
|
// // don't let it be NULL because then xmldoc will
|
|
// // try to download the page!
|
|
// s_dummy[0] = '\0';
|
|
// content = s_dummy;
|
|
// //char *xx=NULL;*xx=0; }
|
|
//}
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
if ( ! set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// from PageInject.cpp:
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
1, // niceness, // 1 ,
|
|
// inject this content
|
|
content ,
|
|
deleteUrl, // false, // deleteFromIndex ,
|
|
injectDocIp, // 0,//forcedIp ,
|
|
contentType ,
|
|
lastSpidered,//lastSpidered overide
|
|
contentHasMimeArg ,
|
|
contentDelim,
|
|
metadata,
|
|
metadataLen,
|
|
payloadLen
|
|
)) {
|
|
// g_errno should be set if that returned false
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// a diffbot reply? should be in json
|
|
if ( diffbotReply ) {
|
|
if ( ! m_diffbotReply.safeStrcpy(diffbotReply) )
|
|
return true;
|
|
// it was injected so assume no error
|
|
m_diffbotReplyError = 0;
|
|
m_diffbotReplyValid = true;
|
|
}
|
|
|
|
//m_doConsistencyTesting = doConsistencyTesting;
|
|
|
|
// . set xd from the old title rec if recycle is true
|
|
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
|
//if ( recycleContent ) m_recycleContent = true;
|
|
|
|
// othercrap. used for importing from titledb of another coll/cluster.
|
|
if ( firstIndexed ) {
|
|
m_firstIndexedDate = firstIndexed;
|
|
m_firstIndexedDateValid = true;
|
|
}
|
|
|
|
if ( lastSpidered ) {
|
|
m_spideredTime = lastSpidered;
|
|
m_spideredTimeValid = true;
|
|
}
|
|
|
|
if ( hopCount != -1 ) {
|
|
m_hopCount = hopCount;
|
|
m_hopCountValid = true;
|
|
}
|
|
|
|
// PageInject calls memset on gigablastrequest so add '!= 0' here
|
|
if ( charset != -1 && charset != csUnknown && charset != 0 ) {
|
|
m_charset = charset;
|
|
m_charsetValid = true;
|
|
}
|
|
|
|
// avoid looking up ip of each outlink to add "firstip" tag to tagdb
|
|
// because that can be slow!!!!!!!
|
|
m_spiderLinks = spiderLinks;
|
|
m_spiderLinks2 = spiderLinks;
|
|
m_spiderLinksValid = true;
|
|
|
|
// . newOnly is true --> do not inject if document is already indexed!
|
|
// . maybe just set indexCode
|
|
m_newOnly = newOnly;
|
|
|
|
// do not re-lookup the robots.txt
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
m_crawlDelay = -1; // unknown
|
|
m_crawlDelayValid = true;
|
|
|
|
|
|
m_isInjecting = true;
|
|
m_isInjectingValid = true;
|
|
|
|
// set this now
|
|
//g_inPageInject = true;
|
|
|
|
// log it now
|
|
//log("inject: indexing injected doc %s",cleanUrl);
|
|
|
|
// make this our callback in case something blocks
|
|
setCallback ( state , callback );
|
|
|
|
// . now tell it to index
|
|
// . this returns false if blocked
|
|
// . eventually it will call "callback" when done if it blocks
|
|
bool status = indexDoc ( );
|
|
if ( ! status ) return false;
|
|
|
|
// log it. i guess only for errors when it does not block?
|
|
// because xmldoc.cpp::indexDoc calls logIt()
|
|
if ( status ) logIt();
|
|
|
|
|
|
|
|
// undo it
|
|
//g_inPageInject = false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// XmlDoc::injectDoc uses a fake spider request so we have to add
|
|
// a real spider request into spiderdb so that the injected doc can
|
|
// be spidered again in the future by the spidering process, otherwise,
|
|
// injected docs can never be re-spidered. they would end up having
|
|
// a SpiderReply in spiderdb but no matching SpiderRequest as well.
|
|
void XmlDoc::getRevisedSpiderRequest ( SpiderRequest *revisedReq ) {
|
|
|
|
if ( ! m_sreqValid ) { char *xx=NULL; *xx=0; }
|
|
|
|
// we are doing this because it has a fake first ip
|
|
if ( ! m_sreq.m_fakeFirstIp ) { char *xx=NULL;*xx=0; }
|
|
|
|
// copy it over from our current spiderrequest
|
|
gbmemcpy ( revisedReq , &m_sreq , m_sreq.getRecSize() );
|
|
|
|
// this must be valid for us of course
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// wtf? it might be invalid!!! parent caller will handle it...
|
|
//if ( m_firstIp == 0 || m_firstIp == -1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store the real ip in there now
|
|
revisedReq->m_firstIp = m_firstIp;
|
|
|
|
// but turn off this flag! the whole point of all this...
|
|
revisedReq->m_fakeFirstIp = 0;
|
|
|
|
// re-make the key since it contains m_firstIp
|
|
int64_t uh48 = m_sreq.getUrlHash48();
|
|
int64_t parentDocId = m_sreq.getParentDocId();
|
|
|
|
// set the key properly to reflect the new "first ip" since
|
|
// we shard spiderdb by that.
|
|
revisedReq->m_key = g_spiderdb.makeKey ( m_firstIp,
|
|
uh48,
|
|
true, // is request?
|
|
parentDocId ,
|
|
false );// isDel );
|
|
revisedReq->setDataSize();
|
|
}
|
|
|
|
void XmlDoc::getRebuiltSpiderRequest ( SpiderRequest *sreq ) {
|
|
|
|
// memset 0
|
|
sreq->reset();
|
|
|
|
// assume not valid
|
|
sreq->m_siteNumInlinks = -1;
|
|
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// how many site inlinks?
|
|
sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
sreq->m_siteNumInlinksValid = true;
|
|
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set other fields besides key
|
|
sreq->m_firstIp = m_firstIp;
|
|
sreq->m_hostHash32 = m_hostHash32a;
|
|
//sreq->m_domHash32 = m_domHash32;
|
|
//sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
//sreq->m_pageNumInlinks = m_pageNumInlinks;
|
|
sreq->m_hopCount = m_hopCount;
|
|
|
|
sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32;
|
|
sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32;
|
|
sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32;
|
|
sreq->m_pageNumInlinks = 0;//m_sreq.m_parentFirstIp;
|
|
|
|
Url *fu = getFirstUrl();
|
|
|
|
sreq->m_isNewOutlink = 0;
|
|
sreq->m_isAddUrl = 0;//m_isAddUrl;
|
|
sreq->m_isPingServer = fu->isPingServer();
|
|
//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;
|
|
|
|
// transcribe from old spider rec, stuff should be the same
|
|
sreq->m_addedTime = m_firstIndexedDate;
|
|
sreq->m_sameDom = 0;//m_sreq.m_sameDom;
|
|
sreq->m_sameHost = 0;//m_sreq.m_sameHost;
|
|
sreq->m_sameSite = 0;//m_sreq.m_sameSite;
|
|
sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed;
|
|
sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS;
|
|
sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink;
|
|
sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer;
|
|
|
|
// validate the stuff so getUrlFilterNum() acks it
|
|
sreq->m_hopCountValid = 1;
|
|
|
|
// we need this now for ucp ucr upp upr new url filters that do
|
|
// substring matching on the url
|
|
if ( m_firstUrlValid )
|
|
strcpy(sreq->m_url,m_firstUrl.m_url);
|
|
|
|
// re-make the key since it contains m_firstIp
|
|
long long uh48 = fu->getUrlHash48();
|
|
// set the key properly to reflect the new "first ip"
|
|
// since we shard spiderdb by that.
|
|
sreq->m_key = g_spiderdb.makeKey ( m_firstIp,//ip,
|
|
uh48,
|
|
true,//is req?
|
|
0LL, // parentDocId ,
|
|
false );//isDel
|
|
sreq->setDataSize();
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////
|
|
// THIS IS THE HEART OF HOW THE PARSER ADDS TO THE RDBS
|
|
////////////////////////////////////////////////////////////////////
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error and returns true
|
|
// . this is now a WRAPPER for indexDoc2() and it will deal with
|
|
// g_errnos by adding an error spider reply so we offload the
|
|
// logic to the url filters table
|
|
bool XmlDoc::indexDoc ( ) {
|
|
|
|
// return from the msg4.addMetaList() below?
|
|
if ( m_msg4Launched ) {
|
|
// must have been waiting
|
|
if ( ! m_msg4Waiting ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// return true with g_errno set on error
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = indexDocWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// do not index if already indexed and we are importing
|
|
// from the code in PageInject.cpp from a foreign titledb file
|
|
if ( m_isImporting && m_isImportingValid ) {
|
|
char *isIndexed = getIsIndexed();
|
|
if ( ! isIndexed ) {
|
|
log("import: import had error: %s",mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
if ( isIndexed == (char *)-1)
|
|
return false;
|
|
if ( *isIndexed ) {
|
|
log("import: skipping import for %s. already indexed.",
|
|
m_firstUrl.getUrl());
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// . even if not using diffbot, keep track of these counts
|
|
// . even if we had something like EFAKEFIRSTIP, OOM, or whatever
|
|
// it was an attempt we made to crawl this url
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
! m_incrementedAttemptsCount ) {
|
|
// do not repeat
|
|
m_incrementedAttemptsCount = true;
|
|
// log debug
|
|
//log("build: attempted %s count=%" INT64 "",m_firstUrl.getUrl(),
|
|
// cr->m_localCrawlInfo.m_pageDownloadAttempts);
|
|
// this is just how many urls we tried to index
|
|
//cr->m_localCrawlInfo.m_urlsConsidered++;
|
|
// avoid counting if it is a fake first ip
|
|
bool countIt = true;
|
|
// pagereindex.cpp sets this as does any add url (bulk job)
|
|
if ( m_sreqValid && m_sreq.m_fakeFirstIp )
|
|
countIt = false;
|
|
if ( countIt ) {
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
}
|
|
// need to save collection rec now during auto save
|
|
cr->m_needsSave = true;
|
|
// update this just in case we are the last url crawled
|
|
//int64_t now = gettimeofdayInMillisecondsGlobal();
|
|
//cr->m_diffbotCrawlEndTime = now;
|
|
}
|
|
|
|
|
|
bool status = true;
|
|
|
|
if ( ! g_errno ) status = indexDoc2 ( );
|
|
|
|
// blocked?
|
|
if ( ! status ) return false;
|
|
|
|
// done with no error?
|
|
bool success = true;
|
|
if ( g_errno ) success = false;
|
|
// if we were trying to spider a fakefirstip request then
|
|
// pass through because we lookup the real firstip below and
|
|
// add a new request as well as a reply for this one
|
|
if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) success = false;
|
|
|
|
if ( success ) return true;
|
|
|
|
// . ignore failed child docs like diffbot pages
|
|
// . they are getting EMALFORMEDSECTIONS
|
|
if ( m_isChildDoc ) {
|
|
log("build: done indexing child doc. error=%s. not adding "
|
|
"spider reply for %s",
|
|
mstrerror(g_errno),
|
|
m_firstUrl.m_url);
|
|
return true;
|
|
}
|
|
|
|
///
|
|
// otherwise, an internal error. we must add a SpiderReply
|
|
// to spiderdb to release the lock.
|
|
///
|
|
|
|
logErr:
|
|
|
|
if ( m_firstUrlValid && g_errno )
|
|
log("build: %s had internal error = %s. adding spider "
|
|
"error reply.",
|
|
m_firstUrl.m_url,mstrerror(g_errno));
|
|
else if ( g_errno )
|
|
log("build: docid=%" INT64 " had internal error = %s. "
|
|
"adding spider error reply.",
|
|
m_docId,mstrerror(g_errno));
|
|
|
|
// seems like this was causing a core somehow...
|
|
if ( g_errno == ENOMEM )
|
|
return true;
|
|
|
|
// and do not add spider reply if shutting down the server
|
|
if ( g_errno == ESHUTTINGDOWN )
|
|
return true;
|
|
|
|
// i saw this on shard 9, how is it happening
|
|
if ( g_errno == EBADRDBID )
|
|
return true;
|
|
|
|
// if docid not found when trying to do a query reindex...
|
|
// this really shouldn't happen but i think we were adding
|
|
// additional SpiderRequests since we were using a fake first ip.
|
|
// but i have since fixed that code. so if the titlerec was not
|
|
// found when trying to do a force delete... it's not a temporary
|
|
// error and should not be retried. if we set indexCode to
|
|
// EINTERNALERROR it seems to be retried.
|
|
if ( g_errno == ENOTFOUND ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// this should not be retired either. i am seeing it excessively
|
|
// retried from a
|
|
// "TitleRec::set: uncompress uncompressed size=-2119348471"
|
|
// error condition. it also said
|
|
// "Error spidering for doc http://www.... : Bad cached document"
|
|
if ( g_errno == EBADTITLEREC ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
|
|
// transId=496026 nice=1 net=default): Buf too small.
|
|
// so fix that with this
|
|
if ( g_errno == EBUFTOOSMALL ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
if ( g_errno == EBADURL ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
if ( g_errno == ENOTITLEREC ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// default to internal error which will be retried forever otherwise
|
|
if ( ! m_indexCodeValid ) {
|
|
m_indexCode = EINTERNALERROR;//g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// if our spiderrequest had a fake "firstip" so that it could be
|
|
// injected quickly into spiderdb, then do the firstip lookup here
|
|
// and re-add the new spider request with that, and add the reply
|
|
// to the fake firstip request below.
|
|
if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) {
|
|
// at least get this if possible
|
|
int32_t *fip = getFirstIp();
|
|
if ( fip == (void *) -1 ) return false;
|
|
// error? g_errno will be changed if this is NULL
|
|
if ( ! fip ) {
|
|
log("build: error getting real firstip: %s",
|
|
mstrerror(g_errno));
|
|
m_indexCode = EINTERNALERROR;
|
|
m_indexCodeValid = true;
|
|
goto logErr;
|
|
}
|
|
// sanity log
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
// sanity log
|
|
if ( *fip == 0 || *fip == -1 ) {
|
|
//
|
|
// now add a spider status doc for this so we know
|
|
// why a crawl might have failed to start
|
|
//
|
|
SafeBuf *ssDocMetaList = NULL;
|
|
// save this
|
|
int32_t saved = m_indexCode;
|
|
// make it the real reason for the spider status doc
|
|
m_indexCode = EDNSERROR;
|
|
// get the spiderreply ready to be added. false=del
|
|
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
|
|
// revert
|
|
m_indexCode = saved;
|
|
// error?
|
|
if ( ! ssDocMetaList ) return true;
|
|
// blocked?
|
|
if ( ssDocMetaList == (void *)-1 ) return false;
|
|
// need to alloc space for it too
|
|
char *list = ssDocMetaList->getBufStart();
|
|
int32_t len = ssDocMetaList->length();
|
|
//needx += len;
|
|
// this too
|
|
m_addedStatusDocSize = len;
|
|
m_addedStatusDocSizeValid = true;
|
|
|
|
char *url = "unknown";
|
|
if ( m_sreqValid ) url = m_sreq.m_url;
|
|
log("build: error2 getting real firstip of "
|
|
"%" INT32 " for "
|
|
"%s. Not adding new spider req. "
|
|
"spiderstatusdocsize=%" INT32 , (int32_t)*fip,url,
|
|
m_addedStatusDocSize);
|
|
// also count it as a crawl attempt
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
|
|
if ( ! m_metaList2.safeMemcpy ( list , len ) )
|
|
return true;
|
|
|
|
goto skipNewAdd1;
|
|
}
|
|
// store the new request (store reply for this below)
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
if ( ! m_metaList2.pushChar(rd) )
|
|
return true;
|
|
// store it here
|
|
SpiderRequest revisedReq;
|
|
// this fills it in
|
|
getRevisedSpiderRequest ( &revisedReq );
|
|
// and store that new request for adding
|
|
if ( ! m_metaList2.safeMemcpy (&revisedReq,
|
|
revisedReq.getRecSize()))
|
|
return true;
|
|
// make sure to log the size of the spider request
|
|
m_addedSpiderRequestSize = revisedReq.getRecSize();
|
|
m_addedSpiderRequestSizeValid = true;
|
|
}
|
|
|
|
skipNewAdd1:
|
|
|
|
SpiderReply *nsr = NULL;
|
|
|
|
// if only rebuilding posdb do not rebuild spiderdb
|
|
if ( m_useSpiderdb && ! m_addedSpiderReplySizeValid ) {
|
|
|
|
////
|
|
//
|
|
// make these fake so getNewSpiderReply() below does not block
|
|
//
|
|
////
|
|
nsr = getFakeSpiderReply ( );
|
|
// this can be NULL and g_errno set to ENOCOLLREC or something
|
|
if ( ! nsr )
|
|
return true;
|
|
|
|
//SafeBuf metaList;
|
|
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
if ( ! m_metaList2.pushChar( rd ) )
|
|
return true;
|
|
|
|
if ( ! m_metaList2.safeMemcpy ( (char *)nsr,nsr->getRecSize()))
|
|
return true;
|
|
|
|
m_addedSpiderReplySize = nsr->getRecSize();
|
|
m_addedSpiderReplySizeValid = true;
|
|
}
|
|
|
|
// for other errors like EBADTITLEREC we are not adding spider
|
|
// status docs, so add them here
|
|
/*
|
|
if ( ! m_addedStatusDocSizeValid ) {
|
|
SafeBuf *ssDocMetaList = NULL;
|
|
// if calling getSpiderStatusDocMetaList blocks then
|
|
// call addErrorStuffWrapper() to call msg4
|
|
//m_masterLoop = addErrorStuffWrapper();
|
|
//m_state = this;
|
|
// this uses m_indexCode to set it
|
|
// if this blocks it ends up calling m_masterLoop and
|
|
// re-entering this function with g_errno clear possibly
|
|
// so do we make it back here????? MDW
|
|
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);
|
|
// error?
|
|
if ( ! ssDocMetaList ) return true;
|
|
// blocked?
|
|
if ( ssDocMetaList == (void *)-1 ) return false;
|
|
// need to alloc space for it too
|
|
char *list = ssDocMetaList->getBufStart();
|
|
int32_t len = ssDocMetaList->length();
|
|
// this too
|
|
m_addedStatusDocSize = len;
|
|
m_addedStatusDocSizeValid = true;
|
|
// also count it as a crawl attempt
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
if ( ! m_metaList2.safeMemcpy ( list , len ) )
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
m_msg4Launched = true;
|
|
|
|
// display the url that had the error
|
|
logIt();
|
|
|
|
// log this for debug now
|
|
if ( nsr ) {
|
|
SafeBuf tmp;
|
|
nsr->print(&tmp);
|
|
log("xmldoc: added reply %s",tmp.getBufStart());
|
|
}
|
|
|
|
// clear g_errno
|
|
g_errno = 0;
|
|
|
|
// "cr" might have been deleted by calling indexDoc() above i think
|
|
// so use collnum here, not "cr"
|
|
if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart() ,
|
|
m_metaList2.length() ,
|
|
m_collnum,//cr->m_coll ,
|
|
m_masterState , // state
|
|
m_masterLoop ,
|
|
m_niceness ) ) {
|
|
// spider hang bug
|
|
//if ( g_conf.m_testSpiderEnabled )
|
|
// logf(LOG_DEBUG,"build: msg4 meta add3 blocked"
|
|
// "msg4=0x%" XINT32 "" ,(int32_t)&m_msg4);
|
|
m_msg4Waiting = true;
|
|
return false;
|
|
}
|
|
|
|
//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );
|
|
|
|
m_msg4Launched = false;
|
|
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error and returns true
|
|
bool XmlDoc::indexDoc2 ( ) {
|
|
|
|
if ( g_isYippy ) return true;
|
|
|
|
// if anything blocks, this will be called when it comes back
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = indexDocWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
|
|
// do this before we increment pageDownloadAttempts below so that
|
|
// john's smoke tests, which use those counts, are not affected
|
|
if ( m_sreqValid &&
|
|
m_sreq.m_fakeFirstIp &&
|
|
// only do for add url, not for injects. injects expect
|
|
// the doc to be indexed while the browser waits. add url
|
|
// is really just adding the spider request and returning
|
|
// to the browser without delay.
|
|
! m_sreq.m_isInjecting &&
|
|
// not for page reindexes either!
|
|
! m_sreq.m_isPageReindex &&
|
|
// just add url
|
|
m_sreq.m_isAddUrl &&
|
|
// diffbot requests are ok though!
|
|
! strstr(m_sreq.m_url,"-diffbotxyz") ) {
|
|
m_indexCodeValid = true;
|
|
m_indexCode = EFAKEFIRSTIP;
|
|
return true;
|
|
}
|
|
|
|
|
|
// ensure that CollectionRec::m_globalCrawlInfo (spider stats)
|
|
// is at least 1 minute in sync with counts of
|
|
// all hosts in network. this returns false if it sent out requests
|
|
// to update the counts from all the hosts in the network, and
|
|
// when it updates CollectionRec::m_crawlInfoGlobal with all the
|
|
// requests from each hosts in the network it will call the
|
|
// specified callback, m_masterLoop with m_masterState. this code
|
|
// is all in Spider.cpp.
|
|
// this is now in a sleep wrapper in spider.cpp.
|
|
//setStatus ( "updating crawl info" );
|
|
//if ( ! g_errno &&
|
|
// ! updateCrawlInfo ( cr , m_masterState , m_masterLoop ) )
|
|
// return false;
|
|
|
|
|
|
// MDW: we do this in indexDoc() above why do we need it here?
|
|
/*
|
|
// even if not using diffbot, keep track of these counts
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
! m_incrementedAttemptsCount ) {
|
|
// do not repeat
|
|
m_incrementedAttemptsCount = true;
|
|
// this is just how many urls we tried to index
|
|
//cr->m_localCrawlInfo.m_urlsConsidered++;
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
// need to save collection rec now during auto save
|
|
cr->m_needsSave = true;
|
|
// update this just in case we are the last url crawled
|
|
int64_t now = gettimeofdayInMillisecondsGlobal();
|
|
cr->m_diffbotCrawlEndTime = now;
|
|
}
|
|
*/
|
|
/*
|
|
// if we are being called from Spider.cpp and we met our max
|
|
// to crawl requirement, then bail out on this. this might
|
|
// become true when we are in the middle of processing this url...
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
// this is just for this collection, from all hosts in network
|
|
cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >=
|
|
cr->m_diffbotMaxToCrawl ) {
|
|
// set the code to badness
|
|
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
|
|
m_indexCodeValid = true;
|
|
log("diffbot: abandoning url because we hit crawl limit "
|
|
"of %" INT64 ". downloaded %" INT64 ". Disabling spiders."
|
|
,cr->m_diffbotMaxToCrawl
|
|
,cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
|
);
|
|
g_errno = m_indexCode;
|
|
// if spiders already off..
|
|
if ( ! cr->m_spideringEnabled ) return true;
|
|
// do not repeat call sendNotification()
|
|
cr->m_spideringEnabled = false;
|
|
// set this
|
|
m_emailInfo.reset();
|
|
m_emailInfo.m_finalCallback = m_masterLoop;
|
|
m_emailInfo.m_finalState = m_masterState;
|
|
m_emailInfo.m_collnum = m_collnum;
|
|
// note it
|
|
setStatus("sending notification");
|
|
// this returns false if it would block, so we ret fals
|
|
if ( ! sendNotification ( &m_emailInfo ) ) return false;
|
|
// it didn't block
|
|
g_errno = m_indexCode;
|
|
return true;
|
|
}
|
|
|
|
// likewise if we hit the max processing limit...
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >=
|
|
cr->m_diffbotMaxToProcess ) {
|
|
// set the code to badness
|
|
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
|
|
m_indexCodeValid = true;
|
|
log("diffbot: abandoning url because we hit process limit "
|
|
"of %" INT64 ". processed %" INT64 ". Disabling spiders."
|
|
, cr->m_diffbotMaxToProcess
|
|
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
|
|
);
|
|
g_errno = m_indexCode;
|
|
// if spiders already off...
|
|
if ( ! cr->m_spideringEnabled ) return true;
|
|
// turn them off and send notification (email or url)
|
|
cr->m_spideringEnabled = false;
|
|
// set this
|
|
m_emailInfo.reset();
|
|
m_emailInfo.m_finalCallback = m_masterLoop;
|
|
m_emailInfo.m_finalState = m_masterState;
|
|
m_emailInfo.m_collnum = m_collnum;
|
|
// note it
|
|
setStatus("sending notification");
|
|
// . this returns false if it would block, so we ret fals
|
|
// . this is now in PingServer.cpp
|
|
if ( ! sendNotification( &m_emailInfo ) ) return false;
|
|
// it didn't block
|
|
g_errno = m_indexCode;
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
setStatus("indexing doc");
|
|
|
|
// maybe a callback had g_errno set?
|
|
if ( g_errno ) return true;
|
|
|
|
// before indexing this doc, index its inlinks it has according
|
|
// to ahrefs?
|
|
if ( m_downloadLevel == 1 && m_useAhrefs && ! m_doneWithAhrefs ) {
|
|
// do not repeat this call!
|
|
m_doneWithAhrefs = true;
|
|
// call it
|
|
if ( ! injectAhrefsLinks () ) return false;
|
|
}
|
|
|
|
|
|
|
|
if ( m_firstUrlValid && (m_firstUrl.isArc() || m_firstUrl.isWarc())) {
|
|
// this returns false if it would block and callback will be
|
|
// called
|
|
if ( ! indexWarcOrArc ( ) )
|
|
return false;
|
|
logIt();
|
|
// all done! no need to add the parent doc.
|
|
return true;
|
|
}
|
|
|
|
if ( isContainerDoc() ) {
|
|
// m_delimeter should be set!
|
|
if ( ! indexContainerDoc () )
|
|
return false;
|
|
logIt();
|
|
// all done! no need to add the parent doc.
|
|
return true;
|
|
}
|
|
|
|
// . now get the meta list from it to add
|
|
// . returns NULL and sets g_errno on error
|
|
char *metaList = getMetaList ( );
|
|
|
|
// error?
|
|
if ( ! metaList ) {
|
|
// sanity check. g_errno must be set
|
|
if ( ! g_errno ) {
|
|
log("build: Error UNKNOWN error spidering. setting "
|
|
"to bad engineer.");
|
|
g_errno = EBADENGINEER;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
log("build: Error spidering for doc %s: %s",
|
|
m_firstUrl.m_url,mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
// did it block? return false if so, we will be recalled since
|
|
// we set m_masterLoop to indexDoc
|
|
if ( metaList == (char *) -1 ) return false;
|
|
|
|
// before we add the meta list let's updateTagdb()
|
|
//char *ret = updateTagdb();
|
|
// it returns NULL on error
|
|
//if ( ret == NULL ) return true;
|
|
// return false if it blocked
|
|
//if ( ret == (char *)-1 ) return false;
|
|
|
|
// . let's update tagdb's venue address default too
|
|
// . no. that is in getTitleRecBuf()
|
|
|
|
// must be valid
|
|
int32_t *indexCode = getIndexCode();
|
|
if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
|
|
|
|
// . check to make sure the parser is consistent so we can cleanly
|
|
// delete the various rdb records if we need to in the future solely
|
|
// based on the titleRec.
|
|
// . force = false
|
|
// . unless we force it, the test is only done at random intervals
|
|
// for performance reasons
|
|
if ( ! *indexCode ) doConsistencyTest ( false );
|
|
// ignore errors from that
|
|
g_errno = 0;
|
|
|
|
|
|
// unregister any sleep callback
|
|
if ( m_registeredSleepCallback ) {
|
|
g_loop.unregisterSleepCallback(m_masterState,indexDocWrapper2);
|
|
m_registeredSleepCallback = false;
|
|
}
|
|
|
|
//////////
|
|
// . add the doledb negative key quickly to our tree to avoid a
|
|
// respider because the msg4 doledb negative key is buffered by msg4
|
|
// . make it negative
|
|
// . well it should not be respidered because the lock is on it!!
|
|
// -- so let's comment this out
|
|
/////////
|
|
/*
|
|
key_t negative = m_doledbKey;
|
|
// make it negative
|
|
negative.n0 &= 0xfffffffffffffffeLL;
|
|
// . store it in our tree if we can
|
|
// . returns false and sets g_errno on error
|
|
// . i.e. g_errno == ETRYAGAIN
|
|
if ( ! m_addedNegativeDoledbRec &&
|
|
! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
|
|
NULL,0,m_niceness)){
|
|
log("build: error trying to add to doledb: %s",
|
|
mstrerror(g_errno));
|
|
// set sleep wrapper
|
|
g_loop.registerSleepCallback(1000,m_masterState,
|
|
indexDocWrapper2,m_niceness);
|
|
// note it
|
|
m_registeredSleepCallback = true;
|
|
// sleep and retry
|
|
return false;
|
|
}
|
|
*/
|
|
// we did that
|
|
m_addedNegativeDoledbRec = true;
|
|
|
|
// now add it
|
|
if ( ! m_listAdded && m_metaListSize ) {
|
|
// only call thuis once
|
|
m_listAdded = true;
|
|
// show it for now
|
|
//printMetaList(m_metaList , m_metaList + m_metaListSize,NULL);
|
|
// test it
|
|
verifyMetaList ( m_metaList ,
|
|
m_metaList + m_metaListSize ,
|
|
false );
|
|
// do it
|
|
if ( ! m_msg4.addMetaList ( m_metaList ,
|
|
m_metaListSize ,
|
|
m_collnum,//cr->m_coll ,
|
|
m_masterState , // state
|
|
m_masterLoop ,
|
|
m_niceness ) ) {
|
|
// spider hang bug
|
|
if ( g_conf.m_testSpiderEnabled )
|
|
logf(LOG_DEBUG,"build: msg4 meta add blocked"
|
|
"msg4=0x%" PTRFMT "" ,(PTRTYPE)&m_msg4);
|
|
m_msg4Waiting = true;
|
|
return false;
|
|
}
|
|
// error with msg4? bail
|
|
if ( g_errno ) return logIt();
|
|
|
|
}
|
|
|
|
// make sure our msg4 is no longer in the linked list!
|
|
if (m_msg4Waiting && isInMsg4LinkedList(&m_msg4)){char *xx=NULL;*xx=0;}
|
|
|
|
if ( m_msg4Waiting && g_conf.m_testSpiderEnabled )
|
|
logf(LOG_DEBUG,"build: msg4=0x%" PTRFMT " returned"
|
|
,(PTRTYPE)&m_msg4);
|
|
|
|
// we are not waiting for the msg4 to return
|
|
m_msg4Waiting = false;
|
|
|
|
bool flush = false;
|
|
|
|
// no longer flush injections.
|
|
// TODO: pass in a flush flag with injection and flush in that
|
|
// case, but for now disable to make things faster. profiler
|
|
// indicates too much msg4 activity.
|
|
//if ( m_contentInjected ) flush = true;
|
|
//if ( m_sreqValid && m_sreq.m_isPageInject ) flush = true;
|
|
|
|
// to keep our qa runs consistent
|
|
if ( strcmp(cr->m_coll,"qatest123") == 0 ) flush = true;
|
|
|
|
if ( ! m_listAdded ) flush = false;
|
|
if ( m_listFlushed ) flush = false;
|
|
|
|
// HACK: flush it if we are injecting it in case the next thing we
|
|
// spider is dependent on this one
|
|
if ( flush ) {
|
|
// note it
|
|
setStatus ( "flushing msg4" );
|
|
// only do it once
|
|
m_listFlushed = true;
|
|
// do it
|
|
if ( ! flushMsg4Buffers ( m_masterState , m_masterLoop ) )
|
|
return false;
|
|
}
|
|
|
|
// . all done with that. core if we block i guess.
|
|
// . but what if we were not the function that set this to begin w/?
|
|
//m_masterLoop = NULL;
|
|
|
|
return logIt();
|
|
/*
|
|
// if not doing exact quotas, we're done
|
|
if ( ! cr->m_exactQuotas ) return logIt();
|
|
|
|
char *isIndexed = getIsIndexed();
|
|
// this means it blocked
|
|
if ( isIndexed == (char *)-1) { char *xx=NULL; *xx=0; }
|
|
// returns NULL with g_errno set
|
|
if ( isIndexed ) return logIt();
|
|
|
|
// otherwise, tell Msg36 to update our quota count for this site
|
|
// so we don't have to keep merging site: termlists
|
|
m_incCount = false;
|
|
m_decCount = false;
|
|
if ( m_indexCode ) m_decCount = true;
|
|
//if ( m_forceDelete ) m_decCount = true;
|
|
|
|
// fix for the exact quota bug found on eurekster collection. bug 229
|
|
// if we're not a new doc, then don't increment the count because
|
|
// we have been already counted as the old doc. MDW: i added the
|
|
// condition that if decCount is true we need to update the count!
|
|
if ( *isIndexed && ! m_decCount ) return logIt();
|
|
|
|
// if it is new and we are not adding it to the index then no need
|
|
// to update any quota count...
|
|
if ( ! *isIndexed && m_decCount ) return logIt();
|
|
|
|
// if not decrementing the count, must be incrementing it then!
|
|
if ( ! m_decCount ) m_incCount = true;
|
|
*/
|
|
// i am not using quotas, so disable this for now
|
|
|
|
/*
|
|
log(LOG_DEBUG,"build: inc'ing quota to REMOTE table "
|
|
"for termIdHost %" UINT64 " termIdDom %" UINT64 " for %s.",
|
|
m_msg16.m_termIdHost,m_msg16.m_termIdDom,m_url.getUrl());
|
|
|
|
setStatus ( "updating quota cache" );
|
|
|
|
// sanity checks
|
|
if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; }
|
|
if ( m_msg16.m_termIdDom == 0 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// . Msg36 gets the correct count from disk and puts it in cache. It
|
|
// doesn't try to increment or decrement the quotas in cache, because
|
|
// then it would have to be done on all twins, and also the correct
|
|
// split will have to be found.
|
|
// . Actually, we should only use the cache on one host to hold the
|
|
// sum of all splits. This will be the authority cache.
|
|
if ( ! m_updatedCounts ) {
|
|
// only call this once
|
|
m_updatedCounts = true;
|
|
// do it
|
|
if ( ! m_msg36.getTermFreq ( m_coll ,
|
|
0 , // maxAge
|
|
m_msg16.m_termIdHost ,
|
|
this ,
|
|
m_masterLoop ,
|
|
m_niceness ,
|
|
m_exactQuotas ,
|
|
m_incCount ,
|
|
m_decCount ,
|
|
false ))
|
|
// we blocked
|
|
return false;
|
|
// error?
|
|
if ( g_errno ) return logIt();
|
|
}
|
|
|
|
// add the second entry for domain
|
|
if ( ! m_updatedCounts2 ) {
|
|
// only call this once
|
|
m_updateCounts2 = true;
|
|
// do it
|
|
if ( ! m_msg36.getTermFreq ( m_coll ,
|
|
0 , // maxAge
|
|
m_msg16.m_termIdDom ,
|
|
this ,
|
|
doneAddingMsg36Entry2,
|
|
m_niceness ,
|
|
m_exactQuotas ,
|
|
m_incCount ,
|
|
m_decCount ,
|
|
false ))
|
|
// we blocked
|
|
return false;
|
|
// error?
|
|
if ( g_errno ) return logIt();
|
|
}
|
|
|
|
// that is it!
|
|
return logIt();
|
|
*/
|
|
}
|
|
|
|
bool isRobotsTxtFile ( char *u , int32_t ulen ) {
|
|
if ( ulen > 12 && ! strncmp ( u + ulen - 11 , "/robots.txt" , 11 ) )
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
// does this doc consist of a sequence of smaller sub-docs?
|
|
// if so we'll index the subdocs and not the container doc itself.
|
|
bool XmlDoc::isContainerDoc ( ) {
|
|
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
|
|
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
|
|
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( m_contentDelim ) return true;
|
|
if ( m_contentDelimValid && m_contentDelim ) return true;
|
|
return false;
|
|
}
|
|
|
|
// returns false if would block, true otherwise. returns true and sets g_errno on err
|
|
bool XmlDoc::indexContainerDoc ( ) {
|
|
|
|
if ( ! m_contentDelim ) {
|
|
log("build: can not index container doc. no delimeter.");
|
|
g_errno = EBADENGINEER;
|
|
return true;
|
|
}
|
|
|
|
// int8_t *hc = getHopCount();
|
|
// if ( ! hc ) return true; // error?
|
|
// if ( hc == (void *)-1 ) return false;
|
|
// first download
|
|
// in the case of a list of delimited http server replies let's
|
|
// not convert into utf8 here but just use as-is
|
|
char **cpp = getContent();//getUtf8Content();
|
|
// return true with g_errno set on error
|
|
if ( ! cpp ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
// would block? return false then
|
|
if ( cpp == (void *)-1 )
|
|
return false;
|
|
|
|
// need this. it is almost 1MB in size, so alloc it
|
|
if ( ! m_msg7 ) {
|
|
try { m_msg7 = new ( Msg7 ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
|
|
}
|
|
|
|
// inject input parms:
|
|
InjectionRequest *ir = &m_msg7->m_injectionRequest;
|
|
// the cursor for scanning the subdocs
|
|
if ( ! m_anyContentPtr ) {
|
|
// init the content cursor to point to the first subdoc
|
|
m_anyContentPtr = *cpp;
|
|
// but skip over initial separator if there. that is a
|
|
// faux pau
|
|
int32_t dlen = gbstrlen(m_contentDelim);
|
|
if ( strncmp(m_anyContentPtr,m_contentDelim,dlen) == 0 )
|
|
m_anyContentPtr += dlen;
|
|
// init the input parms
|
|
memset ( ir , 0 , sizeof(InjectionRequest) );
|
|
// reset it
|
|
ir->m_spiderLinks = false;
|
|
ir->m_injectLinks = false;
|
|
ir->m_hopCount = 0;//*hc + 1;
|
|
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
|
ir->m_collnum = m_collnum;
|
|
// will this work on a content delimeterized doc?
|
|
ir->m_deleteUrl = m_deleteFromIndex;
|
|
// each subdoc will have a mime since it is an arc
|
|
ir->m_hasMime = m_subDocsHaveMime;//true;
|
|
}
|
|
|
|
subdocLoop:
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// EOF?
|
|
if ( m_anyContentPtr == (char *)-1 ) {
|
|
m_indexCode = 0;//m_warcError;
|
|
m_indexCodeValid = true;
|
|
return true;
|
|
}
|
|
|
|
// we had \0 terminated the end of the previous record, so put back
|
|
if ( m_savedChar && ! *m_anyContentPtr ) {
|
|
*m_anyContentPtr = m_savedChar;
|
|
m_anyContentPtr += gbstrlen(m_contentDelim);
|
|
}
|
|
|
|
|
|
// index this subdoc
|
|
ir->ptr_content = m_anyContentPtr;
|
|
|
|
// . should have the url as well.
|
|
// . the url, ip etc. are on a single \n terminated line for an arc!
|
|
char *separator = strstr(m_anyContentPtr,m_contentDelim);
|
|
|
|
|
|
|
|
if ( separator ) {
|
|
m_savedChar = *separator;
|
|
m_anyContentPtr = separator;
|
|
*m_anyContentPtr = '\0';
|
|
//ir->size_content = separator - ir->ptr_content;
|
|
}
|
|
|
|
// if no separator found, this is our last injection
|
|
if ( ! separator ) {
|
|
m_anyContentPtr = (char *)-1;
|
|
}
|
|
|
|
|
|
// these are not defined. will be autoset in set4() i guess.
|
|
ir->m_firstIndexed = 0;
|
|
ir->m_lastSpidered = 0;
|
|
|
|
bool setUrl = false;
|
|
|
|
// HOWEVER, if an hasmime is true and an http:// follows
|
|
// the delimeter then use that as the url...
|
|
// this way we can specify our own urls.
|
|
if ( ir->m_hasMime ) {
|
|
char *du = ir->ptr_content;
|
|
//du += gbstrlen(delim);
|
|
if ( du && is_wspace_a ( *du ) ) du++;
|
|
if ( du && is_wspace_a ( *du ) ) du++;
|
|
if ( du && is_wspace_a ( *du ) ) du++;
|
|
if ( ir->m_hasMime &&
|
|
(strncasecmp( du,"http://",7) == 0 ||
|
|
strncasecmp( du,"https://",8) == 0 ) ) {
|
|
// flag it
|
|
setUrl = true;
|
|
// find end of it
|
|
char *uend = du + 7;
|
|
for ( ; *uend && ! is_wspace_a(*uend) ; uend++ );
|
|
// inject that then
|
|
m_injectUrlBuf.reset();
|
|
m_injectUrlBuf.safeMemcpy ( du , uend - du );
|
|
m_injectUrlBuf.nullTerm();
|
|
// and point to the actual http mime then
|
|
// well, skip that space, right
|
|
ir->ptr_content = uend + 1;
|
|
ir->ptr_url = m_injectUrlBuf.getBufStart();
|
|
ir->size_url = m_injectUrlBuf.length()+1; // include \0
|
|
// if (!strncmp(ir->ptr_url,"http://www.focusinfo.com/"
|
|
// "products/mxprodv" ,40) )
|
|
// log("hey");
|
|
}
|
|
}
|
|
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// make the url from parent url
|
|
// use hash of the content
|
|
int64_t ch64 = hash64n ( ir->ptr_content , 0LL );
|
|
|
|
// need this for an injection
|
|
ir->size_content = gbstrlen(ir->ptr_content) + 1;// improve this?
|
|
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( ! setUrl ) {
|
|
// reset it
|
|
m_injectUrlBuf.reset();
|
|
// by default append a -<ch64> to the provided url
|
|
m_injectUrlBuf.safePrintf("%s-%" UINT64 "",
|
|
m_firstUrl.getUrl(),ch64);
|
|
ir->ptr_url = m_injectUrlBuf.getBufStart();
|
|
ir->size_url = m_injectUrlBuf.length()+1; // include \0
|
|
}
|
|
|
|
|
|
bool status = m_msg7->sendInjectionRequestToHost ( ir ,
|
|
m_masterState ,
|
|
m_masterLoop ) ;
|
|
|
|
// it would block, callback will be called later
|
|
if ( status )
|
|
return false;
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// error?
|
|
if ( g_errno ) {
|
|
log("build: index flatfile error %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
else
|
|
log("build: index flatfile did not block");
|
|
|
|
// loop it up
|
|
goto subdocLoop;
|
|
|
|
}
|
|
|
|
|
|
void doneInjectingArchiveRec ( void *state ) {
|
|
Msg7 *THIS = (Msg7 *)state;
|
|
THIS->m_inUse = false;
|
|
XmlDoc *xd = THIS->m_stashxd;
|
|
xd->m_numInjectionsOut--;
|
|
log("build: archive: injection thread returned. %" INT32 " out now.",
|
|
xd->m_numInjectionsOut);
|
|
// reset g_errno so it doesn't error out in ::indexDoc() when
|
|
// we are injecting a ton of these msg7s and then xmldoc ends up
|
|
// getting reset and when a msg7 reply comes back in, we core
|
|
g_errno = 0;
|
|
xd->m_masterLoop ( xd );
|
|
}
|
|
|
|
void doneReadingArchiveFileWrapper ( int fd, void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// . go back to the main entry function
|
|
// . make sure g_errno is clear from a msg3a g_errno before calling
|
|
// this lest it abandon the loop
|
|
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
#define MAXWARCRECSIZE 5000000
|
|
|
|
bool XmlDoc::readMoreWarc() {
|
|
// We read everything we can off the pipe in a sleep timer.
|
|
// When we have enough to start processing, we call the
|
|
// processing function.
|
|
// If reading gets too far ahead of the processing and we can
|
|
// no longer buffer the read, then we save the offset of what
|
|
// we processed, free the readbuffer and restart the pipe and
|
|
// skip until the offset we last processed
|
|
|
|
if(!m_calledWgetThread) {
|
|
m_pipe = getUtf8ContentInFile();
|
|
}
|
|
|
|
// return true with g_errno set on error
|
|
if ( ! m_pipe ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
log("We don't have the warc pipe.");
|
|
return true;
|
|
}
|
|
|
|
int64_t leftOver = 0;
|
|
int64_t skipAhead = 0;
|
|
|
|
// How much is unprocessed
|
|
if(m_fptr != m_fptrEnd) {
|
|
leftOver = m_fptrEnd - m_fptr;
|
|
}
|
|
if(leftOver < 0) {
|
|
// Happens when we skip a record which is too big
|
|
skipAhead = - leftOver;
|
|
leftOver = 0;
|
|
m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fileBuf;
|
|
}
|
|
|
|
// We don't want to be memmoving the buffer up for every single
|
|
// document we process so only do it when we need it.
|
|
if(leftOver > MAXWARCRECSIZE) return false;
|
|
|
|
int64_t bytesRemaining = m_fileBufAllocSize - (m_fptrEnd - m_fileBuf) - 1;
|
|
// Scoot up everything we haven't processed
|
|
if(bytesRemaining < MAXWARCRECSIZE) {
|
|
//log("scooting up by left over %"INT64, leftOver);
|
|
// count everything we've processed
|
|
m_bytesStreamed += m_fptr - m_fileBuf;
|
|
memmove(m_fileBuf, m_fptr, leftOver);
|
|
m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fileBuf + leftOver;
|
|
*m_fptrEnd = '\0';
|
|
bytesRemaining += leftOver;
|
|
}
|
|
|
|
int64_t toRead = m_fileBufAllocSize - leftOver - 1;
|
|
if(toRead > bytesRemaining) toRead = bytesRemaining;
|
|
|
|
if(toRead == 0) {
|
|
//log("build: not enough room to read, lets process the buffer" );
|
|
return false;
|
|
}
|
|
|
|
|
|
g_loop.disableTimer();
|
|
errno = 0;
|
|
int bytesRead = fread(m_fptrEnd, 1, toRead, m_pipe);
|
|
g_loop.enableTimer();
|
|
|
|
// if(bytesRead > 0) {
|
|
// log("build: warc pipe read %" INT32 " more bytes of the pipe. errno = %s, buf space = %"INT64 " processed = %"INT64 " skipAhead=%"INT64,
|
|
// bytesRead, mstrerror(errno),toRead, m_bytesStreamed, skipAhead);
|
|
// }
|
|
|
|
if(bytesRead <= 0 && errno != EAGAIN) {
|
|
// if(errno == EAGAIN){
|
|
// log("build: fd is not ready, lets process the buffer" );
|
|
// return false;
|
|
// } else {
|
|
if(m_registeredWgetReadCallback) {
|
|
//log("build:came back from read callback");
|
|
g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
|
|
m_registeredWgetReadCallback = false;
|
|
}
|
|
|
|
if(m_pipe) {
|
|
int32_t retCode = fclose(m_pipe);
|
|
if(retCode) {
|
|
log("we closed the pipe with error %s", mstrerror(retCode));
|
|
}
|
|
m_pipe = NULL;
|
|
}
|
|
|
|
//log("build: warc problem pipe terminated %s", mstrerror(errno));
|
|
m_hasMoreToRead = false;
|
|
return false;
|
|
// }
|
|
}
|
|
//m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fptrEnd + bytesRead;
|
|
*m_fptrEnd = '\0';
|
|
m_fptr += skipAhead;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
// . returns false if would block, true otherwise.
|
|
// . returns true and sets g_errno on err
|
|
// . injectwarc
|
|
bool XmlDoc::indexWarcOrArc ( ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
if ( ! cr->m_indexWarcs ) {
|
|
g_errno = EDOCWARC;
|
|
return true;
|
|
}
|
|
|
|
// This can be a busy loop if we have max injections out but we
|
|
// are getting a read ready callback. Should we unregister
|
|
// when max injections are out and then reregister when we have room?
|
|
int32_t max = g_hostdb.m_numHosts * 2;
|
|
if ( max > MAXMSG7S ) max = MAXMSG7S;
|
|
if ( m_numInjectionsOut >= max ) return false;
|
|
|
|
char ctype;
|
|
if ( m_firstUrl.isWarc() ) {
|
|
ctype = CT_WARC;
|
|
} else {
|
|
ctype = CT_ARC;
|
|
}
|
|
|
|
int8_t *hc = getHopCount();
|
|
if ( ! hc ) return true; // error?
|
|
if ( hc == (void *)-1 ) return false;
|
|
|
|
if ( ! m_fileBuf ) {
|
|
// Do this exactly once.
|
|
m_fileBufAllocSize = (5 * MAXWARCRECSIZE) + 1;
|
|
m_fileBuf=(char *)mmalloc(m_fileBufAllocSize ,"sibuf");
|
|
m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fileBuf;
|
|
m_bytesStreamed = 0;
|
|
m_hasMoreToRead = true;
|
|
}
|
|
|
|
if ( ! m_fileBuf ) {
|
|
log("build: failed to alloc buf to read archive file %s",m_firstUrl.getUrl());
|
|
return true;
|
|
}
|
|
|
|
if(m_hasMoreToRead) readMoreWarc();
|
|
|
|
setStatus ("injecting archive records");
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// did an inject return?
|
|
if ( m_doneInjectingWarc ) {
|
|
warcDone:
|
|
// log("build: done parsing %" INT64 " bytes of archive file %s. left over =%"INT32 "done injecting %"INT32 " hasmoretoread %"INT32,
|
|
// m_bytesStreamed + m_fptrEnd - m_fileBuf,
|
|
// m_firstUrl.getUrl(),
|
|
// (int32_t)(m_fptrEnd - m_fptr),
|
|
// (int32_t)m_doneInjectingWarc,
|
|
// (int32_t)m_hasMoreToRead);
|
|
|
|
m_doneInjectingWarc = true;
|
|
|
|
// return if all injects have returned.
|
|
if ( m_numInjectionsOut == 0) { // && !m_hasMoreToRead
|
|
g_errno = m_warcError;
|
|
m_indexCode = m_warcError;
|
|
m_indexCodeValid = true;
|
|
|
|
return true;
|
|
}
|
|
log("build: waiting for injection threads to return.");
|
|
// we would block
|
|
return false;
|
|
}
|
|
|
|
// Dup strings into here so we don't write nulls into our buffer, sometimes we have
|
|
// to rewind over a rec and we want the buf to be the same every time.
|
|
char scratchSpace[1024*10];
|
|
SafeBuf scratch(scratchSpace, 1024*10);
|
|
loop:
|
|
scratch.reset();
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( max > MAXMSG7S ) max = MAXMSG7S;
|
|
// wait for one to come back before launching another msg7
|
|
if ( m_numInjectionsOut >= max ) {
|
|
// Don't need to read anymore so don't call us
|
|
if(m_registeredWgetReadCallback && m_pipe && m_fptr < m_fptrEnd) {
|
|
g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
|
|
m_registeredWgetReadCallback = false;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
char *realStart = m_fptr;
|
|
|
|
// need at least say 100k for warc header
|
|
if ( m_fptr + 100000 > m_fptrEnd && m_hasMoreToRead ) {
|
|
//log("build need more of the record to process so sleeping.");
|
|
|
|
if(!m_registeredWgetReadCallback) {
|
|
if(!g_loop.registerReadCallback ( fileno(m_pipe),
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return true;
|
|
}
|
|
log("build: reregistered the read callback. need more");
|
|
m_registeredWgetReadCallback = true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
int64_t recTime = 0;
|
|
char *recIp = NULL;
|
|
char *recUrl = NULL;
|
|
char *recContent = NULL;
|
|
int64_t recContentLen = 0;
|
|
// what we skip over
|
|
uint64_t recSize = 0;
|
|
|
|
//
|
|
// set recUrl, recIp, recTime, recContent, recContentLen and recSize
|
|
//
|
|
//log("buf size is %"INT64 " four chars %c%c%c%c%c%c",
|
|
//m_fptrEnd-m_fptr, m_fptr[0], m_fptr[1], m_fptr[2], m_fptr[3],m_fptr[4],m_fptr[5]);
|
|
|
|
if ( ctype == CT_WARC ) {
|
|
// find "WARC/1.0" or whatever
|
|
char *whp = m_fptr;
|
|
if( ! whp ) {
|
|
// FIXME: shouldn't get here with a NULL
|
|
log("build: No buffer for file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
// we do terminate last warc rec with \0 so be aware of that...
|
|
int32_t maxCount = 10;
|
|
for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++);
|
|
// none?
|
|
if ( ! *whp ) {
|
|
log("build: could not find WARC/1 header start for "
|
|
"file=%s", m_firstUrl.getUrl());
|
|
// we don't really need this and since we force the
|
|
// http reply to end in \0 before calling inject2() on
|
|
// it it gets messed up
|
|
goto warcDone;
|
|
}
|
|
|
|
char *warcHeader = whp;
|
|
|
|
// find end of warc mime HEADER not the content
|
|
char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
|
|
if ( ! warcHeaderEnd ) {
|
|
log("build: could not find end of WARC header for "
|
|
"file=%s.",
|
|
m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
// \0 term for strstrs below
|
|
char tmp = *warcHeaderEnd;
|
|
*warcHeaderEnd = '\0';
|
|
|
|
char *warcLen = strstr(warcHeader,"Content-Length:");
|
|
char *warcUrl = strstr(warcHeader,"WARC-Target-URI:");
|
|
char *warcType = strstr(warcHeader,"WARC-Type:");
|
|
char *warcDate = strstr(warcHeader,"WARC-Date:");
|
|
char *warcIp = strstr(warcHeader,"WARC-IP-Address:");
|
|
char *warcCon = strstr(warcHeader,"Content-Type:");
|
|
|
|
|
|
// advance
|
|
if ( warcLen ) warcLen += 15;
|
|
if ( warcUrl ) warcUrl += 16;
|
|
if ( warcType ) warcType += 10;
|
|
if ( warcIp ) warcIp += 17;
|
|
if ( warcCon ) warcCon += 13;
|
|
if ( warcDate ) warcDate += 10;
|
|
|
|
// skip initial spaces spaces
|
|
for ( ; warcUrl && is_wspace_a(*warcUrl ) ; warcUrl ++ );
|
|
for ( ; warcLen && is_wspace_a(*warcLen ) ; warcLen ++ );
|
|
for ( ; warcType && is_wspace_a(*warcType) ; warcType++ );
|
|
for ( ; warcDate && is_wspace_a(*warcDate) ; warcDate++ );
|
|
for ( ; warcIp && is_wspace_a(*warcIp ) ; warcIp ++ );
|
|
for ( ; warcCon && is_wspace_a(*warcCon ) ; warcCon ++ );
|
|
|
|
// get Content-Length: of WARC header for its content
|
|
if ( ! warcLen ) {
|
|
// this is a critical stop.
|
|
log("build: warc problem: could not find WARC Content-Length:");
|
|
goto warcDone;
|
|
}
|
|
|
|
//
|
|
// advance m_fptr to point to the next warc record in case we
|
|
// end up calling 'goto loop' below
|
|
//
|
|
recContent = warcHeaderEnd + 4;
|
|
recContentLen = atoll(warcLen);
|
|
|
|
//log("build content len was %"INT64, recContentLen);
|
|
char *warcContentEnd = recContent + recContentLen;
|
|
recSize = (warcContentEnd - realStart);
|
|
|
|
recUrl = warcUrl;
|
|
|
|
// point to the next warc record
|
|
m_fptr += recSize;
|
|
*warcHeaderEnd = tmp;
|
|
|
|
//log("skipping %"UINT64, recSize);
|
|
// advance the file offset to the next record as well
|
|
|
|
// get WARC-Type:
|
|
// revisit (if url was already done before)
|
|
// request (making a GET or DNS request)
|
|
// response (response to a GET or dns request)
|
|
// warcinfo (crawling parameters, robots: obey, etc)
|
|
// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
|
|
if ( ! warcType ) {
|
|
log("build: could not find WARC-Type:");
|
|
goto loop;
|
|
}
|
|
//http://www.mpaa.org/Resources/5bec4ac9-a95e-443b-987b-bff6fb5455a9.pdf
|
|
// get Content-Type:
|
|
// application/warc-fields (fetch time, hops from seed)
|
|
// application/http; msgtype=request (the GET request)
|
|
// application/http; msgtype=response (the GET reply)
|
|
if ( ! warcCon ) {
|
|
log("build: could not find Content-Type:");
|
|
goto loop;
|
|
}
|
|
|
|
if ( ! warcUrl ) {
|
|
// no URI?
|
|
log("build: could not find url");
|
|
goto loop;
|
|
}
|
|
|
|
// if WARC-Type: is not response, skip it. so if it
|
|
// is a revisit then skip it i guess.
|
|
if ( strncmp ( warcType,"response", 8 ) != 0) {
|
|
//log("build: was not type response %s *****%s*****", warcUrl, warcType);
|
|
|
|
// read another warc record
|
|
goto loop;
|
|
}
|
|
|
|
// warcConType needs to be
|
|
// application/http; msgtype=response
|
|
if ( !(strncmp(warcCon,"application/http; msgtype=response",34) == 0 ||
|
|
strncmp(warcCon,"application/http;msgtype=response",33) == 0)) {
|
|
// read another warc record
|
|
//log("build: wrong content type %s ---%s---", warcUrl, warcCon);
|
|
goto loop;
|
|
}
|
|
|
|
recTime = 0;
|
|
if ( warcDate ) recTime = atotime ( warcDate );
|
|
recIp = warcIp;
|
|
}
|
|
// END WARC SPECIFIC PARSING
|
|
|
|
//
|
|
// set recUrl, recIp, recTime, recContent, recContentLen and recSize
|
|
//
|
|
if ( ctype == CT_ARC ) {
|
|
// find \n\nhttp://
|
|
char *whp = m_fptr;
|
|
for ( ; *whp ; whp++ ) {
|
|
if ( whp[0] != '\n' ) continue;
|
|
if ( strncmp(whp+1,"http://",7) == 0) break;
|
|
if ( strncmp(whp+1,"https://",8) == 0) break;
|
|
}
|
|
// none?
|
|
if ( ! *whp ) {
|
|
log("build: arc: could not find next \\nhttp:// in "
|
|
"arc file %s",m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
char *arcHeader = whp;
|
|
// find end of arc header not the content
|
|
char *arcHeaderEnd = strstr(arcHeader+1,"\n");
|
|
if ( ! arcHeaderEnd ) {
|
|
log("build: warc problem: could not find end of ARC header. file=%s",
|
|
m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
// \0 term for strstrs below
|
|
char tmp = *arcHeaderEnd;
|
|
*arcHeaderEnd = '\0';
|
|
char *arcContent = arcHeaderEnd + 1;
|
|
// parse arc header line
|
|
char *url = arcHeader + 1;
|
|
char *hp = url;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 1.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
url = scratch.pushStr(url, hp-url);
|
|
hp++;
|
|
|
|
char *ipStr = hp;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 2.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
ipStr = scratch.pushStr(ipStr, hp - ipStr);
|
|
hp++;
|
|
|
|
char *timeStr = hp;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 3.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
timeStr = scratch.pushStr(timeStr, hp - timeStr);
|
|
hp++;
|
|
|
|
char *arcConType = hp;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 4.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
arcConType = scratch.pushStr(arcConType, hp - arcConType);
|
|
hp++;
|
|
|
|
char *arcContentLenStr = hp;
|
|
// get arc content len
|
|
int64_t arcContentLen = atoll(arcContentLenStr);
|
|
char *arcContentEnd = arcContent + arcContentLen;
|
|
//uint64_t oldOff = s_off;
|
|
recSize = (arcContentEnd - realStart);
|
|
// point to the next arc record
|
|
m_fptr += recSize;
|
|
*arcHeaderEnd = tmp;
|
|
// advance the file offset to the next record as well
|
|
// arcConType needs to indexable
|
|
int32_t ct = getContentTypeFromStr ( arcConType );
|
|
if ( ct != CT_HTML &&
|
|
ct != CT_TEXT &&
|
|
ct != CT_XML &&
|
|
ct != CT_PDF &&
|
|
ct != CT_XLS &&
|
|
ct != CT_PPT &&
|
|
ct != CT_PS &&
|
|
ct != CT_DOC &&
|
|
ct != CT_JSON ) {
|
|
// read another arc record
|
|
log("build: was not indexable response %s", arcConType);
|
|
goto loop;
|
|
}
|
|
// convert to timestamp
|
|
// this time structure, once filled, will help yield a time_t
|
|
struct tm t;
|
|
// DAY OF MONTH
|
|
t.tm_mday = atol2 ( timeStr + 6 , 2 );
|
|
// MONTH
|
|
t.tm_mon = atol2 ( timeStr + 4 , 2 );
|
|
// YEAR - # of years since 1900
|
|
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
|
|
// TIME
|
|
t.tm_hour = atol2 ( timeStr + 8 , 2 );
|
|
t.tm_min = atol2 ( timeStr + 10 , 2 );
|
|
t.tm_sec = atol2 ( timeStr + 12 , 2 );
|
|
// unknown if we're in daylight savings time
|
|
t.tm_isdst = -1;
|
|
// translate using mktime
|
|
recTime = timegm ( &t );
|
|
// set content as well
|
|
recContent = arcContent;
|
|
recContentLen = arcContentLen;
|
|
recUrl = url;
|
|
recIp = ipStr;
|
|
}
|
|
// END ARC SPECIFIC PARSING
|
|
|
|
|
|
|
|
// must be http not dns:
|
|
// url must start with http:// or https://
|
|
// it's probably like WARC-Target-URI: dns:www.xyz.com
|
|
// so it is a dns response
|
|
if ( strncmp(recUrl,"http://" ,7) != 0 &&
|
|
strncmp(recUrl,"https://",8) != 0 )
|
|
goto loop;
|
|
|
|
// get length of it, null term it
|
|
char *recUrlEnd = recUrl;
|
|
for ( ; *recUrlEnd && ! is_wspace_a(*recUrlEnd) ; recUrlEnd++ );
|
|
int32_t recUrlLen = recUrlEnd - recUrl;
|
|
//*recUrlEnd = '\0';
|
|
|
|
// skip if robots.txt
|
|
if ( isRobotsTxtFile( recUrl , recUrlLen ) )
|
|
goto loop;
|
|
|
|
// how can there be no more to read?
|
|
if ( m_fptr > m_fptrEnd && ! m_hasMoreToRead ) {
|
|
log("build: warc problem: archive file %s exceeded file length.",
|
|
m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
|
|
// if we fall outside of the current read buf, read next rec if too big
|
|
if ( m_fptr > m_fptrEnd && recSize > MAXWARCRECSIZE ) {
|
|
log("build: skipping archive file of %" INT64 " "
|
|
"bytes which is too big",recSize);
|
|
|
|
if(!m_registeredWgetReadCallback) {
|
|
if(!g_loop.registerReadCallback ( fileno(m_pipe),
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return true;
|
|
}
|
|
log("build: reregistered the read callback. skip bigrec");
|
|
m_registeredWgetReadCallback = true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// don't read the next record, read THIS one again, we can fit it
|
|
if ( m_fptr > m_fptrEnd ) {
|
|
//log("build: record end is past the end of what we read by %"INT64 " %"UINT64, m_fptrEnd - m_fptr, recSize);
|
|
m_fptr -= recSize;
|
|
|
|
if(!m_registeredWgetReadCallback) {
|
|
if(!g_loop.registerReadCallback ( fileno(m_pipe),
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return true;
|
|
}
|
|
log("build: reregistered the read callback. reread this record");
|
|
m_registeredWgetReadCallback = true;
|
|
}
|
|
|
|
|
|
return false;
|
|
}
|
|
|
|
char *httpReply = recContent;
|
|
int64_t httpReplySize = recContentLen;
|
|
|
|
// should be a mime that starts with GET or POST
|
|
HttpMime m;
|
|
if ( ! m.set ( httpReply , httpReplySize , NULL ) ) {
|
|
log("build: archive: failed to set http mime at in "
|
|
"file");
|
|
goto loop;
|
|
}
|
|
|
|
// check content type
|
|
int ct2 = m.getContentType();
|
|
if ( ct2 != CT_HTML &&
|
|
ct2 != CT_TEXT &&
|
|
ct2 != CT_XML &&
|
|
ct2 != CT_PDF &&
|
|
ct2 != CT_XLS &&
|
|
ct2 != CT_PPT &&
|
|
ct2 != CT_PS &&
|
|
ct2 != CT_DOC &&
|
|
ct2 != CT_JSON ) {
|
|
//log("build:got wrong type %"INT32, (int32_t)ct2);
|
|
goto loop;
|
|
}
|
|
|
|
// grab an available msg7
|
|
Msg7 *msg7 = NULL;
|
|
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) {
|
|
msg7 = m_msg7s[i];
|
|
// if we got an available one stop
|
|
if ( msg7 ) {
|
|
if( msg7->m_inUse ) continue;
|
|
break; // reuse this one.
|
|
}
|
|
// ok, create one, 1MB each about
|
|
try { msg7 = new ( Msg7 ); }
|
|
catch ( ... ) {g_errno=ENOMEM;m_warcError=g_errno;return true;}
|
|
mnew ( msg7 , sizeof(Msg7),"xdmsgs7");
|
|
|
|
// store it for re-use
|
|
m_msg7s[i] = msg7;
|
|
break;
|
|
}
|
|
|
|
if(!msg7 || msg7->m_inUse) {
|
|
// shouldn't happen, but it does... why?
|
|
log("build: archive: Ran out of msg7s to inject doc.");
|
|
return false;
|
|
}
|
|
|
|
// inject input parms:
|
|
InjectionRequest *ir = &msg7->m_injectionRequest;
|
|
// reset it
|
|
ir->m_hopCount = *hc + 1;
|
|
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
|
ir->m_collnum = m_collnum;
|
|
// will this work on a content delimeterized doc?
|
|
ir->m_deleteUrl = m_deleteFromIndex;
|
|
// each subdoc will have a mime since it is a warc
|
|
ir->m_hasMime = true;
|
|
// it has a mime so we shouldn't need to set this
|
|
ir->ptr_contentTypeStr = NULL;
|
|
// we are injecting a single page, not a container file
|
|
ir->ptr_contentDelim = NULL;
|
|
// miscelleaneous. faster than memsetting the whole gr class (32k)
|
|
ir->m_getSections = 0;
|
|
ir->m_gotSections = 0;
|
|
ir->m_spiderLinks = false;
|
|
ir->m_injectLinks = false;
|
|
ir->m_shortReply = false;
|
|
ir->m_newOnly = false;
|
|
ir->m_recycle = false;
|
|
ir->m_dedup = true;
|
|
ir->m_doConsistencyTesting = false;
|
|
ir->m_charset = 0;
|
|
|
|
ir->ptr_queryToScrape = NULL;
|
|
ir->ptr_contentFile = NULL;
|
|
ir->ptr_diffbotReply = NULL;
|
|
|
|
|
|
// Stick the capture date in the metadata
|
|
StackBuf(newKey);
|
|
newKey.safePrintf("\"gbcapturedate\":%" INT64 , recTime);
|
|
SafeBuf newMetadata(newKey.length() * 2 + size_metadata, "ModifiedMetadata");
|
|
|
|
newMetadata.safeMemcpy(ptr_metadata, size_metadata);
|
|
Json::prependKey(newMetadata, newKey.getBufStart());
|
|
|
|
ir->ptr_metadata = newMetadata.getBufStart();
|
|
ir->size_metadata = newMetadata.length();
|
|
|
|
newMetadata.nullTerm();
|
|
// set 'timestamp' for injection
|
|
ir->m_firstIndexed = recTime;
|
|
ir->m_lastSpidered = recTime;
|
|
|
|
|
|
// set 'ip' for injection
|
|
|
|
ir->m_injectDocIp = 0;
|
|
// get the record IP address from the warc header if there
|
|
if ( recIp ) {
|
|
// get end of ip
|
|
char *ipEnd = recIp;
|
|
// skip digits and periods
|
|
while ( *ipEnd && ! is_wspace_a(*ipEnd) ) ipEnd++;
|
|
// we now have the ip address for doing ip: searches
|
|
// this func is in ip.h
|
|
ir->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
|
|
}
|
|
|
|
// we end up repopulating m_fileBuf to read the next warc sometimes
|
|
// so do not destroy the content we are injecting from the original
|
|
// m_fileBuf. so we have to copy it.
|
|
msg7->m_contentBuf.reset();
|
|
msg7->m_contentBuf.reserve ( httpReplySize + 5 );
|
|
msg7->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
|
|
msg7->m_contentBuf.nullTerm();
|
|
|
|
// set 'content' for injection
|
|
ir->ptr_content = msg7->m_contentBuf.getBufStart();
|
|
ir->size_content = msg7->m_contentBuf.getLength() + 1;
|
|
|
|
|
|
|
|
// set the rest of the injection parms
|
|
ir->m_hopCount = -1;
|
|
ir->m_newOnly = 0;
|
|
// all warc records have the http mime
|
|
ir->m_hasMime = true;
|
|
|
|
ir->ptr_url = recUrl;
|
|
ir->size_url = recUrlLen+1;
|
|
|
|
// stash this
|
|
msg7->m_stashxd = this;
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// log it
|
|
*recUrlEnd = '\0';
|
|
log("build: archive: injecting archive url %s",recUrl);
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if (msg7->sendInjectionRequestToHost(ir,msg7,doneInjectingArchiveRec)){
|
|
m_numInjectionsOut++;
|
|
msg7->m_inUse = true;
|
|
goto loop;
|
|
}
|
|
|
|
log("build: index archive: msg7 inject: %s",
|
|
mstrerror(g_errno));
|
|
|
|
goto loop;
|
|
}
|
|
|
|
|
|
|
|
|
|
void getTitleRecBufWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in get title rec wrapper" );
|
|
// return if it blocked
|
|
if ( THIS->getTitleRecBuf() == (void *)-1 ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
key_t *XmlDoc::getTitleRecKey() {
|
|
if ( m_titleRecBufValid ) return &m_titleRecKey;
|
|
SafeBuf *tr = getTitleRecBuf();
|
|
if ( ! tr || tr == (void *)-1 ) return (key_t *)tr;
|
|
return &m_titleRecKey;
|
|
}
|
|
|
|
|
|
int32_t *XmlDoc::getIndexCode ( ) {
|
|
|
|
int32_t *indexCode = getIndexCode2();
|
|
if ( ! indexCode || indexCode == (void *)-1 ) return indexCode;
|
|
|
|
// if zero good!
|
|
if ( *indexCode == 0 ) return indexCode;
|
|
|
|
//
|
|
// should we neutralize it?
|
|
//
|
|
// in the case of indexing dmoz urls outputted from
|
|
// 'dmozparse urldump -s' it outputs a meta tag
|
|
// (<meta name=ignorelinksexternalerrors content=1>) that
|
|
// indicates to index the links even in the case of some errors,
|
|
// so that we can be assured to have exactly the same urls the dmoz
|
|
// has in our index. so when we do a gbcatid:xxx query we get the same
|
|
// urls in the search results that dmoz has for that category id.
|
|
if ( ! m_sreqValid || ! m_sreq.m_ignoreExternalErrors )
|
|
return indexCode;
|
|
|
|
// only neutralize certain errors
|
|
if ( * indexCode != EDNSTIMEDOUT
|
|
&& *indexCode != ETCPTIMEDOUT
|
|
&& *indexCode != EUDPTIMEDOUT
|
|
// from m_redirError
|
|
&& *indexCode != EDOCSIMPLIFIEDREDIR
|
|
&& *indexCode != EDOCNONCANONICAL
|
|
&& *indexCode != EDNSDEAD
|
|
&& *indexCode != ENETUNREACH
|
|
&& *indexCode != EHOSTUNREACH
|
|
&& *indexCode != EDOCFILTERED
|
|
&& *indexCode != EDOCREPEATSPAMMER
|
|
&& *indexCode != EDOCDUP
|
|
&& *indexCode != EDOCISERRPG
|
|
&& *indexCode != EDOCHIJACKED
|
|
&& *indexCode != EDOCBADHTTPSTATUS
|
|
&& *indexCode != EDOCDISALLOWED
|
|
&& *indexCode != EBADCHARSET
|
|
&& *indexCode != EDOCDUPWWW
|
|
&& *indexCode != EBADIP
|
|
&& *indexCode != EDOCEVILREDIRECT // fix video.google.com dmoz
|
|
&& *indexCode != EBADMIME
|
|
// index.t and .exe files are in dmoz but those
|
|
// extensions are "bad" according to Url::isBadExtension()
|
|
&& *indexCode != EDOCBADCONTENTTYPE
|
|
// repeat url path components are ok:
|
|
&& *indexCode != ELINKLOOP
|
|
&& *indexCode != ECONNREFUSED
|
|
// malformed sections:
|
|
&& *indexCode != EDOCBADSECTIONS
|
|
&& *indexCode != ECORRUPTHTTPGZIP
|
|
)
|
|
return indexCode;
|
|
|
|
// ok, neutralize it
|
|
*indexCode = 0;
|
|
|
|
// if we could not get an ip we need to make a fake one
|
|
if ( ! m_ipValid || m_ip == 0 || m_ip == -1 ) {
|
|
log("build: ip unattainable. forcing ip address of %s "
|
|
"to 10.5.123.45",m_firstUrl.m_url);
|
|
m_ip = atoip("10.5.123.45");
|
|
m_ipValid = true;
|
|
}
|
|
|
|
// make certain things valid to avoid core in getNewSpiderReply()
|
|
if ( ! m_crawlDelayValid ) {
|
|
m_crawlDelayValid = true;
|
|
m_crawlDelay = -1;
|
|
}
|
|
|
|
return indexCode;
|
|
}
|
|
|
|
|
|
// . return NULL and sets g_errno on error
|
|
// . returns -1 if blocked
|
|
int32_t *XmlDoc::getIndexCode2 ( ) {
|
|
|
|
// return it now if we got it already
|
|
if ( m_indexCodeValid ) return &m_indexCode;
|
|
|
|
setStatus ( "getting index code");
|
|
|
|
// page inject can set deletefromindex to true
|
|
if ( m_deleteFromIndex ) {
|
|
m_indexCode = EDOCFORCEDELETE;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block and return -1, we will be re-called from the top
|
|
//if ( ! m_masterLoop ) {
|
|
// m_masterLoop = getTitleRecWrapper;
|
|
// m_masterState = this;
|
|
//}
|
|
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( m_firstUrl.m_ulen <= 5 ) {
|
|
m_indexCode = EBADURL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
if ( m_firstUrl.m_ulen + 1 >= MAX_URL_LEN ) {
|
|
m_indexCode = EURLTOOLONG;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// "url is repeating path components" error?
|
|
if ( ! m_check1 ) {
|
|
m_check1 = true;
|
|
if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
|
|
m_indexCode = ELINKLOOP;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// fix for "http://.xyz.com/...."
|
|
if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
|
|
m_indexCode = EBADURL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
|
|
m_check2 = true;
|
|
if ( m_firstUrl.isSpam() ) {
|
|
m_indexCode = EDOCURLSPAM;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// . don't spider robots.txt urls for indexing!
|
|
// . quickly see if we are a robots.txt url originally
|
|
int32_t fulen = getFirstUrl()->getUrlLen();
|
|
char *fu = getFirstUrl()->getUrl();
|
|
char *fp = fu + fulen - 11;
|
|
if ( fulen > 12 &&
|
|
fp[1] == 'r' &&
|
|
! strncmp ( fu + fulen - 11 , "/robots.txt" , 11 )) {
|
|
m_indexCode = EBADURL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// if this is an injection and "newonly" is not zero then we
|
|
// only want to do the injection if the url is "new", meaning not
|
|
// already indexed. "m_wasContentInjected" will be true if this is
|
|
// an injection. "m_newOnly" will be true if the injector only
|
|
// wants to proceed with the injection if this url is not already
|
|
// indexed.
|
|
if ( m_wasContentInjected && m_newOnly ) {
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
|
|
XmlDoc *od = *pod;
|
|
// if the old doc does exist and WAS NOT INJECTED itself
|
|
// then abandon this injection. it was spidered the old
|
|
// fashioned way and we want to preserve it and NOT overwrite
|
|
// it with this injection.
|
|
if ( od && ! od->m_wasContentInjected ) {
|
|
m_indexCode = EABANDONED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
// if it was injected itself, only abandon this injection
|
|
// in the special case that m_newOnly is "1". otherwise
|
|
// if m_newOnly is 2 then we will overwrite any existing
|
|
// titlerecs that were not injected themselves.
|
|
if ( od && od->m_wasContentInjected && m_newOnly == 1 ) {
|
|
m_indexCode = EABANDONED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
}
|
|
|
|
// need tagrec to see if banned
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// this is an automatic ban!
|
|
if ( gr->getLong("manualban",0) ) {
|
|
m_indexCode = EDOCBANNED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
|
|
// get the ip of the current url
|
|
int32_t *ip = getIp ( );
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
|
|
if ( *ip == 0 ) {
|
|
m_indexCode = EBADIP;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . check robots.txt
|
|
// . uses the current url
|
|
// . if we end in /robots.txt then this quickly returns true
|
|
// . no, we still might want to index if we got link text, so just
|
|
// check this again below
|
|
bool *isAllowed = getIsAllowed();
|
|
if ( ! isAllowed || isAllowed == (void *)-1) return (int32_t *)isAllowed;
|
|
/*
|
|
if ( ! *isAllowed ) {
|
|
m_indexCode = EDOCDISALLOWED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
*/
|
|
|
|
// . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc.
|
|
// . this will be the reply from diffbot.com if using diffbot
|
|
int32_t *dstatus = getDownloadStatus();
|
|
if ( ! dstatus || dstatus == (void *)-1 ) return (int32_t *)dstatus;
|
|
if ( *dstatus ) {
|
|
m_indexCode = *dstatus;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// check the mime
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (int32_t *)mime;
|
|
// no, now the smart compression will nuke a reply if it has
|
|
// no good date or for other reasons...
|
|
// if empty, bad mime
|
|
//if ( mime->getMimeLen() <= 0 && ! m_recycleContent ) {
|
|
// m_indexCode = EBADMIME;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
//}
|
|
|
|
// check redir url
|
|
Url **redirp = getRedirUrl();
|
|
if ( ! redirp || redirp == (void *)-1 ) return (int32_t *)redirp;
|
|
// this must be valid now
|
|
if ( ! m_redirErrorValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_redirError ) {
|
|
m_indexCode = m_redirError;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (void *)-1 ) return (int32_t *)d;
|
|
if ( *d == 0LL ) {
|
|
m_indexCode = ENODOCID;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . is the same url but with a www. present already in titledb?
|
|
// . example: if we are xyz.com and www.xyz.com is already in titledb
|
|
// then nuke ourselves by setting m_indexCode to EDOCDUPWWW
|
|
char *isWWWDup = getIsWWWDup ();
|
|
if ( ! isWWWDup || isWWWDup == (char *)-1) return (int32_t *)isWWWDup;
|
|
if ( *isWWWDup ) {
|
|
m_indexCode = EDOCDUPWWW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
|
|
uint16_t *charset = getCharset();
|
|
if ( ! charset && g_errno == EBADCHARSET ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBADCHARSET;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if ( ! charset || charset == (void *)-1) return (int32_t *)charset;
|
|
// we had a 2024 for charset come back and that had a NULL
|
|
// get_charset_str() but it was not supported
|
|
if ( ! supportedCharset(*charset) ) { //&&get_charset_str(*charset) ) {
|
|
m_indexCode = EBADCHARSET;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// get local link info
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int32_t *)info1;
|
|
// get remote link info
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (int32_t *)pinfo2;
|
|
LinkInfo *info2 = *pinfo2;
|
|
|
|
// if robots.txt said no, and if we had no link text, then give up
|
|
bool disallowed = true;
|
|
if ( *isAllowed ) disallowed = false;
|
|
if ( info1 && info1->hasLinkText() ) disallowed = false;
|
|
if ( info2 && info2->hasLinkText() ) disallowed = false;
|
|
// if we generated a new sitenuminlinks to store in tagdb, we might
|
|
// want to add this for that only reason... consider!
|
|
if ( disallowed ) {
|
|
m_indexCode = EDOCDISALLOWED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// check for bad url extension, like .jpg
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (int32_t *)cu;
|
|
|
|
// take this check out because it is hurting
|
|
// http://community.spiceworks.com/profile/show/Mr.T
|
|
// because 't' was in the list of bad extensions.
|
|
// now we use the url filters table to exclude the extensions we want.
|
|
// and we use the 'ismedia' directive to exclude common media
|
|
// extensions. having this check here is no longer needed and confusing
|
|
// BUT on the otherhand stuff like .exe .rpm .deb is good to avoid!
|
|
// so i'll just edit the list to remove more ambiguous extensions
|
|
// like .f and .t
|
|
bool badExt = cu->isBadExtension ( m_version );
|
|
if ( badExt && ! info1->hasLinkText() &&
|
|
( ! info2 || ! info2->hasLinkText() ) ) {
|
|
m_indexCode = EDOCBADCONTENTTYPE;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
int16_t *hstatus = getHttpStatus();
|
|
if ( ! hstatus || hstatus == (void *)-1 ) return (int32_t *)hstatus;
|
|
if ( *hstatus != 200 ) {
|
|
m_indexCode = EDOCBADHTTPSTATUS;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// debug point
|
|
//if ( cr->m_localCrawlInfo.m_pageDownloadAttempts >= 2 ) {
|
|
// m_indexCode = ETCPTIMEDOUT;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
//}
|
|
|
|
// if this page is hijacked, toss it!
|
|
char *hj = getIsHijacked();
|
|
if ( ! hj || hj == (char *)-1 ) return (int32_t *)hj;
|
|
// if not allowed m_indexCode will be set
|
|
if ( *hj ) {
|
|
m_indexCode = EDOCHIJACKED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// check for EDOCISERRPG (custom error pages)
|
|
char *isErrorPage = getIsErrorPage();
|
|
if ( !isErrorPage||isErrorPage==(void *)-1) return (int32_t *)isErrorPage;
|
|
if ( *isErrorPage ) {
|
|
m_indexCode = EDOCISERRPG;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . i moved this up to perhaps fix problems of two dup pages being
|
|
// downloaded at about the same time
|
|
// . are we a dup of another doc from any other site already indexed?
|
|
char *isDup = getIsDup();
|
|
if ( ! isDup || isDup == (char *)-1 ) return (int32_t *)isDup;
|
|
if ( *isDup ) {
|
|
m_indexCode = EDOCDUP;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . is a non-canonical page that have <link ahref=xxx rel=canonical>
|
|
// . also sets m_canonicanlUrl.m_url to it if we are not
|
|
// . returns NULL if we are the canonical url
|
|
// . do not do this check if the page was injected
|
|
bool checkCanonical = true;
|
|
if ( m_wasContentInjected ) checkCanonical = false;
|
|
if ( m_isInjecting && m_isInjectingValid ) checkCanonical = false;
|
|
// do not do canonical deletion if recycling content either i guess
|
|
if ( m_sreqValid && m_sreq.m_recycleContent ) checkCanonical = false;
|
|
// do not delete from being canonical if doing a query reindex
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex ) checkCanonical = false;
|
|
if ( checkCanonical ) {
|
|
Url **canon = getCanonicalRedirUrl();
|
|
if ( ! canon || canon == (void *)-1 ) return (int32_t *)canon;
|
|
// if there is one then we are it's leaf, it is the primary
|
|
// page so we should not index ourselves
|
|
if ( *canon ) {
|
|
m_indexCode = EDOCNONCANONICAL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// was page unchanged since last time we downloaded it?
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
|
|
XmlDoc *od = NULL;
|
|
if ( *pod ) od = *pod;
|
|
|
|
// if recycling content is true you gotta have an old title rec.
|
|
if ( ! od && m_recycleContent ) {
|
|
m_indexCode = ENOTITLEREC;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
bool check = true;
|
|
if ( ! od ) check = false;
|
|
// do not do this logic for diffbot because it might want to get
|
|
// the diffbot reply even if page content is the same, because it
|
|
// might have an ajax call that updates the product price.
|
|
// onlyProcessIfNewUrl defaults to true, so typically even diffbot
|
|
// crawls will do this check.
|
|
if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl &&
|
|
// but allow urls like *-diffbotxyz2445187448 to be deduped,
|
|
// that is the whole point of this line
|
|
! m_isDiffbotJSONObject )
|
|
check = false;
|
|
if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError )
|
|
check = false;
|
|
// or if recycling content turn this off as well! otherwise
|
|
// it will always be 100% the same
|
|
if ( m_recycleContent )
|
|
check = false;
|
|
// never check for a bulk job
|
|
if ( cr->m_isCustomCrawl == 2 )
|
|
check = false;
|
|
|
|
if ( check ) {
|
|
// check inlinks now too!
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 )
|
|
return (int32_t *)info1;
|
|
LinkInfo *info2 = od->getLinkInfo1 ();
|
|
if ( ! info2 || info2 == (LinkInfo *)-1 )
|
|
return (int32_t *)info2;
|
|
Inlink *k1 = NULL;
|
|
Inlink *k2 = NULL;
|
|
char *s1, *s2;
|
|
int32_t len1,len2;
|
|
if ( info1->getNumGoodInlinks() !=
|
|
info2->getNumGoodInlinks() )
|
|
goto changed;
|
|
for ( ; k1=info1->getNextInlink(k1) ,
|
|
k2=info2->getNextInlink(k2); ) {
|
|
if ( ! k1 )
|
|
break;
|
|
if ( ! k2 )
|
|
break;
|
|
if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks )
|
|
goto changed;
|
|
s1 = k1->getLinkText();
|
|
len1 = k1->size_linkText - 1; // exclude \0
|
|
s2 = k2->getLinkText();
|
|
len2 = k2->size_linkText - 1; // exclude \0
|
|
if ( len1 != len2 )
|
|
goto changed;
|
|
if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
|
|
goto changed;
|
|
}
|
|
// no change in link text, look for change in page content now
|
|
int32_t *ch32 = getContentHash32();
|
|
if ( ! ch32 || ch32 == (void *)-1 ) return (int32_t *)ch32;
|
|
if ( *ch32 == od->m_contentHash32 ) {
|
|
m_indexCode = EDOCUNCHANGED;
|
|
m_indexCodeValid = true;
|
|
// hack these values on or off.
|
|
// really should be function calls.
|
|
// but it never gets set when it should if the
|
|
// doc is unchanged.
|
|
m_sentToDiffbot = od->m_sentToDiffbot;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
changed:
|
|
// words
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (int32_t *)words;
|
|
|
|
// we set the D_IS_IN_DATE flag for these bits
|
|
Bits *bits = getBits(); if ( ! bits ) return NULL;
|
|
|
|
// . check for date buffer overflow before setting sections
|
|
// . returns false and sets g_errno on error
|
|
/*
|
|
if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits )) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: parseDates: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
if ( g_errno != EBUFOVERFLOW ) return NULL;
|
|
g_errno = 0;
|
|
m_indexCode = EDOCBADDATES;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
*/
|
|
|
|
// bad sections? fixes http://www.beerexpedition.com/northamerica.shtml
|
|
// being continuously respidered when its lock expires every
|
|
// MAX_LOCK_AGE seconds
|
|
Sections *sections = getSections();
|
|
// on EBUFOVERFLOW we will NEVER be able to parse this url
|
|
// correctly so do not retry!
|
|
if ( ! sections && g_errno == EBUFOVERFLOW ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if (!sections||sections==(Sections *)-1) return (int32_t *)sections;
|
|
if ( sections->m_numSections == 0 && words->m_numWords > 0 ) {
|
|
m_indexCode = EDOCBADSECTIONS;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// i think an oom error is not being caught by Sections.cpp properly
|
|
if ( g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
Dates *dp = getDates();
|
|
if ( ! dp && g_errno == EBUFOVERFLOW ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
|
|
|
|
// make sure address buffers did not overflow
|
|
Addresses *aa = getAddresses ();
|
|
if ( (! aa && g_errno == EBUFOVERFLOW) ||
|
|
// it sets m_breached now if there's a problem
|
|
(aa && aa->m_breached) ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if ( ! aa || aa == (void *)-1 ) return (int32_t *)aa;
|
|
|
|
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (int32_t *)isRoot;
|
|
|
|
// get the tag rec
|
|
//TagRec *gr = getTagRec ();
|
|
//if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
|
|
bool spamCheck = true;
|
|
// if we are a root, allow repeat spam
|
|
if ( *isRoot ) spamCheck = false;
|
|
// if we are being spidered deep, allow repeat spam
|
|
if ( gr->getLong("deep",0) ) spamCheck = false;
|
|
// not for crawlbot
|
|
if ( cr->m_isCustomCrawl ) spamCheck = false;
|
|
// only html for now
|
|
if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
|
|
// turn this off for now
|
|
spamCheck = false;
|
|
// otherwise, check the weights
|
|
if ( spamCheck ) {
|
|
char *ws = getWordSpamVec();
|
|
if ( ! ws || ws == (void *)-1 ) return (int32_t *)ws;
|
|
if ( m_isRepeatSpammer ) {
|
|
m_indexCode = EDOCREPEATSPAMMER;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// validate this here so getSpiderPriority(), which calls
|
|
// getUrlFilterNum(), which calls getNewSpiderReply(), which calls
|
|
// us, getIndexCode() does not repeat all this junk
|
|
//m_indexCodeValid = true;
|
|
//m_indexCode = 0;
|
|
|
|
// fix query reindex on global-index from coring because
|
|
// the spider request is null
|
|
if ( m_isDiffbotJSONObject ) {
|
|
m_indexCode = 0;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
|
|
// this needs to be last!
|
|
int32_t *priority = getSpiderPriority();
|
|
if ( ! priority || priority == (void *)-1) {
|
|
// allow this though
|
|
if ( g_errno == EBUFOVERFLOW ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
// but if it blocked, then un-validate it
|
|
m_indexCodeValid = false;
|
|
// and return to be called again i hope
|
|
return (int32_t *)priority;
|
|
}
|
|
if ( *priority == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
|
|
m_indexCode = EDOCFILTERED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
// if ( *priority == SPIDER_PRIORITY_BANNED ) {
|
|
// m_indexCode = EDOCBANNED;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
// }
|
|
|
|
// . if using diffbot and the diffbot reply had a time out error
|
|
// or otherwise... diffbot failure demands a re-try always i guess.
|
|
// put this above getSpiderPriority() call otherwise we end up in
|
|
// a recursive loop with getIndexCode() and getNewSpiderReply()
|
|
// . NO, don't do this anymore, however, if there is a diffbot
|
|
// reply error then record it in the spider reply BUT only if it is
|
|
// a diffbot reply error that warrants a retry. for instance,
|
|
// EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
|
|
// error trying to download the page so it probably should not
|
|
// retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
|
|
// SafeBuf *dbr = getDiffbotReply();
|
|
// if ( ! dbr || dbr == (void *)-1 ) return (int32_t *)dbr;
|
|
// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
|
|
// m_indexCode= m_diffbotReplyError;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
// }
|
|
|
|
// no error otherwise
|
|
m_indexCode = 0;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
char *XmlDoc::prepareToMakeTitleRec ( ) {
|
|
// do not re-call this for speed
|
|
if ( m_prepared ) return (char *)1;
|
|
|
|
int32_t *indexCode = getIndexCode();
|
|
if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
|
|
if ( *indexCode ) { m_prepared = true; return (char *)1; }
|
|
|
|
//
|
|
// do all the sets here
|
|
//
|
|
|
|
// . this gets our old doc from titledb, if we got it
|
|
// . TODO: make sure this is cached in the event of a backoff, we
|
|
// will redo this again!!! IMPORTANT!!!
|
|
char *isIndexed = getIsIndexed();
|
|
if ( ! isIndexed || isIndexed == (char *)-1) return (char *)isIndexed;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if we are injecting into the "qatest123" coll, then we need to have
|
|
// m_spideredTimeValid be true before calling getIsSpam() which calls
|
|
// getSiteNumInlinks() which adds tags to tagdb using that date, but
|
|
// only for the "qatest123" coll!
|
|
// that keeps our parser output consistent across runs!
|
|
char **content = NULL;
|
|
if ( ! strcmp ( cr->m_coll,"qatest123") ) {
|
|
content = getContent ( );
|
|
if ( ! content || content == (void *)-1 )
|
|
return (char *)content;
|
|
}
|
|
|
|
// get our site root
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (char *)mysite;
|
|
|
|
// if we are a root page, update tagdb with the root lang id
|
|
//bool *status1 = updateRootLangId();
|
|
//if ( ! status1 || status1 == (void *)-1 ) return (char *)status1;
|
|
|
|
// if we are a root page, update tagdb with the root lang id
|
|
//bool *status2 = updateSiteTitleBuf();
|
|
//if ( ! status2 || status2 == (void *)-1 ) return (char *)status2;
|
|
|
|
// if we found some default venue addresses on page, add to tagdb
|
|
//bool *status3 = updateVenueAddresses();
|
|
//if ( ! status3 || status3 == (void *)-1 ) return (char *)status3;
|
|
|
|
// add "firstip" to tag rec if we need to
|
|
//bool *status4 = updateFirstIp();
|
|
//if ( ! status4 || status4 == (void *)-1 ) return (char *)status4;
|
|
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
|
|
|
|
int32_t *datedbDate = getPubDate();
|
|
if ( ! datedbDate || datedbDate == (int32_t *)-1 )
|
|
return (char *)datedbDate;
|
|
|
|
getHostHash32a();
|
|
getContentHash32();
|
|
|
|
//Images *images = getImages();
|
|
//if ( ! images || images == (Images *)-1 ) return (char *)images;
|
|
|
|
char **id = getThumbnailData();
|
|
if ( ! id || id == (void *)-1 ) return (char *)id;
|
|
|
|
int8_t *hopCount = getHopCount();
|
|
if ( ! hopCount || hopCount == (void *)-1 ) return (char *)hopCount;
|
|
|
|
char *spiderLinks = getSpiderLinks();
|
|
if ( ! spiderLinks || spiderLinks == (char *)-1 )
|
|
return (char *)spiderLinks;
|
|
|
|
//int32_t *nextSpiderTime = getNextSpiderTime();
|
|
//if ( ! nextSpiderTime || nextSpiderTime == (int32_t *)-1 )
|
|
// return (char *)nextSpiderTime;
|
|
|
|
//int8_t *nextSpiderPriority = getNextSpiderPriority();
|
|
//if ( ! nextSpiderPriority || nextSpiderPriority == (void *)-1 )
|
|
// return (char *)nextSpiderPriority;
|
|
|
|
int32_t *firstIndexedDate = getFirstIndexedDate();
|
|
if ( ! firstIndexedDate || firstIndexedDate == (int32_t *)-1 )
|
|
return (char *)firstIndexedDate;
|
|
|
|
int32_t *outlinksAddedDate = getOutlinksAddedDate();
|
|
if ( ! outlinksAddedDate || outlinksAddedDate == (int32_t *)-1 )
|
|
return (char *)outlinksAddedDate;
|
|
|
|
uint16_t *countryId = getCountryId();
|
|
if ( ! countryId||countryId==(uint16_t *)-1) return (char *)countryId;
|
|
|
|
char *trunc = getIsContentTruncated();
|
|
if ( ! trunc || trunc == (char *)-1 ) return (char *)trunc;
|
|
|
|
char *pl = getIsPermalink();
|
|
if ( ! pl || pl == (char *)-1 ) return (char *)pl;
|
|
|
|
//int32_t *numBannedOutlinks = getNumBannedOutlinks();
|
|
// set this
|
|
//m_numBannedOutlinks8 = score32to8 ( *numBannedOutlinks );
|
|
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (char *)dp;
|
|
|
|
// . before storing this into title Rec, make sure all tags
|
|
// are valid and tagRec is up to date
|
|
// . like we might need to update the contact info, siteNumInlinks,
|
|
// or other tags because, for instance, contact info might not
|
|
// be in there because isSpam() never required it.
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (char *)hci;
|
|
char *ict = getIsContentTruncated();
|
|
if ( ! ict || ict == (char *)-1 ) return (char *)ict;
|
|
int64_t **wd = getWikiDocIds();
|
|
if ( ! wd || wd == (void *)-1 ) return (char *)wd;
|
|
int64_t **avp = getAdVector();
|
|
if ( ! avp || avp == (void *)-1 ) return (char *)avp;
|
|
char *at = getIsAdult();
|
|
if ( ! at || at == (void *)-1 ) return (char *)at;
|
|
char *ls = getIsLinkSpam();
|
|
if ( ! ls || ls == (void *)-1 ) return (char *)ls;
|
|
uint32_t *tph = getTagPairHash32();
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;
|
|
|
|
// sets the ptr_sectionsReply, that is all we need it to do
|
|
//char **sd = getSectionsReply ( ) ;
|
|
//if ( ! sd || sd == (void *)-1 ) return (char *)sd;
|
|
// sets the ptr_addressReply, that is all we need it to do
|
|
//char **ad = getAddressReply ( ) ;
|
|
//if ( ! ad || ad == (void *)-1 ) return (char *)ad;
|
|
uint8_t *rl = getRootLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (char *)rl;
|
|
int32_t **pcids = getCatIds();
|
|
if ( ! pcids || pcids == (void *)-1) return (char *)pcids;
|
|
// get dmoz ptr_dmozTitles, ptr_dmozSumms, ptr_dmozAnchors
|
|
if ( ! setDmozInfo() ) return (char *)-1;
|
|
|
|
m_prepared = true;
|
|
return (char *)1;
|
|
}
|
|
|
|
#define MAX_DMOZ_TITLES 10
|
|
|
|
int32_t *XmlDoc::getNumDmozEntries() {
|
|
// MDW: wth is this?
|
|
//int32_t **getDmozCatIds();
|
|
int32_t nc = size_catIds / 4;
|
|
if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
|
|
m_numDmozEntries = nc;
|
|
return &m_numDmozEntries;
|
|
}
|
|
// list of \0 terminated titles, etc. use getNumDmozTitles() to get #
|
|
char **XmlDoc::getDmozTitles ( ) {
|
|
// returns false if blocked
|
|
if ( ! setDmozInfo() ) return (char **)-1;
|
|
if ( g_errno ) return NULL;
|
|
return &ptr_dmozTitles;
|
|
}
|
|
char **XmlDoc::getDmozSummaries ( ) {
|
|
// returns false if blocked
|
|
if ( ! setDmozInfo() ) return (char **)-1;
|
|
if ( g_errno ) return NULL;
|
|
return &ptr_dmozSumms;
|
|
}
|
|
char **XmlDoc::getDmozAnchors ( ) {
|
|
// returns false if blocked
|
|
if ( ! setDmozInfo() ) return (char **)-1;
|
|
if ( g_errno ) return NULL;
|
|
return &ptr_dmozAnchors;
|
|
}
|
|
|
|
|
|
// returns false if blocked, true otherwise. sets g_errno on error & rets true
|
|
bool XmlDoc::setDmozInfo () {
|
|
|
|
if ( m_dmozInfoValid ) return true;
|
|
|
|
g_errno = 0;
|
|
|
|
// return true and set g_errno on error
|
|
if ( ! m_dmozBuf.reserve(12000) ) {
|
|
log("xmldoc: error getting dmoz info: %s",mstrerror(g_errno));
|
|
// ensure log statement does not clear g_errno
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// start here
|
|
char *dmozBuf = m_dmozBuf.getBufStart();
|
|
|
|
char *titles = dmozBuf;
|
|
char *summs = dmozBuf+5000;
|
|
char *anchors = dmozBuf+10000;
|
|
// the end of it
|
|
char *dtend = dmozBuf + 5000;
|
|
char *dsend = dmozBuf + 10000;
|
|
char *daend = dmozBuf + 12000;
|
|
// point into those bufs
|
|
char *dt = titles;
|
|
char *ds = summs;
|
|
char *da = anchors;
|
|
// MDW: i limit this to 10 to save stack space!
|
|
int32_t nc = size_catIds / 4;
|
|
if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
|
|
for (int32_t i = 0; i < nc ; i++) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// temp stuff
|
|
int32_t dtlen = 0;
|
|
int32_t dslen = 0;
|
|
unsigned char dalen = 0;
|
|
|
|
// . store all dmoz info separated by \0's into titles[] buffer
|
|
// . crap, this does a disk read and blocks on that
|
|
//
|
|
// . TODO: make it non-blocking!!!!
|
|
//
|
|
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
|
|
m_firstUrl.getUrlLen(),
|
|
ptr_catIds[i],
|
|
dt,//&titles[titlesLen],
|
|
&dtlen,//&titleLens[i],
|
|
dtend-dt,
|
|
ds,//&summs[summsLen],
|
|
&dslen,//&summLens[i],
|
|
dsend-ds,
|
|
da,//&anchors[anchorsLen],
|
|
&dalen,//&anchorLens[i],
|
|
daend-da,
|
|
m_niceness);
|
|
// advance ptrs
|
|
dt += dtlen;
|
|
ds += dslen;
|
|
da += dalen;
|
|
// null terminate
|
|
*dt++ = 0;
|
|
*ds++ = 0;
|
|
*ds++ = 0;
|
|
}
|
|
|
|
// if empty, make it a \0 to keep in sync with the rest
|
|
if ( dt == titles ) *dt++ = '\0';
|
|
if ( ds == summs ) *ds++ = '\0';
|
|
if ( da == anchors ) *da++ = '\0';
|
|
|
|
// set these
|
|
ptr_dmozTitles = titles;
|
|
ptr_dmozSumms = summs;
|
|
ptr_dmozAnchors = anchors;
|
|
size_dmozTitles = dt - titles;
|
|
size_dmozSumms = ds - summs;
|
|
size_dmozAnchors = da - anchors;
|
|
|
|
m_dmozInfoValid = true;
|
|
return true;
|
|
}
|
|
|
|
// . create and store the titlerec into "buf".
|
|
// . it is basically the header part of all the member vars in this XmlDoc.
|
|
// . it has a key,dataSize,compressedData so it can be a record in an Rdb
|
|
// . return true on success, false on failure
|
|
bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){
|
|
|
|
//setStatus ( "making title rec");
|
|
|
|
// assume could not make one because we were banned or something
|
|
tbuf->purge(); // m_titleRec = NULL;
|
|
|
|
// start setting members in THIS's header before compression
|
|
m_version = TITLEREC_CURRENT_VERSION;
|
|
|
|
// tag rec must have "sitenuminlinks" in it
|
|
//if (! m_newTagRec.getTag("sitenuminlinks") ) { char *xx=NULL;*xx=0; }
|
|
// we often update m_oldTagRec above by calling updateRootLangId(), etc
|
|
// so update the size our of tag rec here
|
|
//size_tagRecData = m_oldTagRec.getSize();
|
|
// and sanity check this
|
|
//if( ptr_tagRecData != (char *)&m_oldTagRec ) { char *xx=NULL;*xx=0; }
|
|
|
|
// lookup dmoz title and summary for this site
|
|
//int32_t titleLens [10];
|
|
//int32_t summLens [10];
|
|
//unsigned char anchorLens [10];
|
|
//int32_t titlesLen = 0;
|
|
//int32_t summsLen = 0;
|
|
//int32_t anchorsLen = 0;
|
|
//char titles [10*1024];
|
|
//char summs [10*4096];
|
|
//char anchors [10* 256];
|
|
|
|
/*
|
|
|
|
MDW oct 12 2013 -
|
|
why is this here? we should store this info at spider time?
|
|
|
|
char *titles = m_dmozBuf;
|
|
char *summs = m_dmozBuf+5000;
|
|
char *anchors = m_dmozBuf+10000;
|
|
// the end of it
|
|
char *dtend = m_dmozBuf + 5000;
|
|
char *dsend = m_dmozBuf + 10000;
|
|
char *daend = m_dmozBuf + 12000;
|
|
// point into those bufs
|
|
char *dt = titles;
|
|
char *ds = summs;
|
|
char *da = anchors;
|
|
// MDW: i limit this to 10 to save stack space!
|
|
int32_t nc = size_catIds / 4;
|
|
if ( nc > 10 ) nc = 10;
|
|
for (int32_t i = 0; i < nc ; i++) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// temp stuff
|
|
int32_t dtlen = 0;
|
|
int32_t dslen = 0;
|
|
unsigned char dalen = 0;
|
|
|
|
// . store all dmoz info separated by \0's into titles[] buffer
|
|
// . crap, this does a disk read and blocks on that
|
|
//
|
|
// . TODO: make it non-blocking!!!!
|
|
//
|
|
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
|
|
m_firstUrl.getUrlLen(),
|
|
ptr_catIds[i],
|
|
dt,//&titles[titlesLen],
|
|
&dtlen,//&titleLens[i],
|
|
dtend-dt,
|
|
ds,//&summs[summsLen],
|
|
&dslen,//&summLens[i],
|
|
dsend-ds,
|
|
da,//&anchors[anchorsLen],
|
|
&dalen,//&anchorLens[i],
|
|
daend-da,
|
|
m_niceness);
|
|
// advance ptrs
|
|
dt += dtlen;
|
|
ds += dslen;
|
|
da += dalen;
|
|
// null terminate
|
|
if ( dtlen>0 && dt[dtlen-1]!='\0' ) { *dt++=0; dtlen++; }
|
|
if ( dslen>0 && ds[dslen-1]!='\0' ) { *ds++=0; dslen++; }
|
|
if ( dalen>0 && da[dalen-1]!='\0' ) { *da++=0; dalen++; }
|
|
// must always be something!
|
|
if ( dtlen==0 ) {*dt++=0; dtlen++;}
|
|
if ( dslen==0 ) {*ds++=0; dslen++;}
|
|
if ( dalen==0 ) {*da++=0; dalen++;}
|
|
}
|
|
|
|
// set these
|
|
ptr_dmozTitles = titles;
|
|
ptr_dmozSumms = summs;
|
|
ptr_dmozAnchors = anchors;
|
|
size_dmozTitles = dt - titles;
|
|
size_dmozSumms = ds - summs;
|
|
size_dmozAnchors = da - anchors;
|
|
*/
|
|
|
|
// set our crap that is not necessarily set
|
|
//ptr_firstUrl = m_firstUrl.getUrl();
|
|
//ptr_redirUrl = m_redirUrl.getUrl();
|
|
//ptr_tagRecData = (char *)&m_oldTagRec;
|
|
|
|
// this must be valid now
|
|
//if ( ! m_skipIndexingValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set this
|
|
m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;
|
|
|
|
// add in variable length data
|
|
int32_t *ps = (int32_t *)&size_firstUrl;
|
|
// data ptr, consider a NULL to mean empty too!
|
|
char **pd = (char **)&ptr_firstUrl;
|
|
// how many XmlDoc::ptr_* members do we have? set "np" to that
|
|
int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ;
|
|
np /= sizeof(char *);
|
|
// count up total we need to alloc
|
|
int32_t need1 = m_headerSize;
|
|
// clear these
|
|
m_internalFlags1 = 0;
|
|
// loop over em
|
|
for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
|
|
// skip if empty
|
|
if ( *ps <= 0 ) continue;
|
|
// or empty string ptr
|
|
if ( ! *pd ) continue;
|
|
// skip utf8content if we should -- no events or addresses
|
|
//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
|
|
// 4 bytes for the size
|
|
need1 += 4;
|
|
// add it up
|
|
need1 += *ps;
|
|
// make the mask
|
|
uint32_t mask = 1 << i ;
|
|
// add it in
|
|
m_internalFlags1 |= mask;
|
|
}
|
|
// alloc the buffer
|
|
char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
|
|
// return NULL with g_errno set on error
|
|
if ( ! ubuf ) return false;
|
|
// serialize into it
|
|
char *p = ubuf;
|
|
// copy our crap into there
|
|
gbmemcpy ( p , &m_headerSize , m_headerSize );
|
|
// skip it
|
|
p += m_headerSize;
|
|
// reset data ptrs
|
|
pd = (char **)&ptr_firstUrl;
|
|
// reset data sizes
|
|
ps = (int32_t *)&size_firstUrl;
|
|
|
|
// then variable length data
|
|
for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
|
|
// skip if empty, do not serialize
|
|
if ( ! *ps ) continue;
|
|
// or empty string ptr
|
|
if ( ! *pd ) continue;
|
|
// skip utf8content if we should -- no events or addresses
|
|
//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
|
|
// store size first
|
|
*(int32_t *)p = *ps;
|
|
p += 4;
|
|
// then the data
|
|
gbmemcpy ( p , *pd , *ps );
|
|
// skip *ps bytes we wrote. should include a \0
|
|
p += *ps;
|
|
}
|
|
// sanity check
|
|
if ( p != ubuf + need1 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// now restore it for other functions to use
|
|
//size_content = saved;
|
|
|
|
// . now compress our "title rec" data into a titleRec
|
|
// . cbuf should not be set
|
|
//if ( cbuf ) {
|
|
// log(LOG_LOGIC,"db: titlerec: compress: cbuf is set.");
|
|
// char *p = NULL; *p = 0; exit(-1);
|
|
//}
|
|
// should we free cbuf on our reset/destruction?
|
|
//m_owncbuf = ownCompressedData;
|
|
// . make a buf big enough to hold compressed, we'll realloc afterwards
|
|
// . according to zlib.h line 613 compress buffer must be .1% larger
|
|
// than source plus 12 bytes. (i add one for round off error)
|
|
// . now i added another extra 12 bytes cuz compress seemed to want it
|
|
int32_t need2 = ((int64_t)need1 * 1001LL) / 1000LL + 13 + 12;
|
|
// we also need to store a key then regular dataSize then
|
|
// the uncompressed size in cbuf before the compression of m_ubuf
|
|
int32_t hdrSize = sizeof(key_t) + 4 + 4;
|
|
// . now i add 12 bytes more so Msg14.cpp can also squeeze in a
|
|
// negative key to delete the old titleRec, cuz we use this cbuf
|
|
// to set our list that we add to our twins with
|
|
// . we now store the negative rec before the positive rec in Msg14.cpp
|
|
//hdrSize += sizeof(key_t) + 4;
|
|
need2 += hdrSize;
|
|
// alloc what we need
|
|
//char *cbuf = (char *) mmalloc ( need2 ,"TitleRecc");
|
|
//if ( ! cbuf ) return false;
|
|
// return false on error
|
|
if ( ! tbuf->reserve ( need2 ,"titbuf" ) ) return false;
|
|
// int16_tcut
|
|
char *cbuf = tbuf->getBufStart();
|
|
// set cbuf sizes, we set cbufSize below to fit exactly used buf
|
|
//int32_t cbufMaxSize = need2;
|
|
// . how big is the buf we're passing to ::compress()?
|
|
// . don't include the last 12 byte, save for del key in Msg14.cpp
|
|
int32_t size = need2 - hdrSize ;
|
|
// . uncompress the data into ubuf
|
|
// . this will reset cbufSize to a smaller value probably
|
|
// . "size" is set to how many bytes we wrote into "cbuf + hdrSize"
|
|
int err = gbcompress ( (unsigned char *)cbuf + hdrSize,
|
|
(uint32_t *)&size,
|
|
(unsigned char *)ubuf ,
|
|
(uint32_t )need1 );
|
|
// note it
|
|
//log("test: compressed %s from %" INT32 " to %" INT32 " bytes",
|
|
// m_firstUrl.m_url,need2-hdrSize,size);
|
|
// free the buf we were trying to compress now
|
|
mfree ( ubuf , need1 , "trub" );
|
|
// we should check ourselves
|
|
if ( err == Z_OK && size > (need2 - hdrSize ) ) {
|
|
//mfree ( cbuf , need2 ,"TitleRecc" );
|
|
tbuf->purge();
|
|
g_errno = ECOMPRESSFAILED;
|
|
log("db: Failed to compress document of %" INT32 " bytes. "
|
|
"Provided buffer of %" INT32 " bytes.",
|
|
size, (need2 - hdrSize ) );
|
|
return false;
|
|
}
|
|
// check for error
|
|
if ( err != Z_OK ) {
|
|
//mfree ( cbuf , need2 ,"TitleRecc" );
|
|
tbuf->purge();
|
|
g_errno = ECOMPRESSFAILED;
|
|
log("db: Failed to compress document.");
|
|
return false;
|
|
}
|
|
// calc cbufSize, the uncompressed header + compressed stuff
|
|
//cbufSize = hdrSize + size ;
|
|
|
|
//int64_t uh48 = getFirstUrlHash48();
|
|
// . make the key from docId
|
|
// . false = delkey?
|
|
//m_titleRecKey = g_titledb.makeKey (*getDocId(),uh48,false);//delkey?
|
|
key_t tkey = g_titledb.makeKey (docId,uh48,false);//delkey?
|
|
// validate it
|
|
//m_titleRecKeyValid = true;
|
|
|
|
// get a ptr to the Rdb record at start of the header
|
|
p = cbuf;
|
|
// skip over the negative rec reserved space for Msg14.cpp
|
|
//p += 12 + 4;
|
|
// . store key in header of cbuf
|
|
// . store in our host byte ordering so we can be a rec in an RdbList
|
|
*(key_t *) p = tkey;
|
|
p += sizeof(key_t);
|
|
// store total dataSize in header (excluding itself and key only)
|
|
int32_t dataSize = size + 4;
|
|
*(int32_t *) p = dataSize ;
|
|
p += 4;
|
|
// store uncompressed size in header
|
|
*(int32_t *) p = need1 ; p += 4;
|
|
// sanity check
|
|
if ( p != cbuf + hdrSize ) { char *xx = NULL; *xx = 0; }
|
|
// sanity check
|
|
if ( need1 <= 0 ) { char *xx = NULL; *xx = 0; }
|
|
// advance over data
|
|
p += size;
|
|
|
|
// update safebuf::m_length so it is correct
|
|
tbuf->setLength ( p - cbuf );
|
|
|
|
return true;
|
|
}
|
|
|
|
// . return NULL and sets g_errno on error
|
|
// . returns -1 if blocked
|
|
SafeBuf *XmlDoc::getTitleRecBuf ( ) {
|
|
|
|
// return it now if we got it already
|
|
if ( m_titleRecBufValid ) return &m_titleRecBuf;
|
|
|
|
setStatus ( "making title rec");
|
|
|
|
// did one of our many blocking function calls have an error?
|
|
if ( g_errno ) return NULL;
|
|
|
|
// . HACK so that TitleRec::isEmpty() return true
|
|
// . faster than calling m_titleRec.reset()
|
|
//m_titleRec.m_url.m_ulen = 0;
|
|
|
|
int32_t *indexCode = getIndexCode();
|
|
// not allowed to block here
|
|
if ( indexCode == (void *)-1) { char *xx=NULL;*xx=0; }
|
|
// return on errors with g_errno set
|
|
if ( ! indexCode ) return NULL;
|
|
// force delete? EDOCFORCEDELETE
|
|
if ( *indexCode ) { m_titleRecBufValid = true; return &m_titleRecBuf; }
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block and return -1, we will be re-called from the top
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getTitleRecBufWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
/*
|
|
// parsing knobs
|
|
if ( ! m_titleWeightValid ) {
|
|
// TODO: watchout for overruns!! these are 16-bits only!
|
|
//m_eliminateMenus = cr->m_eliminateMenus;
|
|
m_titleWeight = cr->m_titleWeight;
|
|
m_headerWeight = cr->m_headerWeight;
|
|
m_urlPathWeight = cr->m_urlPathWeight;
|
|
m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
|
|
m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
|
|
m_conceptWeight = cr->m_conceptWeight;
|
|
//int32_t siteNumInlinksBoost = cr->m_siteNumInlinksBoost;
|
|
// validate these
|
|
//m_eliminateMenusValid = true;
|
|
m_titleWeightValid = true;
|
|
m_headerWeightValid = true;
|
|
m_urlPathWeightValid = true;
|
|
m_externalLinkTextWeightValid = true;
|
|
m_internalLinkTextWeightValid = true;
|
|
m_conceptWeightValid = true;
|
|
}
|
|
*/
|
|
|
|
/////////
|
|
//
|
|
// IF ANY of these validation sanity checks fail then update
|
|
// prepareToMakeTitleRec() so it makes them valid!!!
|
|
//
|
|
/////////
|
|
|
|
// verify key parts
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// verify record parts
|
|
//if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_firstIndexedDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_outlinksAddedDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_charsetValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_countryIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_httpStatusValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
if ( ! m_titleWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_headerWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_urlPathWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_externalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_internalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_conceptWeightValid ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
|
|
// if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
|
|
// if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_rootLangIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_metaListCheckSum8Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_numBannedOutlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_eliminateMenusValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_spiderLinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isContentTruncatedValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isLinkSpamValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// buffers
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_metaRedirUrlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_adVectorValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_imageDataValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_catIdsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_indCatIdsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_dmozInfoValid ) { char *xx=NULL;*xx=0; }
|
|
// if m_recycleContent is true, these are not valid
|
|
if ( ! m_recycleContent ) {
|
|
if ( ! m_rawUtf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_expandedUtf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_datesValid ) { char *xx=NULL;*xx=0; }
|
|
// why do we need valid sections for a titlerec? we no longer user
|
|
// ptr_sectiondbData...
|
|
//if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sectionsReplyValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_addressReplyValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sectiondbDataValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_placedbDataValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_clockCandidatesDataValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do we need these?
|
|
if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_contentHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_tagHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_tagPairHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
// sanity checks
|
|
if ( ! m_addressesValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
setStatus ( "compressing into final title rec");
|
|
|
|
int64_t uh48 = getFirstUrlHash48();
|
|
|
|
int64_t *docId = getDocId();
|
|
|
|
// time it
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
|
|
//////
|
|
//
|
|
// fill in m_titleRecBuf
|
|
//
|
|
//////
|
|
|
|
// we need docid and uh48 for making the key of the titleRec
|
|
if ( ! setTitleRecBuf ( &m_titleRecBuf , *docId , uh48 ) )
|
|
return NULL;
|
|
|
|
// set this member down here because we can't set it in "xd"
|
|
// because it is too int16_t of an xmldoc stub
|
|
m_versionValid = true;
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// . add the stat
|
|
// . use white for the stat
|
|
g_stats.addStat_r ( 0 ,
|
|
startTime ,
|
|
gettimeofdayInMilliseconds(),
|
|
0x00ffffff );
|
|
|
|
QUICKPOLL( m_niceness );
|
|
|
|
char *cbuf = m_titleRecBuf.getBufStart();
|
|
m_titleRecKey = *(key_t *)cbuf;
|
|
m_titleRecKeyValid = true;
|
|
|
|
// we are legit
|
|
//m_freeTitleRec = true;
|
|
//m_titleRec = cbuf;
|
|
// key + dataSize + ubufSize + compressedData
|
|
//m_titleRecSize = sizeof(key_t)+ 4 + 4 + size;
|
|
//m_titleRecAllocSize = need2;
|
|
|
|
// now valid. congratulations!
|
|
m_titleRecBufValid = true;
|
|
return &m_titleRecBuf;
|
|
}
|
|
|
|
|
|
// . an "id" of 2 means very indicative of a dirty doc
|
|
// . an "id" of 1 means it must be joined with another dirty word to indicate
|
|
// . taken mostly from Url.cpp
|
|
// . see matches2.h for Needle class definition
|
|
static Needle s_dirtyWords [] = {
|
|
{"upskirt" ,0,2,0,0,NULL,0,NULL},
|
|
{"downblouse" ,0,2,0,0,NULL,0,NULL},
|
|
{"shemale" ,0,1,0,0,NULL,0,NULL},
|
|
{"spank" ,0,1,0,0,NULL,0,NULL},
|
|
{"dildo" ,0,2,0,0,NULL,0,NULL},
|
|
{"bdsm" ,0,2,0,0,NULL,0,NULL},
|
|
{"voyeur" ,0,2,0,0,NULL,0,NULL},
|
|
{"fisting" ,0,2,0,0,NULL,0,NULL},
|
|
{"vibrator" ,0,2,0,0,NULL,0,NULL},
|
|
{"ejaculat" ,0,2,0,0,NULL,0,NULL},
|
|
{"rgasm" ,0,2,0,0,NULL,0,NULL},
|
|
{"orgy" ,0,2,0,0,NULL,0,NULL},
|
|
{"orgies" ,0,2,0,0,NULL,0,NULL},
|
|
{"stripper" ,0,1,0,0,NULL,0,NULL},
|
|
{"softcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"whore" ,0,2,0,0,NULL,0,NULL},
|
|
// gary slutkin on ted.com. make this just 1 point.
|
|
{"slut" ,0,1,0,0,NULL,0,NULL},
|
|
{"smut" ,0,2,0,0,NULL,0,NULL},
|
|
{"tits" ,0,2,0,0,NULL,0,NULL},
|
|
{"lesbian" ,0,2,0,0,NULL,0,NULL},
|
|
{"swinger" ,0,2,0,0,NULL,0,NULL},
|
|
{"fetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"nude" ,0,1,0,0,NULL,0,NULL},
|
|
{"centerfold" ,0,2,0,0,NULL,0,NULL},
|
|
{"incest" ,0,2,0,0,NULL,0,NULL},
|
|
{"pedophil" ,0,2,0,0,NULL,0,NULL},
|
|
{"pedofil" ,0,2,0,0,NULL,0,NULL},
|
|
{"horny" ,0,2,0,0,NULL,0,NULL}, // horny toad
|
|
{"pussy" ,0,2,0,0,NULL,0,NULL}, // pussy willow pussy cat
|
|
{"pussies" ,0,2,0,0,NULL,0,NULL},
|
|
{"penis" ,0,2,0,0,NULL,0,NULL},
|
|
{"vagina" ,0,2,0,0,NULL,0,NULL},
|
|
{"phuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"blowjob" ,0,2,0,0,NULL,0,NULL},
|
|
{"blow job" ,0,2,0,0,NULL,0,NULL},
|
|
{"gangbang" ,0,2,0,0,NULL,0,NULL},
|
|
{"xxx" ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
|
|
{"porn" ,0,2,0,0,NULL,0,NULL},
|
|
{"felch" ,0,2,0,0,NULL,0,NULL},
|
|
{"cunt" ,0,2,0,0,NULL,0,NULL},
|
|
{"bestial" ,0,2,0,0,NULL,0,NULL},
|
|
{"beastial" ,0,2,0,0,NULL,0,NULL},
|
|
{"kink" ,0,2,0,0,NULL,0,NULL},
|
|
// . "sex" is often substring in tagids.
|
|
// . too many false positives, make "1" not "2"
|
|
{"sex" ,0,1,0,0,NULL,0,NULL},
|
|
{"anal" ,0,2,0,0,NULL,0,NULL},
|
|
{"cum" ,0,2,0,0,NULL,0,NULL}, // often used for cumulative
|
|
{"clit" ,0,2,0,0,NULL,0,NULL},
|
|
{"fuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"boob" ,0,1,0,0,NULL,0,NULL},
|
|
{"wank" ,0,2,0,0,NULL,0,NULL},
|
|
{"fick" ,0,2,0,0,NULL,0,NULL},
|
|
{"eroti" ,0,2,0,0,NULL,0,NULL},
|
|
{"gay" ,0,1,0,0,NULL,0,NULL}, // make 1 pt. 'marvin gay'
|
|
// new stuff not in Url.cpp
|
|
{"thong" ,0,1,0,0,NULL,0,NULL},
|
|
{"masturbat" ,0,2,0,0,NULL,0,NULL},
|
|
{"bitch" ,0,1,0,0,NULL,0,NULL},
|
|
{"hell" ,0,1,0,0,NULL,0,NULL},
|
|
{"damn" ,0,1,0,0,NULL,0,NULL},
|
|
{"rimjob" ,0,2,0,0,NULL,0,NULL},
|
|
{"cunnilingu" ,0,2,0,0,NULL,0,NULL},
|
|
{"felatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"fellatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"dick" ,0,1,0,0,NULL,0,NULL},
|
|
{"cock" ,0,1,0,0,NULL,0,NULL},
|
|
{"rape" ,0,2,0,0,NULL,0,NULL},
|
|
{"raping" ,0,2,0,0,NULL,0,NULL},
|
|
{"bukake" ,0,2,0,0,NULL,0,NULL},
|
|
{"shit" ,0,2,0,0,NULL,0,NULL},
|
|
{"naked" ,0,1,0,0,NULL,0,NULL},
|
|
{"nympho" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcore" ,0,1,0,0,NULL,0,NULL}, // hardcore gamer, count as 1
|
|
{"sodom" ,0,2,0,0,NULL,0,NULL},
|
|
{"titties" ,0,2,0,0,NULL,0,NULL}, // re-do
|
|
{"twat" ,0,2,0,0,NULL,0,NULL},
|
|
{"bastard" ,0,1,0,0,NULL,0,NULL},
|
|
{"erotik" ,0,2,0,0,NULL,0,NULL},
|
|
|
|
// EXCEPTIONS
|
|
|
|
// smut
|
|
{"transmut" ,0,-2,0,0,NULL,0,NULL},
|
|
{"bismuth" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
// sex
|
|
{"middlesex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sussex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"essex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"deusex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexchange" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpress" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpert" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
|
|
// EXCEPTIONS
|
|
|
|
// sex
|
|
{"middlesex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sussex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"essex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"deusex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexchange" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpress" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpert" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexcel" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexist" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexile" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexperi" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexual" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpose" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexclu" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexo" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexism" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpan" ,0,-1,0,0,NULL,0,NULL}, // buttonsexpanion
|
|
{"same-sex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"opposite sex",0,-1,0,0,NULL,0,NULL},
|
|
|
|
// anal
|
|
{"analog" ,0,-2,0,0,NULL,0,NULL},
|
|
{"analy" ,0,-2,0,0,NULL,0,NULL},
|
|
{"canal" ,0,-2,0,0,NULL,0,NULL},
|
|
{"kanal" ,0,-2,0,0,NULL,0,NULL},
|
|
{"banal" ,0,-2,0,0,NULL,0,NULL},
|
|
{"ianalbert" ,0,-2,0,0,NULL,0,NULL}, // ian albert
|
|
|
|
// cum
|
|
{"circum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cum laude" ,0,-2,0,0,NULL,0,NULL},
|
|
{"succum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cumber" ,0,-2,0,0,NULL,0,NULL},
|
|
{"docum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cumul" ,0,-2,0,0,NULL,0,NULL},
|
|
{"acumen" ,0,-2,0,0,NULL,0,NULL},
|
|
{"incum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"capsicum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"modicum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"locum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"accum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cumbre" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
{"swank" ,0,-2,0,0,NULL,0,NULL},
|
|
{"fickle" ,0,-2,0,0,NULL,0,NULL},
|
|
{"traffick" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scleroti" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gaylor" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gaynor" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gayner" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gayton" ,0,-2,0,0,NULL,0,NULL},
|
|
{"dipthong" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// hell
|
|
{"hellen" ,0,-1,0,0,NULL,0,NULL},
|
|
{"hellman" ,0,-1,0,0,NULL,0,NULL},
|
|
{"shell" ,0,-1,0,0,NULL,0,NULL},
|
|
{"mitchell" ,0,-1,0,0,NULL,0,NULL},
|
|
{"chelle" ,0,-1,0,0,NULL,0,NULL}, // me/michelle
|
|
{"hello" ,0,-1,0,0,NULL,0,NULL},
|
|
{"moschella" ,0,-1,0,0,NULL,0,NULL},
|
|
{"othello" ,0,-1,0,0,NULL,0,NULL},
|
|
{"schelling" ,0,-1,0,0,NULL,0,NULL},
|
|
{"seychelles" ,0,-1,0,0,NULL,0,NULL},
|
|
{"wheller" ,0,-1,0,0,NULL,0,NULL},
|
|
{"winchell" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// dick
|
|
{"dicker" ,0,-1,0,0,NULL,0,NULL},
|
|
{"dickins" ,0,-1,0,0,NULL,0,NULL},
|
|
{"dickies" ,0,-1,0,0,NULL,0,NULL},
|
|
{"dickran" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// cock
|
|
{"babcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocked" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocking" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cockpit" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cockroach" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocktail" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocky" ,0,-1,0,0,NULL,0,NULL},
|
|
{"hancock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"hitchcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"peacock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"shuttlecock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"stopcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"weathercock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"woodcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cockburn" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// kink
|
|
{"kinko" ,0,-2,0,0,NULL,0,NULL},
|
|
{"ukink" ,0,-2,0,0,NULL,0,NULL}, // ink shop in uk
|
|
|
|
// naked
|
|
{"snaked" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// rape
|
|
{"drape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"grape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scrape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"therape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"trapez" ,0,-2,0,0,NULL,0,NULL},
|
|
{"parapet" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scraping" ,0,-2,0,0,NULL,0,NULL},
|
|
{"draping" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
// twat
|
|
{"twatch" ,0,-2,0,0,NULL,0,NULL}, // courtwatch -- cspan.org
|
|
|
|
// clit
|
|
{"heraclitus" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
// boob
|
|
{"booboo" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// shit
|
|
{"shitak" ,0,-2,0,0,NULL,0,NULL}
|
|
};
|
|
|
|
////
|
|
//// New stuff from sex.com adult word list
|
|
////
|
|
////
|
|
//// make it a 2nd part because of performance limits on matches2.cpp algo
|
|
////
|
|
static Needle s_dirtyWordsPart2 [] = {
|
|
{"amateurfoto" ,0,2,0,0,NULL,0,NULL},
|
|
{"amateurhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"amateurindex" ,0,2,0,0,NULL,0,NULL},
|
|
{"amateurnaked" ,0,2,0,0,NULL,0,NULL},
|
|
{"amatuerhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"ampland" ,0,2,0,0,NULL,0,NULL},
|
|
//{"animehentai" ,0,2,0,0,NULL,0,NULL}, dup
|
|
{"anitablonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"asiacarrera" ,0,2,0,0,NULL,0,NULL},
|
|
{"asshole" ,0,2,0,0,NULL,0,NULL},
|
|
{"asslick" ,0,2,0,0,NULL,0,NULL},
|
|
{"asspic" ,0,2,0,0,NULL,0,NULL},
|
|
{"assworship" ,0,2,0,0,NULL,0,NULL},
|
|
//{"badgirl" ,0,2,0,0,NULL,0,NULL}, not necessarily bad
|
|
{"bareceleb" ,0,2,0,0,NULL,0,NULL},
|
|
{"barenaked" ,0,2,0,0,NULL,0,NULL},
|
|
{"beaverboy" ,0,2,0,0,NULL,0,NULL},
|
|
{"beavershot" ,0,2,0,0,NULL,0,NULL}, // was beavershots
|
|
//{"bigball" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
|
|
{"bigbreast" ,0,2,0,0,NULL,0,NULL},
|
|
//{"bigbutt" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
|
|
{"bigcock" ,0,2,0,0,NULL,0,NULL},
|
|
{"bigdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"biggestdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"biggesttit" ,0,2,0,0,NULL,0,NULL},
|
|
{"bighairyball" ,0,2,0,0,NULL,0,NULL},
|
|
{"bighooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"bignipple" ,0,2,0,0,NULL,0,NULL},
|
|
{"bigtit" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackbooty" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackbutt" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackcock" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackonblonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"blacksonblonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"blacktit" ,0,2,0,0,NULL,0,NULL},
|
|
{"blacktwat" ,0,2,0,0,NULL,0,NULL},
|
|
{"boner" ,0,1,0,0,NULL,0,NULL}, // softcore, someone's lastname?
|
|
{"bordello" ,0,2,0,0,NULL,0,NULL},
|
|
{"braless" ,0,2,0,0,NULL,0,NULL},
|
|
{"brothel" ,0,2,0,0,NULL,0,NULL},
|
|
{"bukake" ,0,2,0,0,NULL,0,NULL},
|
|
{"bukkake" ,0,2,0,0,NULL,0,NULL},
|
|
{"bustyblonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"bustyceleb" ,0,2,0,0,NULL,0,NULL},
|
|
{"butthole" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttman" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttplug" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttthumbnails" ,0,2,0,0,NULL,0,NULL},
|
|
{"callgirl" ,0,2,0,0,NULL,0,NULL},
|
|
{"celebritiesnaked" ,0,2,0,0,NULL,0,NULL},
|
|
{"celebritybush" ,0,2,0,0,NULL,0,NULL},
|
|
{"celebritybutt" ,0,2,0,0,NULL,0,NULL},
|
|
{"chaseylain" ,0,2,0,0,NULL,0,NULL},
|
|
{"chickswithdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"christycanyon" ,0,2,0,0,NULL,0,NULL},
|
|
{"cicciolina" ,0,2,0,0,NULL,0,NULL},
|
|
//{"cunilingus" ,0,2,0,0,NULL,0,NULL},
|
|
{"cunniling" ,0,2,0,0,NULL,0,NULL}, // abbreviate
|
|
{"cyberlust" ,0,2,0,0,NULL,0,NULL},
|
|
{"danniashe" ,0,2,0,0,NULL,0,NULL},
|
|
{"dicksuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"dirtymind" ,0,2,0,0,NULL,0,NULL},
|
|
{"dirtypicture" ,0,2,0,0,NULL,0,NULL},
|
|
{"doggiestyle" ,0,2,0,0,NULL,0,NULL},
|
|
{"doggystyle" ,0,2,0,0,NULL,0,NULL},
|
|
{"domatrix" ,0,2,0,0,NULL,0,NULL},
|
|
{"dominatrix" ,0,2,0,0,NULL,0,NULL},
|
|
//{"dyke" ,0,2,0,0,NULL,0,NULL}, // dick van dyke!
|
|
{"ejaculation" ,0,2,0,0,NULL,0,NULL},
|
|
{"erosvillage" ,0,2,0,0,NULL,0,NULL},
|
|
{"facesit" ,0,2,0,0,NULL,0,NULL},
|
|
{"fatass" ,0,2,0,0,NULL,0,NULL},
|
|
{"feetfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"felatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"fellatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"femdom" ,0,2,0,0,NULL,0,NULL},
|
|
{"fetishwear" ,0,2,0,0,NULL,0,NULL},
|
|
{"fettegirl" ,0,2,0,0,NULL,0,NULL},
|
|
{"fingerbang" ,0,2,0,0,NULL,0,NULL},
|
|
{"fingering" ,0,1,0,0,NULL,0,NULL}, // fingering the keyboard? use 1
|
|
{"flesh4free" ,0,2,0,0,NULL,0,NULL},
|
|
{"footfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"footjob" ,0,2,0,0,NULL,0,NULL},
|
|
{"footlicking" ,0,2,0,0,NULL,0,NULL},
|
|
{"footworship" ,0,2,0,0,NULL,0,NULL},
|
|
{"fornication" ,0,2,0,0,NULL,0,NULL},
|
|
{"freeass" ,0,2,0,0,NULL,0,NULL},
|
|
{"freebigtit" ,0,2,0,0,NULL,0,NULL},
|
|
{"freedick" ,0,2,0,0,NULL,0,NULL},
|
|
{"freehardcore" ,0,2,0,0,NULL,0,NULL},
|
|
//{"freehentai" ,0,2,0,0,NULL,0,NULL}, dup
|
|
{"freehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"freelargehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"freenakedpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"freenakedwomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"freetit" ,0,2,0,0,NULL,0,NULL},
|
|
{"freevoyeur" ,0,2,0,0,NULL,0,NULL},
|
|
{"gratishardcoregalerie" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorecelebs" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorefree" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorejunkie" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorejunky" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcoremovie" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorepic" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorepix" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcoresample" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorestories" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorethumb" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorevideo" ,0,2,0,0,NULL,0,NULL},
|
|
{"harddick" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardnipple" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardon" ,0,2,0,0,NULL,0,NULL},
|
|
{"hentai" ,0,2,0,0,NULL,0,NULL},
|
|
{"interacialhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"intercourseposition" ,0,2,0,0,NULL,0,NULL},
|
|
{"interracialhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"ittybittytitty" ,0,2,0,0,NULL,0,NULL},
|
|
{"jackoff" ,0,2,0,0,NULL,0,NULL},
|
|
{"jennajameson" ,0,2,0,0,NULL,0,NULL},
|
|
{"jennicam" ,0,2,0,0,NULL,0,NULL},
|
|
{"jerkoff" ,0,2,0,0,NULL,0,NULL},
|
|
{"jism" ,0,2,0,0,NULL,0,NULL},
|
|
{"jiz" ,0,2,0,0,NULL,0,NULL},
|
|
{"justhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"karasamateurs" ,0,2,0,0,NULL,0,NULL},
|
|
{"kascha" ,0,2,0,0,NULL,0,NULL},
|
|
{"kaylakleevage" ,0,2,0,0,NULL,0,NULL},
|
|
{"kobetai" ,0,2,0,0,NULL,0,NULL},
|
|
{"lapdance" ,0,2,0,0,NULL,0,NULL},
|
|
{"largedick" ,0,2,0,0,NULL,0,NULL},
|
|
{"largehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"largestbreast" ,0,2,0,0,NULL,0,NULL},
|
|
{"largetit" ,0,2,0,0,NULL,0,NULL},
|
|
{"lesben" ,0,2,0,0,NULL,0,NULL},
|
|
{"lesbo" ,0,2,0,0,NULL,0,NULL},
|
|
{"lickadick" ,0,2,0,0,NULL,0,NULL},
|
|
{"lindalovelace" ,0,2,0,0,NULL,0,NULL},
|
|
{"longdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"lovedoll" ,0,2,0,0,NULL,0,NULL},
|
|
{"makinglove" ,0,2,0,0,NULL,0,NULL},
|
|
{"mangax" ,0,2,0,0,NULL,0,NULL},
|
|
{"manpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"marilynchambers" ,0,2,0,0,NULL,0,NULL},
|
|
{"massivecock" ,0,2,0,0,NULL,0,NULL},
|
|
{"masterbating" ,0,2,0,0,NULL,0,NULL},
|
|
{"mensdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"milf" ,0,2,0,0,NULL,0,NULL},
|
|
{"minka" ,0,2,0,0,NULL,0,NULL},
|
|
{"monstercock" ,0,2,0,0,NULL,0,NULL},
|
|
{"monsterdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"muffdiving" ,0,2,0,0,NULL,0,NULL},
|
|
{"nacktfoto" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedblackwomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedceleb" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedcelebrity" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedcheerleader" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedchick" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedgirl" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedguy" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedladies" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedlady" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedman" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedmen" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedness" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedphoto" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedstar" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedwife" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedwoman" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedwomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"nastychat" ,0,2,0,0,NULL,0,NULL},
|
|
{"nastythumb" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtylink" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtylinx" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtylynx" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtynurse" ,0,2,0,0,NULL,0,NULL},
|
|
{"niceass" ,0,2,0,0,NULL,0,NULL},
|
|
{"nikkinova" ,0,2,0,0,NULL,0,NULL},
|
|
{"nikkityler" ,0,2,0,0,NULL,0,NULL},
|
|
{"nylonfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"nympho" ,0,2,0,0,NULL,0,NULL},
|
|
{"openleg" ,0,2,0,0,NULL,0,NULL},
|
|
{"oral4free" ,0,2,0,0,NULL,0,NULL},
|
|
{"pantyhosefetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"peepcam" ,0,2,0,0,NULL,0,NULL},
|
|
{"persiankitty" ,0,2,0,0,NULL,0,NULL},
|
|
{"perverted" ,0,2,0,0,NULL,0,NULL},
|
|
{"pimpserver" ,0,2,0,0,NULL,0,NULL},
|
|
{"pissing" ,0,2,0,0,NULL,0,NULL},
|
|
{"poontang" ,0,2,0,0,NULL,0,NULL},
|
|
{"privatex" ,0,2,0,0,NULL,0,NULL},
|
|
{"prono" ,0,2,0,0,NULL,0,NULL},
|
|
{"publicnudity" ,0,2,0,0,NULL,0,NULL},
|
|
{"puffynipple" ,0,2,0,0,NULL,0,NULL},
|
|
{"racqueldarrian" ,0,2,0,0,NULL,0,NULL},
|
|
//{"rape" ,0,2,0,0,NULL,0,NULL}, // dup!
|
|
{"rawlink" ,0,2,0,0,NULL,0,NULL},
|
|
{"realhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"rubberfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"seka" ,0,2,0,0,NULL,0,NULL},
|
|
{"sheboy" ,0,2,0,0,NULL,0,NULL},
|
|
{"showcam" ,0,2,0,0,NULL,0,NULL},
|
|
{"showercam" ,0,2,0,0,NULL,0,NULL},
|
|
{"smallbreast" ,0,2,0,0,NULL,0,NULL},
|
|
{"smalldick" ,0,2,0,0,NULL,0,NULL},
|
|
{"spycamadult" ,0,2,0,0,NULL,0,NULL},
|
|
{"strapon" ,0,2,0,0,NULL,0,NULL},
|
|
{"stripclub" ,0,2,0,0,NULL,0,NULL},
|
|
{"stripshow" ,0,2,0,0,NULL,0,NULL},
|
|
{"striptease" ,0,2,0,0,NULL,0,NULL},
|
|
{"strokeit" ,0,2,0,0,NULL,0,NULL},
|
|
{"strokeme" ,0,2,0,0,NULL,0,NULL},
|
|
{"suckdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"sylviasaint" ,0,2,0,0,NULL,0,NULL},
|
|
{"teenhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"teenie" ,0,2,0,0,NULL,0,NULL},
|
|
{"teenpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"teensuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"tgp" ,0,2,0,0,NULL,0,NULL},
|
|
{"threesome" ,0,2,0,0,NULL,0,NULL},
|
|
{"thumblord" ,0,2,0,0,NULL,0,NULL},
|
|
{"thumbzilla" ,0,2,0,0,NULL,0,NULL},
|
|
{"tiffanytowers" ,0,2,0,0,NULL,0,NULL},
|
|
{"tinytitties" ,0,2,0,0,NULL,0,NULL},
|
|
//{"tities" ,0,2,0,0,NULL,0,NULL}, // entities
|
|
{"titman" ,0,2,0,0,NULL,0,NULL},
|
|
{"titsandass" ,0,2,0,0,NULL,0,NULL},
|
|
{"titties" ,0,2,0,0,NULL,0,NULL},
|
|
{"titts" ,0,2,0,0,NULL,0,NULL},
|
|
{"titty" ,0,2,0,0,NULL,0,NULL},
|
|
{"tokyotopless" ,0,2,0,0,NULL,0,NULL},
|
|
{"tommysbookmark" ,0,2,0,0,NULL,0,NULL},
|
|
{"toplesswomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"trannies" ,0,2,0,0,NULL,0,NULL},
|
|
{"twinks" ,0,2,0,0,NULL,0,NULL},
|
|
{"ultradonkey" ,0,2,0,0,NULL,0,NULL},
|
|
{"ultrahardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"uncutcock" ,0,2,0,0,NULL,0,NULL},
|
|
{"vividtv" ,0,2,0,0,NULL,0,NULL},
|
|
{"wendywhoppers" ,0,2,0,0,NULL,0,NULL},
|
|
{"wetdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"wetpanties" ,0,2,0,0,NULL,0,NULL},
|
|
{"wifesharing" ,0,2,0,0,NULL,0,NULL},
|
|
{"wifeswapping" ,0,2,0,0,NULL,0,NULL},
|
|
{"xrated" ,0,2,0,0,NULL,0,NULL}
|
|
};
|
|
|
|
|
|
// . store this in clusterdb rec so family filter works!
|
|
// . check content for adult words
|
|
char *XmlDoc::getIsAdult ( ) {
|
|
|
|
if ( m_isAdultValid ) return &m_isAdult2;
|
|
|
|
// call that
|
|
setStatus ("getting is adult bit");
|
|
|
|
int32_t **pici = getIndCatIds();
|
|
if ( ! pici || pici == (void *)-1 ) return (char *)pici;
|
|
|
|
// check categories
|
|
for ( int32_t i = 0 ; i < size_indCatIds / 4 ; i++ ) {
|
|
int32_t ic = ptr_indCatIds[i];
|
|
// skip if not an adult category
|
|
if ( ! g_categories->isIdAdult ( ic ) ) continue;
|
|
// got it
|
|
m_isAdult = true;
|
|
m_isAdult2 = true;
|
|
m_isAdultValid = true;
|
|
return &m_isAdult2;
|
|
}
|
|
|
|
// . if any of the wiki docids we are in are adult.... then we are
|
|
// . we set the top bit of wiki docids to indicate if adult
|
|
//for ( int32_t i = 0 ; i < size_wikiDocIds / 8 ; i++ ) {
|
|
// int64_t d = ptr_wikiDocIds[i];
|
|
// if ( ! ( d & 0x8000000000000000 ) ) continue;
|
|
// // got it
|
|
// m_isAdult = true;
|
|
// m_isAdultValid = true;
|
|
// return &m_isAdult;
|
|
//}
|
|
|
|
// need the content
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1) return (char *)u8;
|
|
|
|
// time it
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
// score that up
|
|
int32_t total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
|
|
m_niceness , m_firstUrl.m_url );
|
|
|
|
// then the url
|
|
//char *u = getFirstUrl()->getUrl();
|
|
//total += getDirtyPoints ( u , gbstrlen(u) );
|
|
|
|
// and redir url
|
|
//char *r = getRedirUrl()->getUrl();
|
|
//total += getDirtyPoints ( r , gbstrlen(r) );
|
|
|
|
// debug msg
|
|
int64_t took = gettimeofdayInMilliseconds() - start;
|
|
if ( took > 10 )
|
|
logf(LOG_DEBUG,
|
|
"build: Took %" INT64 " ms to check doc of %" INT32 " bytes for "
|
|
"dirty words.",took,size_utf8Content-1);
|
|
|
|
m_isAdult = false;
|
|
// adult?
|
|
if ( total >= 2 ) m_isAdult = true;
|
|
// set shadow member
|
|
m_isAdult2 = (bool)m_isAdult;
|
|
// validate
|
|
m_isAdultValid = true;
|
|
|
|
// note it
|
|
if ( m_isAdult2 && g_conf.m_logDebugDirty )
|
|
log("dirty: %s points = %" INT32 "",m_firstUrl.m_url,total);
|
|
|
|
// no dirty words found
|
|
return &m_isAdult2;
|
|
}
|
|
|
|
|
|
|
|
int32_t getDirtyPoints ( char *s , int32_t slen , int32_t niceness , char *url ) {
|
|
// . use the matches function to get all the matches
|
|
// . then check each match to see if it is actually a legit word
|
|
// . actually match the dirty words, then match the clean words
|
|
// then we can subtract counts.
|
|
int32_t numDirty = sizeof(s_dirtyWords) / sizeof(Needle);
|
|
getMatches2 ( s_dirtyWords ,
|
|
numDirty ,
|
|
s ,
|
|
slen ,
|
|
NULL , // linkPos
|
|
NULL , // needleNum
|
|
false , // stopAtFirstMatch?
|
|
NULL , // hadPreMatch ptr
|
|
true , // saveQuickTables?
|
|
niceness );
|
|
|
|
int32_t points = 0;
|
|
// each needle has an associated score
|
|
for ( int32_t i = 0 ; i < numDirty ; i++ ) {
|
|
// skip if no match
|
|
if ( s_dirtyWords[i].m_count <= 0 ) continue;
|
|
// . the "id", is positive for dirty words, - for clean
|
|
// . uses +2/-2 for really dirty words
|
|
// . uses +1/-1 for borderline dirty words
|
|
points += s_dirtyWords[i].m_id;
|
|
// log debug
|
|
if ( ! g_conf.m_logDebugDirty ) continue;
|
|
// show it in the log
|
|
log("dirty: %s %" INT32 " %s"
|
|
,s_dirtyWords[i].m_string
|
|
,(int32_t)s_dirtyWords[i].m_id
|
|
,url
|
|
);
|
|
}
|
|
|
|
////
|
|
//
|
|
// repeat for part2
|
|
//
|
|
// we have to do two separate parts otherwise the algo in
|
|
// matches2.cpp gets really slow. it was not meant to match
|
|
// so many needles in one haystack.
|
|
//
|
|
///
|
|
int32_t numDirty2 = sizeof(s_dirtyWordsPart2) / sizeof(Needle);
|
|
|
|
// . disable this for now. most of these are phrases and they
|
|
// will not be detected.
|
|
// . TODO: hash the dirty words and phrases and just lookup
|
|
// words in that table like we do for isStopWord(), but use
|
|
// isDirtyWord(). Then replace the code is Speller.cpp
|
|
// with isDirtyUrl() which will split the string into words
|
|
// and call isDirtyWord() on each one. also use bi and tri grams
|
|
// in the hash table.
|
|
numDirty2 = 0;
|
|
|
|
getMatches2 ( s_dirtyWordsPart2 ,
|
|
numDirty2 ,
|
|
s ,
|
|
slen ,
|
|
NULL , // linkPos
|
|
NULL , // needleNum
|
|
false , // stopAtFirstMatch?
|
|
NULL , // hadPreMatch ptr
|
|
true , // saveQuickTables?
|
|
niceness );
|
|
|
|
|
|
// each needle has an associated score
|
|
for ( int32_t i = 0 ; i < numDirty2 ; i++ ) {
|
|
// skip if no match
|
|
if ( s_dirtyWordsPart2[i].m_count <= 0 ) continue;
|
|
// . the "id", is positive for dirty words, - for clean
|
|
// . uses +2/-2 for really dirty words
|
|
// . uses +1/-1 for borderline dirty words
|
|
points += s_dirtyWordsPart2[i].m_id;
|
|
// log debug
|
|
if ( ! g_conf.m_logDebugDirty ) continue;
|
|
// show it in the log
|
|
log("dirty: %s %" INT32 " %s"
|
|
,s_dirtyWordsPart2[i].m_string
|
|
,(int32_t)s_dirtyWordsPart2[i].m_id
|
|
,url
|
|
);
|
|
}
|
|
|
|
|
|
return points;
|
|
}
|
|
|
|
|
|
int32_t **XmlDoc::getIndCatIds ( ) {
|
|
// if XmlDoc was set from a titleRec it should validate this
|
|
if ( m_indCatIdsValid ) return &ptr_indCatIds;
|
|
// otherwise, we must compute them!
|
|
CatRec *cat = getCatRec ();
|
|
// blocked or error?
|
|
if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
|
|
// set this
|
|
ptr_indCatIds = cat->m_indCatids;
|
|
size_indCatIds = cat->m_numIndCatids * 4;
|
|
m_indCatIdsValid = true;
|
|
// parse that up
|
|
return &ptr_indCatIds;
|
|
}
|
|
|
|
int32_t **XmlDoc::getCatIds ( ) {
|
|
// if XmlDoc was set from a titleRec it should validate this
|
|
if ( m_catIdsValid ) return &ptr_catIds;
|
|
// otherwise, we must compute them!
|
|
CatRec *cat = getCatRec ();
|
|
// blocked or error?
|
|
if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
|
|
// set this
|
|
ptr_catIds = cat->m_catids;
|
|
size_catIds = cat->m_numCatids * 4;
|
|
m_catIdsValid = true;
|
|
// parse that up
|
|
return &ptr_catIds;
|
|
}
|
|
|
|
CatRec *XmlDoc::getCatRec ( ) {
|
|
// return what we got
|
|
if ( m_catRecValid ) return &m_catRec;
|
|
// call that
|
|
setStatus ("getting dmoz cat rec");
|
|
// callback?
|
|
if ( m_calledMsg8b ) {
|
|
// return NULL on error
|
|
if ( g_errno ) return NULL;
|
|
// otherwise, success
|
|
m_catRecValid = true;
|
|
return &m_catRec;
|
|
}
|
|
// consider it called
|
|
m_calledMsg8b = true;
|
|
// assume empty and skip the call for now
|
|
m_catRec.reset();
|
|
m_catRecValid = true;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// let's bring dmoz back
|
|
//return &m_catRec;
|
|
// compute it otherwise
|
|
if ( ! m_msg8b.getCatRec ( &m_firstUrl ,
|
|
cr->m_coll ,
|
|
gbstrlen(cr->m_coll) ,
|
|
true , // use canonical name?
|
|
m_niceness ,
|
|
&m_catRec , // store here
|
|
m_masterState , // state
|
|
m_masterLoop )) // callback
|
|
// return -1 if we blocked
|
|
return (CatRec *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// we got it somehow without blocking... local cached lookup?
|
|
m_catRecValid = true;
|
|
return &m_catRec;
|
|
}
|
|
|
|
void gotWikiResultsWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotWikiResults ( slot );
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . get the wiki pages that this page matches
|
|
// . use the docids of the wiki pages to represent them
|
|
// . use an independent 32-node cluster to index all of wikipedia so it is all
|
|
// in ram. do not need datedb, etc.
|
|
// . get the gigabits for this page, up to 50 of them, and use that as a rat=0
|
|
// query on the wiki cluster
|
|
// . score each wiki docid too, based on match
|
|
// . normalize scores so they range from 10% to 100%, based on # of gigabits
|
|
// that the wiki page matches
|
|
// . index these as gbwiki:<wikipagedocid> with the score given (8-bit) mapped
|
|
// to 32 bits using score8to32() so the score itself is preserved
|
|
// . WE CAN ALSO call this at QUERY TIME, using the actual query of the
|
|
// searcher instead of the string of gigabits
|
|
// . BUT i will probably just look at the wiki topics of the search results,
|
|
// that will be faster and maybe more accurate...
|
|
int64_t **XmlDoc::getWikiDocIds ( ) {
|
|
|
|
if ( m_wikiDocIdsValid ) return (int64_t **)&ptr_wikiDocIds;
|
|
|
|
setStatus ( "getting wiki docids" );
|
|
|
|
// . get our gigabit vector
|
|
// . consists of array of 32-bit hashes
|
|
// . followed by 1-1 array of 16-bit scores
|
|
// . TODO: restrict gigabits to capitalized words and phrases, and
|
|
// also to 2+ word wiki titles
|
|
char *gq = getGigabitQuery ( );
|
|
if ( ! gq || gq == (char *)-1 ) return (int64_t **)gq;
|
|
|
|
// empty? then no wiki match i guess
|
|
//logf(LOG_DEBUG,"FIX ME FIX ME - getWikiDocIds");
|
|
|
|
// MDW: for now bail here too!
|
|
if ( ! gq[0] || 1 == 1 ) {
|
|
ptr_wikiDocIds = m_wikiDocIds;
|
|
ptr_wikiScores = m_wikiScores;
|
|
size_wikiDocIds = 0;
|
|
size_wikiScores = 0;
|
|
m_wikiDocIdsValid = true;
|
|
return (int64_t **)&ptr_wikiDocIds;
|
|
}
|
|
|
|
// set our query to these gigabits
|
|
// re-enable this later
|
|
//if ( ! m_calledMsg40 ) m_wq.set ( gq );
|
|
|
|
int32_t need = 200 + gbstrlen(gq);
|
|
// make buf
|
|
m_wikiqbuf = (char *)mmalloc ( need , "wikiqbuf");
|
|
// error?
|
|
if ( ! m_wikiqbuf ) return NULL;
|
|
// save size
|
|
m_wikiqbufSize = need;
|
|
// use large single tier for speed
|
|
char *p = m_wikiqbuf;
|
|
p += sprintf ( p ,
|
|
"GET /search?raw=9&n=%" INT32 "&sc=0&dr=0&"//dio=1&"
|
|
"t0=1000000&rat=0&"
|
|
"c=wiki&q=%s", (int32_t)MAX_WIKI_DOCIDS, gq );
|
|
// terminate it
|
|
*p++ = '\0';
|
|
// then put in the ip
|
|
*(int32_t *)p = g_hostdb.m_myHost->m_ip;
|
|
// skip over ip
|
|
p += 4;
|
|
// sanity check
|
|
if ( p - m_wikiqbuf > need ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t ip = g_conf.m_wikiProxyIp;
|
|
// if not given, make it gf1 for now
|
|
if ( ! ip ) ip = atoip ( "10.5.62.11" , 10 );
|
|
|
|
int32_t port = g_conf.m_wikiProxyPort;
|
|
// port default too to gf1
|
|
if ( ! port ) port = 9002;
|
|
|
|
// send it using msg 0xfd to the wiki cluster's proxy
|
|
if ( ! g_udpServer.sendRequest ( m_wikiqbuf ,
|
|
p - m_wikiqbuf ,
|
|
0xfd ,
|
|
ip ,
|
|
port ,
|
|
-1 , // hostId
|
|
NULL , // retSlot
|
|
this , // state
|
|
gotWikiResultsWrapper ,
|
|
1000 ) )
|
|
// we had an error, g_errno should be set
|
|
return NULL;
|
|
|
|
// got without blocking? no way!
|
|
return (int64_t **)-1;
|
|
}
|
|
|
|
void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
|
|
|
|
setStatus ( "got wiki docids" );
|
|
|
|
// do not free our request in slot
|
|
slot->m_sendBufAlloc = NULL;
|
|
|
|
// free request buf
|
|
mfree ( m_wikiqbuf , m_wikiqbufSize , "wikiqbuf" );
|
|
|
|
// error getting the wiki results?
|
|
if ( g_errno ) return;
|
|
|
|
// TODO: normalize all scores with each other some how. i think
|
|
// they are fairly absolute, but now sure with a lot of rat=0 terms!
|
|
logf(LOG_DEBUG,"wiki: fix my scoring stuff. have a min score... "
|
|
" and somehow normalize scores to be in [0,1.0]");
|
|
|
|
// . force this reply to be NULL terminated
|
|
// . i can't fix in the code now because the reply is coming from
|
|
// a different cluster running an older version of gb
|
|
char *s = slot->m_readBuf;
|
|
char *end = s + slot->m_readBufSize - 1;
|
|
// overwrite the last '>', who cares!
|
|
*end = '\0';
|
|
// make our xml
|
|
Xml xml;
|
|
if ( ! xml.set ( s ,
|
|
end - s ,
|
|
false , // ownData?
|
|
0 ,
|
|
false ,
|
|
TITLEREC_CURRENT_VERSION ,
|
|
false , // setParents?
|
|
m_niceness ,
|
|
CT_HTML ))
|
|
// return if g_errno got set
|
|
return;
|
|
|
|
// grab docids
|
|
int32_t nd = 0;
|
|
int32_t nn = xml.getNumNodes();
|
|
XmlNode *nodes = xml.getNodes();
|
|
float score = 0.0;
|
|
int64_t docId = 0LL;
|
|
for ( int32_t i = 0 ; i + 1 < nn ; i++ ) {
|
|
if ( nodes[i].m_nodeId != 1 ) continue;
|
|
// tagname is <docid>?
|
|
if ( nodes[i].m_tagNameLen == 5 &&
|
|
nodes[i].m_tagName[0] == 'd' &&
|
|
! strncmp(nodes[i].m_tagName,"docId",5) )
|
|
docId = atoll ( nodes[i].m_tagName );
|
|
// is <score>? (after docid tag)
|
|
if ( nodes[i].m_tagNameLen == 8 &&
|
|
nodes[i].m_tagName[0] == 'a' &&
|
|
! strncmp(nodes[i].m_tagName,"absScore",8) ) {
|
|
score = atof ( nodes[i].m_tagName );
|
|
// add it
|
|
m_wikiDocIds [ nd ] = docId;
|
|
m_wikiScores [ nd ] = score;
|
|
nd++;
|
|
// do not overflow
|
|
if ( nd >= MAX_WIKI_DOCIDS ) break;
|
|
}
|
|
}
|
|
// point to them
|
|
ptr_wikiDocIds = m_wikiDocIds;
|
|
ptr_wikiScores = m_wikiScores;
|
|
size_wikiDocIds = nd * 8;
|
|
size_wikiScores = nd * sizeof(rscore_t);
|
|
|
|
log ( LOG_DEBUG , "build: got %" INT32 " wiki docids",nd);
|
|
|
|
m_wikiDocIdsValid = true;
|
|
}
|
|
|
|
int32_t *XmlDoc::getPubDate ( ) {
|
|
if ( m_pubDateValid ) return (int32_t *)&m_pubDate;
|
|
// get date parse
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
|
|
// got it
|
|
m_pubDateValid = true;
|
|
m_pubDate = dp->getPubDate();
|
|
// print it once for page parser. we now do this in XmlDoc::print()
|
|
//if ( m_pbuf ) m_dates.printPubDates ( m_pbuf );
|
|
// set m_ageInDays
|
|
if ( m_pubDate == (uint32_t)-1 ) return (int32_t *)&m_pubDate;
|
|
// for parsing date
|
|
//int32_t currentTime = getTimeGlobal();
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t spideredTime = getSpideredTime();
|
|
// get doc age
|
|
//float age = currentTime - m_pubDate;
|
|
float age = spideredTime - m_pubDate;
|
|
// convert to days (could be negative if in the future)
|
|
m_ageInDays = age / (3600*24.0);
|
|
// fix it if negative
|
|
if ( m_ageInDays < 0.0 ) m_ageInDays = 0.0;
|
|
return (int32_t *)&m_pubDate;
|
|
}
|
|
|
|
Dates *XmlDoc::getDates ( ) {
|
|
if ( m_datesValid ) return &m_dates;
|
|
// skip for now
|
|
m_datesValid = true;
|
|
return &m_dates;
|
|
|
|
// set status. we can time status changes with this routine!
|
|
setStatus ( "getting dates");
|
|
|
|
Dates *dd = getSimpleDates();
|
|
// bail on error
|
|
if ( ! dd ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return NULL;
|
|
}
|
|
|
|
// need addresses
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (void *)-1 ) return (Dates *)aa;
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (Dates *)isRoot;
|
|
|
|
// . get root doc, from titlerec is ok ( TODO: make sure from titlerec)
|
|
// . TODO: make sure to save in titledb too???
|
|
// . we need this now too
|
|
// . now set DF_IN_ROOTDOC on dates that were in the same section but
|
|
// in the root doc.
|
|
// . if we are not the root, we use the root title rec to see if
|
|
// the website repeats the store hours on every page. in that case
|
|
// . TODO: a special cache just fo rholding "svt" for root pages.
|
|
// should be highly efficient!!!
|
|
//XmlDoc *rd = NULL;
|
|
|
|
// setPart2() needs the implied sections set, so set them
|
|
Sections *sections = getSections();
|
|
if ( !sections ||sections==(Sections *)-1) return(Dates *)sections;
|
|
|
|
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
//if ( ! osvt || osvt == (void *)-1 ) return (Dates *)osvt;
|
|
|
|
// table should be empty if we are the root!
|
|
//HashTableX *rvt = getRootVotingTable();
|
|
//if ( ! rvt || rvt == (void *)-1 ) return (Dates *)rvt;
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS || isRSS == (void *)-1 ) return (Dates *)isRSS;
|
|
|
|
uint8_t *ctype = getContentType();
|
|
if ( ! ctype || ctype == (void *)-1 ) return (Dates *)ctype;
|
|
|
|
bool isXml = false;
|
|
if ( *isRSS ) isXml = true;
|
|
if ( *ctype == CT_XML ) isXml = true;
|
|
|
|
int32_t minPubDate = -1;
|
|
int32_t maxPubDate = -1;
|
|
// parentPrevSpiderTime is 0 if that was the first time that the
|
|
// parent was spidered, in which case isNewOutlink will always be set
|
|
// for every outlink it had!
|
|
if ( m_sreqValid &&
|
|
m_sreq.m_isNewOutlink &&
|
|
m_sreq.m_parentPrevSpiderTime ) {
|
|
// pub date is somewhere between these two times
|
|
minPubDate = m_sreq.m_parentPrevSpiderTime;
|
|
//maxPubDate = m_sreq.m_addedTime;
|
|
maxPubDate = m_sreq.m_discoveryTime;
|
|
}
|
|
|
|
// now set part2 , returns false and sets g_errno on error
|
|
if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
|
|
isXml , *isRoot )) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: dates2: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
|
|
// on all other errors, return NULL
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
// debug EBADENGINEER error
|
|
if ( g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
// overflow? does not set g_errno. at least clear all so we do not
|
|
// get a messed up partial representation.
|
|
//if ( m_dates.m_overflowed ) {
|
|
// log("doc: date overflow for %s",m_firstUrl.m_url);
|
|
// m_dates.reset();
|
|
//}
|
|
|
|
// only call it once
|
|
m_datesValid = true;
|
|
// return it
|
|
return &m_dates;
|
|
}
|
|
|
|
Dates *XmlDoc::getSimpleDates ( ) {
|
|
|
|
if ( m_simpleDatesValid ) return &m_dates;
|
|
// note that
|
|
setStatus("get dates part 1");
|
|
// try the current url
|
|
Url *u = getCurrentUrl();
|
|
// and ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (Dates *)ip;
|
|
// the docid
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Dates *)d;
|
|
// the site hash
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Dates *)sh32;
|
|
// words
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (Dates *)words;
|
|
// we set the D_IS_IN_DATE flag for these bits
|
|
Bits *bits = getBits(); if ( ! bits ) return NULL;
|
|
// sections. is it ok that these do not include implied sections?
|
|
Sections *sections = getExplicitSections();
|
|
if (!sections||sections==(Sections *)-1) return (Dates *)sections;
|
|
// link info (this is what we had the problem with)
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Dates *)info1;
|
|
//int32_t *sv = getPageSampleVector();
|
|
//if ( ! sv || sv == (int32_t *)-1 ) return (Dates *)sv;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Dates *)xml;
|
|
// this must be valid, cuz Dates.cpp uses it!
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0;}
|
|
// . get the xml doc of the previously stored title rec
|
|
// . Dates will compare the two docs to check for clocks, etc.
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (Dates *)pod;
|
|
Url **redir = getRedirUrl();
|
|
if ( ! redir || redir == (Url **)-1 ) return (Dates *)redir;
|
|
//char *ru = NULL;
|
|
//if ( *redir ) ru = (*redir)->getUrl();
|
|
|
|
// this should deserialize from its title rec data
|
|
//Dates *odp = NULL;
|
|
//if ( *pod ) odp = (*pod)->getDates ();
|
|
// the key in this table is the date tagHash and occNum, and the
|
|
// value is the timestamp of the date. this is used by the clock
|
|
// detection algorithm to compare a date in the previous version
|
|
// of this web page to see if it changed and is therefore a clock then.
|
|
// HashTableX *cct = NULL;
|
|
// if ( *pod ) cct = (*pod)->getClockCandidatesTable();
|
|
// this should be valid
|
|
uint8_t ctype = *getContentType();
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// this now returns false and sets g_errno on error, true on success
|
|
if ( ! m_dates.setPart1 ( u , //->getUrl(),
|
|
*redir, // ru ,
|
|
ctype ,
|
|
*ip ,
|
|
*d ,
|
|
*sh32 ,
|
|
xml ,
|
|
words ,
|
|
// set D_IS_IN_DATE flag so Address.cpp
|
|
// can avoid such word in addresses!
|
|
bits ,
|
|
sections ,
|
|
info1 ,
|
|
//sv ,
|
|
//odp , // old dates
|
|
NULL , // cct ,
|
|
this , // us
|
|
*pod , // old XmlDoc
|
|
cr->m_coll ,
|
|
m_niceness )) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: dates1: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
|
|
// on all other errors, return NULL
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
// only call it once
|
|
m_simpleDatesValid = true;
|
|
// return it
|
|
return &m_dates;
|
|
}
|
|
|
|
/*
|
|
// returns NULL and sets g_errno on error, returns -1 if blocked
|
|
HashTableX *XmlDoc::getClockCandidatesTable ( ) {
|
|
// return if valid
|
|
if ( m_clockCandidatesTableValid ) return &m_clockCandidatesTable;
|
|
// otherwise, deserialize?
|
|
if ( m_clockCandidatesDataValid ) {
|
|
// and table is now valid
|
|
m_clockCandidatesTableValid = true;
|
|
// return empty table if ptr is NULL. take this out then.
|
|
if(!ptr_clockCandidatesData ) return &m_clockCandidatesTable;
|
|
// otherwise, deserialize
|
|
m_clockCandidatesTable.deserialize(ptr_clockCandidatesData ,
|
|
size_clockCandidatesData,
|
|
m_niceness );
|
|
// and return that
|
|
return &m_clockCandidatesTable;
|
|
}
|
|
|
|
// no longer using this since we got ptr_metadata
|
|
return &m_clockCandidatesTable;
|
|
|
|
// otherwise, get our dates
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (HashTableX *)dp;
|
|
// reset table just in case
|
|
m_clockCandidatesTable.reset();
|
|
// if no dates, bail
|
|
if ( dp->m_numDatePtrs == 0 ) {
|
|
m_clockCandidatesTableValid = true;
|
|
m_clockCandidatesDataValid = true;
|
|
// ptr_clockCandidatesData = NULL;
|
|
// size_clockCandidatesData = 0;
|
|
return &m_clockCandidatesTable;
|
|
}
|
|
// and set size to 32 buckets to start
|
|
if ( ! m_clockCandidatesTable.set (8,4,32,NULL,0,false,m_niceness,
|
|
"clockcands") )
|
|
return NULL;
|
|
// now stock the table
|
|
for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get date
|
|
Date *di = dp->m_datePtrs[i];
|
|
// skip if got nuked
|
|
if ( ! di ) continue;
|
|
// make the key
|
|
int64_t key ;
|
|
// lower 32 bits is taghash
|
|
key = di->m_tagHash;
|
|
// upper 32 bits is occNum
|
|
key |= ((int64_t)(di->m_occNum)) << 32;
|
|
// timestamp is the val
|
|
int32_t val = di->m_timestamp;
|
|
// then store it
|
|
if ( ! m_clockCandidatesTable.addKey ( &key , &val ) )
|
|
return NULL;
|
|
}
|
|
// that is now valid
|
|
m_clockCandidatesTableValid = true;
|
|
// how many bytes to serialize?
|
|
int32_t need = m_clockCandidatesTable.getStoredSize();
|
|
// now make the ptr valid
|
|
if ( ! m_cctbuf.reserve ( need ) ) return NULL;
|
|
// store it in there
|
|
m_clockCandidatesTable.serialize ( &m_cctbuf );
|
|
// point to it
|
|
// ptr_clockCandidatesData = m_cctbuf.getBufStart();
|
|
// size_clockCandidatesData = need;
|
|
// that is valid now
|
|
m_clockCandidatesDataValid = true;
|
|
return &m_clockCandidatesTable;
|
|
}
|
|
*/
|
|
|
|
// a date of -1 means not found or unknown
|
|
int32_t XmlDoc::getUrlPubDate ( ) {
|
|
if ( m_urlPubDateValid ) return m_urlPubDate;
|
|
// need a first url. caller should have called setFirstUrl()
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
// use Dates
|
|
//Dates dp;
|
|
// -1 means unknown
|
|
m_urlPubDate = -1;
|
|
//m_urlAge = -1;
|
|
// try the FIRST url
|
|
Url *u = getFirstUrl();
|
|
// get last url we redirected to
|
|
Url **redir = getRedirUrl();
|
|
if ( ! redir || redir == (Url **)-1 ) {char *xx=NULL;*xx=0;}
|
|
|
|
subloop:
|
|
// . try to get the date just from the url
|
|
// . this will be zero if none found
|
|
m_urlPubDate = parseDateFromUrl ( u->getUrl() );
|
|
// we are kosher
|
|
m_urlPubDateValid = true;
|
|
// if we are unknown try last/redir url, if any
|
|
if ( m_urlPubDate == 0 && *redir && u != *redir ) {
|
|
u = *redir;
|
|
goto subloop;
|
|
}
|
|
// if we got a valid pub date from the url, set "m_urlAge"
|
|
if ( m_urlPubDate == 0 ) return m_urlPubDate;
|
|
// note it
|
|
log ( LOG_DEBUG, "date: Got url pub date: %" UINT32 "",
|
|
(uint32_t)m_urlPubDate );
|
|
// set the age
|
|
//m_urlAge = getTimeGlobal() - m_urlPubDate;
|
|
//if ( m_urlAge < 0 ) m_urlAge = 0;
|
|
return m_urlPubDate;
|
|
}
|
|
|
|
// . use Dates to extract pub date from the url itself if pub date exists
|
|
// . an age of "-1" means unknown
|
|
/*
|
|
int32_t XmlDoc::getOutlinkAge ( int32_t outlinkNum ) {
|
|
// use Dates
|
|
Dates dp;
|
|
// sanity
|
|
if ( outlinkNum < 0 ) { char *xx=NULL;*xx=0; }
|
|
// get it
|
|
char *us = m_links.getLinkPtr(outlinkNum);
|
|
// for now set this, until we mod Dates to use normalized
|
|
// string urls
|
|
Url u;
|
|
u.set ( us );
|
|
// try to get the date just from the url
|
|
if ( ! dp.set ( &u ,
|
|
0 , // ip
|
|
0LL , // m_newDocId
|
|
0 , // siteHash
|
|
NULL , // Xml
|
|
NULL , // Words
|
|
NULL , // Bits
|
|
NULL , // Sections
|
|
NULL , // LinkInfo
|
|
NULL , // pageSampleVec
|
|
NULL , // old date parse2
|
|
NULL , // m_newDoc
|
|
NULL , // m_oldDoc
|
|
m_coll ,
|
|
0 , // defaultTimeZone
|
|
m_niceness )){
|
|
// should never block!
|
|
char *xx=NULL; *xx= 0; }
|
|
// this will be -1 if no date was found in the url
|
|
int32_t urlPubDate = dp.getPubDate();
|
|
// if we got a valid pub date from the url, set "m_urlAge"
|
|
if ( urlPubDate == -1 ) return -1;
|
|
// note it
|
|
//log ( LOG_DEBUG, "date: Got url pub date: %" UINT32 "", m_urlDate );
|
|
// set the age
|
|
int32_t age = getTimeGlobal() - urlPubDate;
|
|
// keep positive
|
|
if ( age < 0 ) age = 0;
|
|
// return it
|
|
return age;
|
|
}
|
|
*/
|
|
|
|
|
|
// . sets g_errno on error and returns NULL
|
|
// . now returns a ptr to it so we can return NULL to signify error, that way
|
|
// all accessors have equivalent return values
|
|
// . an accessor function returns (char *)-1 if it blocked!
|
|
char *XmlDoc::getIsPermalink ( ) {
|
|
if ( m_isPermalinkValid ) return &m_isPermalink2;
|
|
Url *url = getCurrentUrl();
|
|
if ( ! url ) return NULL;
|
|
char *isRSS = getIsRSS();
|
|
// return NULL with g_errno set, -1 if blocked
|
|
if ( ! isRSS || isRSS == (char *)-1 ) return isRSS;
|
|
Links *links = getLinks();
|
|
// return NULL with g_errno set, -1 if blocked
|
|
if ( ! links || links == (Links *)-1 ) return (char *)links;
|
|
uint8_t *ct = getContentType();
|
|
// return NULL with g_errno set, -1 if blocked
|
|
if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct;
|
|
// GUESS if it is a permalink by the format of the url
|
|
int32_t p = ::isPermalink ( links , // Links ptr
|
|
url ,
|
|
*ct , // CT_HTML default?
|
|
NULL , // LinkInfo ptr
|
|
*isRSS );// isRSS?
|
|
m_isPermalink = p;
|
|
m_isPermalink2 = p;
|
|
m_isPermalinkValid = true;
|
|
return &m_isPermalink2;
|
|
}
|
|
|
|
// guess based on the format of the url if this is a permalink
|
|
char *XmlDoc::getIsUrlPermalinkFormat ( ) {
|
|
if ( m_isUrlPermalinkFormatValid ) return &m_isUrlPermalinkFormat;
|
|
|
|
setStatus ( "getting is url permalink format" );
|
|
|
|
Url *url = getCurrentUrl();
|
|
if ( ! url ) return NULL;
|
|
// just guess if we are rss here since we most likely do not have
|
|
// access to the url's content...
|
|
bool isRSS = false;
|
|
char *ext = url->getExtension();
|
|
if ( ext && strcasecmp(ext,"rss") == 0 ) isRSS = true;
|
|
// GUESS if it is a permalink by the format of the url
|
|
int32_t p = ::isPermalink ( NULL , // Links ptr
|
|
url ,
|
|
CT_HTML ,
|
|
NULL , // LinkInfo ptr
|
|
isRSS );// we guess this...
|
|
m_isUrlPermalinkFormat = p;
|
|
m_isUrlPermalinkFormatValid = true;
|
|
return &m_isUrlPermalinkFormat;
|
|
}
|
|
|
|
char *XmlDoc::getIsRSS ( ) {
|
|
if ( m_isRSSValid ) return &m_isRSS2;
|
|
// the xml tells us for sure
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
m_isRSS = xml->isRSSFeed();
|
|
m_isRSS2 = (bool)m_isRSS;
|
|
m_isRSSValid = true;
|
|
return &m_isRSS2;
|
|
}
|
|
|
|
char *XmlDoc::getIsSiteMap ( ) {
|
|
if ( m_isSiteMapValid ) return &m_isSiteMap;
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct;
|
|
char *uf = m_firstUrl.getFilename();
|
|
int32_t ulen = m_firstUrl.getFilenameLen();
|
|
// sitemap.xml
|
|
m_isSiteMap = false;
|
|
// must be xml to be a sitemap
|
|
if ( *ct == CT_XML &&
|
|
ulen == 11 &&
|
|
strncmp(uf,"sitemap.xml",11) == 0 )
|
|
m_isSiteMap = true;
|
|
m_isSiteMapValid = true;
|
|
return &m_isSiteMap;
|
|
}
|
|
|
|
// . this function should really be called getTagTokens() because it mostly
|
|
// works on HTML documents, not XML, and just sets an array of ptrs to
|
|
// the tags in the document, including ptrs to the text in between
|
|
// tags.
|
|
Xml *XmlDoc::getXml ( ) {
|
|
|
|
// return it if it is set
|
|
if ( m_xmlValid ) return &m_xml;
|
|
|
|
// note it
|
|
setStatus ( "parsing html");
|
|
|
|
// get the filtered content
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
|
|
int32_t u8len = size_utf8Content - 1;
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
|
|
|
|
// set it
|
|
if ( ! m_xml.set ( *u8 ,
|
|
u8len ,
|
|
false , // ownData?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
false , // setParentsArg?
|
|
m_niceness ,
|
|
*ct ) )
|
|
// return NULL on error with g_errno set
|
|
return NULL;
|
|
// set just once
|
|
m_xmlValid = true;
|
|
// all done
|
|
return &m_xml;
|
|
}
|
|
|
|
// Language support static stuff
|
|
enum {
|
|
METHOD_TAG = 0,
|
|
METHOD_DMOZ,
|
|
METHOD_URL,
|
|
METHOD_OUTLINKS,
|
|
METHOD_INLINKS,
|
|
METHOD_FREQ,
|
|
METHOD_DEFAULT,
|
|
METHOD_IP,
|
|
METHOD_ROOT,
|
|
METHOD_CAP
|
|
};
|
|
|
|
bool setLangVec ( Words *words ,
|
|
SafeBuf *langBuf ,
|
|
Sections *ss ,
|
|
int32_t niceness ) {
|
|
|
|
int64_t *wids = words->getWordIds ();
|
|
char **wptrs = words->m_words;
|
|
int32_t nw = words->getNumWords ();
|
|
|
|
// allocate
|
|
if ( ! langBuf->reserve ( nw ) ) return false;
|
|
|
|
uint8_t *langVector = (uint8_t *)langBuf->getBufStart();
|
|
|
|
// now set the langid
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// default
|
|
langVector[i] = langUnknown;
|
|
// add the word
|
|
if ( wids[i] == 0LL ) continue;
|
|
// skip if number
|
|
if ( is_digit(wptrs[i][0]) ) {
|
|
langVector[i] = langTranslingual;
|
|
continue;
|
|
}
|
|
// get the lang bits. does not include langTranslingual
|
|
// or langUnknown
|
|
int64_t bits = g_speller.getLangBits64 ( &wids[i] );
|
|
// skip if not unique
|
|
char count = getNumBitsOn64 ( bits ) ;
|
|
// if we only got one lang we could be, assume that
|
|
if ( count == 1 ) {
|
|
// get it. bit #0 is english, so add 1
|
|
char langId = getBitPosLL((uint8_t *)&bits) + 1;
|
|
//langVector[i] = g_wiktionary.getLangId(&wids[i]);
|
|
langVector[i] = langId;
|
|
continue;
|
|
}
|
|
// ambiguous? set it to unknown then
|
|
if ( count >= 2 ) {
|
|
langVector[i] = langUnknown;
|
|
continue;
|
|
}
|
|
// try setting based on script. greek. russian. etc.
|
|
// if the word was not in the wiktionary.
|
|
// this will be langUnknown if not definitive.
|
|
langVector[i] = getCharacterLanguage(wptrs[i]);
|
|
}
|
|
|
|
// . now go sentence by sentence
|
|
// . get the 64 bit vector for each word in the sentence
|
|
// . then intersect them all
|
|
// . if the result is a unique langid, assign that langid to
|
|
// all words in the sentence
|
|
|
|
// get first sentence in doc
|
|
Section *si = NULL;
|
|
if ( ss ) si = ss->m_firstSent;
|
|
// scan the sentence sections and or in the bits we should
|
|
for ( ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// reset vec
|
|
int64_t bits = LANG_BIT_MASK;
|
|
// get lang 64 bit vec for each wid in sentence
|
|
for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[j] ) continue;
|
|
// skip if starts with digit
|
|
if ( is_digit(wptrs[j][0]) ) continue;
|
|
// get 64 bit lang vec. does not include
|
|
// langUnknown or langTransligual bits
|
|
bits &= g_speller.getLangBits64 ( &wids[j] );
|
|
}
|
|
// bail if none
|
|
if ( ! bits ) continue;
|
|
// skip if more than one language in intersection
|
|
if ( getNumBitsOn64(bits) != 1 ) continue;
|
|
// get it. bit #0 is english, so add 1
|
|
char langId = getBitPosLL((uint8_t *)&bits) + 1;
|
|
// ok, must be this language i guess
|
|
for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[j] ) continue;
|
|
// skip if starts with digit
|
|
if ( is_digit(wptrs[j][0]) ) continue;
|
|
// set it
|
|
langVector[j] = langId;
|
|
}
|
|
}
|
|
|
|
// try the same thing but do not use sentences. use windows of
|
|
// 5 words. this will pick up pages that have an english menu
|
|
// where each menu item is an individual sentence and only
|
|
// one word.
|
|
// http://www.topicexchange.com/
|
|
int64_t window[5];
|
|
int32_t wpos[5];
|
|
memset ( window , 0 , 8*5 );
|
|
int32_t wp = 0;
|
|
int32_t total = 0;
|
|
// now set the langid
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// must be alnum
|
|
if ( ! wids[i] ) continue;
|
|
// skip if starts with digit
|
|
if ( is_digit(wptrs[i][0]) ) continue;
|
|
// skip if lang already set to a language
|
|
//if ( langVector[i] != langUnknown &&
|
|
// langVector[i] != langTranslingual )
|
|
// continue;
|
|
// get last 5
|
|
window[wp] = g_speller.getLangBits64 ( &wids[i] );
|
|
// skip if not in dictionary!
|
|
if ( window[wp] == 0 ) continue;
|
|
// otherwise, store it
|
|
wpos [wp] = i;
|
|
if ( ++wp >= 5 ) wp = 0;
|
|
// need at least 3 samples
|
|
if ( ++total <= 2 ) continue;
|
|
// intersect them all together
|
|
int64_t bits = LANG_BIT_MASK;
|
|
for ( int32_t j = 0 ; j < 5 ; j++ ) {
|
|
// skip if uninitialized, like if we have 3
|
|
// or only 4 samples
|
|
if ( ! window[j] ) continue;
|
|
// otherwise, toss it in the intersection
|
|
bits &= window[j];
|
|
}
|
|
// skip if intersection empty
|
|
if ( ! bits ) continue;
|
|
// skip if more than one language in intersection
|
|
if ( getNumBitsOn64(bits) != 1 ) continue;
|
|
// get it. bit #0 is english, so add 1
|
|
char langId = getBitPosLL((uint8_t *)&bits) + 1;
|
|
// set all in window to this language
|
|
for ( int32_t j = 0 ; j < 5 ; j++ ) {
|
|
// skip if uninitialized
|
|
if ( ! window[j] ) continue;
|
|
// otherwise, set it
|
|
langVector[wpos[j]] = langId;
|
|
}
|
|
}
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// 1-1 with the words!
|
|
uint8_t *XmlDoc::getLangVector ( ) {
|
|
|
|
if ( m_langVectorValid ) {
|
|
// can't return NULL, that means error!
|
|
uint8_t *v = (uint8_t *)m_langVec.getBufStart();
|
|
if ( ! v ) return (uint8_t *)0x01;
|
|
return v;
|
|
}
|
|
|
|
// words
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
|
|
|
|
// get the sections without implied sections
|
|
Sections *ss = getImpliedSections();
|
|
if ( ! ss || ss==(void *)-1) return (uint8_t *)ss;
|
|
|
|
|
|
if ( ! setLangVec ( words , &m_langVec , ss , m_niceness) )
|
|
return NULL;
|
|
|
|
m_langVectorValid = true;
|
|
// can't return NULL, that means error!
|
|
uint8_t *v = (uint8_t *)m_langVec.getBufStart();
|
|
if ( ! v ) return (uint8_t *)0x01;
|
|
return v;
|
|
}
|
|
|
|
// returns -1 and sets g_errno on error
|
|
uint8_t *XmlDoc::getLangId ( ) {
|
|
if ( m_langIdValid ) return &m_langId;
|
|
setStatus ( "getting lang id");
|
|
|
|
// debu ghack
|
|
//m_langId = langRussian;
|
|
//m_langIdValid = true;
|
|
//return &m_langId;
|
|
|
|
// get the stuff we need
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (uint8_t *)ip;
|
|
|
|
// . if we got no ip, we can't get the page...
|
|
// . also getLinks() will call getSiteNumInlinks() which will
|
|
// call getSiteLinkInfo() and will core if ip is 0 or -1
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
m_langId = langUnknown;
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
//Xml *xml = getXml ();
|
|
//if ( ! xml || xml == (Xml *)-1 ) return (uint8_t *)xml;
|
|
Words *words = getWords ();
|
|
if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
|
|
// do not get regular sections, getSections() which will call
|
|
// getImpliedSections(), because then that will need to set addresses
|
|
// and dates, etc. the addresses could return NULL with EBUFOVERFLOW
|
|
// from a static buffer overflow causing us some problems here and
|
|
// since that g_errno is only really handled well in getIndexCode()
|
|
// it will log that CRITICAL CRITICAL message. and we really only
|
|
// need the section sot avoid looking at script tag sections, etc.
|
|
// when calling Words::getLanguage()
|
|
Sections *sections = getExplicitSections();
|
|
// did it block?
|
|
if ( sections==(Sections *)-1) return(uint8_t *)sections;
|
|
// well, it still calls Dates::parseDates which can return g_errno
|
|
// set to EBUFOVERFLOW...
|
|
if ( ! sections && g_errno != EBUFOVERFLOW ) return NULL;
|
|
// if sectinos is still NULL - try lang id without sections then,
|
|
// reset g_errno
|
|
g_errno = 0;
|
|
//Links *links = getLinks();
|
|
//if ( ! links || links == (Links *)-1 ) return (uint8_t *)links;
|
|
//LinkInfo *info1 = getLinkInfo1();
|
|
//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (uint8_t *)info1;
|
|
//CatRec *cat = getCatRec ();
|
|
//if ( ! cat || cat == (CatRec *)-1) return (uint8_t *)cat;
|
|
uint8_t *lv = getLangVector();
|
|
if ( ! lv || lv == (void *)-1 ) return (uint8_t *)lv;
|
|
|
|
setStatus ( "getting lang id");
|
|
|
|
// compute langid from vector
|
|
m_langId = computeLangId ( sections , words, (char *)lv );
|
|
if ( m_langId != langUnknown ) {
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
// . try the meta description i guess
|
|
// . 99% of the time we don't need this because the above code
|
|
// captures the language
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription( &mdlen );
|
|
Words mdw;
|
|
mdw.setx ( md , mdlen , m_niceness );
|
|
SafeBuf langBuf;
|
|
setLangVec ( &mdw,&langBuf,NULL,m_niceness);
|
|
char *tmpLangVec = langBuf.getBufStart();
|
|
m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
|
|
if ( m_langId != langUnknown ) {
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
// try meta keywords
|
|
md = getMetaKeywords( &mdlen );
|
|
mdw.setx ( md , mdlen , m_niceness );
|
|
langBuf.purge();
|
|
setLangVec ( &mdw,&langBuf,NULL,m_niceness);
|
|
tmpLangVec = langBuf.getBufStart();
|
|
m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
|
|
// lv = langVec
|
|
char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {
|
|
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
// this means null too
|
|
if ( sections && sections->m_numSections == 0 ) sp = NULL;
|
|
int32_t badFlags = SEC_SCRIPT|SEC_STYLE;//|SEC_SELECT;
|
|
|
|
int32_t counts [ MAX_LANGUAGES ];
|
|
memset ( counts , 0 , MAX_LANGUAGES * 4);
|
|
|
|
|
|
|
|
int32_t nw = words->getNumWords ();
|
|
char **wptrs = words->m_words;
|
|
int32_t *wlens = words->m_wordLens;
|
|
|
|
|
|
// now set the langid
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if in script or style section
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
|
//
|
|
// skip if in a url
|
|
//
|
|
// blah/
|
|
if ( wptrs[i][wlens[i]] == '/' ) continue;
|
|
// blah.blah or blah?blah
|
|
if ( (wptrs[i][wlens[i]] == '.' ||
|
|
wptrs[i][wlens[i]] == '?' ) &&
|
|
is_alnum_a(wptrs[i][wlens[i]+1]) )
|
|
continue;
|
|
// /blah or ?blah
|
|
if ( (i>0 && wptrs[i][-1] == '/') ||
|
|
(i>0 && wptrs[i][-1] == '?') )
|
|
continue;
|
|
// add it up
|
|
counts[(unsigned char)lv[i]]++;
|
|
}
|
|
|
|
// get the majority count
|
|
int32_t max = 0;
|
|
int32_t maxi = 0;
|
|
// skip langUnknown by starting at 1, langEnglish
|
|
for ( int32_t i = 1 ; i < MAX_LANGUAGES ; i++ ) {
|
|
// skip translingual
|
|
if ( i == langTranslingual ) continue;
|
|
if ( counts[i] <= max ) continue;
|
|
max = counts[i];
|
|
maxi = i;
|
|
}
|
|
|
|
return maxi;
|
|
//m_langId = maxi;
|
|
//m_langIdValid = true;
|
|
//return &m_langId;
|
|
|
|
/*
|
|
int32_t freqScore = 0;
|
|
int32_t lang;
|
|
if ( ! m_processedLang ) {
|
|
// do not repeat this call for this document
|
|
m_processedLang = true;
|
|
lang = words->getLanguage( sections ,
|
|
1000 , // sampleSize ,
|
|
m_niceness,
|
|
&freqScore);
|
|
// return NULL on error with g_errno set
|
|
if ( lang == -1 ) return NULL;
|
|
// we got it from words, return
|
|
if ( lang != 0 ) {
|
|
m_langId = lang;
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
}
|
|
|
|
m_langId = 0;
|
|
// try from charset
|
|
uint16_t *charset = getCharset ( );
|
|
if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset;
|
|
// do based on charset
|
|
if ( *charset == csGB18030 ) m_langId = langChineseTrad;
|
|
if ( *charset == csGBK ) m_langId = langChineseSimp;
|
|
|
|
if ( m_langId ) {
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
|
|
// this lookup here might be unnecessary
|
|
uint8_t *rl = NULL;
|
|
if ( ! *isRoot ) {
|
|
rl = getRootLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
|
|
}
|
|
|
|
//Url *u = getCurrentUrl();
|
|
Url *u = getFirstUrl();
|
|
uint8_t gs[METHOD_CAP];
|
|
// reset language method vector
|
|
memset( gs , 0, sizeof(uint8_t) * METHOD_CAP );
|
|
// Let the site tell us what language it's in
|
|
gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml );
|
|
// Guess from the FIRST URL (unredirected url)
|
|
gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() );
|
|
// Guess from the outlinks
|
|
gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links );
|
|
// Guess from the inlinks
|
|
gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip);
|
|
// root page's language, if there was one
|
|
if ( ! *isRoot ) gs [METHOD_ROOT] = *rl;
|
|
|
|
int32_t scores[MAX_LANGUAGES];
|
|
memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES );
|
|
// weights for the 10 methods
|
|
char cw[] = { 8,9,4,7,6,7,8,1,2};
|
|
// add up weighted scores
|
|
for(int i = 0; i < METHOD_CAP; i++ )
|
|
scores[gs[i]] += cw[i];
|
|
|
|
// reset the "lang" to langUnknown which is 0
|
|
lang = langUnknown ;
|
|
int max, oldmax;
|
|
max = oldmax = 0;
|
|
// find best language
|
|
for ( int32_t i = MAX_LANGUAGES - 1; i > 0 ; i-- ) {
|
|
if ( scores[i] < max) continue;
|
|
oldmax = max;
|
|
max = scores[i];
|
|
lang = i;
|
|
}
|
|
// give up if not too conclusive
|
|
if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) {
|
|
//log(LOG_DEBUG, "build: Language: Threshold, score "
|
|
// "(%" INT32 " - %" INT32 ") %" INT32 " vs. %" INT32 ".\n",
|
|
// (int32_t)max,
|
|
// (int32_t)oldmax,
|
|
// (int32_t)max - oldmax,
|
|
// (int32_t)3);//(int32_t)cr->m_languageThreshold);
|
|
lang = langUnknown;
|
|
}
|
|
// Make sure we're over the bailout value, this
|
|
// keeps low scoring methods like TLD from being
|
|
// the decider if it was the only successful method.
|
|
if ( max < 5 ) { // cr->m_languageBailout ) {
|
|
//log(LOG_DEBUG, "build: Language: Bailout, "
|
|
// "score %" INT32 " vs. %" INT32 ".",
|
|
// (int32_t)max, (int32_t)5);//cr->m_languageBailout);
|
|
lang = langUnknown;
|
|
}
|
|
// If the language is still not known,
|
|
// use the language detected from the frames.
|
|
//if(lang == langUnknown) lang = frameFoundLang;
|
|
// . try dmoz if still unknown
|
|
// . limit to 10 of them
|
|
// all done, do not repeat
|
|
m_langIdValid = true;
|
|
m_langId = lang;
|
|
m_langIdScore = max;
|
|
return &m_langId;
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
Words *XmlDoc::getWords ( ) {
|
|
// return it if it is set
|
|
if ( m_wordsValid ) return &m_words;
|
|
// this will set it if necessary
|
|
Xml *xml = getXml();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Words *)xml;
|
|
// note it
|
|
setStatus ( "getting words");
|
|
// now set what we need
|
|
if ( ! m_words.set ( xml ,
|
|
true , // computeWordIds?
|
|
m_niceness ))
|
|
return NULL;
|
|
// we got it
|
|
m_wordsValid = true;
|
|
return &m_words;
|
|
}
|
|
|
|
Bits *XmlDoc::getBits ( ) {
|
|
// return it if it is set
|
|
if ( m_bitsValid ) return &m_bits;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Bits *)words;
|
|
// now set what we need
|
|
if ( ! m_bits.set ( words , m_version , m_niceness ) )
|
|
return NULL;
|
|
// we got it
|
|
m_bitsValid = true;
|
|
return &m_bits;
|
|
}
|
|
|
|
Bits *XmlDoc::getBitsForSummary ( ) {
|
|
// return it if it is set
|
|
if ( m_bits2Valid ) return &m_bits2;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Bits *)words;
|
|
// now set what we need
|
|
if ( ! m_bits2.setForSummary ( words ) ) return NULL;
|
|
// we got it
|
|
m_bits2Valid = true;
|
|
return &m_bits2;
|
|
}
|
|
|
|
Pos *XmlDoc::getPos ( ) {
|
|
// return it if it is set
|
|
if ( m_posValid ) return &m_pos;
|
|
// this will set it if necessary
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Pos *)ww;
|
|
//Sections *sections = getSections();
|
|
//if ( !sections ||sections==(Sections *)-1) return(Pos *)sections;
|
|
// now set what we need
|
|
//if ( ! m_pos.set ( ww , sections ) ) return NULL;
|
|
if ( ! m_pos.set ( ww , NULL ) ) return NULL;
|
|
// we got it
|
|
m_posValid = true;
|
|
return &m_pos;
|
|
}
|
|
|
|
Phrases *XmlDoc::getPhrases ( ) {
|
|
// return it if it is set
|
|
if ( m_phrasesValid ) return &m_phrases;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Phrases *)words;
|
|
// get this
|
|
Bits *bits = getBits();
|
|
// bail on error
|
|
if ( ! bits ) return NULL;
|
|
// now set what we need
|
|
if ( ! m_phrases.set ( words ,
|
|
bits ,
|
|
true , // use stop words
|
|
false , // use stems
|
|
m_version ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
// we got it
|
|
m_phrasesValid = true;
|
|
return &m_phrases;
|
|
}
|
|
|
|
/*
|
|
Synonyms *XmlDoc::getSynonyms ( ) {
|
|
// return if already set
|
|
if ( m_synonymsValid ) return &m_synonyms;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (Synonyms *)words;
|
|
Phrases *phrases = getPhrases ();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (Synonyms *)phrases;
|
|
uint8_t *lv = getLangVector();
|
|
if ( ! lv || lv == (void *)-1 ) return (Synonyms *)lv;
|
|
// primary language of the document
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (Synonyms *) langId;
|
|
// . now set what we need
|
|
// . provide a buf for which synonyms can be stored if we need to
|
|
SafeBuf *synBuf = NULL;
|
|
if ( m_pbuf || m_storeTermListInfo ) synBuf = &m_synBuf;
|
|
|
|
// force on for printing out the synonyms in the loop below
|
|
//synBuf = &m_synBuf;
|
|
|
|
if ( ! m_synonyms.set ( words,
|
|
(char *)lv,
|
|
(char)*langId,phrases,
|
|
m_niceness,synBuf) )
|
|
return NULL;
|
|
|
|
// we got it
|
|
m_synonymsValid = true;
|
|
return &m_synonyms;
|
|
}
|
|
*/
|
|
|
|
Sections *XmlDoc::getExplicitSections ( ) {
|
|
// these sections might or might not have the implied sections in them
|
|
if ( m_explicitSectionsValid ) return &m_sections;
|
|
|
|
// if json forget this it is only html
|
|
//uint8_t *ct = getContentType();
|
|
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
|
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
|
// m_sectionsValid = true;
|
|
// return &m_sections;
|
|
//}
|
|
|
|
setStatus ( "getting explicit sections" );
|
|
// use the old title rec to make sure we parse consistently!
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;
|
|
// int16_tcut
|
|
//XmlDoc *od = *pod;
|
|
// if the serialized section is valid, use that
|
|
//char *sd = NULL;
|
|
//bool valid = false;
|
|
//if ( od && od->m_sectionsReplyValid ) valid = true;
|
|
//if ( valid ) sd = od->ptr_sectionsReply;
|
|
// shouldn't we use the section data in ptr_sections for this???
|
|
//bool valid = m_sectionsReplyValid ;
|
|
//char *sd = NULL;
|
|
//if ( valid ) sd = ptr_sectionsReply;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
|
|
// need these too now
|
|
Phrases *phrases = getPhrases();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases;
|
|
// get this
|
|
Bits *bits = getBits();
|
|
// bail on error
|
|
if ( ! bits ) return NULL;
|
|
// the site hash
|
|
int64_t *sh64 = getSiteHash64();
|
|
// sanity check
|
|
if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; }
|
|
if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64;
|
|
// the docid
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
setStatus ( "getting sections");
|
|
|
|
//char *sv = NULL;
|
|
//if ( m_setFromTitleRec ) sv = ptr_sectionsVotes;
|
|
|
|
// debug time to find a slow url
|
|
int64_t start = gettimeofdayInMillisecondsLocal();
|
|
|
|
// this uses the sectionsReply to see which sections are "text", etc.
|
|
// rather than compute it expensively
|
|
if ( ! m_calledSections &&
|
|
// we get malformed sections error for some diffbot replies
|
|
//*ct != CT_JSON &&
|
|
! m_sections.set ( &m_words ,
|
|
&m_phrases ,
|
|
bits ,
|
|
getFirstUrl() ,
|
|
*d ,
|
|
*sh64 , // 64 bits
|
|
cr->m_coll ,
|
|
m_niceness ,
|
|
m_masterState , // state
|
|
m_masterLoop , // callback
|
|
*ct ,
|
|
&m_dates ,
|
|
NULL , // sd // sections data
|
|
true , // sections data valid?
|
|
NULL , // sv // for m_nsvt
|
|
//*tph ,
|
|
NULL , // buf
|
|
0 )) { // bufSize
|
|
m_calledSections = true;
|
|
// sanity check, this should not block, we are setting
|
|
// exclusively from the titleRec
|
|
//if ( sd ) { char *xx=NULL;*xx=0; }
|
|
// it blocked, return -1
|
|
return (Sections *) -1;
|
|
}
|
|
|
|
int64_t end = gettimeofdayInMillisecondsLocal();
|
|
|
|
if ( end - start > 1000 )
|
|
log("build: %s section set took %" INT64 " ms",
|
|
m_firstUrl.m_url,end -start);
|
|
|
|
|
|
// error? ETAGBREACH for example... or maybe ENOMEM
|
|
if ( g_errno ) return NULL;
|
|
// set inlink bits
|
|
m_bits.setInLinkBits ( &m_sections );
|
|
// we got it
|
|
m_explicitSectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
Sections *XmlDoc::getImpliedSections ( ) {
|
|
if ( m_impliedSectionsValid ) return &m_sections;
|
|
|
|
// get the sections without implied sections
|
|
Sections *sections = getExplicitSections();
|
|
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
|
|
|
|
// just use that for now if not doing events to save time! because
|
|
// adding implied sections really sucks the resources.
|
|
m_impliedSectionsValid = true;
|
|
return &m_sections;
|
|
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
|
|
// get this
|
|
Bits *bits = getBits();
|
|
// bail on error
|
|
if ( ! bits ) return NULL;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now we need basic date types to add implied sections that
|
|
// have a dow/dom header and tod brother sections
|
|
|
|
// THIS WAS in getExplicitSections() but now m_wids is NULL.
|
|
// m_wids is set in setPart1() called by XmlDoc::getSimpleDates(),
|
|
// which calls getExplicitSections().
|
|
// . This was called for the benefit of Sections::addImpliedSections()
|
|
// but now getAddresses() which we call below ends up calling
|
|
// getSimpleDates() which calls m_dates.setPart1() which calls
|
|
// m_dates.parseDates() so this is no longer needed i guess.
|
|
/*
|
|
if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits,
|
|
sections, m_niceness , &m_firstUrl ,
|
|
*ct )) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: dates3: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
|
|
// on all other errors, return NULL
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
*/
|
|
|
|
// if we got no sections it was bad html. so don't go any further
|
|
// lest we core in other code..
|
|
// it might have also just been an empty doc.
|
|
// either way we'll core in getAddresses cuz it calls getSimpleDates
|
|
// which will core in Dates::setPart1() trying to use m_sectionPtrs
|
|
if ( sections->m_numSections == 0 ) {
|
|
m_impliedSectionsValid = true;
|
|
// hack to avoid core for empty docs like www.mini-polis.com
|
|
sections->m_addedImpliedSections = true;
|
|
return &m_sections;
|
|
}
|
|
// . now set addresses so we can use those to add implied sections
|
|
// . this calls getSimpleDates() which calls m_dates.setPart1()
|
|
// which calls parseDates again
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (void *)-1 ) return (Sections *)aa;
|
|
|
|
// . now add implied sections
|
|
// . return NULL with g_errno set on error
|
|
if ( ! m_sections.addImpliedSections ( aa ) ) return NULL;
|
|
|
|
// we got it
|
|
m_impliedSectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
// add in Section::m_sentFlags bits having to do with our voting tables
|
|
Sections *XmlDoc::getSections ( ) {
|
|
|
|
setStatus("getting sections");
|
|
|
|
// get the sections without implied sections
|
|
Sections *ss = getImpliedSections();
|
|
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
|
|
|
|
// hash the turk votes (each vote maps a contenthash or taghash to
|
|
// a value) and use these to set sections sentence flags, etc.
|
|
//HashTableX *tvt = getTurkVotingTable ();
|
|
//if ( ! tvt || tvt == (void *)-1 ) return (Sections *)tvt;
|
|
|
|
// returns NULL if our url is root!
|
|
//HashTableX *rvt = getRootVotingTable();
|
|
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
|
|
|
|
SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt;
|
|
|
|
uint32_t *tph = getTagPairHash32();
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph;
|
|
|
|
// need a getUseSectiondb() function...
|
|
|
|
if ( ! m_useSectiondb ) {
|
|
m_sectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
// start here
|
|
Section *si;
|
|
|
|
/*
|
|
// get first sentence in doc
|
|
si = ss->m_firstSent;
|
|
// do not bother scanning if no votes
|
|
if ( osvt->getNumVotes() <= 0 ) si = NULL;
|
|
// scan the sentence sections and or in the bits we should
|
|
for ( ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// combine section tagHash with contentHashAll to get
|
|
// the "modified tagHash"
|
|
int32_t modified = si->m_tagHash ^ si->m_contentHash;
|
|
// save this
|
|
float dups = osvt->getNumSampled (modified,SV_TAGCONTENTHASH);
|
|
// . getNumSampled() combines both m_nsvt and m_osvt so it
|
|
// includes ourselves... NO!... let's change this!
|
|
// the osvt should not include votes from us!
|
|
// it strips those outin SectionVotingTable::addListOfVotes()
|
|
// . if it is a print-friendly version of the same page then
|
|
// one of the two should have been deduped and not indexed,
|
|
// so be strict with adhering to no more than 1!
|
|
if ( dups > 0 ) si->m_flags |= SEC_DUP;
|
|
// . content hash must be unique!
|
|
// . can detect texty bios repeated throughout the site
|
|
// . this is the hash of the words directly in the section
|
|
// . HACK: the contentHash is the "tagHash" for this call
|
|
// . SectionVote::m_numSampled is how many sections over all
|
|
// docs we indexed from this site have this m_contentHash
|
|
// . note that it is not restricted to pages with the same
|
|
// tagPairHash as us (i.e. pages with similar layouts)
|
|
// therefore it is very flexible!!! it is only restricted
|
|
// to pages with our same site hash.
|
|
// . getNumSampled() combines both m_nsvt and m_osvt so it
|
|
// includes ourselves
|
|
// . if it is a print-friendly version of the same page then
|
|
// one of the two should have been deduped and not indexed,
|
|
// so be strict with adhering to no more than 1!
|
|
if ( dups > 0 ) continue;
|
|
// . must be in a unique section
|
|
// . if the section has siblings, skip it!
|
|
if ( si->m_numOccurences > 1 ) continue;
|
|
// . eliminate dynamic menus
|
|
// . like "related posts" menus
|
|
// . therefore require that we must be "texty" ...
|
|
// . i.e. be like 80% plain text and no more than 20% link text
|
|
// . vote on this since in some cases article may be mostly
|
|
// just all in anchor text on a few article pages, but on
|
|
// other pages it is well-behaved
|
|
if ( osvt->getScore ( si->m_tagHash, SV_TEXTY) < .80 )
|
|
continue;
|
|
// . check for comment sections
|
|
// . these are text and the content is unique
|
|
// . BUT the section tagHash is typically repeated at least
|
|
// once on some other pages (HOPEFULLY!!!!)
|
|
// . if we only require there be X other pages from this site
|
|
// with the same layout, we might get unlucky in that each
|
|
// page has 1 or less comments!!! how to fix???
|
|
// . anyway, we ask for the max # sampled from all of the votes
|
|
// here because if just one page has 2+ copies of this
|
|
// section enum tag hash, that is enough to be a comment
|
|
// section
|
|
// . SV_TEXTY_MAX_SAMPLED is a statistic compiled from the
|
|
// voters and does not actually exist in sectiondb per se.
|
|
// we add this statistic transparently in addVote() below
|
|
// . it just gets the num sampled from the voter that had the
|
|
// maximum m_numSampled value, because we don't want an
|
|
// average in this case
|
|
if ( osvt->getNumSampled(si->m_tagHash,SV_TEXTY_MAX_SAMPLED)>0)
|
|
continue;
|
|
// set it
|
|
si->m_flags |= SEC_ARTICLE;
|
|
// tally it up
|
|
//m_numAlnumWordsInArticle += si->m_exclusive;
|
|
// and another flag
|
|
//m_hadArticle = true;
|
|
}
|
|
*/
|
|
|
|
//
|
|
// . how many other pages from this site have our tagpairhash?
|
|
// . that is all the unique adjacent tag pair hashes xor'd together
|
|
// . kind of represents the template of the webpage, ideally
|
|
//
|
|
//int32_t numSimLayouts = osvt->getNumSampled ( *tph , SV_TAGPAIRHASH );
|
|
|
|
///////////////////////////////////////
|
|
//
|
|
// set m_dupVotes and m_notDupVotes for each section
|
|
//
|
|
// answers the question... out of all the pages with this taghash,
|
|
// from this site, how often is this content repeated?
|
|
//
|
|
// trumba.com often repeats an event on its various feeds, but
|
|
// not on EVERY page. so we should adjust the event title penalties
|
|
// based on the ratio of repeated to not-repeated from the various
|
|
// pages on the site that have the same *taghash*
|
|
//
|
|
///////////////////////////////////////
|
|
|
|
// get first sentence in doc
|
|
si = ss->m_firstSent;
|
|
// do not bother scanning if no votes
|
|
if ( osvt->getNumVotes() <= 0 ) si = NULL;
|
|
// assume no dups
|
|
m_maxVotesForDup = 0;
|
|
// scan the sentence sections and or in the bits we should
|
|
for ( ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// sanity check
|
|
if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
|
|
// how many pages from this site have this taghash for
|
|
// a sentence
|
|
float nt;
|
|
nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH);
|
|
// skip if nobody! (except us)
|
|
if ( nt <= 0.0 ) continue;
|
|
// . get out tag content hash
|
|
// . for some reason m_contentHash is 0 for like menu-y sectns
|
|
int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64;
|
|
// . now how many pages also had same content in that tag?
|
|
// . TODO: make sure numsampled only counts a docid once!
|
|
// and this is not each time it occurs on that page.
|
|
float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH);
|
|
// cast it to a int32_t
|
|
int32_t votes1 = (int32_t)nsam;
|
|
// by default, complement
|
|
int32_t votes2 = (int32_t)nt - votes1;
|
|
// store votes
|
|
si->m_votesForDup = votes1;
|
|
si->m_votesForNotDup = votes2;
|
|
// what's the most dup votes we had...
|
|
if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1;
|
|
// set it
|
|
//if ( si->m_votesForDup > 2 * si->m_votesForNotDup &&
|
|
// si->m_votesForDup >= 1 &&
|
|
// ! (si->m_flags & SEC_HAS_NONFUZZYDATE) )
|
|
// si->m_sentFlags |= SENT_DUP_SECTION;
|
|
}
|
|
|
|
m_sectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) {
|
|
if ( m_nsvtValid ) return &m_nsvt;
|
|
// need sections
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
|
|
// and dates
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (SectionVotingTable *)dp;
|
|
// hash of all adjacent tag pairs
|
|
uint32_t *tph = getTagPairHash32 ( ) ;
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
|
|
// are we a site root url?
|
|
//char *isRoot = getIsSiteRoot();
|
|
//if ( ! isRoot || isRoot == (char *)-1 )
|
|
// return (SectionVotingTable *)isRoot;
|
|
|
|
// init table
|
|
if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL;
|
|
// . tally the section votes from the sections class
|
|
// . only add the date votes, not the taghash/contenthash keys
|
|
// from the root, since we add those from the root voting table
|
|
// into m_osvt directly!
|
|
// . we no longer have root voting table!
|
|
// . this adds keys of the hash of each tag xpath
|
|
// . and it adds keys of the hash of each tag path PLUS its innerhtml
|
|
if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL;
|
|
// tally the section votes from the dates
|
|
if ( ! dp->addVotes ( &m_nsvt ) ) return NULL;
|
|
// our new section voting table is now valid, and ready to be added
|
|
// to sectiondb by calling SectionVotingTable::hash()
|
|
m_nsvtValid = true;
|
|
return &m_nsvt;
|
|
}
|
|
|
|
|
|
// . scan every section and look up its tag and content hashes in
|
|
// sectiondb to find out how many pages and sites have the same hash
|
|
// . use the secondary sectiondb key, key2
|
|
// . then store the stats in the Sections::m_stats class
|
|
Sections *XmlDoc::getSectionsWithDupStats ( ) {
|
|
|
|
Sections *ss = getSections();
|
|
if ( !ss ||ss==(Sections *)-1) return(Sections *)ss;
|
|
|
|
if ( m_gotDupStats ) return ss;
|
|
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32;
|
|
uint32_t siteHash32 = (uint32_t)*sh32;
|
|
|
|
//int64_t *shp64 = getSiteHash64();
|
|
//if ( ! shp64 || shp64 == (void *)-1 ) return (Sections *)shp64;
|
|
//int64_t siteHash48 = *shp64 & 0x0000ffffffffffffLL;
|
|
|
|
// first time called? then init m_nextSection.
|
|
//Section *si = m_si;
|
|
|
|
// if this is -1, we are called for the first time
|
|
if ( m_si == (void *)-1 ) {
|
|
m_si = ss->m_rootSection;
|
|
m_mcastRequestsIn = 0;
|
|
m_mcastRequestsOut = 0;
|
|
m_secStatsErrno = 0;
|
|
}
|
|
|
|
|
|
//sec_t menuFlags = SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ;
|
|
|
|
for ( ; m_si ; m_si = m_si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index.
|
|
if ( ! ( m_si->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
|
|
// skip if sentence, only hash tags now i guess for diffbot
|
|
//if ( m_si->m_sentenceContentHash64 )
|
|
// continue;
|
|
|
|
// get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64;
|
|
if ( ! val32 )
|
|
continue;
|
|
|
|
// skip if menu!
|
|
//if ( m_si->m_flags & menuFlags ) continue;
|
|
|
|
// get section xpath hash combined with sitehash
|
|
uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32;
|
|
|
|
// convert this to 32 bits
|
|
uint32_t innerHash32 ;
|
|
//sentHash32 = (uint32_t)m_si->m_sentenceContentHash64;
|
|
innerHash32 = (uint32_t)m_si->m_indirectSentHash64;
|
|
|
|
// save in case we need to read more than 5MB
|
|
//m_lastSection = si;
|
|
// . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32
|
|
// . we hack the "sentContentHash32" into each posdb key
|
|
// as the "value" so we can do a facet-like histogram
|
|
// over all the possible values this xpath has for this site
|
|
SectionStats *stats = getSectionStats ( secHash32,
|
|
innerHash32,
|
|
false ); // cache only?
|
|
// it returns -1 if would block
|
|
if ( stats == (void *)-1 ) {
|
|
// count it as outstanding
|
|
//m_mcastRequestsOut++;
|
|
// launch more if we have room
|
|
// UdpServer.cpp has a limit of 10 on 0x39 requests
|
|
if ( m_mcastRequestsOut - m_mcastRequestsIn < 10)
|
|
continue;
|
|
// advance m_si so we do not repeat
|
|
m_si = m_si->m_next;
|
|
// otherwise, return -1 to indicate blocked
|
|
return (Sections *)-1;
|
|
}
|
|
// NULL means g_errno
|
|
if ( ! stats ) {
|
|
// ensure g_errno is set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// save it
|
|
m_secStatsErrno = g_errno;
|
|
// clear it
|
|
g_errno = 0;
|
|
// if still waiting though return -1
|
|
if ( m_mcastRequestsOut > m_mcastRequestsIn )
|
|
return (Sections *)-1;
|
|
// otherwise, all done i guess
|
|
return NULL;
|
|
}
|
|
// if already in the table, skip it!
|
|
}
|
|
|
|
// waiting for more replies to come back?
|
|
if ( m_mcastRequestsOut > m_mcastRequestsIn )
|
|
return (Sections *) -1;
|
|
|
|
// now scan the sections and copy the stats from the table
|
|
// into Section::m_stats of each sentence section.
|
|
// use the key hash as the the hash of the tag/xpath and the innerhtml
|
|
// and the val instead of being site hash will be hash of the
|
|
// content. then we can get the histogram of our content hash
|
|
// for this xpath on our site.
|
|
Section *si = ss->m_rootSection;
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if no content to hash
|
|
//if ( ! si->m_sentenceContentHash64 ) continue;
|
|
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index
|
|
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
|
|
// skip if sentence, only hash tags now i guess for diffbot
|
|
//if ( si->m_sentenceContentHash64 )
|
|
// continue;
|
|
|
|
// get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
|
if ( ! val32 )
|
|
continue;
|
|
|
|
// skip if menu!
|
|
//if ( si->m_flags & menuFlags ) continue;
|
|
|
|
|
|
// get section xpath hash combined with sitehash
|
|
uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32;
|
|
|
|
// convert this to 32 bits
|
|
uint32_t innerHash32 ;
|
|
innerHash32 = (uint32_t)si->m_indirectSentHash64;
|
|
|
|
// the "stats" class should be in the table from
|
|
// the lookups above!!
|
|
SectionStats *stats = getSectionStats ( secHash32,
|
|
innerHash32,
|
|
true ); // cache only?
|
|
// sanity
|
|
//if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;}
|
|
// must have had a network error or something
|
|
if ( ! stats ) continue;
|
|
// copy
|
|
gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) );
|
|
}
|
|
|
|
//
|
|
// now if a section has no stats but has the same
|
|
// m_indirectSentHash64 as a kid, take his stats
|
|
//
|
|
Section *sx = ss->m_rootSection;
|
|
for ( ; sx ; sx = sx->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index
|
|
if ( ! ( sx->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
// scan up parents and set their stats to ours as int32_t as
|
|
// they have the same indirect sent hash64
|
|
Section *p = sx->m_parent;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
|
|
// if parent is like an img tag, skip it
|
|
if ( p->m_tagId == TAG_IMG )
|
|
continue;
|
|
|
|
if ( p ->m_indirectSentHash64 !=
|
|
sx->m_indirectSentHash64 )
|
|
break;
|
|
|
|
// copy it to parent with the same inner html hash
|
|
gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats));
|
|
}
|
|
}
|
|
|
|
// now free the table's mem
|
|
m_sectionStatsTable.reset();
|
|
|
|
m_gotDupStats = true;
|
|
return ss;
|
|
}
|
|
|
|
static void gotReplyWrapper39 ( void *state1 , void *state2 ) {
|
|
//XmlDoc *THIS = (XmlDoc *)state;
|
|
XmlDoc *THIS = (XmlDoc *)state1;
|
|
Multicast *mcast = (Multicast *)state2;
|
|
THIS->gotSectionFacets ( mcast );
|
|
// this will end up calling getSectionsWithDupStats() again
|
|
// which will call getSectionStats() some more on new sections
|
|
// until m_gotDupStats is set to true.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
// . launch a single msg3a::getDocIds() for a section hash, secHash32
|
|
SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 ,
|
|
uint32_t innerHash32 ,
|
|
bool cacheOnly ) {
|
|
|
|
// init cache?
|
|
if ( m_sectionStatsTable.m_numSlots == 0 &&
|
|
! m_sectionStatsTable.set(4,
|
|
sizeof(SectionStats),
|
|
32,
|
|
NULL,
|
|
0,
|
|
false,
|
|
m_niceness,
|
|
"secstatsch"))
|
|
return NULL;
|
|
|
|
// check in cache...
|
|
SectionStats *stats ;
|
|
stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 );
|
|
// if there, return it
|
|
if ( stats ) return stats;
|
|
|
|
// if cache only do not launch
|
|
if ( cacheOnly ) return NULL;
|
|
|
|
//
|
|
// TODO: shard gbxpathsitehashxxxxx by termid
|
|
// and make sure msg3a only sends to that single shard and sends
|
|
// the stats back. should make us much faster to sectionize
|
|
// a web page. but for now try without it...
|
|
//
|
|
|
|
//int32_t *sh32 = getSiteHash32();
|
|
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32;
|
|
|
|
int32_t maxOut = 32;
|
|
|
|
// . need to make new msg39Request and a new Multicast arrays
|
|
// . only need multicast since these gbfacetstr:gbxpathsitehash123456
|
|
// terms are sharded by termid, otherwise we'd have to use msg3a
|
|
if ( ! m_mcastArray ) {
|
|
// how much mem to alloc?
|
|
int32_t need = 0;
|
|
need += sizeof(Multicast);
|
|
need += sizeof(Msg39Request);
|
|
// query buf str
|
|
need += 100;
|
|
need *= maxOut;
|
|
// a single query now to be shared
|
|
//need += sizeof(Query);
|
|
// just in case we are being re-used
|
|
m_mcastBuf.reset();
|
|
// alloc space
|
|
if ( ! m_mcastBuf.reserve(need) ) return NULL;
|
|
// point to buf
|
|
char *p = m_mcastBuf.getBufStart();
|
|
// set them up
|
|
m_mcastArray = (Multicast *)p;
|
|
p += sizeof(Multicast) * maxOut;
|
|
m_msg39RequestArray = (Msg39Request *)p;
|
|
p += sizeof(Msg39Request) * maxOut;
|
|
//m_queryArray = (Query *)p;
|
|
//p += sizeof(Query) * maxOut;
|
|
//m_sharedQuery = (Query *)p;
|
|
//p += sizeof(Query);
|
|
// for holding the query string
|
|
// assume query will not exceed 100 bytes including \0
|
|
m_queryBuf = p;
|
|
p += 100 * maxOut;
|
|
// initialize all!
|
|
for ( int32_t i = 0 ; i < maxOut ; i++ ) {
|
|
m_mcastArray [i].constructor();
|
|
m_msg39RequestArray[i].reset();//constructor();
|
|
//m_queryArray [i].constructor();
|
|
m_queryBuf[100*i] = '\0';
|
|
//m_inUse[i] = 0;
|
|
}
|
|
}
|
|
|
|
// get first available
|
|
int32_t i;
|
|
for ( i = 0 ; i < maxOut ; i++ )
|
|
if ( ! m_mcastArray[i].m_inUse ) break;
|
|
|
|
// wtf?
|
|
if ( i >= maxOut ) { char *xx=NULL;*xx=0; }
|
|
|
|
// and our vehicle
|
|
Multicast *mcast = &m_mcastArray[i];
|
|
|
|
// mark as in use up here in case we quickpoll into this same code?!
|
|
// yeah, i guess set2() calls quickpoll?
|
|
//mcast->m_inUse = 1;
|
|
|
|
// save this for reply
|
|
//mcast->m_hack = this;
|
|
|
|
char *qbuf = m_queryBuf + 100 * i;
|
|
|
|
// . hash this special term (was gbsectionhash)
|
|
// . the wordbits etc will be a number though, the hash of the content
|
|
// of the xpath, the inner html hash
|
|
// . preceeding this term with gbfacet: will make gigablast return
|
|
// the statistics for all the values in the posdb keys of this
|
|
// termlist, which happen to be innerHTML hashes for all pages
|
|
// with this same xpath and on this same site.
|
|
sprintf(qbuf,"gbfacetstr:gbxpathsitehash%" UINT32 "",
|
|
(uint32_t)secHash32);
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// set the msg39 request
|
|
Msg39Request *r = &m_msg39RequestArray[i];
|
|
|
|
// reset all to defaults
|
|
r->reset();
|
|
|
|
//r-> ptr_coll = cr->m_coll;
|
|
//r->size_coll = gbstrlen(cr->m_coll)+1;
|
|
r->m_collnum = cr->m_collnum;
|
|
r->m_maxAge = 60; // cache timeout?
|
|
r->m_addToCache = true;
|
|
r->m_docsToGet = 0; // just calc stats
|
|
r->m_niceness = m_niceness;
|
|
r->m_debug = 0;
|
|
r->m_doSiteClustering = false;
|
|
//r->m_doIpClustering = false;
|
|
r->m_doDupContentRemoval = false;
|
|
r->m_boolFlag = 2;
|
|
r->m_familyFilter = 0;
|
|
r->m_language = 0;
|
|
r->ptr_query = qbuf;//m_sectionHashQueryBuf;
|
|
r->size_query = gbstrlen(r->ptr_query)+1;
|
|
r->m_timeout = 3600; //-1;// auto-determine based on #terms
|
|
r->m_maxQueryTerms = 10;
|
|
|
|
// how much of each termlist to read in bytes
|
|
int32_t readList = 10000;
|
|
r-> ptr_readSizes = (char *)&readList;
|
|
r->size_readSizes = 4;
|
|
|
|
// term freqs
|
|
float tfw = 1.0;
|
|
r-> ptr_termFreqWeights = (char *)&tfw;
|
|
r->size_termFreqWeights = 4;
|
|
|
|
// speed it up some with this flag
|
|
r->m_forSectionStats = true;
|
|
|
|
// only do a single read of docids... do not split up
|
|
r->m_numDocIdSplits = 1;
|
|
|
|
// 1 query term
|
|
r->m_nqt = 1;
|
|
|
|
///////////////////////
|
|
//
|
|
// this tells msg3a/msg39/posdbtable its a hack! no need to do this
|
|
// because it's implied by the query.
|
|
// BUT REALLY let's eliminate this and just make our queries like
|
|
// gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of
|
|
// the section's xpath with the site. the values of that term in
|
|
// the posdb key will be 32-bit hashes of the innerHtml for such
|
|
// sections from all pages with the same xpath on the same site.
|
|
// so no need for this now, comment out.
|
|
//
|
|
//r->m_getFacetStats = true;
|
|
//
|
|
/////////////////////////
|
|
|
|
|
|
// we need to know what site is the base site so the section stats
|
|
// can set m_onSiteDocIds and m_offSiteDocIds correctly
|
|
//r->m_siteHash32 = *sh32;
|
|
|
|
// . now we use the hash of the innerHtml of the xpath
|
|
// . this is our value for the facet field of gbxpathsitehash12345678
|
|
// which is the hash of the innerHTML for that xpath on this site.
|
|
// 12345678 is the hash of the xpath and the site.
|
|
//r->m_myFacetVal32 = sentHash32;
|
|
|
|
|
|
//Query *qq = &m_queryArray[i];
|
|
// set query for msg3a. queryExpansion=false
|
|
//qq->set2 ( r->ptr_query , langUnknown , false );
|
|
|
|
Query qq;
|
|
qq.set2 ( r->ptr_query , langUnknown , false );
|
|
|
|
// TODO: ensure this just hits the one host since it is sharded
|
|
// by termid...
|
|
|
|
// what shard owns this termlist. we shard these
|
|
// gbfacetstr:gbxpathsitehash123456 terms by termid.
|
|
int64_t termId = qq.getTermId(0);
|
|
int32_t shardNum = getShardNumFromTermId ( termId );
|
|
|
|
// hack in our inner html content hash for this xpath
|
|
mcast->m_hack32 = innerHash32;
|
|
mcast->m_hack64 = secHash32;
|
|
|
|
// malloc and store the request. mcast will free it when done.
|
|
int32_t reqSize;
|
|
char *req = serializeMsg ( sizeof(Msg39Request),
|
|
&r->size_readSizes,
|
|
&r->size_whiteList,
|
|
&r->ptr_readSizes,
|
|
r,
|
|
&reqSize,
|
|
NULL,
|
|
0,
|
|
false);
|
|
|
|
// . send out a msg39 request to each shard
|
|
// . multicasts to a host in group "groupId"
|
|
// . we always block waiting for the reply with a multicast
|
|
// . returns false and sets g_errno on error
|
|
// . sends the request to fastest host in group "groupId"
|
|
// . if that host takes more than about 5 secs then sends to
|
|
// next host
|
|
// . key should be largest termId in group we're sending to
|
|
bool status;
|
|
status = mcast->send ( req , // m_rbufPtr ,
|
|
reqSize , // request size
|
|
0x39 , // msgType 0x39
|
|
true , // mcast owns m_request?
|
|
shardNum , // group to send to
|
|
false , // send to whole group?
|
|
0,//(int32_t)qh , // 0 // startKey.n1
|
|
this , // state1 data
|
|
mcast , // state2 data
|
|
gotReplyWrapper39 ,
|
|
30 , //timeout in secs
|
|
m_niceness,//m_r->m_niceness ,
|
|
false , // realtime?
|
|
-1, // firstHostId, // -1// bestHandlingHostId ,
|
|
NULL , // m_replyBuf ,
|
|
0 , // MSG39REPLYSIZE,
|
|
// this is true if multicast should free the
|
|
// reply, otherwise caller is responsible
|
|
// for freeing it after calling
|
|
// getBestReply().
|
|
// actually, this should always be false,
|
|
// there is a bug in Multicast.cpp.
|
|
// no, if we error out and never steal
|
|
// the buffers then they will go unfreed
|
|
// so they are freed by multicast by default
|
|
// then we steal control explicitly
|
|
true );
|
|
|
|
m_mcastRequestsOut++;
|
|
|
|
// if successfully launch, wait...
|
|
if ( status ) return (SectionStats *) -1;
|
|
|
|
// error?
|
|
if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; }
|
|
|
|
// sets &m_sectionStats and adds to the table
|
|
gotSectionFacets ( mcast );
|
|
|
|
// i guess did not block...
|
|
//return &msg3a->m_sectionStats;
|
|
return &m_sectionStats;
|
|
}
|
|
|
|
// . come here when msg39 got the ptr_faceHashList for our single
|
|
// gbfacet:gbxpathsitehash
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
|
|
//SectionStats *stats = &msg39->m_sectionStats;
|
|
|
|
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;}
|
|
|
|
// count it as returned
|
|
m_mcastRequestsIn++;
|
|
// mark it as available now
|
|
int32_t num = mcast - m_mcastArray;
|
|
// sanity
|
|
//if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; }
|
|
|
|
// grab the xpath/site hash
|
|
uint32_t secHash32 = mcast->m_hack64;
|
|
|
|
// and our innher html for that xpath
|
|
int32_t myFacetVal32 = mcast->m_hack32;
|
|
|
|
// sanity. should only be a gbfacet:gbxpathsitehash12345567 term.
|
|
//if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// reset all counts to 0
|
|
m_sectionStats.reset();
|
|
|
|
//////
|
|
//
|
|
// compile m_sectionStats
|
|
//
|
|
///////
|
|
|
|
// set m_sectionStats from the list of facet values for this
|
|
// gbfacet:xpathsitehash term...
|
|
// Query::m_queryTerm.m_facetHashTable has the facets merged
|
|
// from all the shards. so now compute the stats from them.
|
|
// set the section stats.
|
|
//QueryTerm *qt = &msg3a->m_q->m_qterms[0];
|
|
//HashTableX *ft = &qt->m_facetHashTable;
|
|
|
|
// . get the list of facet field/value pairs.
|
|
// . see how Msg3a.cpp merges these to see how they are stored
|
|
Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply();
|
|
|
|
// this is NULL with g_errno set on error
|
|
if ( ! mr ) {
|
|
log("xmldoc: got error from sec stats mcast: %s",
|
|
mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
|
|
deserializeMsg ( sizeof(Msg39Reply) ,
|
|
&mr->size_docIds,
|
|
&mr->size_clusterRecs,
|
|
&mr->ptr_docIds,
|
|
((char*)m_r) + sizeof(*m_r) );
|
|
|
|
char *p = (char *)(mr->ptr_facetHashList);
|
|
//char *pfinal = p + mr->size_facetHashList;
|
|
|
|
//
|
|
// should only be one termid of facets in here, so no need to re-loop
|
|
//
|
|
int32_t nh = 0;
|
|
// "matches" is how many docids with this facet field had our facet val
|
|
int32_t matches = 0;
|
|
// "totalDocIds" is how many docids had this facet field
|
|
int32_t totalFields = 0;
|
|
|
|
if ( p ) {
|
|
// first is the termid
|
|
//int64_t termId = *(int64_t *)p;
|
|
// skip that
|
|
p += 8;
|
|
// the # of unique 32-bit facet values
|
|
nh = *(int32_t *)p;
|
|
p += 4;
|
|
// the end point
|
|
char *pend = p + (8 * nh);
|
|
// now compile the facet hash list into there
|
|
for ( ; p < pend ; ) {
|
|
// does this facet value match ours?
|
|
// (i.e. same inner html?)
|
|
if ( *(int32_t *)p == myFacetVal32 )
|
|
matches += *(int32_t *)(p+4);
|
|
p += 4;
|
|
// now how many docids had this facet value?
|
|
totalFields += *(int32_t *)p;
|
|
p += 4;
|
|
}
|
|
}
|
|
|
|
// how many unique inner html content hashes for this xpath/site
|
|
// hash were there?
|
|
m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed;
|
|
|
|
// how many xpaths existsed over all docs. doc can have multiple.
|
|
m_sectionStats.m_totalEntries = totalFields;
|
|
|
|
// total # unique docids that had this facet
|
|
m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits;
|
|
|
|
// how many had the same inner html content hash for
|
|
// this xpath/site as we did?
|
|
m_sectionStats.m_totalMatches = matches;
|
|
|
|
////////
|
|
//
|
|
// store m_sectionStats in cache
|
|
//
|
|
////////
|
|
|
|
// cache them. this does a copy of m_sectionStats
|
|
if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) )
|
|
log("xmldoc: failed to add sections stats: %s",
|
|
mstrerror(g_errno));
|
|
|
|
// reset that msg39 to free its data
|
|
//msg39->reset();
|
|
|
|
if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . make it available again
|
|
// . do this after all in case we were in quickpoll interruptting
|
|
// the getSectionStats() function below
|
|
//mcast->m_inUse = 0;
|
|
|
|
// free query Query::m_qwords array etc. to stop mem leaks
|
|
m_mcastArray [num].reset();
|
|
m_msg39RequestArray[num].reset();
|
|
//m_queryArray [num].reset();
|
|
// now when the master loop calls getSectionsWithDupStats() it
|
|
// should find the stats class in the cache!
|
|
return true;
|
|
}
|
|
|
|
|
|
// . for all urls from this subdomain...
|
|
// . EXCEPT root url since we use msg17 to cache that, etc.
|
|
SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {
|
|
|
|
if ( m_osvtValid ) return &m_osvt;
|
|
|
|
// do not consult sectiondb if we are set from the title rec,
|
|
// that way we avoid parsining inconsistencies since sectiondb changes!
|
|
if ( m_setFromTitleRec ) {
|
|
char *p = ptr_sectiondbData;
|
|
m_osvtValid = true;
|
|
m_osvt.m_totalSiteVoters = 0;
|
|
if ( size_sectiondbData <= 4 ) return &m_osvt;
|
|
m_osvt.m_totalSiteVoters = *(int32_t *)p;
|
|
p += 4;
|
|
int32_t remaining = size_sectiondbData - 4;
|
|
m_osvt.m_svt.deserialize(p,remaining,m_niceness);
|
|
return &m_osvt;
|
|
}
|
|
|
|
// returns empty table if WE are the site root url!
|
|
//HashTableX *rvt = getRootVotingTable();
|
|
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
|
|
|
|
// need sections
|
|
//Sections *ss = getSections();
|
|
//if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
|
|
|
|
// hash of all adjacent tag pairs
|
|
uint32_t *tph = getTagPairHash32 ( ) ;
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
|
|
|
|
int64_t *siteHash64 = getSiteHash64();
|
|
if ( ! siteHash64 || siteHash64 == (void *)-1 )
|
|
return (SectionVotingTable *)siteHash64;
|
|
|
|
// the docid
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . for us, dates are really containers of the flags and tag hash
|
|
// . init this up here, it is re-set if we re-call getSectiondbList()
|
|
// because there were too many records in it to handle in one read
|
|
if ( m_numSectiondbReads == 0 ) {
|
|
// init table
|
|
if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL;
|
|
// use site hash as the main thing
|
|
int64_t termId = *siteHash64 & TERMID_MASK;
|
|
// . start key for reading list from sectiondb
|
|
// . read all the section votes for this site
|
|
m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff);
|
|
// how many reads we have to do...
|
|
m_numSectiondbNeeds = 1;
|
|
}
|
|
|
|
//bool skipRecall = false;
|
|
// always read 5MB at a time from sectiondb
|
|
int32_t minRecSizes = 5000000;
|
|
|
|
// crap! host #28 is being totall slammed!!!!!
|
|
// why?????? in the meantime do this
|
|
//minRecSizes = 100000;
|
|
//skipRecall = true;
|
|
|
|
// is it facebook?
|
|
bool limitSectiondb = false;
|
|
// limit now to speed up repair rebuild
|
|
// limit now to speed up injection!
|
|
limitSectiondb = true;
|
|
// facebook lists often clog the tree, and when we read 2MB worth of
|
|
// it, it takes 100ms, so reduce to 50k to so it takes 2.5ms...
|
|
// because facebook is a well structured xml feed so why read any
|
|
// really!
|
|
if ( limitSectiondb ) minRecSizes = 50000;
|
|
|
|
key128_t *lastKey = NULL;
|
|
|
|
// if msg0 blocked and came back with g_errno set, like
|
|
// in preparing to merge it got an OOM
|
|
if ( g_errno ) {
|
|
log("build: sectiondb read2: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
|
|
readLoop:
|
|
// before looking up TitleRecs using Msg20, let's first consult
|
|
// datedb to see if we got adequate data as to what sections
|
|
// are the article sections
|
|
|
|
// only get the list once
|
|
if ( m_numSectiondbReads < m_numSectiondbNeeds ) {
|
|
// only do this once
|
|
m_numSectiondbReads++;
|
|
// make the termid
|
|
uint64_t termId = *siteHash64 & TERMID_MASK;
|
|
// end key is always the same
|
|
key128_t end = g_datedb.makeEndKey ( termId , 0 );
|
|
// int16_tcut
|
|
Msg0 *m = &m_msg0;
|
|
// get the group this list is in (split = false)
|
|
uint32_t shardNum;
|
|
shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey);
|
|
// we need a group # from the groupId
|
|
//int32_t split = g_hostdb.getGroupNum ( gid );
|
|
// note it
|
|
//logf(LOG_DEBUG,"sections: "
|
|
// "reading list from sectiondb: "
|
|
// "sk.n1=0x%" XINT64 " sk.n0=0x%" XINT64 " "
|
|
// "ek.n1=0x%" XINT64 " ek.n0=0x%" XINT64 " "
|
|
// ,m_sectiondbStartKey.n1
|
|
// ,m_sectiondbStartKey.n0
|
|
// ,end.n1
|
|
// ,end.n0
|
|
// );
|
|
// . get the list
|
|
// . gets all votes for one particular site
|
|
if ( ! m->getList ( -1 , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // addToCache
|
|
RDB_SECTIONDB , // was RDB_DATEDB
|
|
cr->m_collnum ,
|
|
&m_secdbList ,
|
|
(char *)&m_sectiondbStartKey ,
|
|
(char *)&end ,
|
|
minRecSizes ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // MAX_NICENESS
|
|
// default parms follow
|
|
true , // doErrorCorrection?
|
|
true , // includeTree?
|
|
true , // doMerge?
|
|
-1 , // firstHostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
999995 , // timeout
|
|
-1 , // syncPoint
|
|
-1 , // preferLocalReads
|
|
NULL , // msg5
|
|
NULL , // msg5b
|
|
false , // isrealmerge?
|
|
true , // allowpagecache?
|
|
false , // forceLocalIndexdb?
|
|
false , // doIndexdbSplit?
|
|
shardNum ) )//split ))
|
|
// return -1 if blocks
|
|
return (SectionVotingTable *)-1;
|
|
// error?
|
|
if ( g_errno ) {
|
|
log("build: sectiondb read: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// it also returns the lastKey in the list so we can use that to
|
|
// set the startKey for a re-call if we read >= 5MB
|
|
lastKey = NULL;
|
|
|
|
//logf(LOG_DEBUG,"sections: read list of %" INT32 " bytes",
|
|
// m_secdbList.m_listSize);
|
|
|
|
bool recall = true;
|
|
|
|
if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false;
|
|
|
|
// . unless it had special byte set in Msg0.cpp HACK
|
|
// . we send back a compressed list and tack on an extra 0 byte at
|
|
// the end so that we know we had a full list!
|
|
if ( (m_secdbList.m_listSize % 2) == 1 ) {
|
|
m_secdbList.m_listSize--;
|
|
m_secdbList.m_listEnd --;
|
|
recall = true;
|
|
}
|
|
|
|
// no longer bother re-calling, because facebook is way slow...
|
|
if ( limitSectiondb ) recall = false;
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . compile the votes from sectiondb for this site into a hashtable
|
|
// . m_osvt is a SectionVotingTable and each entry in the hashtable
|
|
// is a SectionVote class.
|
|
// . the taghash is the key of the vote and is a hash of all the
|
|
// nested tags the section is in.
|
|
// . another vote uses the tag hash hashed with the hash of the
|
|
// content contained by the section
|
|
// . using these two vote counts we set Section::m_votesForDup
|
|
// or Section::m_votesForNotDup counts which let us know how the
|
|
// section is repeated or not repeated on the site
|
|
// . SectionVote::m_score is always 1.0 from what i can tell
|
|
// cuz it seems like addVote*() always uses a score of 1.0
|
|
// . SectionVote::m_numSampled is how many times that tagHash
|
|
// occurs in the document.
|
|
if ( ! m_osvt.addListOfVotes(&m_secdbList,
|
|
&lastKey,
|
|
*tph,
|
|
*d , // docid
|
|
m_niceness))
|
|
return NULL;
|
|
|
|
// why is this always zero it seems?
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("xmldoc: added sectiondblist size=%" INT32 " recall=%" INT32 "",
|
|
m_secdbList.m_listSize,(int32_t)recall);
|
|
|
|
// . recall? yes if we had to truncate our list...
|
|
// . we need to be able to scan all votes for the website... that is
|
|
// why we recall here
|
|
// . limit votes by a special sectiondb key then that is a vote...
|
|
if ( recall ) {
|
|
// another debug
|
|
//logf(LOG_DEBUG,"sections: recallling read");
|
|
// just note it for now
|
|
//if ( m_sectiondbRecall > 5 )
|
|
if ( m_numSectiondbNeeds > 5 )
|
|
logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%" INT32 "",
|
|
m_sectiondbRecall++);
|
|
// we should really limit voting per site! we do now!
|
|
//if ( m_recall > 5 ) { char *xx=NULL;*xx=0; }
|
|
// update our start key
|
|
if ( lastKey ) m_sectiondbStartKey = *lastKey;
|
|
// inc by 2 since we already had this key
|
|
m_sectiondbStartKey += 2;
|
|
// unflag
|
|
m_numSectiondbNeeds++;
|
|
// and repeat
|
|
goto readLoop;
|
|
}
|
|
|
|
//
|
|
// set ptr_sectiondbData so this can be set from a title rec without
|
|
// having to lookup in sectiondb again which might have changed!
|
|
//
|
|
m_sectiondbData.purge();
|
|
// alloc
|
|
int32_t need = m_osvt.m_svt.getStoredSize() + 4;
|
|
if ( ! m_sectiondbData.reserve(need) )
|
|
// oom error?
|
|
return NULL;
|
|
// serialize this number
|
|
m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters);
|
|
// serialize the hashtablex
|
|
m_osvt.m_svt.serialize ( &m_sectiondbData );
|
|
// reference it for title rec serialization
|
|
ptr_sectiondbData = m_sectiondbData.getBufStart();
|
|
size_sectiondbData = m_sectiondbData.length();
|
|
|
|
m_osvtValid = true;
|
|
return &m_osvt;
|
|
}
|
|
|
|
int32_t *XmlDoc::getLinkSiteHashes ( ) {
|
|
if ( m_linkSiteHashesValid )
|
|
return (int32_t *)m_linkSiteHashBuf.getBufStart();
|
|
// get the outlinks
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
|
|
|
// . get the outlink tag rec vector
|
|
// . each link's tagrec may have a "site" tag that is basically
|
|
// the cached SiteGetter::getSite() computation
|
|
TagRec ***grv = NULL;
|
|
if ( ! m_setFromTitleRec ) {
|
|
grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (int32_t *)grv;
|
|
}
|
|
|
|
// how many outlinks do we have on this page?
|
|
int32_t n = links->getNumLinks();
|
|
|
|
// reserve space
|
|
m_linkSiteHashBuf.purge();
|
|
if ( ! m_linkSiteHashBuf.reserve ( n * 4 ) ) return NULL;
|
|
|
|
if ( n == 0 ) {
|
|
ptr_linkdbData = NULL;
|
|
size_linkdbData = 0;
|
|
return (int32_t *)0x1234;
|
|
}
|
|
|
|
// if set from titlerec then assume each site is the full hostname
|
|
// of the link, unless its specified explicitly in the hashtablex
|
|
// serialized in ptr_linkdbData
|
|
if ( m_setFromTitleRec ) {
|
|
// this holds the sites that are not just the hostname
|
|
int32_t *p = (int32_t *)ptr_linkdbData;
|
|
int32_t *pend = (int32_t *)(ptr_linkdbData + size_linkdbData);
|
|
// loop over links
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the link
|
|
char *u = links->getLinkPtr(i);
|
|
// assume site is just the host
|
|
int32_t hostLen = 0;
|
|
char *host = ::getHost ( u , &hostLen );
|
|
int32_t siteHash32 = hash32 ( host , hostLen , 0 );
|
|
// unless give as otherwise
|
|
if ( p < pend && *p == i ) {
|
|
p++;
|
|
siteHash32 = *p;
|
|
p++;
|
|
}
|
|
// store that then. should not fail since we allocated
|
|
// right above
|
|
if ( ! m_linkSiteHashBuf.pushLong(siteHash32) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
// return ptr of array, which is a safebuf
|
|
return (int32_t *)m_linkSiteHashBuf.getBufStart();
|
|
}
|
|
|
|
// ptr_linkdbData will point into this buf
|
|
m_linkdbDataBuf.purge();
|
|
|
|
// loop through them
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the link
|
|
char *u = links->getLinkPtr(i);
|
|
// get full host from link
|
|
int32_t hostLen = 0;
|
|
char *host = ::getHost ( u , &hostLen );
|
|
int32_t hostHash32 = hash32 ( host , hostLen , 0 );
|
|
// get the site
|
|
TagRec *gr = (*grv)[i];
|
|
char *site = NULL;
|
|
int32_t siteLen = 0;
|
|
if ( gr ) {
|
|
int32_t dataSize = 0;
|
|
site = gr->getString("site",NULL,&dataSize);
|
|
if ( dataSize ) siteLen = dataSize - 1;
|
|
}
|
|
// otherwise, make it the host or make it cut off at
|
|
// a "/user/" or "/~xxxx" or whatever path component
|
|
if ( ! site ) {
|
|
// GUESS link site... like /~xxx
|
|
site = host;
|
|
siteLen = hostLen;
|
|
}
|
|
int32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
|
|
// only store if different form host itself
|
|
if ( linkeeSiteHash32 != hostHash32 ) {
|
|
if ( ! m_linkdbDataBuf.pushLong(i) )
|
|
return NULL;
|
|
if ( ! m_linkdbDataBuf.pushLong(linkeeSiteHash32) )
|
|
return NULL;
|
|
}
|
|
// store it always in this buf
|
|
if ( ! m_linkSiteHashBuf.pushLong(linkeeSiteHash32) ) {
|
|
// space should have been reserved above!
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
// set ptr_linkdbData
|
|
ptr_linkdbData = m_linkdbDataBuf.getBufStart();
|
|
size_linkdbData = m_linkdbDataBuf.length();
|
|
m_linkSiteHashesValid = true;
|
|
|
|
return (int32_t *)m_linkSiteHashBuf.getBufStart();
|
|
}
|
|
|
|
Links *XmlDoc::getLinks ( bool doQuickSet ) {
|
|
if ( m_linksValid ) return &m_links;
|
|
// set status
|
|
setStatus ( "getting outlinks");
|
|
|
|
// . add links from diffbot reply
|
|
// . get the reply of json objects from diffbot
|
|
// . this will be empty if we are a json object!
|
|
// . will also be empty if not meant to be sent to diffbot
|
|
// . the TOKENIZED reply consists of \0 separated json objects that
|
|
// we create from the original diffbot reply
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( ! dbr || dbr == (void *)-1 ) return (Links *)dbr;
|
|
|
|
// this will set it if necessary
|
|
Xml *xml = getXml();
|
|
// bail on error
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Links *)xml;
|
|
// can't call getIsPermalink() here without entering a dependency loop
|
|
char *pp = getIsUrlPermalinkFormat();
|
|
if ( !pp || pp == (char *)-1 ) return (Links *)pp;
|
|
// use the old xml doc
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (Links *)od;
|
|
// get Links class of the old title rec
|
|
Links *oldLinks = NULL;
|
|
// if we were set from a title rec, do not do this
|
|
if ( *od ) {
|
|
oldLinks = (*od)->getLinks();
|
|
if (!oldLinks||oldLinks==(Links *)-1) return (Links *)oldLinks;
|
|
}
|
|
Url *baseUrl = getBaseUrl();
|
|
if ( ! baseUrl || baseUrl==(Url *)-1) return (Links *)baseUrl;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (Links *)ip;
|
|
// this ensures m_contentLen is set
|
|
//char **content = getContent();
|
|
//if ( ! content || content == (char **)-1 ) return (Links *)content;
|
|
// this will set ptr_indCatIds and size_indCatIds
|
|
int32_t **pici = getIndCatIds();
|
|
if ( ! pici || pici == (void *)-1 ) return (Links *)pici;
|
|
char *ict = getIsContentTruncated();
|
|
if ( ! ict || ict == (char *)-1 ) return (Links *)ict;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (Links *)sni;
|
|
// get the latest url we are on
|
|
Url *u = getCurrentUrl();
|
|
|
|
//
|
|
// if we had a EDOCSIMPLIFIEDREDIR error, pretend it is a link
|
|
// so addOutlinkSpiderRecsToMetaList() will add it to spiderdb
|
|
//
|
|
if ( m_indexCodeValid && m_indexCode == EDOCSIMPLIFIEDREDIR ) {
|
|
m_links.set ( m_redirUrl.getUrl(),m_redirUrl.getUrlLen() );
|
|
m_linksValid = true;
|
|
return &m_links;
|
|
}
|
|
|
|
if ( m_indexCodeValid && m_indexCode == EDOCNONCANONICAL ) {
|
|
m_links.set(m_canonicalRedirUrl.getUrl(),
|
|
m_canonicalRedirUrl.getUrlLen());
|
|
m_linksValid = true;
|
|
return &m_links;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
bool useRelNoFollow = true;
|
|
if ( ! cr->m_obeyRelNoFollowLinks ) useRelNoFollow = false;
|
|
// to keep things simple, for diffbot custom crawls, if robots.txt
|
|
// is not used then do not use rel no follow
|
|
if ( ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
|
|
useRelNoFollow = false;
|
|
|
|
// . set it
|
|
// . if parent is a permalink we can avoid its suburl outlinks
|
|
// containing "comment" from being classified as permalinks
|
|
if ( ! m_links.set ( useRelNoFollow ,
|
|
xml ,
|
|
u ,
|
|
true , // setLinkHashes?
|
|
baseUrl ,
|
|
m_version ,
|
|
m_niceness ,
|
|
*pp , // parent url in permalink format?
|
|
oldLinks ,// oldLinks, might be NULL!
|
|
doQuickSet ,
|
|
dbr ) )
|
|
return NULL;
|
|
|
|
m_linksValid = true;
|
|
|
|
// do not bother setting that bit if we are being called for link
|
|
// text because that bit was already in the linkdb key, and it
|
|
// was set to zero! so if getting msg20 reply.... bail now
|
|
if ( m_req ) return &m_links;
|
|
|
|
// . apply link spam settings
|
|
// . set the "spam bits" in the Links class
|
|
setLinkSpam ( *ip ,
|
|
ptr_indCatIds ,
|
|
size_indCatIds / 4 ,
|
|
u , // linker url
|
|
*sni ,
|
|
xml ,
|
|
&m_links ,
|
|
*ict ,
|
|
m_niceness );
|
|
// we got it
|
|
return &m_links;
|
|
}
|
|
|
|
|
|
HashTableX *XmlDoc::getCountTable ( ) {
|
|
// return it if we got it
|
|
if ( m_countTableValid ) return &m_countTable;
|
|
|
|
setStatus ("getting count table");
|
|
|
|
// get the stuff we need
|
|
Xml *xml = getXml ();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (HashTableX *)xml;
|
|
Words *words = getWords ();
|
|
if ( ! words || words == (Words *)-1 ) return (HashTableX *)words;
|
|
Phrases *phrases = getPhrases ();
|
|
if ( ! phrases || phrases==(Phrases *)-1) return (HashTableX *)phrases;
|
|
Bits *bits = getBits ();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (HashTableX *)bits;
|
|
Sections *sections = getSections();
|
|
if ( !sections||sections==(Sections *)-1) return(HashTableX *)sections;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (HashTableX *)info1;
|
|
|
|
// . reduce score of words in badly repeated fragments to 0 so we do
|
|
// not count them here!
|
|
// . ff[i] will have score of 0 if in repeated frag
|
|
// . make sure this is stored for whole doc... since we only use it
|
|
// for the body
|
|
char *fv = getFragVec();
|
|
if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv;
|
|
|
|
//LinkInfo *info2 = getLinkInfo2();
|
|
//if ( ! info2 || info2 == (LinkInfo *)-1 ) return (HashTableX *)info2;
|
|
|
|
// init our count table otherwise
|
|
//if(! m_countTable.set( 8,4,1024,NULL,0,false,m_niceness,"xmlcnttbl"))
|
|
// return NULL;
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
//
|
|
// this was in Weights.cpp, but now it is here...
|
|
//
|
|
|
|
// int16_tcut
|
|
HashTableX *ct = &m_countTable;
|
|
|
|
// reset the counts, just in case set() below does not
|
|
//ct->reset();
|
|
|
|
// ez var
|
|
int64_t *wids = words->getWordIds ();
|
|
nodeid_t *tids = words->getTagIds ();
|
|
int32_t nw = words->getNumWords ();
|
|
char **wptrs = words->m_words;
|
|
int32_t *wlens = words->m_wordLens;
|
|
int64_t *pids = phrases->getPhraseIds2();
|
|
|
|
// add 5000 slots for inlink text in hashString_ct() calls below
|
|
int32_t numSlots = nw * 3 + 5000;
|
|
// only alloc for this one if not provided
|
|
if (!ct->set(8,4,numSlots,NULL,0,false,m_niceness,"xmlct"))
|
|
return (HashTableX *)NULL;
|
|
|
|
//char *ff = getFragVec ( ) ;
|
|
//if ( ! ff ) return false;
|
|
|
|
// . now hash all the phrase ids we have in order to see if the phrase
|
|
// is unique or not. if phrase is repeated a lot we punish the scores
|
|
// of the individual words in the phrase and boost the score of the
|
|
// phrase itself. We check for uniqueness down below.
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// add the word
|
|
if ( wids[i] == 0LL ) continue;
|
|
//if ( wids[i] == 708411945052722517LL )
|
|
// log("hey4 got new pid=%" INT64 " i=%" INT32 "",pids[i],i);
|
|
// . skip if in repeated fragment
|
|
// . unfortunately we truncate the frag vec to like
|
|
// the first 80,000 words for performance reasons
|
|
if ( i < MAXFRAGWORDS && fv[i] == 0 ) continue;
|
|
// accumulate the wid with a score of 1 each time it occurs
|
|
if ( ! ct->addTerm ( &wids[i] ) ) return (HashTableX *)NULL;
|
|
// skip if word #i does not start a phrase
|
|
if ( ! pids [i] ) continue;
|
|
// if phrase score is less than 100% do not consider as a
|
|
// phrase so that we do not phrase "albuquerque, NM" and stuff
|
|
// like that... in fact, we can only have a space here...
|
|
if ( wptrs[i+1][0] == ',' ) continue;
|
|
if ( wptrs[i+1][1] == ',' ) continue;
|
|
if ( wptrs[i+1][2] == ',' ) continue;
|
|
// put it in, accumulate, max score is 0x7fffffff
|
|
if ( ! ct->addTerm ( &pids[i] ) ) return (HashTableX *)NULL;
|
|
}
|
|
|
|
// now add each meta tag to the pot
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not a meta tag
|
|
if ( tids[i] != 68 ) continue;
|
|
// find the "content=" word
|
|
char *w = wptrs[i];
|
|
int32_t wlen = wlens[i];
|
|
char *wend = w + wlen;
|
|
char *p ;
|
|
p = strncasestr (w,wlen,"content=");
|
|
// skip if we did not have any content in this meta tag
|
|
if ( ! p ) continue;
|
|
// skip the "content="
|
|
p += 8;
|
|
// skip if empty meta content
|
|
if ( wend - p <= 0 ) continue;
|
|
// our ouw hash
|
|
if ( ! hashString_ct ( ct , p , wend - p ) )
|
|
return (HashTableX *)NULL;
|
|
}
|
|
// add each incoming link text
|
|
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// int16_tcuts
|
|
char *p;
|
|
int32_t plen;
|
|
// hash link text (was hashPwids())
|
|
p = k-> getLinkText();
|
|
plen = k->size_linkText - 1;
|
|
if ( ! verifyUtf8 ( p , plen ) ) {
|
|
log("xmldoc: bad link text 3 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
if ( ! hashString_ct ( ct , p , plen ) )
|
|
return (HashTableX *)NULL;
|
|
// hash this stuff (was hashPwids())
|
|
p = k->getSurroundingText();
|
|
plen = k->size_surroundingText - 1;
|
|
if ( ! hashString_ct ( ct , p , plen ) )
|
|
return (HashTableX *)NULL;
|
|
}
|
|
|
|
// we got it
|
|
m_countTableValid = true;
|
|
return &m_countTable;
|
|
}
|
|
|
|
// . a special function used by XmlDoc::getCountTable() above
|
|
// . kinda similar to XmlDoc::hashString()
|
|
bool XmlDoc::hashString_ct ( HashTableX *ct , char *s , int32_t slen ) {
|
|
|
|
Words words;
|
|
Bits bits;
|
|
Phrases phrases;
|
|
if ( ! words.set ( s , slen , m_version , true , m_niceness ) )
|
|
return false;
|
|
if ( ! bits.set ( &words , m_version , m_niceness ) )
|
|
return false;
|
|
if ( ! phrases.set(&words,&bits,true,false,m_version,m_niceness))
|
|
return false;
|
|
int32_t nw = words.getNumWords();
|
|
int64_t *wids = words.getWordIds();
|
|
int64_t *pids = phrases.m_phraseIds2;
|
|
char **wptrs = words.m_words;
|
|
int32_t *wlens = words.m_wordLens;
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// add the word
|
|
if ( wids[i] == 0LL ) continue;
|
|
// skip if in repeated fragment
|
|
// . NO, we do not use this for these int16_t strings
|
|
//if ( ww[i] == 0 ) continue;
|
|
// accumulate the wid with a score of 1 each time it occurs
|
|
if ( ! ct->addTerm ( &wids[i] ) ) return false;
|
|
// skip if word #i does not start a phrase
|
|
if ( ! pids [i] ) continue;
|
|
// if phrase score is less than 100% do not consider as a
|
|
// phrase so that we do not phrase "albuquerque, NM" and stuff
|
|
// like that... in fact, we can only have a space here...
|
|
if ( i+1<nw ) {
|
|
if ( wptrs[i+1][0] == ',' ) continue;
|
|
if ( wlens[i+1]>=2 && wptrs[i+1][1] == ',' ) continue;
|
|
if ( wlens[i+1]>=3 && wptrs[i+1][2] == ',' ) continue;
|
|
}
|
|
// put it in, accumulate, max score is 0x7fffffff
|
|
if ( ! ct->addTerm ( &pids[i] ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
uint8_t *XmlDoc::getSummaryLangId ( ) {
|
|
// return if we got it already
|
|
if ( m_summaryLangIdValid ) return &m_summaryLangId;
|
|
Summary *s = getSummary();
|
|
if ( ! s || s == (void *)-1 ) return (uint8_t *)s;
|
|
char *sum = s->getSummary();
|
|
// now set the words class
|
|
Words ww;
|
|
if ( ! ww.set9 ( sum , m_niceness ) ) return NULL;
|
|
// check it out. 0 means langUnknown. -1 means error.
|
|
int32_t ret = ww.getLanguage ( NULL , 100 , m_niceness , NULL );
|
|
// -1 means error! g_errno should be set
|
|
if ( ret < 0 ) return NULL;
|
|
// set it
|
|
m_summaryLangId = (uint8_t)ret;
|
|
// assume valid
|
|
m_summaryLangIdValid = true;
|
|
// return it
|
|
return &m_summaryLangId;
|
|
}
|
|
|
|
int cmp ( const void *h1 , const void *h2 ) ;
|
|
|
|
// vector components are 32-bit hashes
|
|
int32_t *XmlDoc::getTagPairHashVector ( ) {
|
|
|
|
if ( m_tagPairHashVecValid ) return m_tagPairHashVec;
|
|
|
|
Xml *xml = getXml ();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
|
|
|
|
// store the hashes here
|
|
uint32_t hashes [ 2000 ];
|
|
int32_t nh = 0;
|
|
// go through each node
|
|
XmlNode *nodes = xml->getNodes ();
|
|
int32_t n = xml->getNumNodes ();
|
|
|
|
// start with the ith node
|
|
int32_t i = 0;
|
|
|
|
uint32_t saved = 0;
|
|
uint32_t lastHash = 0;
|
|
// loop over the nodes
|
|
for ( ; i < n ; i++ ) {
|
|
// breathe a little
|
|
QUICKPOLL ( m_niceness );
|
|
// skip NON tags
|
|
if ( ! nodes[i].isTag() ) continue;
|
|
// use the tag id as the hash, its unique
|
|
uint32_t h = hash32h ( nodes[i].getNodeId() , 0 );
|
|
// ensure hash is not 0, that has special meaning
|
|
if ( h == 0 ) h = 1;
|
|
// store in case we have only one hash
|
|
saved = h;
|
|
|
|
// if we are the first, set this
|
|
if ( ! lastHash ) {
|
|
lastHash = h;
|
|
continue;
|
|
}
|
|
|
|
// if they were the same do not xor, they will zero out
|
|
if ( h == lastHash ) hashes[nh++] = h;
|
|
// incorporate it into the last hash
|
|
else hashes[nh++] = h ^ lastHash;
|
|
|
|
// we are the new last hash
|
|
lastHash = h;
|
|
// bust out if no room
|
|
if ( nh >= 2000 ) break;
|
|
}
|
|
// if only had one tag after, use that
|
|
if ( nh == 0 && saved ) hashes[nh++] = saved;
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness ) ;
|
|
// . TODO: remove the link text hashes here?
|
|
// . because will probably be identical..
|
|
// . now sort hashes to get the top MAX_PAIR_HASHES
|
|
gbsort ( hashes , nh , 4 , cmp );
|
|
// breathe
|
|
QUICKPOLL ( m_niceness ) ;
|
|
// uniquify them
|
|
int32_t d = 0;
|
|
for ( int32_t j = 1 ; j < nh ; j++ ) {
|
|
if ( hashes[j] == hashes[d] ) continue;
|
|
hashes[++d] = hashes[j];
|
|
}
|
|
// breathe
|
|
QUICKPOLL ( m_niceness ) ;
|
|
// how many do we got?
|
|
nh = d;
|
|
// truncate to MAX_PAIR_HASHES MINUS 1 so we can put a 0 at the end
|
|
if ( nh > MAX_TAG_PAIR_HASHES-1 ) nh = MAX_TAG_PAIR_HASHES-1;
|
|
// store the top MAX_PAIR_HASHES
|
|
gbmemcpy ( m_tagPairHashVec , hashes , nh * 4 );
|
|
// null term it. all vectors need this so computeSimilarity() works
|
|
m_tagPairHashVec [ nh++ ] = 0;
|
|
m_tagPairHashVecValid = true;
|
|
m_tagPairHashVecSize = nh * 4;
|
|
return m_tagPairHashVec;
|
|
}
|
|
|
|
// sort in descending order
|
|
int cmp ( const void *h1 , const void *h2 ) {
|
|
return *(uint32_t *)h2 - *(uint32_t *)h1;
|
|
}
|
|
|
|
// . m_tagVector.setTagPairHashes(&m_xml, niceness);
|
|
// . Sections.cpp and getIsDup() both use this hash
|
|
// . returns NULL and sets g_errno on error
|
|
// . xors all the unique adjacent tag hashes together
|
|
// . kind of represents the template the web pages uses
|
|
// . we add this to sectiondb as a vote in Sections::addVotes()
|
|
uint32_t *XmlDoc::getTagPairHash32 ( ) {
|
|
|
|
// only compute once
|
|
if ( m_tagPairHash32Valid ) return &m_tagPairHash32;
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (uint32_t *)words;
|
|
|
|
// int16_tcuts
|
|
//int64_t *wids = words->getWordIds ();
|
|
nodeid_t *tids = words->getTagIds ();
|
|
int32_t nw = words->getNumWords ();
|
|
int32_t nt = words->m_numTags;
|
|
|
|
// . get the hash of all the tag pair hashes!
|
|
// . we then combine that with our site hash to get our site specific
|
|
// html template termid
|
|
// . put all tag pairs into a hash table
|
|
// . similar to Vector::setTagPairHashes() but we do not compute a
|
|
// vector, just a single scalar/hash of 32 bits, m_termId
|
|
HashTableX tp; // T<int64_t,char> tp;
|
|
if ( ! tp.set ( 4 , 1 , nt * 4 , NULL , 0 , true,m_niceness,"xmltp"))
|
|
return 0LL;
|
|
uint32_t lastTid = 0;
|
|
char val = 1;
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// skip if not tag
|
|
if ( tids[i] == 0LL ) continue;
|
|
// skip if back tag
|
|
if ( tids[i] & BACKBIT ) continue;
|
|
// get last tid
|
|
uint32_t h = hash32h ( tids[i] , lastTid );
|
|
//logf(LOG_DEBUG,"build: tph %" INT32 " h=%" UINT64 "",i,(int64_t)h);
|
|
// . add to table (skip if 0, means empty bucket)
|
|
// . return NULL and set g_errno on error
|
|
if ( h && ! tp.addKey ( &h , &val ) ) return NULL;
|
|
// update this
|
|
lastTid = h;
|
|
}
|
|
// linear scan on hash table to get all the hash, XOR together
|
|
uint32_t hx = 0;
|
|
int32_t nb = tp.getNumSlots();
|
|
char *flags = tp.m_flags;
|
|
// get keys
|
|
uint32_t *keys = (uint32_t *)tp.m_keys;
|
|
for ( int32_t i = 0 ; i < nb ; i++ ) {
|
|
// skip if empty
|
|
if ( flags[i] == 0 ) continue;
|
|
// skip if empty
|
|
//if ( keys[i] == 0LL ) continue;
|
|
// incorporate
|
|
hx ^= keys[i];
|
|
}
|
|
// never return 0, make it 1. 0 means an error
|
|
if ( hx == 0 ) hx = 1;
|
|
// set the hash
|
|
m_tagPairHash32 = hx ;
|
|
// it is now valid
|
|
m_tagPairHash32Valid = true;
|
|
return &m_tagPairHash32;
|
|
}
|
|
|
|
// . used for deduping search results
|
|
// . also uses the title
|
|
int32_t *XmlDoc::getSummaryVector ( ) {
|
|
if ( m_summaryVecValid ) return (int32_t *)m_summaryVec;
|
|
Summary *s = getSummary();
|
|
if ( ! s || s == (Summary *)-1 ) return (int32_t *)s;
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (int32_t *)ti;
|
|
// store title and summary into "buf" so we can call words.set()
|
|
//char buf[5000];
|
|
SafeBuf sb;
|
|
//char *p = buf;
|
|
//int32_t avail = 5000;
|
|
//int32_t len;
|
|
// put title into there
|
|
int32_t tlen = ti->m_titleBytes - 1;
|
|
//if ( len > avail ) len = avail - 10;
|
|
if ( tlen < 0 ) tlen = 0;
|
|
|
|
// put summary into there
|
|
int32_t slen = s->m_summaryLen;
|
|
|
|
// allocate space
|
|
int32_t need = tlen + 1 + slen + 1;
|
|
if ( ! sb.reserve ( need ) ) return NULL;
|
|
|
|
//gbmemcpy ( p , ti->m_title , len );
|
|
//p += len;
|
|
sb.safeMemcpy ( ti->m_title , tlen );
|
|
// space separating the title from summary
|
|
if ( tlen > 0 ) sb.pushChar(' ');
|
|
|
|
//if ( len > avail ) len = avail - 10;
|
|
//gbmemcpy ( p , s->m_summary , len );
|
|
//p += len;
|
|
sb.safeMemcpy ( s->m_summary , slen );
|
|
// null terminate it
|
|
//*p = '\0';
|
|
sb.nullTerm();
|
|
// word-ify it
|
|
Words words;
|
|
if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL;
|
|
// . now set the dedup vector from big summary and title
|
|
// . store sample vector in here
|
|
// . returns size in bytes including null terminating int32_t
|
|
m_summaryVecSize = computeVector ( NULL , &words ,
|
|
(uint32_t *)m_summaryVec );
|
|
m_summaryVecValid = true;
|
|
return m_summaryVec;
|
|
}
|
|
|
|
|
|
bool getWordVector ( char *s ,
|
|
HashTableX *ht ,
|
|
uint32_t *d ,
|
|
int32_t *nd ,
|
|
int32_t ndmax ) {
|
|
// utf8 char size
|
|
char size;
|
|
// grab each word and hash it
|
|
for ( ; *s ; s += size ) {
|
|
// get size
|
|
size = getUtf8CharSize(s);
|
|
// skip if tag
|
|
if ( *s == '<' ) {
|
|
while ( *s && *s!='>' )
|
|
s += getUtf8CharSize(s);
|
|
continue;
|
|
}
|
|
// skip if other type of punct
|
|
if ( ! is_alnum_utf8(s) ) continue;
|
|
// ok, we got a word then
|
|
char *start = s;
|
|
// see how long the word is
|
|
for ( ; *s && is_alnum_utf8(s);s+=getUtf8CharSize(s));
|
|
// get wordid, a simple hash, just like Words.cpp does
|
|
uint64_t h = hash64Lower_utf8(start,s - start);
|
|
// do not inc this time
|
|
size = 0;
|
|
// breathe
|
|
//QUICKPOLL ( m_niceness );
|
|
// make 32 bit
|
|
uint32_t wid32 = (uint32_t)h;
|
|
//
|
|
// TODO: ignore if it is a day name or month name or
|
|
// number because those are like dates
|
|
//
|
|
if ( ht ) {
|
|
// do not add if we already got it
|
|
if ( ht->getSlot ( &wid32 ) >= 0 ) continue;
|
|
// add to hash table. return NULL and set g_errno onerr
|
|
if ( ! ht->addKey (&wid32 )) return false;
|
|
}
|
|
// add it to our vector
|
|
d[*nd] = (uint32_t)wid32;
|
|
// inc it
|
|
*nd = *nd + 1;
|
|
// stop after 3000 for sure
|
|
if ( *nd >= ndmax ) return true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// used by getIsDup() and Dates.cpp for detecting dups and for
|
|
// seeing if the content changed respectively
|
|
int32_t *XmlDoc::getPageSampleVector ( ) {
|
|
if ( m_pageSampleVecValid ) return m_pageSampleVec;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
|
|
Sections *ss = NULL;
|
|
//if ( m_eliminateMenus ) {
|
|
//ss = getSections();
|
|
//if ( ! ss || ss == (Sections *)-1) return (int32_t *)ss;
|
|
//}
|
|
m_pageSampleVecSize = computeVector ( ss, ww,
|
|
(uint32_t *)m_pageSampleVec );
|
|
m_pageSampleVecValid = true;
|
|
return m_pageSampleVec;
|
|
}
|
|
|
|
// . this is the vector of the words right after the hypertext for the link
|
|
// we are voting on.
|
|
// . it is used to dedup voters in Msg25.cpp
|
|
int32_t *XmlDoc::getPostLinkTextVector ( int32_t linkNode ) {
|
|
|
|
if ( m_postVecValid ) return m_postVec;
|
|
// assume none
|
|
m_postVecSize = 0;
|
|
|
|
// set up
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
|
|
|
|
// sanity check
|
|
if ( linkNode < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// linkNode starts pointing to a <a> tag so skip over that!
|
|
linkNode++;
|
|
// limit
|
|
int32_t nn = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// and advance i to the next anchor tag thereafter, we do not
|
|
// want to include link text in this vector because it is usually
|
|
// repeated and will skew our "similarities"
|
|
for ( ; linkNode < nn ; linkNode++ ) {
|
|
// stop if we hit </a> or <a>
|
|
if ( (nodes[linkNode].m_nodeId & BACKBITCOMP) != 2 ) continue;
|
|
// advance over the </a> or <a>
|
|
linkNode++;
|
|
// then stop, we will start gathering link text here
|
|
break;
|
|
}
|
|
// if we hit end of the doc, we got not vector then
|
|
if ( linkNode >= nn ) return m_postVec;
|
|
|
|
// now convert the linkNode # to a word #, "start"
|
|
int32_t nw = ww->getNumWords ();
|
|
int64_t *wids = ww->getWordIds ();
|
|
nodeid_t *tids = ww->getTagIds ();
|
|
int32_t *wn = ww->m_nodes;
|
|
int32_t i = 0;
|
|
for ( ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop when we got the first word in this node #
|
|
if ( wn[i] == linkNode ) break;
|
|
}
|
|
// if none, bail now, size is 0
|
|
if ( i >= nw ) return m_postVec;
|
|
// save that
|
|
int32_t start = i;
|
|
|
|
// likewise, set the end of it
|
|
int32_t end = nw;
|
|
// count alnum words
|
|
int32_t count = 0;
|
|
// limit it
|
|
for ( i = start ; i < nw && count < 35 ; i++ ) {
|
|
// get tag id
|
|
nodeid_t tid = tids[i] & BACKBITCOMP;
|
|
// stop if certain ones
|
|
if ( tid == TAG_TABLE ) break;
|
|
if ( tid == TAG_UL ) break;
|
|
// <a>, </a> is ok
|
|
if ( tids[i] == TAG_A ) break;
|
|
// only up to 35 words allowed in the hash
|
|
if ( wids[i] ) count++;
|
|
}
|
|
// set the end of the words to hash
|
|
end = i;
|
|
// specify starting node # now
|
|
m_postVecSize = computeVector(NULL,ww,(uint32_t *)m_postVec,start,end);
|
|
// return what we got
|
|
return m_postVec;
|
|
}
|
|
|
|
// . was kinda like "m_tagVector.setTagPairHashes(&m_xml, niceness);"
|
|
// . this is used by getIsDup() (below)
|
|
// . this is used by Dates.cpp to see how much a doc has changed
|
|
// . this is also now used for getting the title/summary vector for deduping
|
|
// search results
|
|
// . if we couldn't extract a good pub date for the doc, and it has changed
|
|
// since last spidered, use the bisection method to come up with our own
|
|
// "last modified date" which we use as the pub date.
|
|
// . this replaces the clusterdb.getSimilarity() logic in Msg14.cpp used
|
|
// to do the same thing. but we call Vector::setForDates() from
|
|
// Dates.cpp. that way the logic is more contained in Dates!
|
|
// . doesn't Msg14 already do that?
|
|
// . yes, but it uses two TermTables and calls Clusterdb::getSimilarity()
|
|
// . returns false and sets g_errno on error
|
|
// . these words classes should have been set by a call to Words::set(Xml *...)
|
|
// so that we have "tids1" and "tids2"
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . TODO: if our title rec is non-empty consider getting it from that
|
|
// . we use this vector to compare two docs to see how similar they are
|
|
int32_t XmlDoc::computeVector ( Sections *sections, Words *words, uint32_t *vec ,
|
|
int32_t start , int32_t end ) {
|
|
|
|
// assume empty vector
|
|
vec[0] = 0;
|
|
|
|
// skip if no article section. then we have no vector.
|
|
if ( sections && ! sections->m_hadArticle ) return 0;
|
|
|
|
// int16_tcuts
|
|
int32_t nw = words->getNumWords();
|
|
//int32_t nt = words->m_numTags;
|
|
int64_t *wids = words->getWordIds();
|
|
|
|
// set the end to the real end if it was specified as less than zero
|
|
if ( end < 0 ) end = nw;
|
|
|
|
// # of alnum words, about... minus the tags, then the punct words
|
|
// are half of what remains...
|
|
int32_t count = words->m_numAlnumWords;
|
|
|
|
// if we got sections, how many good words?
|
|
if ( sections ) count = sections->m_numAlnumWordsInArticle;
|
|
|
|
// google seems to index SEC_MARQUEE so i took that out
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
|
|
|
|
// these Section ptrs are 1-1 with the words
|
|
Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs;
|
|
|
|
// . Get sample vector from content section only.
|
|
// . This helps remove duplicate menu/ad from vector
|
|
|
|
// 4 bytes per hash, save the last one for a NULL terminator, 0 hash
|
|
int32_t maxTerms = SAMPLE_VECTOR_SIZE / 4 - 1;
|
|
// what portion of them do we want to mask out from the rest?
|
|
int32_t ratio = count / maxTerms ;
|
|
// a mask of 0 means to get them all
|
|
unsigned char mask = 0x00;
|
|
// if we got twice as many terms as we need, then set mask to 0x01
|
|
// to filter out half of them! but actually, let's aim for twice
|
|
// as many as we need to ensure we really get as many as we need.
|
|
// so if we got 4 or more than we need then cut in half...
|
|
while ( ratio >= 4 ) {
|
|
// shift the mask down, ensure hi bit is set
|
|
mask >>= 1;
|
|
mask |= 0x80;
|
|
ratio >>= 1; // /2
|
|
}
|
|
|
|
// store vector into "d" for now. will sort below
|
|
uint32_t d [ 3000 ];
|
|
|
|
// dedup our vector using this hashtable, "ht"
|
|
char hbuf[3000*6*2];
|
|
HashTableX ht;
|
|
if ( ! ht.set(4,0,3000,hbuf,3000*6*2,false,m_niceness,"xmlvecdedup")){
|
|
char*xx=NULL;*xx=0;}
|
|
|
|
again:
|
|
// a buffer to hold the top termIds
|
|
int32_t nd = 0;
|
|
// count how many we mask out
|
|
int32_t mo = 0;
|
|
// . buffer should have at least "maxTerms" in it
|
|
// . these should all be 12 byte keys
|
|
for ( int32_t i = start ; i < end ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum word
|
|
if ( wids[i] == 0 ) continue;
|
|
// skip if mask filters it
|
|
if ( ((wids[i]>>(NUMTERMIDBITS-8)) & mask)!=0) {mo++;continue;}
|
|
// skip if in select, style, script or marquee tag section
|
|
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
|
|
// make 32 bit
|
|
uint32_t wid32 = (uint32_t)wids[i];
|
|
// do not add if we already got it
|
|
if ( ht.getSlot ( &wid32 ) >= 0 ) continue;
|
|
// add to hash table. return NULL and set g_errno on error
|
|
if ( ! ht.addKey (&wid32 )){char*xx=NULL;*xx=0; }
|
|
// add it to our vector
|
|
d[nd] = (uint32_t)wids[i];
|
|
// stop after 3000 for sure
|
|
if ( ++nd < 3000 ) continue;
|
|
// bitch and break out on error
|
|
log(LOG_INFO,"build: Sample vector overflow. Slight "
|
|
"performance hit.");
|
|
break;
|
|
}
|
|
|
|
// . if nd was too small, don't use a mask to save time
|
|
// . well just make the mask less restrictive
|
|
if ( nd < maxTerms && mask && mo ) {
|
|
// shift the mask UP, allow more termIds to pass through
|
|
mask <<= 1;
|
|
// reset hash table since we are starting over
|
|
ht.clear();
|
|
goto again;
|
|
}
|
|
|
|
// bubble sort them
|
|
bool flag = true;
|
|
while ( flag ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
flag = false;
|
|
for ( int32_t i = 1 ; i < nd ; i++ ) {
|
|
if ( d[i-1] <= d[i] ) continue;
|
|
uint32_t tmp = d[i-1];
|
|
d[i-1] = d[i];
|
|
d[i] = tmp;
|
|
flag = true;
|
|
}
|
|
}
|
|
|
|
// truncate
|
|
if ( nd > maxTerms ) nd = maxTerms;
|
|
// null terminate
|
|
d [ nd++ ] = 0;
|
|
// store in our sample vector
|
|
gbmemcpy ( vec , d , nd * 4 );
|
|
// return size in bytes
|
|
return nd * 4;
|
|
}
|
|
|
|
float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
|
|
int32_t *tv1 = getTagPairHashVector();
|
|
if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
|
|
int32_t *tv2 = xd2->getTagPairHashVector();
|
|
if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
|
|
m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
|
|
m_niceness );
|
|
// this means error, g_errno should be set
|
|
if ( m_tagSimilarity == -1.0 ) return NULL;
|
|
return &m_tagSimilarity;
|
|
}
|
|
|
|
float *XmlDoc::getGigabitSimilarity ( XmlDoc *xd2 ) {
|
|
int32_t **gv1 = getGigabitHashes();
|
|
if ( ! gv1 || gv1 == (int32_t **)-1 ) return (float *)gv1;
|
|
int32_t **gv2 = xd2->getGigabitHashes();
|
|
if ( ! gv2 || gv2 == (int32_t **)-1 ) return (float *)gv2;
|
|
// *gv1 could be NULL if vec was empty in titlerec's ptr_gigabitHashes
|
|
m_gigabitSimilarity = computeSimilarity ( *gv1, *gv2, NULL, NULL, NULL,
|
|
m_niceness );
|
|
// this means error, g_errno should be set
|
|
if ( m_gigabitSimilarity == -1.0 ) return NULL;
|
|
return &m_gigabitSimilarity;
|
|
}
|
|
|
|
float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
|
|
int32_t *sv1 = getPageSampleVector();
|
|
if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
|
|
int32_t *sv2 = xd2->getPageSampleVector();
|
|
if ( ! sv2 || sv2 == (int32_t *)-1 ) return (float *)sv2;
|
|
m_pageSimilarity = computeSimilarity ( sv1, sv2, NULL, NULL, NULL,
|
|
m_niceness );
|
|
// this means error, g_errno should be set
|
|
if ( m_pageSimilarity == -1.0 ) return NULL;
|
|
return &m_pageSimilarity;
|
|
}
|
|
|
|
// . compare old page vector with new
|
|
// . returns ptr to a float from 0.0 to 100.0
|
|
float *XmlDoc::getPercentChanged ( ) {
|
|
// if we got it
|
|
if ( m_percentChangedValid ) return &m_percentChanged;
|
|
// get the old doc
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (float *)od;
|
|
// if empty, assume 0% changed
|
|
if ( ! *od ) {
|
|
m_percentChanged = 0;
|
|
m_percentChangedValid = true;
|
|
return &m_percentChanged;
|
|
}
|
|
// get its page c
|
|
float *ps = getPageSimilarity ( *od );
|
|
if ( ! ps || ps == (float *)-1 ) return (float *)ps;
|
|
// got it
|
|
m_percentChanged = *ps;
|
|
m_percentChangedValid = true;
|
|
// just return it
|
|
return &m_percentChanged;
|
|
}
|
|
|
|
// . Address.cpp converts a place name into a vector for comparing via a
|
|
// call to computeSimilarity() below
|
|
// . returns -1 and set g_errno on error
|
|
// . "vbufSize" is in BYTES!
|
|
// . returns length of word vector in int32_ts (# components stored)
|
|
int32_t makeSimpleWordVector (char *s,int32_t *vbuf,int32_t vbufSize,int32_t niceness ) {
|
|
// nonsense?
|
|
if ( vbufSize < 4 ) { char *xx=NULL;*xx=0; }
|
|
// empty it
|
|
*vbuf = 0;
|
|
// no words, no vector
|
|
if ( ! s ) return 0;
|
|
// set them
|
|
Words w;
|
|
// return -1 with g_errno set on error
|
|
if ( ! w.set9 ( s , niceness ) ) return -1;
|
|
// skip if no words
|
|
if ( w.m_numWords == 0 ) return 0;
|
|
// int16_t cut
|
|
int64_t *wids = w.m_wordIds;
|
|
int64_t pid = 0LL;
|
|
// count insertions
|
|
int32_t count = 0;
|
|
// ptr
|
|
int32_t *vbufPtr = vbuf;
|
|
int32_t *vbufEnd = vbuf + vbufSize/4;
|
|
// put words into a vector
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// if no room stop. need room for NULL terminator
|
|
if ( vbufPtr + 2 >= vbufEnd ) return count;
|
|
// put it in
|
|
//*vbufPtr = (int32_t)wids[i];
|
|
// . use the synonym instead if it had one
|
|
// . maps "theatre" to "theater", "4th" to "fourth", etc.
|
|
// . false = is street name?
|
|
int64_t *p = getSynonymWord ( &wids[i] , &pid , false );
|
|
// set this
|
|
pid = wids[i];
|
|
//int64_t *p = (int64_t *)synTable->getValue64( wids[i] );
|
|
// 0 means to ignore it
|
|
if ( *p == 0LL ) continue;
|
|
// otherwise add into our vector
|
|
*vbufPtr = *p;
|
|
// advance
|
|
vbufPtr++;
|
|
// NULL termination
|
|
*vbufPtr = 0;
|
|
// count it
|
|
count++;
|
|
}
|
|
// all done
|
|
return count;
|
|
}
|
|
|
|
// . compare two vectors
|
|
// . components in vectors are int32_ts
|
|
// . last component is a zero, to mark EOV = end of vector
|
|
// . discount any termIds that are in the query vector, qvec, which may be NULL
|
|
// . returns -1 and sets g_errno on error
|
|
// . vector components are 32-bit hashes of the words (hash32())???
|
|
// i would say they should be the lower 32 bits of the 64-bit hashes!
|
|
// . replaces:
|
|
// g_clusterdb.getGigabitSimilarity()
|
|
// m_tagVec->getLinkBrotherProbability()
|
|
// g_clusterdb.getSampleSimilarity()
|
|
float computeSimilarity ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t *s0 , // corresponding scores vector
|
|
int32_t *s1 , // corresponding scores vector
|
|
Query *q ,
|
|
int32_t niceness ,
|
|
bool dedupVectors ) {
|
|
static int32_t s_tmp = 0;
|
|
if ( ! vec0 ) vec0 = &s_tmp;
|
|
if ( ! vec1 ) vec1 = &s_tmp;
|
|
// if both empty, assume not similar at all
|
|
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *vec0 == 0 ) return 0;
|
|
if ( *vec1 == 0 ) return 0;
|
|
|
|
|
|
// flag if from query vector
|
|
HashTableX qt;
|
|
char qbuf[5000];
|
|
if ( q ) {
|
|
// init hash table
|
|
if ( ! qt.set ( 4,0,512,qbuf,5000,false,niceness,"xmlqvtbl") )
|
|
return -1;
|
|
// . stock the query term hash table
|
|
// . use the lower 32 bits of the termids to make compatible
|
|
// with the other vectors we use
|
|
//int64_t *qtids = q->getTermIds ();
|
|
int32_t nt = q->getNumTerms();
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
// get query term
|
|
QueryTerm *QT = &q->m_qterms[i];
|
|
// get the termid
|
|
int64_t termId = QT->m_termId;
|
|
// get it
|
|
uint32_t h = (uint32_t)(termId & 0xffffffff);
|
|
// hash it
|
|
if ( ! qt.addKey ( &h ) ) return -1;
|
|
}
|
|
}
|
|
|
|
// if we ignore cardinality then it only matters if both vectors
|
|
// have a particular value, and not how many times they each have it.
|
|
// so we essentially dedup each vector if dedupVectors is true.
|
|
// but we do total up the score and put it behind the one unique
|
|
// occurrence though. we do this only for
|
|
// Sections::addDateBasedImpliedSections() right now
|
|
bool allowDups = true;
|
|
if ( dedupVectors ) allowDups = false;
|
|
|
|
HashTableX ht;
|
|
char hbuf[10000];
|
|
if ( ! ht.set ( 4,4,-1,hbuf,10000,allowDups,niceness,"xmlqvtbl2"))
|
|
return -1;
|
|
|
|
bool useScores = (bool)s0;
|
|
|
|
int32_t matches = 0;
|
|
int32_t total = 0;
|
|
|
|
int32_t matchScore = 0;
|
|
int32_t totalScore = 0;
|
|
|
|
// hash first vector. accumulating score total and total count
|
|
for ( int32_t *p = vec0; *p ; p++ , s0++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip if matches a query term
|
|
if ( q && qt.getSlot ( p ) ) continue;
|
|
// count it
|
|
total++;
|
|
// get it
|
|
int32_t score = 1;
|
|
// get the score if valid
|
|
if ( useScores ) score = *s0;
|
|
// total it up
|
|
totalScore += score;
|
|
// add it
|
|
if ( dedupVectors ) {
|
|
// accumulate all the scores into this one bucket
|
|
// in the case of p being a dup
|
|
if ( ! ht.addTerm32 ( p , score ) ) return -1;
|
|
}
|
|
else {
|
|
// otherwise, add each into its own bucket since
|
|
// ht.m_allowDups should be true
|
|
if ( ! ht.addKey ( p , &score ) ) return -1;
|
|
}
|
|
}
|
|
|
|
int32_t zero = 0;
|
|
|
|
// see what components of this vector match
|
|
for ( int32_t *p = vec1; *p ; p++ , s1++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip if matches a query term
|
|
if ( q && qt.getSlot ( p ) ) continue;
|
|
// count it
|
|
total++;
|
|
// get it
|
|
int32_t score = 1;
|
|
// get the score if valid
|
|
if ( useScores ) score = *s1;
|
|
// and total scores
|
|
totalScore += score;
|
|
// is it in there?
|
|
int32_t slot = ht.getSlot ( p );
|
|
// skip if unmatched
|
|
if ( slot < 0 ) continue;
|
|
// otherwise, it is a match!
|
|
matches++;
|
|
// and scores
|
|
matchScore += score;
|
|
// and score of what we matched
|
|
uint32_t *val = (uint32_t *)ht.getValueFromSlot ( slot );
|
|
// he is hit too
|
|
matchScore += *val;
|
|
|
|
// remove it as we match it to deal with dups
|
|
if ( allowDups ) {
|
|
// once we match it once, do not match again, score was
|
|
// already accumulated
|
|
ht.setValue ( slot , &zero );
|
|
}
|
|
else {
|
|
// otherwise, remove this dup and try to match any
|
|
// remaining dups in the table
|
|
ht.removeSlot ( slot );
|
|
}
|
|
}
|
|
|
|
// if after subtracting query terms we got no hits, return 0.framesets?
|
|
if ( useScores && totalScore == 0 ) return 0;
|
|
if ( total == 0 ) return 0;
|
|
// . what is the max possible score we coulda had?
|
|
// . subtract the vector components that matched a query term
|
|
float percent = 100 * (float)matchScore / (float)totalScore;
|
|
//if ( useScores)percent = 100 * (float)matchScore / (float)totalScore;
|
|
//else percent = 100 * (float)matches / (float)total;
|
|
// sanity
|
|
//if ( percent > 100 ) percent = 100;
|
|
if ( percent > 100 ) { char *xx=NULL;*xx=0; }
|
|
|
|
return percent;
|
|
}
|
|
|
|
// this returns true if the two vecs are "percentSimilar" or more similar
|
|
bool isSimilar_sorted ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t nv0 , // how many int32_ts in vec?
|
|
int32_t nv1 , // how many int32_ts in vec?
|
|
// they must be this similar or more to return true
|
|
int32_t percentSimilar,
|
|
int32_t niceness ) {
|
|
// if both empty, assume not similar at all
|
|
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *vec0 == 0 ) return 0;
|
|
if ( *vec1 == 0 ) return 0;
|
|
|
|
// do not include last 0
|
|
nv0--;
|
|
nv1--;
|
|
int32_t total = nv0 + nv1;
|
|
|
|
// so if the "noMatched" count ever EXCEEDS (not equals) this
|
|
// "brink" we can bail early because there's no chance of getting
|
|
// the similarity "percentSimilar" provided. should save some time.
|
|
int32_t brink = ((100-percentSimilar) * total) / 100;
|
|
|
|
// scan each like doing a merge
|
|
int32_t *p0 = vec0;
|
|
int32_t *p1 = vec1;
|
|
int32_t yesMatched = 0;
|
|
int32_t noMatched = 0;
|
|
|
|
mergeLoop:
|
|
|
|
// stop if both exhausted. we didn't bail on brink, so it's a match
|
|
if ( *p0 == 0 && *p1 == 0 )
|
|
return true;
|
|
|
|
if ( *p0 < *p1 || *p1 == 0 ) {
|
|
p0++;
|
|
if ( ++noMatched > brink ) return false;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
if ( *p1 < *p0 || *p0 == 0 ) {
|
|
p1++;
|
|
if ( ++noMatched > brink ) return false;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
yesMatched += 2;
|
|
p1++;
|
|
p0++;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
uint64_t *XmlDoc::getFuzzyDupHash ( ) {
|
|
|
|
if ( m_dupHashValid ) return &m_dupHash;
|
|
uint32_t *h1 = getTagPairHash32();
|
|
if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;
|
|
|
|
uint32_t *h2 = getGigabitVectorScorelessHash ( ) ;
|
|
if ( ! h2 || h2 == (uint32_t *)-1 ) return (uint64_t *)h2;
|
|
|
|
//uint64_t h2b = (uint64_t)*h2;
|
|
|
|
m_dupHash = hash64 ( (uint64_t)*h1 , (uint64_t)*h2 );
|
|
m_dupHashValid = true;
|
|
return &m_dupHash;
|
|
}
|
|
|
|
int64_t *XmlDoc::getExactContentHash64 ( ) {
|
|
|
|
if ( m_exactContentHash64Valid )
|
|
return &m_exactContentHash64;
|
|
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1) return (int64_t *)u8;
|
|
|
|
|
|
// if (m_docId==88581116800LL)
|
|
// log("got article1 diffbot");
|
|
// if (m_docId==201689682865LL)
|
|
// log("got article11 diffbot");
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if we are diffbot, then do not quite do an exact content hash.
|
|
// there is a "url:" field in the json that changes. so we have
|
|
// to exclude that field. otherwise getDupList() spider time dedup
|
|
// detection will fail the TestDuplicateContent.testDuplicate smoketest
|
|
if ( cr->m_isCustomCrawl == 1 && m_isDiffbotJSONObject ) {
|
|
int32_t *ch32 = getContentHashJson32();
|
|
if ( ! ch32 || ch32 == (void *)-1 ) return (int64_t *)ch32;
|
|
m_exactContentHash64Valid = true;
|
|
m_exactContentHash64 = (uint64_t)(uint32_t)*ch32;
|
|
return &m_exactContentHash64;
|
|
}
|
|
|
|
|
|
unsigned char *p = (unsigned char *)*u8;
|
|
|
|
int32_t plen = size_utf8Content;
|
|
if ( plen > 0 ) plen--;
|
|
|
|
// sanity
|
|
//if ( ! p ) return 0LL;
|
|
//if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
unsigned char *pend = (unsigned char *)p + plen;
|
|
uint64_t h64 = 0LL;
|
|
unsigned char pos = 0;
|
|
bool lastWasSpace = true;
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// treat sequences of white space as a single ' ' (space)
|
|
if ( is_wspace_a(*p) ) {
|
|
if ( lastWasSpace ) continue;
|
|
lastWasSpace = true;
|
|
// treat all white space as a space
|
|
h64 ^= g_hashtab[pos][(unsigned char)' '];
|
|
pos++;
|
|
continue;
|
|
}
|
|
lastWasSpace = false;
|
|
// xor this in right
|
|
h64 ^= g_hashtab[pos][p[0]];
|
|
pos++;
|
|
}
|
|
|
|
m_exactContentHash64Valid = true;
|
|
m_exactContentHash64 = h64;
|
|
return &m_exactContentHash64;
|
|
}
|
|
|
|
|
|
RdbList *XmlDoc::getDupList ( ) {
|
|
if ( m_dupListValid ) return &m_dupList;
|
|
|
|
// until we start using posdb and not indexdb, just return an
|
|
// empty list.
|
|
// TODO: MDW fix the deduping.
|
|
//m_dupList.reset();
|
|
//m_dupListValid = true;
|
|
//return &m_dupList;
|
|
//
|
|
// end temp hack
|
|
//
|
|
|
|
//uint64_t *dh = getDupHash ( );
|
|
//if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
int64_t *ph64 = getExactContentHash64();
|
|
//int64_t *ph64 = getLooseContentHash64();
|
|
if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64;
|
|
|
|
// must match term in XmlDoc::hashVectors()
|
|
char qbuf[256];
|
|
snprintf(qbuf, 256, "%" UINT64 "",*ph64);
|
|
int64_t pre = hash64b ( "gbcontenthash" , 0LL );
|
|
int64_t rawHash = hash64b ( qbuf , 0LL );
|
|
int64_t termId = hash64 ( rawHash , pre );
|
|
// get the startkey, endkey for termlist
|
|
key144_t sk ;
|
|
key144_t ek ;
|
|
g_posdb.makeStartKey ( &sk,termId ,0);
|
|
g_posdb.makeEndKey ( &ek,termId ,MAX_DOCID);
|
|
// note it
|
|
log(LOG_DEBUG,"build: check termid=%" UINT64 " for docid %" UINT64 ""
|
|
,(uint64_t)(termId&TERMID_MASK)
|
|
,m_docId);
|
|
// assume valid now
|
|
m_dupListValid = true;
|
|
// this is a no-split lookup by default now
|
|
if ( ! m_msg0.getList ( -1 , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // add to cache?
|
|
RDB_POSDB, // INDEXDB ,
|
|
cr->m_collnum,
|
|
&m_dupList ,
|
|
(char *)&sk ,
|
|
(char *)&ek ,
|
|
606006 , // minRecSizes in bytes
|
|
m_masterState , // state
|
|
m_masterLoop ,
|
|
m_niceness ,
|
|
true , // error correction?
|
|
true , // include tree?
|
|
true , // domerge?
|
|
-1 , // firsthosti
|
|
0 , // startfilenum
|
|
-1, // # files
|
|
// never timeout when spidering in case
|
|
// a host is down.
|
|
9999977 , // timeout
|
|
-1 , // syncpoint
|
|
-1 , // preferlocal reads
|
|
NULL, // msg5
|
|
NULL, // msg5b
|
|
false , // isRealMerge
|
|
true , // allow page cache
|
|
false , // forcelocalindexdb
|
|
true ) ) // shardByTermId? THIS IS DIFFERENT!!!
|
|
// return -1 if this blocks
|
|
return (RdbList *)-1;
|
|
// assume valid!
|
|
m_dupListValid = true;
|
|
return &m_dupList;
|
|
}
|
|
|
|
|
|
// moved DupDetector.cpp into here...
|
|
char *XmlDoc::getIsDup ( ) {
|
|
if ( m_isDupValid ) return &m_isDup;
|
|
// assume we are not a dup
|
|
m_isDup = false;
|
|
// get it
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// skip if we should
|
|
if ( ! cr->m_dedupingEnabled ||
|
|
// bulk jobs never dedup
|
|
cr->m_isCustomCrawl == 2 ) {
|
|
m_isDupValid = true;
|
|
return &m_isDup;
|
|
}
|
|
|
|
// if &links was given in the diffbot api url then do not do
|
|
// spider time deduping because the pages are likely rendered using
|
|
// javascript, so they'd all seem to be dups of one another.
|
|
if ( cr->m_isCustomCrawl ) {
|
|
SafeBuf *au = getDiffbotApiUrl();
|
|
if ( ! au || au == (void *)-1 ) return (char *)au;
|
|
char *linksParm = NULL;
|
|
if ( au->length() > 0 )
|
|
linksParm = strstr ( au->getBufStart() , "&links");
|
|
if ( ! linksParm && au->length() > 0 )
|
|
linksParm = strstr ( au->getBufStart() , "?links");
|
|
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
|
|
linksParm = NULL;
|
|
if ( linksParm ) {
|
|
m_isDupValid = true;
|
|
m_isDup = false;
|
|
return &m_isDup;
|
|
}
|
|
}
|
|
|
|
// do not dedup seeds
|
|
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
|
if ( cr->m_isCustomCrawl && isSeed ) {
|
|
m_isDupValid = true;
|
|
m_isDup = false;
|
|
return &m_isDup;
|
|
}
|
|
|
|
|
|
setStatus ( "checking for dups" );
|
|
|
|
// BUT if we are already indexed and a a crawlbot/bulk diffbot job
|
|
// then do not kick us out just because another indexed doc is
|
|
// a dup of us because it messes up the TestOnlyProcessIfNew smoketests
|
|
// because in the 2nd round we end up deleting article1.html after
|
|
// indexing it in the first round, then we add article11.html's
|
|
// diffbot reply in the 2nd round because article1.html and its
|
|
// diffbot reply was deleted. thereby giving it a new timestamp and
|
|
// making the smoke fail.
|
|
if ( cr->m_isCustomCrawl ) {
|
|
char *isIndexed = getIsIndexed();
|
|
if ( ! isIndexed || isIndexed == (char *)-1)
|
|
return (char *)isIndexed;
|
|
if ( *isIndexed ) {
|
|
m_isDupValid = true;
|
|
return &m_isDup;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//we need both vectors to be non-empty
|
|
//uint64_t *tv = getTagPairHash();
|
|
//if ( ! tv || tv == (uint64_t *)-1) return (char *)tv;
|
|
// get our docid
|
|
int64_t *mydocid = getDocId();
|
|
if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
|
|
// get the duplist!
|
|
RdbList *list = getDupList();
|
|
if ( ! list || list == (RdbList *)-1 ) return (char *)list;
|
|
|
|
// sanity. must be posdb list.
|
|
if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}
|
|
|
|
// so getSiteRank() does not core
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
|
|
// . see if there are any pages that seem like they are dups of us
|
|
// . they must also have a HIGHER score than us, for us to be
|
|
// considered the dup
|
|
//if ( ! m_didQuickDupCheck ) {
|
|
// // do not repeat
|
|
// m_didQuickDupCheck = true;
|
|
|
|
|
|
int32_t myRank = getSiteRank ( );
|
|
|
|
// init
|
|
//uint8_t maxScore = 0;
|
|
//uint8_t myScore = 0;
|
|
//char maxSiteRank = -1;
|
|
//int64_t maxDocId = -1LL;
|
|
// assume not a dup
|
|
m_isDup = false;
|
|
// get the docid that we are a dup of
|
|
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
//int64_t d = list->getCurrentDocId();
|
|
char *rec = list->getCurrentRec();
|
|
// get the docid
|
|
int64_t d = g_posdb.getDocId ( rec );
|
|
// get the score
|
|
//uint8_t score = list->getCurrentScore();
|
|
// just let the best site rank win i guess?
|
|
// even though one page may have more inlinks???
|
|
char sr = (char )g_posdb.getSiteRank ( rec );
|
|
// skip if us!
|
|
//if ( d == *getDocId() ) {
|
|
// // record our score
|
|
// //myScore = score;
|
|
// mySiteRank = sr;
|
|
// continue;
|
|
//}
|
|
|
|
// skip if us
|
|
if ( d == m_docId ) continue;
|
|
|
|
// for debug
|
|
//if ( d != m_docId )
|
|
//log("build: doc %s is dup of docid %" INT64 "",
|
|
// m_firstUrl.m_url,d);
|
|
|
|
// if his rank is <= ours then he was here first and we
|
|
// are the dup i guess...
|
|
if ( sr >= myRank ) {
|
|
log("build: doc %s is dup of docid %" INT64 "",
|
|
m_firstUrl.m_url,d);
|
|
m_isDup = true;
|
|
m_isDupValid = true;
|
|
m_docIdWeAreADupOf = d;
|
|
return &m_isDup;
|
|
}
|
|
|
|
// get the winner
|
|
//if ( score > maxScore ) maxScore = score;
|
|
//if ( sr > maxSiteRank || maxSiteRank == -1 ) {
|
|
// maxSiteRank = sr;
|
|
// maxDocId = d;
|
|
// continue;
|
|
//}
|
|
//if ( sr < maxSiteRank ) continue;
|
|
// fallback to docid?
|
|
// do it first come first server othereise i guess
|
|
// this will prevent dups from existing in the index at least
|
|
// if they have the same siterank...
|
|
//if ( d < maxDocId ) {
|
|
// maxDocId = d;
|
|
// continue;
|
|
//}
|
|
}
|
|
// are we the highest scoring doc with this template?
|
|
// corollary: if all dups have equal scores they will be
|
|
// removed until there is only one doc that matches the pattern
|
|
//if ( myScore >= maxScore ) {
|
|
//if ( maxDocId >= 0 && maxDocId != *mydocid && out) {
|
|
// m_isDup = true;
|
|
// m_isDupValid = true;
|
|
// return &m_isDup;
|
|
//}
|
|
|
|
m_isDup = false;
|
|
m_isDupValid = true;
|
|
return &m_isDup;
|
|
|
|
/*
|
|
we now temporarily at least, do exact dup checking...
|
|
later we will bring in the fuzzy code...
|
|
|
|
// reset its ptr for stuff below
|
|
list->resetListPtr();
|
|
|
|
loop:
|
|
// . get a title rec for the current docid
|
|
// . but if exhausted, we are not a dup!
|
|
if ( list->isExhausted() ) { m_isDupValid = true; return &m_isDup; }
|
|
// get the docid
|
|
int64_t d = list->getCurrentDocId();
|
|
// continue if us!
|
|
if ( d == *mydocid ) { list->skipCurrentRecord(); goto loop; }
|
|
// is this a dup of us?
|
|
char *dup = isDupOfUs ( d );
|
|
if ( ! dup || dup == (char *)dup ) return (char *)dup;
|
|
// if dup of us, bail out
|
|
if ( *dup ) { m_isDup = true; m_isDupValid = true; return &m_isDup; }
|
|
// prepare for next
|
|
list->skipCurrentRecord();
|
|
// loop up
|
|
goto loop;
|
|
*/
|
|
}
|
|
|
|
char *XmlDoc::isDupOfUs ( int64_t d ) {
|
|
// sanity check
|
|
if ( d <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// get our current title rec
|
|
SafeBuf *tr = getTitleRecBuf();
|
|
if ( ! tr || tr == (void *)-1 ) return (char *)tr;
|
|
// we should not be here if we know we are a dup of another doc
|
|
if ( m_isDup ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// get the title rec for this docid if we haven't yet done so
|
|
if ( m_calledMsg22d != d ) { // .m_docId != d ) {
|
|
bool s;
|
|
// note it
|
|
setStatus ( "getting possible dup title rec" );
|
|
// do not re-call
|
|
m_calledMsg22d = d;
|
|
// get the guy that might be a dup of us
|
|
s = m_msg22d.getTitleRec ( &m_msg22Request ,
|
|
NULL ,
|
|
d ,
|
|
cr->m_coll ,
|
|
&m_dupTrPtr ,
|
|
&m_dupTrSize ,
|
|
false , // just check tfndb?
|
|
false , // getAvailDocIdOnly
|
|
m_masterState, // state
|
|
m_masterLoop , // callback
|
|
m_niceness ,
|
|
false , // add to cache
|
|
60*60*24 , // maxcacheage
|
|
999999 );// timeout
|
|
// we blocked
|
|
if ( ! s ) return (char *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
// if not there do not count as an error
|
|
if ( ! m_dupTrPtr ) { g_errno = 0; return &m_isDup; }
|
|
// ignore any errors too i guess...
|
|
if ( m_msg22d.m_errno ) {
|
|
log(LOG_WARN, "build: Dup Detection error with "
|
|
"titlerec fetch: %s",mstrerror(m_msg22d.m_errno));
|
|
g_errno = 0;
|
|
return &m_isDup;
|
|
}
|
|
// we need to parse this potential dup doc
|
|
XmlDoc dd;
|
|
// . parse the possible dup title rec into another XmlDoc class
|
|
// . it returns false and sets g_errno on error
|
|
if ( ! dd.set2 ( m_dupTrPtr ,
|
|
m_dupTrSize ,
|
|
cr->m_coll ,
|
|
NULL , // m_pbuf ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
|
|
LinkInfo *info1a = dd.getLinkInfo1();
|
|
LinkInfo *info1b = getLinkInfo1();
|
|
float pageNumInlinksA = info1a->m_numGoodInlinks;//getNumInlinksExtrapolated();
|
|
float pageNumInlinksB = info1b->m_numGoodInlinks;//getNumInlinksExtrapolated();
|
|
|
|
// . if the old dup doc is of lower quality than the new doc that
|
|
// we are checking, then that one should be removed, not us!
|
|
// if they are equal, we keep the int16_ter url of the two
|
|
// . dd was set from title rec so these numInlinks should be taken
|
|
// from the TagRec in ptr_tagRecData, and therefore NOT BLOCK!
|
|
if ( *dd.getSiteNumInlinks() < *getSiteNumInlinks() )
|
|
return &m_isDup;
|
|
if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
|
|
pageNumInlinksA < pageNumInlinksB )
|
|
return &m_isDup;
|
|
if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
|
|
pageNumInlinksA == pageNumInlinksB &&
|
|
dd.getFirstUrl()->getUrlLen() > getFirstUrl()->getUrlLen())
|
|
return &m_isDup;
|
|
|
|
float *ts = getTagSimilarity ( &dd );
|
|
if ( ! ts || ts == (float *)-1 ) return (char *)ts;
|
|
float *gs = getGigabitSimilarity ( &dd );
|
|
if ( ! gs || gs == (float *)-1 ) return (char *)gs;
|
|
float *ps = getPageSimilarity ( &dd );
|
|
if ( ! ps || ps == (float *)-1 ) return (char *)ps;
|
|
|
|
int32_t gigabitVecSimilarity = (int32_t)*gs;
|
|
int32_t tagVecSimilarity = (int32_t)*ts;
|
|
int32_t sampleVecSimilarity = (int32_t)*ps;
|
|
|
|
int32_t notSimilarCount = 0;
|
|
if ( gigabitVecSimilarity < 80 ) {
|
|
notSimilarCount++;
|
|
if ( gigabitVecSimilarity < 50 ) return &m_isDup;
|
|
}
|
|
if ( tagVecSimilarity < 80 ) {
|
|
notSimilarCount++;
|
|
if ( tagVecSimilarity < 50 ) return &m_isDup;
|
|
}
|
|
if ( sampleVecSimilarity < 80 ) {
|
|
notSimilarCount++;
|
|
if ( sampleVecSimilarity < 50 ) return &m_isDup;
|
|
}
|
|
// if it is similar enough, we got a dup!
|
|
if ( notSimilarCount <= 0 ) { m_isDupValid = true; m_isDup = true; }
|
|
|
|
return &m_isDup;
|
|
}
|
|
|
|
// hash a gigabit hash vector without its scores, also order independent
|
|
uint32_t *XmlDoc::getGigabitVectorScorelessHash ( ) {
|
|
if ( m_gigabitVectorHashValid ) return &m_gigabitVectorHash;
|
|
int32_t **gbvec = getGigabitHashes();
|
|
if ( ! gbvec || gbvec == (int32_t **)-1 ) return (uint32_t *)gbvec;
|
|
uint32_t h = 0;
|
|
// this bad boy is NULL terminated
|
|
uint32_t *gbv = (uint32_t *)*gbvec;
|
|
// i guess zak likes the simple XOR'ing thing...
|
|
for ( int32_t i = 0; gbv && gbv[i] ; i++) h ^= gbv[i];
|
|
m_gigabitVectorHashValid = true;
|
|
m_gigabitVectorHash = h;
|
|
return &m_gigabitVectorHash;
|
|
}
|
|
|
|
// . the original vector used for deduping similar search results is just from
|
|
// random sample of indexed terms, but gigabit vector is
|
|
// formed using the hashes of the top-scoring gigabits of the document, and
|
|
// therefore uses the words class
|
|
// . sets g_errno and returns NULL on error
|
|
// . ptr_gigabitHashes can be NULL...
|
|
int32_t **XmlDoc::getGigabitHashes ( ) {
|
|
// if it was already set, treat this as an accessor
|
|
if ( m_gigabitHashesValid ) return &ptr_gigabitHashes;
|
|
// this also sets the vector
|
|
char *gq = getGigabitQuery();
|
|
if ( ! gq || gq == (char *)-1) return (int32_t **)gq;
|
|
// it should be valid now!
|
|
if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
|
|
return &ptr_gigabitHashes;
|
|
}
|
|
|
|
// . the new function to get gigabits
|
|
// . sets and validates m_gigabitQuery[] and m_gigabitHashes[] among others
|
|
// . candidates = capitalized word, capitalized sequence of words,
|
|
// uncapitalized 2+ word wikipedia phrase.
|
|
// . candidates exclude uncapitalized query stop words.
|
|
// . calls addGigabits() which is called by each doc in search results
|
|
// when we use this at query time.
|
|
// . separates gigabits with a comma (delimeter) in m_gigabitQuery[]
|
|
// . quotes multiple word gigabits
|
|
char *XmlDoc::getGigabitQuery ( ) {
|
|
|
|
if ( m_gigabitQueryValid ) return m_gigabitQuery;
|
|
|
|
setStatus ( "getting gigabit query" );
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (char *)d;
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss == (Sections *)-1 ) return (char *)ss;
|
|
//Weights *we = getWeights();
|
|
//if ( ! we || we == (Weights *)-1 ) return (char *)we;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
|
|
|
|
HashTableX ht;
|
|
char buf [ 200000 ];
|
|
// pass in niceness in case it has to grow really big and re-hash all!!
|
|
ht.set ( 8 , 4 , -1 , buf , 200000 , false, m_niceness,"xmlgbtbl");
|
|
|
|
// . add gigabits from our body words
|
|
// . includes title and header tags so pts can work well!
|
|
if ( ! addGigabits ( ww , *d , ss , *langId ) ) return NULL;
|
|
|
|
// add gigabits from link info
|
|
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
|
// sanity check
|
|
char *txt = k->getLinkText();
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 0 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
// add those in
|
|
if (!addGigabits(txt, *d, *langId ) ) return NULL;
|
|
// add in neighborhoods
|
|
if(!addGigabits(k->getSurroundingText(),*d,*langId))
|
|
return NULL;
|
|
}
|
|
|
|
// add in gigabits for meta keywords
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription( &mdlen );
|
|
if ( ! addGigabits2 ( md , mdlen, *d , *langId ) ) return NULL;
|
|
|
|
// add in gigabits for meta description
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
if ( ! addGigabits2 ( mk , mklen , *d , *langId ) ) return NULL;
|
|
|
|
// set m_gigabitQuery and m_gigabitScores
|
|
//GigabitInfo *top[100];
|
|
// fill in "top" in order of score
|
|
m_numTop = getTopGigabits ( &ht , m_top , 100 , 0 );
|
|
// error? then g_errno should be set
|
|
if ( m_numTop == -1 ) return NULL;
|
|
|
|
char *p = m_gigabitQuery;
|
|
char *pend = m_gigabitQuery + XD_GQ_MAX_SIZE - 1;
|
|
// reset count of vector components for setting gigabit vector
|
|
int32_t ng = 0;
|
|
// total score
|
|
//int32_t total = 0;
|
|
// . now set the gigabit query!
|
|
// . start with the highest scoring node first, the last node since
|
|
// nodes are ranked by lowest to highest key
|
|
for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
|
|
// get the info
|
|
GigabitInfo *gi = m_top[i];
|
|
// stop if too big
|
|
if ( p + gi->m_len + 10 >= pend ) continue;
|
|
// get 32 bit hash
|
|
uint32_t h = gi->m_hash & 0xffffffff;
|
|
// never allow 0
|
|
if ( h == 0 ) h = 1;
|
|
// add to vector
|
|
if ( ng + 1 < XD_MAX_GIGABIT_HASHES ) {
|
|
// the term hash
|
|
m_gigabitHashes[ng] = (int32_t)h ;
|
|
// and the score
|
|
m_gigabitScores[ng] = gi->m_pts;
|
|
// point into it, where we will copy it to
|
|
m_gigabitPtrs [ng] = p + 1;
|
|
// advance
|
|
ng++;
|
|
}
|
|
// quote it
|
|
*p++ = '\"';
|
|
// write into buffer
|
|
gbmemcpy ( p , gi->m_ptr , gi->m_len );
|
|
// finish quote
|
|
*p++ = '\"';
|
|
// separate terms just in case
|
|
//gbmemcpy ( p , " , ", 4 );
|
|
//p += 4;
|
|
*p++ = ',';
|
|
}
|
|
// done
|
|
*p++ = '\0';
|
|
// NULL termiante the vector to make it a legit vector
|
|
m_gigabitHashes [ ng ] = 0;
|
|
m_gigabitScores [ ng ] = 0;
|
|
|
|
// include the terminating 0
|
|
ng++;
|
|
// validate both the query and vector
|
|
m_gigabitQueryValid = true;
|
|
m_gigabitHashesValid = true;
|
|
// set this too
|
|
ptr_gigabitHashes = m_gigabitHashes;
|
|
ptr_gigabitScores = m_gigabitScores;
|
|
size_gigabitHashes = ng * 4 ; // 4 bytes each component
|
|
size_gigabitScores = ng * 4 ; // 4 bytes each score
|
|
return m_gigabitQuery;
|
|
}
|
|
|
|
|
|
// . fill in "top" in order of score
|
|
// . returns -1 and sets g_errno on error
|
|
int32_t getTopGigabits ( HashTableX *ht ,
|
|
GigabitInfo **top ,
|
|
int32_t max ,
|
|
int32_t minDocCount ) {
|
|
|
|
|
|
// store top 100 into this tree
|
|
RdbTree tree;
|
|
if ( ! tree.set ( 4 , // fixedDataSize
|
|
max+2 , // maxNumNodes
|
|
true , // balance?
|
|
-1 , // maxMem
|
|
true , // own data?
|
|
"tree-topgbits" ))
|
|
return -1;
|
|
|
|
int32_t ns = ht->getNumSlots();
|
|
key_t minKey;
|
|
bool minKeyValid = false;
|
|
for ( int32_t i = 0 ; i < ns ; i++ ) {
|
|
// skip if empty
|
|
if ( ht->isEmpty(i) ) continue;
|
|
// get his info
|
|
GigabitInfo *gi = (GigabitInfo *)ht->getValueFromSlot(i);
|
|
// must be valid
|
|
if ( gi->m_count <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// must be in this many docs minimum
|
|
if ( gi->m_numDocs < minDocCount ) continue;
|
|
// make the key
|
|
key_t key;
|
|
key.n1 = gi->m_pts;
|
|
key.n0 = gi->m_hash;
|
|
// should we add it?
|
|
if ( minKeyValid && key <= minKey ) continue;
|
|
// we should add it. use points as the key. use PTR as data
|
|
int32_t node = tree.addNode(0,key,(char *)&gi,4);
|
|
// error? g_errno should be set
|
|
if ( node < 0 ) return -1;
|
|
// if not full continue
|
|
if ( tree.getNumUsedNodes() < 100 ) continue;
|
|
// get the smallest node
|
|
int32_t tn = tree.getLowestNode ( ) ;
|
|
// sanity check
|
|
if ( tn < 0 ) { char *xx=NULL;*xx=0; }
|
|
// kick out smallest
|
|
tree.deleteNode3 ( tn , false );
|
|
// get new smallest
|
|
tn = tree.getLowestNode();
|
|
// set the new minkey
|
|
minKey = *(key_t *)tree.getKey ( tn );
|
|
// validate it
|
|
minKeyValid = true;
|
|
}
|
|
int32_t count = 0;
|
|
// . now set the array
|
|
// . start with the highest scoring node first, the last node since
|
|
// nodes are ranked by lowest to highest key
|
|
for ( int32_t nn=tree.getLastNode() ; nn>=0 ; nn=tree.getPrevNode(nn) ){
|
|
// get the info
|
|
GigabitInfo *gi = (GigabitInfo *)tree.getData(nn);
|
|
// store it
|
|
top[count++] = gi;
|
|
// stop if we are full
|
|
if ( count >= max ) break;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
char *XmlDoc::getMetaDescription( int32_t *mdlen ) {
|
|
if ( m_metaDescValid ) {
|
|
*mdlen = m_metaDescLen;
|
|
return m_metaDesc;
|
|
}
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
//xml->getMetaContent ( m_metaDesc, 1024, "description", 11 );
|
|
// we need to point to it in the html source so our WordPosInfo
|
|
// algo works right.
|
|
m_metaDesc = xml->getMetaContentPointer("description",
|
|
11,
|
|
"name",
|
|
&m_metaDescLen);
|
|
*mdlen = m_metaDescLen;
|
|
m_metaDescValid = true;
|
|
return m_metaDesc;
|
|
}
|
|
|
|
char *XmlDoc::getMetaSummary ( int32_t *mslen ) {
|
|
if ( m_metaSummaryValid ) {
|
|
*mslen = m_metaSummaryLen;
|
|
return m_metaSummary;
|
|
}
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
m_metaSummary = xml->getMetaContentPointer("summary",
|
|
7,
|
|
"name",
|
|
&m_metaSummaryLen);
|
|
*mslen = m_metaSummaryLen;
|
|
m_metaSummaryValid = true;
|
|
return m_metaSummary;
|
|
}
|
|
|
|
char *XmlDoc::getMetaKeywords( int32_t *mklen ) {
|
|
if ( m_metaKeywordsValid ) {
|
|
*mklen = m_metaKeywordsLen;
|
|
return m_metaKeywords;
|
|
}
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
//xml->getMetaContent ( m_metaKeywords, 1024, "keywords", 8 );
|
|
// we need to point to it in the html source so our WordPosInfo
|
|
// algo works right.
|
|
m_metaKeywords=xml->getMetaContentPointer("keywords",
|
|
8,
|
|
"name",
|
|
&m_metaKeywordsLen);
|
|
*mklen = m_metaKeywordsLen;
|
|
m_metaKeywordsValid = true;
|
|
return m_metaKeywords;
|
|
}
|
|
|
|
bool XmlDoc::addGigabits ( char *s ,
|
|
int64_t docId ,
|
|
uint8_t langId ) {
|
|
Words tmp;
|
|
// skip if none
|
|
if ( ! s ) return true;
|
|
// returns NULL with g_errno set on error
|
|
if ( ! tmp.set9 ( s , m_niceness ) ) return false;
|
|
// and weights!
|
|
//Weights we;
|
|
//if ( ! we.set ( &tmp , )
|
|
// and so does this
|
|
return addGigabits ( &tmp , docId , NULL , langId );
|
|
}
|
|
|
|
bool XmlDoc::addGigabits2 ( char *s ,
|
|
int32_t slen,
|
|
int64_t docId ,
|
|
uint8_t langId ) {
|
|
Words tmp;
|
|
// skip if none
|
|
if ( ! s ) return true;
|
|
// returns NULL with g_errno set on error
|
|
if ( ! tmp.setx ( s , slen , m_niceness ) ) return false;
|
|
// and weights!
|
|
//Weights we;
|
|
//if ( ! we.set ( &tmp , )
|
|
// and so does this
|
|
return addGigabits ( &tmp , docId , NULL , langId );
|
|
}
|
|
|
|
bool XmlDoc::addGigabits(Words *ww,int64_t docId,Sections *sections,
|
|
uint8_t langId ) {
|
|
// skip sections marked as these:
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
|
|
// get this
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
// not if we don't have any identified sections
|
|
if ( sections && sections->m_numSections <= 0 ) sp = NULL;
|
|
// int16_tcuts
|
|
int64_t *wids = ww->m_wordIds;
|
|
char **wptrs = ww->m_words;
|
|
int32_t *wlens = ww->m_wordLens;
|
|
nodeid_t *tids = ww->m_tagIds;
|
|
int32_t nw = ww->getNumWords();
|
|
//int32_t flags;
|
|
// initial # of slots
|
|
int32_t is = 0;
|
|
if ( m_wordsValid ) is = ww->m_numAlnumWords;
|
|
// put gigabits into this hash table
|
|
HashTableX ht;
|
|
if ( ! ht.set ( 8 , sizeof(GigabitInfo),is,NULL,0,false,m_niceness,
|
|
"gigabits") )
|
|
return false;
|
|
// scan through the words
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe if being called by spider
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// get section
|
|
Section *sx = NULL;
|
|
// get flags
|
|
if ( sp ) sx = sp[i];//flags = sp[i]->m_flags;
|
|
//else flags = 0;
|
|
// skip if ignored. i.e. in the menu or not in the article text
|
|
//if ( flags & badFlags ) continue;
|
|
// are we capitalized?
|
|
bool cap = ww->isCapitalized(i);
|
|
// ignore lower case query stop words
|
|
if (!cap&&isQueryStopWord(wptrs[i],wlens[i],wids[i],langId))
|
|
continue;
|
|
// hash of word then the phrase
|
|
//uint32_t h = wids[i] & 0xffffffff;
|
|
//uint64_t h = wids[i];
|
|
// add the word itself. return NULL with g_errno set on error
|
|
if ( ! addGigabit (&ht,wptrs[i],wlens[i],docId,
|
|
sx,true,langId,i)) return false;
|
|
// save position
|
|
int32_t j = i + 1 ;
|
|
// check this far out
|
|
int32_t maxj = i + 12; if ( maxj > nw ) maxj = nw;
|
|
// do we got a cap phrase?
|
|
bool capPhrase = false;
|
|
// if capitalized look for sequence
|
|
for ( ; cap && j < maxj ; j++ ) {
|
|
// . stop on tags
|
|
// . tids is NULL if being set from meta tag...
|
|
if ( tids && tids[j] ) break;
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) {
|
|
// make sure it is like a single space or
|
|
// something we can "phrase across"
|
|
// TODO: can be like "capt. "
|
|
if ( wlens[j] == 1 ) continue;
|
|
// otherwise it stops the phrase
|
|
break;
|
|
}
|
|
// if not capitalized stop
|
|
if ( ! ww->isCapitalized(j) ) break;
|
|
// got one!
|
|
capPhrase = true;
|
|
// . hash it into the ongoing hash
|
|
// . Speller::getPopularity() should use this same
|
|
// method so we can get popularities of the gigabits!
|
|
//h = hash32Fast ( wids[j] & 0xffffffff , h );
|
|
//h = hash64Fast ( wids[j] , h );
|
|
}
|
|
// if we added something... skip whole phrase, if any
|
|
if ( capPhrase ) {
|
|
// get length of it
|
|
int32_t len = wptrs[j-1] + wlens[j-1] - wptrs[i];
|
|
// add that entire sequence, [i,j)
|
|
if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,
|
|
false,langId,i)) return false;
|
|
// advance to end of phrase
|
|
i = j - 1;
|
|
continue;
|
|
}
|
|
// reset
|
|
j = i + 1;
|
|
// this must be true
|
|
// . ok, look for a wiki phrase then!
|
|
// . we can speed this up if too slow... using a crazy hash tbl
|
|
int32_t wikij = -1;
|
|
// init the hash for wiki lookup
|
|
uint32_t h = 0;
|
|
// loop over successive terms
|
|
for ( ; j < maxj ; j++ ) {
|
|
// . stop on tags
|
|
// . tids is NULL if being set from meta tag
|
|
if ( tids && tids[j] ) break;
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) {
|
|
// make sure it is like a single space or
|
|
// something we can "phrase across"
|
|
// TODO: can be like "capt. "
|
|
if ( wlens[j] == 1 ) continue;
|
|
// otherwise it stops the phrase
|
|
break;
|
|
}
|
|
// init it
|
|
if ( ! h ) h = hash32Fast ( wids[i] & 0xffffffff , 0 );
|
|
// hash it into the ongoing hash
|
|
h = hash32Fast ( wids[j] & 0xffffffff , h );
|
|
// is this in the wiki?
|
|
if ( ! g_wiki.isInWiki ( h ) ) continue;
|
|
// it is, mark it
|
|
wikij = j + 1;
|
|
}
|
|
|
|
// must be a 2+ word phrase in the wiki to be a gigabit
|
|
if ( wikij == -1 ) continue;
|
|
// bail if breach
|
|
if ( wikij >= nw ) continue;
|
|
// get len
|
|
int32_t len = wptrs[wikij] + wlens[wikij] - wptrs[i];
|
|
// add what we got
|
|
if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,false,
|
|
langId,i) ) return false;
|
|
// advance to end of phrase
|
|
i = wikij - 1;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
char* XmlDoc::getMetadata(int32_t* retlen) {
|
|
if(!m_hasMetadata) {
|
|
*retlen = 0;
|
|
return NULL;
|
|
}
|
|
|
|
*retlen = size_metadata;
|
|
return ptr_metadata;
|
|
|
|
}
|
|
|
|
// . this is called by Msg40.cpp to intersect gigabits from multiple docs
|
|
// . returns -1 and sets g_errno on error
|
|
// . returns # of GigabitInfos stored into "top"
|
|
/*
|
|
int32_t intersectGigabits ( Msg20 **mp , // search results
|
|
int32_t n , // # of em
|
|
uint8_t langId , // searcher's langId
|
|
int32_t maxTop ,
|
|
int32_t docsToScan ,
|
|
int32_t minDocCount , // must be in this # docs
|
|
GigabitInfo *top ,
|
|
int32_t niceness ) {
|
|
|
|
// put gigabits into this hash table
|
|
HashTableX ht;
|
|
ht.set ( 8 , sizeof(GigabitInfo),0,NULL,0,false,niceness,"ginttbl");
|
|
|
|
for ( int32_t i = 0 ; i < n && i < docsToScan ; i++ ) {
|
|
// get the reply/searchResult
|
|
Msg20Reply *mr = mp[i]->m_r;
|
|
// sanity check
|
|
if ( ! mr && ! mp[i]->m_errno ) { char *xx=NULL;*xx=0; }
|
|
// this is NULL on error
|
|
if ( ! mr ) continue;
|
|
// count them
|
|
int32_t count = 0;
|
|
// add each gigabit for it
|
|
for ( char *p = mr->ptr_gigabitQuery ; p && *p ; count++ ) {
|
|
// skip the comma
|
|
p++;
|
|
// point to next
|
|
char *end = strchr ( p , ',' );
|
|
// do not allow NULLs
|
|
if ( ! end ) end = p + gbstrlen(p);
|
|
// get the score. aka GigabitInfo::m_pts
|
|
int32_t ptsArg = mr->ptr_gigabitScores[count];
|
|
// sanity check for bad scores
|
|
if ( ptsArg <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// add it in
|
|
if ( ! addGigabit ( &ht ,
|
|
p ,
|
|
end - p , // langth
|
|
mr->m_docId ,
|
|
NULL ,// section ptr
|
|
false , // singleWrd? unused
|
|
langId ,
|
|
-1 , // word #i not used
|
|
ptsArg ) )
|
|
return -1;
|
|
// advance p
|
|
p = end;
|
|
// if not comma, all done
|
|
if ( *p != ',' ) break;
|
|
// skip comma
|
|
p++;
|
|
}
|
|
}
|
|
|
|
// . get up to the top 50 gigabits
|
|
GigabitInfo *array [ 50 ];
|
|
int32_t numTop = getTopGigabits ( &ht , array , 50 , minDocCount );
|
|
// error? g_errno should be set
|
|
if ( numTop == -1 ) return -1;
|
|
// sanity check
|
|
if ( numTop > maxTop ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now copy into our array
|
|
for ( int32_t i = 0 ; i < numTop ; i++ ) {
|
|
// get it
|
|
GigabitInfo *gi = array[i];
|
|
// copy it
|
|
gbmemcpy ( &top[i] , gi , sizeof(GigabitInfo) );
|
|
}
|
|
// return how many we copied
|
|
return numTop;
|
|
}
|
|
*/
|
|
|
|
// . "docId" is the document Id that "h" came from
|
|
// . if being called at query time we often get called on each search result!
|
|
// . if being called at parse/index time we are being called on a single docId
|
|
// . returns false and sets g_errno on error
|
|
bool addGigabit ( HashTableX *ht ,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int64_t docId ,
|
|
Section *sp ,
|
|
bool singleWord ,
|
|
uint8_t langId ,
|
|
// starts with word #i
|
|
int32_t i ,
|
|
int32_t ptsArg ) {
|
|
// get its hash
|
|
uint64_t h = hash64d ( s , slen );
|
|
// get the slot where its at
|
|
int32_t slot = ht->getSlot ( &h );
|
|
// info for this hash/gigabit in the doc
|
|
GigabitInfo *gi ;
|
|
// otherwise, init a new slot. set the key to h
|
|
if ( slot < 0 ) {
|
|
// . add key to a new slot, set "gi" to the value ptr
|
|
// . use NULL for the GigabitInfo ptr temporarily so it should
|
|
// not gbmemcpy into the slot
|
|
if ( ! ht->addKey ( &h , NULL , &slot ) ) return false;
|
|
// get data ptr to the bogus data
|
|
gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
|
|
// . set all the stuff now. this way avoids a gbmemcpy...
|
|
// . every wiki title should have a popularity i guess...
|
|
// . "pop" is # of docs out of 10,000 that have this phrase?
|
|
int32_t pop = g_speller.getPhrasePopularity(s,h,true,langId);
|
|
gi->m_pop = pop;
|
|
gi->m_pts = 0;
|
|
gi->m_count = 0;
|
|
gi->m_numDocs = 0;
|
|
gi->m_lastDocId = 0LL;
|
|
gi->m_currentDocCount = 0; // a char
|
|
gi->m_ptr = s;
|
|
gi->m_len = slen;
|
|
gi->m_hash = h;
|
|
// sanity test
|
|
GigabitInfo *tt = (GigabitInfo *)ht->getValue ( &h );
|
|
if ( tt->m_pop != pop ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
|
|
// only allow up to 5 votes per document!
|
|
if ( gi->m_currentDocCount >= 5 ) return true;
|
|
}
|
|
// inc the count, we got one more occurrence
|
|
gi->m_count++;
|
|
// doc count. how many docs have this gigabit? count it.
|
|
if ( docId != gi->m_lastDocId ) {
|
|
gi->m_numDocs++;
|
|
gi->m_lastDocId = docId;
|
|
gi->m_currentDocCount = 1;
|
|
}
|
|
else
|
|
gi->m_currentDocCount++;
|
|
|
|
// given?
|
|
if ( ptsArg != -1 ) {
|
|
gi->m_pts += ptsArg;
|
|
return true;
|
|
}
|
|
|
|
// base points on popularity
|
|
float pts = 1.0;
|
|
if ( gi->m_pop < 1 ) pts = 1000;
|
|
else if ( gi->m_pop < 2 ) pts = 500;
|
|
else if ( gi->m_pop < 3 ) pts = 250;
|
|
else if ( gi->m_pop < 4 ) pts = 200;
|
|
else if ( gi->m_pop < 5 ) pts = 150;
|
|
else if ( gi->m_pop < 6 ) pts = 100;
|
|
else if ( gi->m_pop < 7 ) pts = 20;
|
|
else if ( gi->m_pop < 8 ) pts = 10;
|
|
else if ( gi->m_pop < 10 ) pts = 5;
|
|
else if ( gi->m_pop < 15 ) pts = 3;
|
|
else if ( gi->m_pop < 20 ) pts = 2;
|
|
|
|
// . special boost if in title, header or anchor tag
|
|
// . the weights class ONLY boosts the first 20 or so words in
|
|
// header tags... how can we fix that??????????????????
|
|
// . TODO: FIX THAT!!!
|
|
//if ( flags & SEC_TITLE ) pts = pts * 6.0/(float)we->m_titleWeight;
|
|
//if ( flags & SEC_HEADER) pts = pts * 4.0/(float)we->m_headerWeight;
|
|
//if ( flags & SEC_A ) pts = pts * 4.0/(float)we->m_linkTextWeight;
|
|
if ( sp ) {
|
|
if ( sp->m_flags & SEC_IN_TITLE ) pts = pts * 6.0;
|
|
if ( sp->m_flags & SEC_IN_HEADER ) pts = pts * 4.0;
|
|
if ( sp->m_tagId == TAG_A ) pts = pts * 4.0;
|
|
}
|
|
|
|
// if for the query 'recreation' you get the phrase "park bench"
|
|
// 100 times and the word "bench" 100 times. the word weight
|
|
// for "bench" should be very low! Weights.cpp also demotes repreated
|
|
// sentence fragments, etc. it is generally a really handy thing!
|
|
// and i think it already boosts scores for being in the title, etc.
|
|
// IF BEING called from meta tag, weights are NULL!
|
|
// TODO: we need to use the diversity vector here then...
|
|
//if ( we ) {
|
|
// if ( singleWord ) pts *= we->m_ww[i];
|
|
// else pts *= we->m_pw[i];
|
|
//}
|
|
|
|
// add them in
|
|
gi->m_pts += (int32_t)pts;
|
|
|
|
// good to go
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
-- this will be a url filter var like "numindexed"
|
|
int32_t *XmlDoc::getSiteSpiderQuota ( ) {
|
|
if ( m_siteSpiderQuotaValid ) return &m_siteSpiderQuota;
|
|
int32_t *siteNumInlinks = getSiteNumInlinks();
|
|
if ( ! siteNumInlinks ) return NULL;
|
|
if ( siteNumInlinks == (int32_t *)-1 ) return (int32_t *)-1;
|
|
// get this fresh each time
|
|
int32_t *rn = getRegExpNum ( -1 );
|
|
if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
|
|
// bail early? this happens if we match a banned/filtered rule in
|
|
// the url filters table
|
|
if ( m_indexCode ) return NULL;
|
|
// valid at this point
|
|
m_siteSpiderQuotaValid = true;
|
|
// if no match, or filtered or banned, assume no quota
|
|
if ( *rn == -1 ) m_siteSpiderQuota = -1;
|
|
else m_siteSpiderQuota = cr->m_spiderQuotas[*rn];
|
|
// get the quota, -1 means no limit
|
|
return &m_siteSpiderQuota;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
Url *XmlDoc::getCurrentUrl ( ) {
|
|
if ( m_currentUrlValid ) return &m_currentUrl;
|
|
// otherwise, get first url
|
|
Url *fu = getFirstUrl();
|
|
if ( ! fu || fu == (void *)-1 ) return (Url *)fu;
|
|
// make that current url
|
|
m_currentUrl.set ( &m_firstUrl , false );
|
|
m_currentUrlValid = true;
|
|
return &m_currentUrl;
|
|
/*
|
|
// need a valid url
|
|
Url *u = getFirstUrl();
|
|
if ( ! u ) return NULL;
|
|
// but use redir if we got that
|
|
Url *r = getRedirUrl();
|
|
if ( r && m_redirUrlValid ) return r;
|
|
return u;
|
|
*/
|
|
}
|
|
|
|
Url *XmlDoc::getFirstUrl() {
|
|
if ( m_firstUrlValid ) return &m_firstUrl;
|
|
// we might have a title rec
|
|
if ( m_setFromTitleRec ) {
|
|
setFirstUrl ( ptr_firstUrl , false );
|
|
m_firstUrlValid = true;
|
|
return &m_firstUrl;
|
|
}
|
|
// must be this otherwise
|
|
if ( ! m_setFromDocId ) { char *xx=NULL;*xx=0; }
|
|
// this must be valid
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (Url *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// now set it
|
|
setFirstUrl ( od->ptr_firstUrl , false );
|
|
m_firstUrlValid = true;
|
|
return &m_firstUrl;
|
|
}
|
|
|
|
|
|
int64_t XmlDoc::getFirstUrlHash48() {
|
|
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
|
|
// this must work
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( getUseTimeAxis() ) {
|
|
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
|
|
m_firstUrlHash48Valid = true;
|
|
return m_firstUrlHash48;
|
|
}
|
|
|
|
m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL;
|
|
m_firstUrlHash48Valid = true;
|
|
return m_firstUrlHash48;
|
|
}
|
|
|
|
int64_t XmlDoc::getFirstUrlHash64() {
|
|
if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
|
|
// this must work
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( getUseTimeAxis() ) {
|
|
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
|
|
m_firstUrlHash64Valid = true;
|
|
return m_firstUrlHash64;
|
|
}
|
|
|
|
m_firstUrlHash64 = hash64b ( m_firstUrl.m_url );
|
|
m_firstUrlHash64Valid = true;
|
|
return m_firstUrlHash64;
|
|
}
|
|
|
|
Url **XmlDoc::getLastRedirUrl() {
|
|
|
|
Url **ru = getRedirUrl();
|
|
if ( ! ru || ru == (void *)-1 ) return ru;
|
|
|
|
// m_redirUrlPtr will be NULL in all cases, however, the
|
|
// last redir url we actually got will be set in
|
|
// m_redirUrl.m_url so return that.
|
|
m_lastRedirUrlPtr = &m_redirUrl;
|
|
return &m_lastRedirUrlPtr;
|
|
}
|
|
|
|
// . operates on the latest m_httpReply
|
|
Url **XmlDoc::getRedirUrl() {
|
|
if ( m_redirUrlValid ) return &m_redirUrlPtr;
|
|
|
|
setStatus ( "getting redir url" );
|
|
|
|
// assume no redirect
|
|
m_redirUrlPtr = NULL;
|
|
//ptr_redirUrl = NULL;
|
|
//size_redirUrl = 0;
|
|
// bail on this
|
|
//if ( ! m_checkForRedir ) {
|
|
// m_redirError = 0;
|
|
// m_redirErrorValid = true;
|
|
// return &m_redirUrlPtr;
|
|
//}
|
|
// we might have a title rec
|
|
if ( m_setFromTitleRec ) { char *xx=NULL;*xx=0; }
|
|
|
|
// or recycling content from old title rec
|
|
if ( m_recycleContent ) {
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
m_redirUrlValid = true;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
// get the current http reply, not the final http reply necessarily
|
|
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set a mime on the stack
|
|
HttpMime mime;
|
|
// int16_tcut
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
// sanity check
|
|
if ( LEN > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
|
|
// empty reply, no redir
|
|
if ( LEN == 0 ) {
|
|
// bad mime, but i guess valid empty redir url
|
|
m_redirUrlValid = true;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
// return a fake thing. content length is 0.
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// set it. if 'connection refused' then LEN is -1.
|
|
if ( LEN<0 || ! mime.set ( m_httpReply, LEN, getCurrentUrl() ) ) {
|
|
// set this on mime error
|
|
//if ( ! m_indexCode ) m_indexCode = EBADMIME;
|
|
// bad mime, but i guess valid empty redir url
|
|
m_redirUrlValid = true;
|
|
// return nothing, no redirect url was there
|
|
m_redirUrlPtr = NULL;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
// return a fake thing. content length is 0.
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
int32_t httpStatus = mime.getHttpStatus() ;
|
|
|
|
|
|
Url *loc = NULL;
|
|
|
|
// quickly see if we are a robots.txt url originally
|
|
bool isRobotsTxt = isFirstUrlRobotsTxt ( );
|
|
|
|
//
|
|
// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
|
|
// if httpStatus is not a redirect
|
|
//
|
|
if ( httpStatus < 300 || httpStatus > 399 ) {
|
|
// ok, crap, i was getting the xml here to get the meta
|
|
// http-equiv refresh tag, but that added an element of
|
|
// recursion that is just too confusing to deal with. so
|
|
// let's just parse out the meta tag by hand
|
|
bool checkMeta = true;
|
|
if ( isRobotsTxt ) checkMeta = false;
|
|
// if we are a doc that consists of a sequence of sub-docs that
|
|
// we are indexing/injecting then don't do this check.
|
|
if ( isContainerDoc() ) checkMeta = false;
|
|
if ( checkMeta ) {
|
|
Url **mrup = getMetaRedirUrl();
|
|
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
|
|
// set it. might be NULL if not there.
|
|
loc = *mrup;
|
|
}
|
|
}
|
|
else
|
|
// get Location: url (the redirect url) from the http mime
|
|
loc = mime.getLocationUrl();
|
|
|
|
// get current url
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (Url **)cu;
|
|
|
|
// this call set size_catIds
|
|
int32_t **pcids = getCatIds();
|
|
if ( ! pcids || pcids == (void *)-1) return (Url **)pcids;
|
|
// get local link info
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// error or blocked
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
|
|
// get remote link info
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
// error or blocked
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (Url **)pinfo2;
|
|
// convenience
|
|
LinkInfo *info2 = *pinfo2;
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// did we send a cookie with our last request?
|
|
bool sentCookieLastTime = false;
|
|
if ( m_redirCookieBuf.length() )
|
|
sentCookieLastTime = true;
|
|
|
|
// get cookie for redirect to fix nyt.com/nytimes.com
|
|
// for gap.com it uses multiple Set-Cookie:\r\n lines so we have
|
|
// to accumulate all of them into a buffer now
|
|
m_redirCookieBuf.reset();
|
|
mime.addCookiesIntoBuffer ( &m_redirCookieBuf );
|
|
m_redirCookieBufValid = true;
|
|
|
|
/*
|
|
char *cookie = mime.getCookie();
|
|
// find end of cookie at the semicolon
|
|
char *s = cookie;
|
|
for ( ; s && *s && *s != ';' ; s++ );
|
|
if ( s && *s == ';' ) {
|
|
// do not include ;
|
|
int32_t clen = s - cookie;
|
|
m_redirCookieBuf.reset();
|
|
m_redirCookieBuf.safeMemcpy ( cookie , clen );
|
|
m_redirCookieBuf.nullTerm();
|
|
m_redirCookieBufValid = true;
|
|
}
|
|
*/
|
|
|
|
// mdw23
|
|
//log("http: reply=%s",m_httpReply);
|
|
|
|
// a hack for removing session ids already in there. for
|
|
// brilliantshopper's bs4 collection and gk0 cluster
|
|
//bool forceRedirect = false;
|
|
if ( size_catIds == 0 &&
|
|
// must not have an actual redirect url in there
|
|
! loc &&
|
|
// must be a valid http status
|
|
httpStatus == 200 &&
|
|
(gb_strcasestr( cu->getUrl(), "sessionid") ||
|
|
gb_strcasestr( cu->getUrl(), "oscsid") ) ) {
|
|
Url *tt = &m_redirUrl;
|
|
tt->set ( cu->getUrl() ,
|
|
cu->getUrlLen() ,
|
|
true , // addwww?
|
|
true ); // strip sessid?
|
|
// if it no longer has the session id, force redirect it
|
|
if ( ! gb_strcasestr( tt->getUrl(), "sessionid") &&
|
|
! gb_strcasestr( tt->getUrl(), "oscsid") ) {
|
|
m_redirUrlValid = true;
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
// TODO: log redir url in spider log output
|
|
//logf(LOG_INFO,"build: %s force redirected to %s",
|
|
// cu->getUrl(),m_redirUrl.getUrl());
|
|
m_redirUrlValid = true;
|
|
ptr_redirUrl = m_redirUrl.m_url;
|
|
size_redirUrl = m_redirUrl.m_ulen+1;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// if no location url, then no redirect a NULL redir url
|
|
if ( ! loc || loc->m_url[0] == '\0' ) {
|
|
// validate it
|
|
m_redirUrlValid = true;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
// and return an empty one
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// this is handy
|
|
//Url tmp;
|
|
|
|
// TODO: make sure we got this logic elsewhere
|
|
// if robots.txt said no, and if we had no link text, then give up
|
|
//if(! *isAllowed && !info1->hasLinkText() && !info2->hasLinkText() ) {
|
|
// m_indexCode = EDOCDISALLOWED;
|
|
|
|
// set our redir url from the mime's Location: field. addWWW=false
|
|
//if ( loc != &tmp ) tmp.set ( loc , false );
|
|
|
|
bool keep = false;
|
|
if ( size_catIds > 0 ) keep = true;
|
|
if ( info1->hasLinkText() ) keep = true;
|
|
if ( info2 && info2->hasLinkText() ) keep = true;
|
|
|
|
// at this point we do not block anywhere
|
|
m_redirUrlValid = true;
|
|
|
|
// store the redir error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
|
|
// i've seen a "Location: 2010..." bogus url as well, so make sure
|
|
// we got a legit url
|
|
if ( ! loc->getDomain() || loc->getDomainLen() <= 0 ) {
|
|
if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
//bool injected = false;
|
|
// get from spider request if there
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
|
|
|
// . if redirect url is nothing new, then bail (infinite loop)
|
|
// . www.xbox.com/SiteRequirements.htm redirects to itself
|
|
// until you send a cookie!!
|
|
// . www.twomileborris.com does the cookie thing, too
|
|
if ( strcmp ( cu->getUrl(), loc->getUrl() ) == 0 ) {
|
|
// try sending the cookie if we got one now and didn't have
|
|
// one for this last request
|
|
if ( ! sentCookieLastTime && m_redirCookieBuf.length() ) {
|
|
m_redirUrl.set ( loc->getUrl() );
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
if ( ! keep ) m_redirError = EDOCREDIRECTSTOSELF;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . don't allow redirects when injecting!
|
|
// . otherwise, we would mfree(m_buf) which would free our
|
|
// injected reply... yet m_injectedReplyLen would still be
|
|
// positive! can you say 'seg fault'?
|
|
// . hmmm... seems to have worked though
|
|
if ( cr->m_recycleContent || m_recycleContent ) { // || injected
|
|
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// . if we followed too many then bail
|
|
// . www.motorolamobility.com www.outlook.com ... failed when we
|
|
// had >= 4 here
|
|
if ( ++m_numRedirects >= 10 ) {
|
|
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// sometimes idiots don't supply us with a Location: mime
|
|
if ( loc->getUrlLen() == 0 ) {
|
|
if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// . protocol of url must be http or https
|
|
// . we had one url redirect to an ihttp:// protocol and caused
|
|
// spider to core dump when it saw that SpiderRequest record
|
|
char *proto = loc->getScheme();
|
|
if ( strncmp(proto,"http://" ,7) && strncmp(proto,"https://",8) ) {
|
|
m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// do not allow redirects to evil-G or bing
|
|
//if ( strstr(loc->getUrl(),".google.com/") ||
|
|
// strstr(loc->getUrl(),".bing.com/") ) {
|
|
// m_redirError = EDOCEVILREDIRECT;
|
|
// return &m_redirUrlPtr;
|
|
//}
|
|
// log a msg
|
|
if ( g_conf.m_logSpideredUrls )
|
|
logf(LOG_INFO,"build: %s redirected to %s",
|
|
cu->getUrl(),loc->getUrl());
|
|
|
|
// if not same Domain, it is not a simplified redirect
|
|
bool sameDom = true;
|
|
int32_t dlen = loc->getDomainLen();
|
|
if ( cu->getDomainLen() != dlen ) sameDom=false;
|
|
else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
|
|
if ( ! sameDom ) {
|
|
m_redirectFlag = true;
|
|
m_redirUrl.set ( loc , false ); // addWWW=false
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
ptr_redirUrl = m_redirUrl.m_url;
|
|
size_redirUrl = m_redirUrl.m_ulen+1;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
// if redirecting to the same domain, then do not add "www.".
|
|
// this way we can take care of slashdot.org, etc.
|
|
//bool addwww = false;
|
|
// but never modify if in dmoz, keep it pure
|
|
//if ( size_catIds > 0 ) addwww = false;
|
|
// debug msg
|
|
//if ( strcmp(m_redirUrl.getUrl(),url->getUrl())== 0 )
|
|
// log("Redirect error: same url");
|
|
//bool stripSessId = (size_catIds == 0);
|
|
// . reset m_redirUrl now (do not addWWW for slashdot.org, etc)
|
|
// . we now add "www." UNLESS it's a redirect from the same
|
|
// domain or firstUrl is in catdb
|
|
//tmp.set( loc->getUrl(),loc->getUrlLen(),addwww,stripSessId);
|
|
/*
|
|
// get this
|
|
bool sameHostLinks = false;
|
|
if ( *pi >= 0 ) sameHostLinks =cr->m_pq_spiderSameHostnameLinks[*pi];
|
|
// get first url ever
|
|
Url *f = getFirstUrl();
|
|
// . for same host links, addwww for comparing
|
|
// . so if we are doing google.com and it redirects to
|
|
// www.google.com then we will allow that... and vice versa
|
|
if ( sameHostLinks ) {
|
|
Url u1;
|
|
Url u2;
|
|
u1.set ( loc->getUrl () , loc->getUrlLen(), true ); // addwww?
|
|
u2.set ( f->getUrl() , f->getUrlLen () , true ); // addwww?
|
|
// host must match if we are restricted to a particular host
|
|
if ( u1.getHostLen() != u2.getHostLen() ||
|
|
strncmp ( u1.getHost() , u2.getHost() ,
|
|
u1.getHostLen () ) != 0 ) {
|
|
m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
}
|
|
*/
|
|
// get first url ever
|
|
Url *f = getFirstUrl();
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// set this to true if the redirected urls is much preferred
|
|
bool simplifiedRedir = false;
|
|
// . if it redirected to a simpler url then stop spidering now
|
|
// and add the simpler url to the spider queue
|
|
// . by simpler, i mean one w/ fewer path components
|
|
// . or one with a www for hostname
|
|
// . or could be same as firstUrl but with a / appended
|
|
char *r = loc->getUrl();
|
|
char *u = f->getUrl();
|
|
int32_t rlen = loc->getUrlLen();
|
|
int32_t ulen = f->getUrlLen();
|
|
// simpler if new path depth is int16_ter
|
|
if ( loc->getPathDepth (true) < f->getPathDepth (true) )
|
|
simplifiedRedir = true;
|
|
// simpler if old has cgi and new does not
|
|
if ( f->isCgi() && ! loc->isCgi() )
|
|
simplifiedRedir = true;
|
|
// if we're a dmoz page, don't do this, unless just a / case,no
|
|
if ( size_catIds > 0 )
|
|
simplifiedRedir = false;
|
|
// simpler if new one is same as old but has a '/' at the end
|
|
if ( rlen == ulen+1 && r[rlen-1]=='/' && strncmp(r,u,ulen)==0)
|
|
simplifiedRedir = true;
|
|
// . if new url does not have semicolon but old one does
|
|
// . http://news.yahoo.com/i/738;_ylt=AoL4eFRYKEdXbfDh6W2cF
|
|
// redirected to http://news.yahoo.com/i/738
|
|
if ( strchr (u,';') && ! strchr (r,';') )
|
|
simplifiedRedir = true;
|
|
// simpler is new host is www and old is not
|
|
if ( loc->isHostWWW() && ! f->isHostWWW() )
|
|
simplifiedRedir = true;
|
|
// if redirect is to different domain, set simplified
|
|
// this helps locks from bunching on one domain
|
|
if ( loc->getDomainLen()!=f->getDomainLen() ||
|
|
strncasecmp ( loc->getDomain(),
|
|
f->getDomain(),
|
|
loc->getDomainLen() ) != 0 )
|
|
// crap, but www.hotmail.com redirects to live.msn.com
|
|
// login page ... so add this check here
|
|
if ( ! f->isRoot() )
|
|
simplifiedRedir = true;
|
|
|
|
bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;
|
|
|
|
// follow redirects if injecting so we do not return
|
|
// EDOCSIMPLIFIEDREDIR
|
|
if ( getIsInjecting ( ) )
|
|
allowSimplifiedRedirs = true;
|
|
|
|
// or if disabled then follow the redirect
|
|
if ( ! cr->m_useSimplifiedRedirects )
|
|
allowSimplifiedRedirs = true;
|
|
|
|
// . if the redir url is simpler, but has no hostname we
|
|
// prepend a "www." to it
|
|
// . this should avoids www.russ.ru and russ.ru from being
|
|
// in the index at the same time and causing url: collisions
|
|
/*
|
|
if ( size_catIds == 0 &&
|
|
simplifiedRedir &&
|
|
loc->getDomainLen() == loc->getHostLen () )
|
|
loc->set (loc->getUrl(),
|
|
loc->getUrlLen(),
|
|
true, //false, addwww?
|
|
stripSessId );
|
|
*/
|
|
// if not allow, do not do them... except for the two below
|
|
//if ( ! m_useSimplifiedRedirects || m_isDirColl )
|
|
// simplifiedRedir = false;
|
|
|
|
// special hack for nytimes.com. do not consider simplified redirs
|
|
// because it uses a cookie along with redirs to get to the final
|
|
// page.
|
|
char *dom2 = m_firstUrl.getDomain();
|
|
int32_t dlen2 = m_firstUrl.getDomainLen();
|
|
if ( dlen2 == 11 && strncmp(dom2,"nytimes.com",dlen2)==0 )
|
|
allowSimplifiedRedirs = true;
|
|
// same for bananarepublic.gap.com ?
|
|
// if ( dlen2 == 7 && strncmp(dom2,"gap.com",dlen2)==0 )
|
|
// allowSimplifiedRedirs = true;
|
|
|
|
// if redirect is setting cookies we have to follow the redirect
|
|
// all the way through so we can stop now.
|
|
if ( m_redirCookieBufValid && m_redirCookieBuf.getLength() )
|
|
allowSimplifiedRedirs = true;
|
|
|
|
// . don't bother indexing this url if the redir is better
|
|
// . 301 means moved PERMANENTLY...
|
|
// . many people use 301 on their root pages though, so treat
|
|
// it like a temporary redirect, like exclusivelyequine.com
|
|
if ( simplifiedRedir && ! allowSimplifiedRedirs &&
|
|
// for custom BULK clients don't like this i guess
|
|
// AND for custom crawl it was messing up the processing
|
|
// url format for a nytimes blog subsite which was redirecting
|
|
// to the proper nytimes.com site...
|
|
// ! cr->m_isCustomCrawl ) {
|
|
// no, we need this for custom crawls because otherwise we
|
|
// get too many dups in the index. so for nyt we need something
|
|
// else
|
|
cr->m_isCustomCrawl != 2 ) {
|
|
// returns false if blocked, true otherwise
|
|
//return addSimplifiedRedirect();
|
|
m_redirError = EDOCSIMPLIFIEDREDIR;
|
|
// set this because getLinks() treats this redirUrl
|
|
// as a link now, it will add a SpiderRequest for it:
|
|
m_redirUrl.set ( loc , false ); // addWWW=false
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
// mdw: let this path through so contactXmlDoc gets a proper
|
|
// redirect that we can follow. for the base xml doc at
|
|
// least the m_indexCode will be set
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// good to go
|
|
m_redirectFlag = true;
|
|
m_redirUrl.set ( loc , false ); // addWWW=false
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
ptr_redirUrl = m_redirUrl.m_url;
|
|
size_redirUrl = m_redirUrl.m_ulen+1;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
int32_t *XmlDoc::getFirstIndexedDate ( ) {
|
|
if ( m_firstIndexedDateValid ) return (int32_t *)&m_firstIndexedDate;
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
|
|
// valid
|
|
m_firstIndexedDateValid = true;
|
|
// must be downloaded
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// assume now is the first time
|
|
m_firstIndexedDate = getSpideredTime();//m_spideredTime;
|
|
// inherit from our old title rec
|
|
if ( *od ) m_firstIndexedDate = (*od)->m_firstIndexedDate;
|
|
// return it
|
|
return (int32_t *)&m_firstIndexedDate;
|
|
}
|
|
|
|
int32_t *XmlDoc::getOutlinksAddedDate ( ) {
|
|
if ( m_outlinksAddedDateValid ) return (int32_t *)&m_outlinksAddedDate;
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
|
|
// valid
|
|
m_outlinksAddedDateValid = true;
|
|
// must be downloaded
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// assume we are doing it now
|
|
m_outlinksAddedDate = getSpideredTime();//m_spideredTime;
|
|
// get that
|
|
if ( *od ) m_outlinksAddedDate = (*od)->m_outlinksAddedDate;
|
|
// return it
|
|
return (int32_t *)&m_outlinksAddedDate;
|
|
}
|
|
|
|
/*
|
|
int32_t *XmlDoc::getNumBannedOutlinks ( ) {
|
|
if ( m_numBannedOutlinksValid ) return &m_numBannedOutlinks;
|
|
|
|
setStatus ( "getting num banned outlinks" );
|
|
|
|
// get the outlinks
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
|
// count em
|
|
int32_t n = links->getNumLinks();
|
|
// reset
|
|
m_numBannedOutlinks = 0;
|
|
// one vote per domain hash table
|
|
char buf[20000];
|
|
HashTableX ht; ht.set ( 4 , 0 , -1 , buf , 20000 ,false,m_niceness);
|
|
// loop through them
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the link
|
|
char *u = links->getLinkPtr(i);
|
|
// get domain of the link
|
|
int32_t dlen; char *dom = getDomFast ( u , &dlen , false );
|
|
// skip if bad domain
|
|
if ( ! dom || dlen <= 0 ) continue;
|
|
// get domHash
|
|
int32_t h = hash32 ( dom , dlen );
|
|
// one check per domain
|
|
if ( ht.getSlot ( &h ) >= 0 ) continue;
|
|
// add it, return NULL on error, g_errno should be set
|
|
if ( ! ht.addKey ( &h ) ) return NULL;
|
|
// . loop over all regular expression in the url filters table
|
|
// . stop at first regular expression it matches
|
|
int32_t *rn = getRegExpNum2 ( i );
|
|
// need to wait for a callback at this point
|
|
if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
|
|
// skip if no match in url filters table
|
|
if ( *rn == -1 ) continue;
|
|
// get spider priority
|
|
int32_t pr = cr->m_spiderPriorities[*rn];
|
|
// skip if not banned
|
|
if ( pr != -2 ) continue;
|
|
// count it
|
|
m_numBannedOutlinks++;
|
|
}
|
|
// all done
|
|
m_numBannedOutlinksValid = true;
|
|
// convert this too!
|
|
//m_numBannedOutlinks8 = score32to8 ( m_numBannedOutlinks );
|
|
// sanity check on score32to8()
|
|
//if(m_numBannedOutlinks8>0&&!m_numBannedOutlinks){char*xx=NULL;*xx=0;}
|
|
|
|
return &m_numBannedOutlinks;
|
|
}
|
|
*/
|
|
|
|
uint16_t *XmlDoc::getCountryId ( ) {
|
|
if ( m_countryIdValid ) return &m_countryId;
|
|
|
|
setStatus ( "getting country id" );
|
|
|
|
// get it
|
|
CatRec *cat = getCatRec ();
|
|
if ( ! cat || cat == (CatRec *)-1) return (uint16_t *)cat;
|
|
// MDW: i limit this to 10 to save stack space!
|
|
Url *u = getCurrentUrl();
|
|
if ( ! u || u == (void *)-1) return (uint16_t *)u;
|
|
// use the url's tld to guess the country
|
|
uint16_t country = g_langId.guessCountryTLD ( u->getUrl ( ) );
|
|
// . 0 means no country i guess. try dmoz next.
|
|
// . limit to 10 of them
|
|
int32_t nc = cat->m_numCatids;
|
|
for ( int32_t i = 0; ! country && i < nc && i < 10 ; i++) {
|
|
int32_t catid = cat->m_catids[i];
|
|
country = g_countryCode.getCountryFromDMOZ ( catid );
|
|
}
|
|
m_countryIdValid = true;
|
|
m_countryId = country;
|
|
return &m_countryId;
|
|
}
|
|
|
|
|
|
/*
|
|
XmlDoc *XmlDoc::getOldDoc ( ) {
|
|
if ( m_oldDocValid ) return &m_oldDoc;
|
|
// get current url
|
|
Url *u = getCurrentUrl();
|
|
// set its url otherwise
|
|
m_oldDoc.setFirstUrl ( u , false );
|
|
// get the old title rec
|
|
char *ret = getOldTitleRec();
|
|
if ( ! ret || ret == (char *)-1 ) return (XmlDoc *)ret;
|
|
// all done
|
|
m_oldDocValid = true;
|
|
// return it
|
|
return m_oldDoc;
|
|
}
|
|
*/
|
|
|
|
uint8_t *XmlDoc::getRootLangId ( ) {
|
|
|
|
// return it if we got it
|
|
if ( m_rootLangIdValid ) return &m_rootLangId;
|
|
// note it
|
|
setStatus ( "getting root lang id from tagdb");
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
|
|
// sanity check - should not be called on a root url
|
|
if ( *isRoot ) {
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 )
|
|
return (uint8_t *) langId;
|
|
m_rootLangId = *langId;
|
|
m_rootLangIdValid = true;
|
|
return &m_rootLangId;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
// get the tag rec
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
|
|
// just use one. there may be multiple ones!
|
|
Tag *tag = gr->getTag("rootlang");
|
|
// if there use that
|
|
if ( ! tag ) {
|
|
// . get the root doc
|
|
// . allow for a one hour cache of the titleRec
|
|
XmlDoc **prd = getRootXmlDoc( 3600 );
|
|
if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
|
|
// int16_tcut
|
|
XmlDoc *rd = *prd;
|
|
// . if no root doc, then assume language unknown
|
|
// . this happens if we are injecting because we do not want
|
|
// to download the root page for speed purposes
|
|
if ( ! rd ) {
|
|
m_rootLangId = langUnknown;
|
|
m_rootLangIdValid = true;
|
|
return &m_rootLangId;
|
|
}
|
|
// . update tagdb rec
|
|
// . on root download error use language "xx" (unknown) to
|
|
// avoid hammering the root page
|
|
//bool *status = rd->updateRootLangId ();
|
|
//if (! status || status==(void *)-1) return (uint8_t *)status;
|
|
// update our tag rec now
|
|
//Tag *tt = rd->m_newTagRec.getTag("rootlang");
|
|
// must be there
|
|
//if ( ! tt ) { char *xx=NULL;*xx=0; }
|
|
// add it for us
|
|
//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
|
|
// get it
|
|
uint8_t *rl = rd->getLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
|
|
// must be legit now!
|
|
if ( ! rd->m_langIdValid ) { char *xx=NULL;*xx=0;}
|
|
// now validate our stuff
|
|
m_rootLangIdValid = true;
|
|
//m_rootLangIdScore = rd->m_langIdScore;
|
|
m_rootLangId = rd->m_langId;
|
|
return &m_rootLangId;
|
|
}
|
|
// sanity check ( must be like "en,50\0" or could be
|
|
// "en_US,50\0" or "zh_cn,50"
|
|
if ( tag->getTagDataSize() > 6 ) { char *xx=NULL;*xx=0; }
|
|
// point to 2 character language abbreviation
|
|
char *abbr = tag->getTagData();
|
|
/*
|
|
// find comma
|
|
char *comma = strchr(abbr,',' );
|
|
// sanity check
|
|
if ( ! comma ) { char *xx=NULL;*xx=0; }
|
|
// tmp NULL
|
|
*comma = '\0';
|
|
*/
|
|
// map it to an id
|
|
uint8_t langId = getLangIdFromAbbr( abbr );
|
|
/*
|
|
// put it back
|
|
*comma = ',';
|
|
// get score
|
|
int32_t score = atol(comma+1);
|
|
// sanity check
|
|
if ( score < 0 || score > 100 ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
// set that up
|
|
m_rootLangId = langId;
|
|
//m_rootLangIdScore = score;
|
|
m_rootLangIdValid = true;
|
|
return &m_rootLangId;
|
|
}
|
|
|
|
XmlDoc **XmlDoc::getOldXmlDoc ( ) {
|
|
|
|
if ( m_oldDocValid ) return &m_oldDoc;
|
|
|
|
// note it
|
|
setStatus ( "getting old xml doc");
|
|
|
|
// if we are set from a title rec, we are the old doc
|
|
if ( m_setFromTitleRec ) {
|
|
m_oldDocValid = true;
|
|
m_oldDoc = NULL;//this;
|
|
return &m_oldDoc;
|
|
}
|
|
|
|
// . cache age is 0... super fresh
|
|
// . returns NULL w/ g_errno if not found unless isIndexed is false
|
|
// and valid, and it is not valid for pagereindexes.
|
|
char **otr = getOldTitleRec ( );
|
|
if ( ! otr || otr == (char **)-1 ) return (XmlDoc **)otr;
|
|
// if no title rec, return ptr to a null
|
|
m_oldDoc = NULL;
|
|
if ( ! *otr ) { m_oldDocValid = true; return &m_oldDoc; }
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if provided title rec matches our docid but not uh48 then there
|
|
// was a docid collision and we should null out our title rec
|
|
// and return with an error and no index this puppy!
|
|
// crap, we can't call getFirstUrl() because it might not be
|
|
// valid if we are a docid based doc and THIS function was called
|
|
// from getFirstUrl() -- we end up in a recursive loop.
|
|
if ( ! m_setFromDocId ) {
|
|
//int64_t uh48 = getFirstUrl()->getUrlHash48();
|
|
int64_t uh48 = getFirstUrlHash48();
|
|
int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr );
|
|
if ( uh48 != tuh48 ) {
|
|
log("xmldoc: docid collision uh48 mismatch. cannot "
|
|
"index "
|
|
"%s",getFirstUrl()->getUrl() );
|
|
g_errno = EDOCIDCOLLISION;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// . if *otr is NULL that means not found
|
|
// . return a NULL old XmlDoc in that case as well?
|
|
// . make a new one
|
|
// . this will uncompress it and set ourselves!
|
|
try { m_oldDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_oldDoc , sizeof(XmlDoc),"xmldoc1");
|
|
// debug the mem leak
|
|
// log("xmldoc: xmldoc1=%"PTRFMT" u=%s"
|
|
// ,(PTRTYPE)m_oldDoc
|
|
// ,m_firstUrl.getUrl());
|
|
// if title rec is corrupted data uncompress will fail and this
|
|
// will return false!
|
|
if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
|
|
m_oldTitleRecSize , // maxSize
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness ) ) {
|
|
log("build: failed to set old doc for %s",m_firstUrl.m_url);
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
int32_t saved = g_errno;
|
|
// ok, fix the memleak here
|
|
mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
|
|
delete ( m_oldDoc );
|
|
m_oldDocExistedButHadError = true;
|
|
//log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc);
|
|
m_oldDoc = NULL;
|
|
g_errno = saved;
|
|
// MDW: i removed this on 2/8/2016 again so the code below
|
|
// would execute.
|
|
//return NULL; //mdwmdwmdw
|
|
// if it is data corruption, just assume empty so
|
|
// we don't stop spidering a url because of this. so we'll
|
|
// think this is the first time indexing it. otherwise
|
|
// we get "Bad cached document" in the logs and the
|
|
// SpiderReply and it never gets re-spidered because it is
|
|
// not a 'temporary' error according to the url filters.
|
|
log("build: treating corrupted titlerec as not found");
|
|
g_errno = 0;
|
|
m_oldDoc = NULL;
|
|
m_oldDocValid = true;
|
|
return &m_oldDoc;
|
|
}
|
|
m_oldDocValid = true;
|
|
// share our masterloop and state!
|
|
m_oldDoc->m_masterLoop = m_masterLoop;
|
|
m_oldDoc->m_masterState = m_masterState;
|
|
return &m_oldDoc;
|
|
}
|
|
|
|
void XmlDoc::nukeDoc ( XmlDoc *nd ) {
|
|
// skip if empty
|
|
if ( ! nd ) return;
|
|
// debug the mem leak
|
|
// if ( nd == m_oldDoc )
|
|
// log("xmldoc: nuke xmldoc1=%"PTRFMT" u=%s this=%"PTRFMT""
|
|
// ,(PTRTYPE)m_oldDoc
|
|
// ,m_firstUrl.getUrl()
|
|
// ,(PTRTYPE)this
|
|
// );
|
|
// do not nuke yerself!
|
|
if ( nd == this ) return;
|
|
// or root doc!
|
|
//if ( nd == m_rootDoc ) return;
|
|
// nuke it
|
|
mdelete ( nd , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( nd );
|
|
// invalidate
|
|
if ( nd == m_extraDoc ) {
|
|
m_extraDocValid = false;
|
|
m_extraDoc = NULL;
|
|
}
|
|
if ( nd == m_rootDoc ) {
|
|
m_rootDocValid = false;
|
|
m_rootDoc = NULL;
|
|
}
|
|
if ( nd == m_oldDoc ) {
|
|
m_oldDocValid = false;
|
|
m_oldDoc = NULL;
|
|
}
|
|
if ( nd == m_ahrefsDoc ) {
|
|
m_ahrefsDocValid = false;
|
|
m_ahrefsDoc = NULL;
|
|
}
|
|
}
|
|
|
|
static LinkInfo s_dummy;
|
|
|
|
XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
|
|
if ( m_extraDocValid ) return &m_extraDoc;
|
|
// note that
|
|
setStatus ( "getting new doc" );
|
|
// we need a valid first ip first!
|
|
//int32_t *pfip = getFirstIp();
|
|
//if ( ! pfip || pfip == (void *)-1 ) return (XmlDoc **)pfip;
|
|
// must be NULL
|
|
if ( m_extraDoc ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( ! u || ! u[0] ) { char *xx=NULL;*xx=0; }//return &m_extraDoc;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// . if *otr is NULL that means not found
|
|
// . return a NULL old XmlDoc in that case as well?
|
|
// . make a new one
|
|
// . this will uncompress it and set ourselves!
|
|
try { m_extraDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_extraDoc , sizeof(XmlDoc),"xmldoc2");
|
|
|
|
// . if we did not have it in titledb then download it!
|
|
// . or if titleRec was too old!
|
|
|
|
// a spider rec for the extra doc to use
|
|
SpiderRequest sreq;
|
|
// clear it
|
|
sreq.reset();
|
|
// spider the url "u"
|
|
strcpy ( sreq.m_url , u );
|
|
// inherit page parser
|
|
sreq.m_isPageParser = getIsPageParser();
|
|
// set the data size right
|
|
sreq.setDataSize();
|
|
// . prepare to download it, set it up
|
|
// . returns false and sets g_errno on error
|
|
if ( ! m_extraDoc->set4 ( &sreq ,
|
|
NULL , // doledbkey ptr
|
|
cr->m_coll ,
|
|
NULL , // SafeBuf
|
|
m_niceness ))
|
|
return NULL;
|
|
|
|
// share our masterloop and state!
|
|
m_extraDoc->m_masterLoop = m_masterLoop;
|
|
m_extraDoc->m_masterState = m_masterState;
|
|
|
|
// carry this forward always!
|
|
m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;
|
|
|
|
// disable spam check because that is not necessary for this doc!
|
|
m_extraDoc->m_spamCheckDisabled = true;
|
|
|
|
// tell msg13 to get this from it robots.txt cache if it can. it also
|
|
// keeps a separate html page cache for the root pages, etc. in case
|
|
m_extraDoc->m_maxCacheAge = maxCacheAge;
|
|
|
|
// a dummy thing
|
|
s_dummy.m_numStoredInlinks = 0;
|
|
s_dummy.m_numGoodInlinks = 0;
|
|
|
|
// we indirectly call m_extraDoc->getHttpReply() which calls
|
|
// m_extraDoc->getRedirectUrl(), which checks the linkInfo and
|
|
// dmoz catids of the original url to see if we should set m_indexCode
|
|
// to something bad or not. to avoid these unnecessary lookups we
|
|
// set these to NULL and validate them
|
|
m_extraDoc->ptr_catIds = NULL;
|
|
m_extraDoc->size_catIds = 0;
|
|
m_extraDoc->m_catIdsValid = true;
|
|
m_extraDoc->ptr_linkInfo1 = &s_dummy;
|
|
m_extraDoc->size_linkInfo1 = 0;
|
|
m_extraDoc->m_linkInfo1Valid = true;
|
|
m_extraDoc->ptr_linkInfo2 = &s_dummy;
|
|
m_extraDoc->size_linkInfo2 = 0;
|
|
m_extraDoc->m_linkInfo2Valid = true;
|
|
m_extraDoc->m_urlFilterNumValid = true;
|
|
m_extraDoc->m_urlFilterNum = 0;
|
|
// for redirects
|
|
m_extraDoc->m_allowSimplifiedRedirs = true;
|
|
// always forward the http download request so that Msg13.cpp's
|
|
// handleRequest13() can avoid this same page
|
|
// from being downloaded at the same time. also, if we are robots.txt
|
|
// this allows us to use the same cache since we select the host we
|
|
// forward to based on ip address.
|
|
m_extraDoc->m_forwardDownloadRequest = true;
|
|
// set this flag so msg13.cpp doesn't print the "hammering ip" msg
|
|
m_extraDoc->m_isChildDoc = true;
|
|
m_extraDoc->m_parentDocPtr = this;
|
|
// debug it
|
|
//g_doc = this;
|
|
|
|
// and inherit test dir so getTestDir() doesn't core on us
|
|
bool isPageParser = getIsPageParser();
|
|
m_extraDoc->m_isPageParser = isPageParser;
|
|
m_extraDoc->m_isPageParserValid = true;
|
|
|
|
// without this we send all the msg13 requests to host #3! because
|
|
// Msg13 uses it to determine what host to handle it
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
m_extraDoc->m_firstIp = m_firstIp;
|
|
m_extraDoc->m_firstIpValid = true;
|
|
|
|
// i guess we are valid now
|
|
m_extraDocValid = true;
|
|
return &m_extraDoc;
|
|
}
|
|
|
|
bool XmlDoc::getIsPageParser ( ) {
|
|
if ( m_isPageParserValid ) return m_isPageParser;
|
|
// assume not
|
|
m_isPageParser = false;
|
|
// and set otherwise
|
|
if ( m_sreqValid && m_sreq.m_isPageParser ) m_isPageParser = true;
|
|
// and validate
|
|
m_isPageParserValid = true;
|
|
return m_isPageParser;
|
|
}
|
|
|
|
XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
|
|
if ( m_rootDocValid ) return &m_rootDoc;
|
|
// help avoid mem leaks
|
|
if ( m_rootDoc ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
setStatus ( "getting root doc");
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (XmlDoc **)isRoot;
|
|
// if we are root use us!!!!!
|
|
if ( *isRoot ) {
|
|
m_rootDoc = this;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
// get our site root
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (XmlDoc **)mysite;
|
|
// otherwise, we gotta get it!
|
|
char **rtr = getRootTitleRec ( );
|
|
if ( ! rtr || rtr == (char **)-1 ) return (XmlDoc **)rtr;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (XmlDoc **)cu;
|
|
// if no title rec, return ptr to a null
|
|
//m_rootDoc = NULL;
|
|
//if ( ! *rtr ) {
|
|
// // damn, not in titledb, i guess download it then
|
|
// m_rootDocValid = true; return &m_rootDoc; }
|
|
// note it
|
|
setStatus ( "getting root doc");
|
|
|
|
// to keep injections fast, do not download the root page!
|
|
if ( ! *rtr && m_contentInjected ) {
|
|
// assume none
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
// likewise, if doing a rebuild
|
|
if ( ! *rtr && m_useSecondaryRdbs ) {
|
|
// assume none
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
// or recycling content like for query reindex. keep it fast.
|
|
if ( ! *rtr && m_recycleContent ) {
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . if *otr is NULL that means not found
|
|
// . return a NULL root XmlDoc in that case as well?
|
|
// . make a new one
|
|
// . this will uncompress it and set ourselves!
|
|
try { m_rootDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
|
|
// if we had the title rec, set from that
|
|
if ( *rtr ) {
|
|
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
|
|
m_rootTitleRecSize , // maxSize ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness ) ) {
|
|
// it was corrupted... delete this
|
|
// possibly printed
|
|
// " uncompress uncompressed size=..." bad uncompress
|
|
log("build: rootdoc set2 failed");
|
|
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( m_rootDoc );
|
|
// call it empty for now, we don't want to return
|
|
// NULL with g_errno set because it could stop
|
|
// the whole indexing pipeline
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
//return NULL;
|
|
}
|
|
}
|
|
// . otherwise, set the url and download it on demand
|
|
// . this junk copied from the contactDoc->* stuff below
|
|
else {
|
|
// a spider rec for the contact doc
|
|
SpiderRequest sreq;
|
|
// clear it
|
|
sreq.reset();
|
|
// spider the url "u"
|
|
char *p = sreq.m_url;
|
|
if ( cu->isHttps() ) p += sprintf ( p , "https://" );
|
|
else p += sprintf ( p , "http://" );
|
|
strcpy ( p , mysite );
|
|
// set this
|
|
if ( m_sreqValid ) {
|
|
// this will avoid it adding to tagdb!
|
|
sreq.m_isPageParser = m_sreq.m_isPageParser;
|
|
}
|
|
// reset the data size
|
|
sreq.setDataSize ();
|
|
// . prepare to download it, set it up
|
|
// . returns false and sets g_errno on error
|
|
if ( ! m_rootDoc->set4 ( &sreq ,
|
|
NULL , // doledbkey ptr
|
|
cr->m_coll ,
|
|
NULL , // SafeBuf
|
|
m_niceness )) {
|
|
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( m_rootDoc );
|
|
m_rootDoc = NULL;
|
|
return NULL;
|
|
}
|
|
// do not throttle it!
|
|
//m_rootDoc->m_throttleDownload = false;
|
|
// . do not do robots check for it
|
|
// . no we must to avoid triggering a bot trap & getting banned
|
|
//m_rootDoc->m_isAllowed = m_isAllowed;
|
|
//m_rootDoc->m_isAllowedValid = true;
|
|
}
|
|
|
|
// share our masterloop and state!
|
|
m_rootDoc->m_masterLoop = m_masterLoop;
|
|
m_rootDoc->m_masterState = m_masterState;
|
|
|
|
// msg13 caches the pages it downloads
|
|
m_rootDoc->m_maxCacheAge = maxCacheAge;
|
|
|
|
// like m_contactDoc we avoid unnecessary lookups in call to
|
|
// getRedirUrl() by validating these empty members
|
|
m_rootDoc->ptr_catIds = NULL;
|
|
m_rootDoc->size_catIds = 0;
|
|
m_rootDoc->m_catIdsValid = true;
|
|
m_rootDoc->ptr_linkInfo1 = &s_dummy;
|
|
m_rootDoc->size_linkInfo1 = 0;
|
|
m_rootDoc->m_linkInfo1Valid = true;
|
|
m_rootDoc->ptr_linkInfo2 = &s_dummy;
|
|
m_rootDoc->size_linkInfo2 = 0;
|
|
m_rootDoc->m_linkInfo2Valid = true;
|
|
m_rootDoc->m_urlFilterNumValid = true;
|
|
m_rootDoc->m_urlFilterNum = 0;
|
|
// for redirects
|
|
m_rootDoc->m_allowSimplifiedRedirs = true;
|
|
// always forward the http download request so that Msg13.cpp's
|
|
// handleRequest13() can avoid the same root page or contact page
|
|
// from being downloaded at the same time. also, if we are robots.txt
|
|
// this allows us to use the same cache since we select the host we
|
|
// forward to based on ip address.
|
|
m_rootDoc->m_forwardDownloadRequest = true;
|
|
// set this flag so msg13.cpp doesn't print the "hammering ip" msg
|
|
m_rootDoc->m_isChildDoc = true;
|
|
m_rootDoc->m_parentDocPtr = this;
|
|
|
|
// validate it
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
/*
|
|
// no longer access Revdb to get the old metalist, now re-compute
|
|
RdbList *XmlDoc::getOldMetaList ( ) {
|
|
// if valid return that
|
|
if ( m_oldMetaListValid ) return &m_oldMetaList;
|
|
// update status msg
|
|
setStatus ( "getting old meta list");
|
|
// load the old title rec
|
|
XmlDoc **odp = getOldXmlDoc( );
|
|
if ( ! odp || odp == (XmlDoc **)-1 ) return (RdbList *)odp;
|
|
XmlDoc *od = *odp;
|
|
// empty old doc?
|
|
if ( ! od ) {
|
|
m_oldMetaList.reset();
|
|
m_oldMetaListValid = true;
|
|
return &m_oldMetaList;
|
|
}
|
|
// and use that. it has m_setFromTitleRec set to true.
|
|
char *old = od->getMetaList();
|
|
if ( ! old || old == (void *)-1 ) return (RdbList *)old;
|
|
// set it
|
|
m_oldMetaList.m_list = od->m_metaList; // old;
|
|
m_oldMetaList.m_listSize = od->m_metaListSize;
|
|
m_oldMetaList.m_ownData = false;
|
|
// assign it
|
|
m_oldMetaListValid = true;
|
|
return &m_oldMetaList;
|
|
}
|
|
*/
|
|
|
|
SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
|
|
if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
|
|
if ( m_setFromDocId ) return &m_timeAxisUrl;
|
|
m_timeAxisUrlValid = true;
|
|
Url *fu = getFirstUrl();
|
|
m_timeAxisUrl.reset();
|
|
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
|
|
return &m_timeAxisUrl;
|
|
}
|
|
|
|
// . look up TitleRec using Msg22 if we need to
|
|
// . set our m_titleRec member from titledb
|
|
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
|
// from scratch. this loads it from titledb.
|
|
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
|
char **XmlDoc::getOldTitleRec ( ) {
|
|
// clear if we blocked
|
|
//if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
|
|
// g_errno = EBADTITLEREC;
|
|
// return NULL;
|
|
|
|
// if valid return that
|
|
if ( m_oldTitleRecValid ) return &m_oldTitleRec;
|
|
// update status msg
|
|
setStatus ( "getting old title rec");
|
|
// if we are set from a title rec, we are the old doc
|
|
if ( m_setFromTitleRec ) {
|
|
m_oldTitleRecValid = true;
|
|
m_oldTitleRec = NULL;//m_titleRec;
|
|
return &m_oldTitleRec;
|
|
}
|
|
// sanity check
|
|
if ( m_oldTitleRecValid && m_msg22a.m_outstanding ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// point to url
|
|
//char *u = getCurrentUrl()->getUrl();
|
|
//char *u = getFirstUrl()->getUrl();
|
|
|
|
// assume its valid
|
|
m_oldTitleRecValid = true;
|
|
// add it to the cache?
|
|
bool addToCache = false;
|
|
//if ( maxCacheAge > 0 ) addToCache = true;
|
|
|
|
// not if new! no we need to do this so XmlDoc::getDocId() works!
|
|
// this logic prevents us from setting g_errno to ENOTFOUND
|
|
// when m_msg22a below calls indexDocWrapper(). however, for
|
|
// doing a query delete on a not found docid will succumb to
|
|
// the g_errno because m_isIndexed is not valid i think...
|
|
if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
|
|
m_oldTitleRec = NULL;
|
|
m_oldTitleRecValid = true;
|
|
return &m_oldTitleRec;
|
|
}
|
|
// sanity check. if we have no url or docid ...
|
|
if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// use docid if first url not valid
|
|
int64_t docId = 0;
|
|
if ( ! m_firstUrlValid ) docId = m_docId;
|
|
// if url not valid, use NULL
|
|
char *u = NULL;
|
|
if ( docId == 0LL && ptr_firstUrl ) u = getFirstUrl()->getUrl();
|
|
// if both are not given that is a problem
|
|
if ( docId == 0LL && ! u ) {
|
|
log("doc: no url or docid provided to get old doc");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if using time axis then append the timestamp to the end of
|
|
// the url. this way Msg22::getAvailDocId() will return a docid
|
|
// based on that so we don't collide with other instances of this
|
|
// same url.
|
|
if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
|
|
SafeBuf *tau = getTimeAxisUrl();
|
|
u = tau->getBufStart();
|
|
}
|
|
|
|
// the title must be local since we're spidering it
|
|
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
|
|
u ,
|
|
docId , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
&m_oldTitleRec ,
|
|
&m_oldTitleRecSize ,
|
|
false , // just chk tfndb?
|
|
false , // getAvailDocIdOnly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
addToCache , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false ))// load balancing?
|
|
// return -1 if we blocked
|
|
return (char **)-1;
|
|
// not really an error
|
|
if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// got it
|
|
return &m_oldTitleRec;
|
|
}
|
|
|
|
// . look up TitleRec using Msg22 if we need to
|
|
// . set our m_titleRec member from titledb
|
|
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
|
// from scratch. this loads it from titledb.
|
|
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
|
char **XmlDoc::getRootTitleRec ( ) {
|
|
// if valid return that
|
|
if ( m_rootTitleRecValid ) return &m_rootTitleRec;
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
|
|
// if we are root use us!!!!! well, the old us...
|
|
if ( *isRoot ) {
|
|
char **otr = getOldTitleRec ( );
|
|
if ( ! otr || otr == (char **)-1 ) return (char **)otr;
|
|
m_rootTitleRec = m_oldTitleRec;
|
|
m_rootTitleRecSize = m_oldTitleRecSize;
|
|
return &m_rootTitleRec;
|
|
}
|
|
// get our site root
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// make it a url. keep it on stack since msg22 copies it into its
|
|
// url request buffer anyway! (m_msg22Request.m_url[])
|
|
Url site; site.set ( mysite );
|
|
// assume its valid
|
|
m_rootTitleRecValid = true;
|
|
// add it to the cache?
|
|
bool addToCache = false;
|
|
//if ( maxCacheAge > 0 ) addToCache = true;
|
|
// update status msg
|
|
setStatus ( "getting root title rec");
|
|
// the title must be local since we're spidering it
|
|
if ( ! m_msg22b.getTitleRec ( &m_msg22Request ,
|
|
site.getUrl() ,
|
|
0 , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
&m_rootTitleRec ,
|
|
&m_rootTitleRecSize ,
|
|
false , // just chk tfndb?
|
|
false , // getAvailDocIdOnly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
addToCache , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false ))// load balancing?
|
|
// return -1 if we blocked
|
|
return (char **)-1;
|
|
// not really an error
|
|
if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// got it
|
|
return &m_rootTitleRec;
|
|
}
|
|
|
|
/*
|
|
// . look up TitleRec using Msg22 if we need to
|
|
// . set our m_titleRec member from titledb
|
|
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
|
// from scratch. this loads it from titledb.
|
|
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
|
char **XmlDoc::getContactTitleRec ( char *u ) {
|
|
// clear if we blocked
|
|
//if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// if valid return that
|
|
if ( m_contactTitleRecValid ) return &m_contactTitleRec;
|
|
// fake
|
|
static char *s_fake = NULL;
|
|
// if no url, we got no contact title rec in titledb then!
|
|
if ( ! u || u[0] == '\0' ) return &s_fake;
|
|
// update status msg
|
|
setStatus ( "getting contact title rec");
|
|
// assume its valid
|
|
m_contactTitleRecValid = true;
|
|
// add it to the cache?
|
|
bool addToCache = false;
|
|
//if ( maxCacheAge > 0 ) addToCache = true;
|
|
// the title must be local since we're spidering it
|
|
if ( ! m_msg22c.getTitleRec ( &m_msg22Request ,
|
|
u ,
|
|
0 , // probable docid
|
|
m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
&m_contactTitleRec ,
|
|
&m_contactTitleRecSize ,
|
|
false , // just chk tfndb?
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
addToCache , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false ))// load balancing?
|
|
// return -1 if we blocked
|
|
return (char **)-1;
|
|
// not really an error
|
|
if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// got it
|
|
return &m_contactTitleRec;
|
|
}
|
|
*/
|
|
|
|
|
|
// used for indexing spider replies. we need a unique docid because it
|
|
// is treated as a different document even though its url will be the same.
|
|
// and there is never an "older" version of it because each reply is treated
|
|
// as a brand new document.
|
|
int64_t *XmlDoc::getAvailDocIdOnly ( int64_t preferredDocId ) {
|
|
if ( m_availDocIdValid && g_errno ) {
|
|
log("xmldoc: error getting availdocid: %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
if ( m_availDocIdValid )
|
|
// this is 0 or -1 if no avail docid was found
|
|
return &m_msg22c.m_availDocId;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// pre-validate it
|
|
m_availDocIdValid = true;
|
|
if ( ! m_msg22c.getAvailDocIdOnly ( &m_msg22Requestc ,
|
|
preferredDocId ,
|
|
cr->m_coll ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness ) )
|
|
return (int64_t *)-1;
|
|
// error?
|
|
log("xmldoc: error getting availdocid2: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
|
|
int64_t *XmlDoc::getDocId ( ) {
|
|
if ( m_docIdValid ) return &m_docId;
|
|
setStatus ("getting docid");
|
|
XmlDoc **od = getOldXmlDoc( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (int64_t *)od;
|
|
setStatus ("getting docid");
|
|
// . set our docid
|
|
// . *od is NULL if no title rec found with that docid in titledb
|
|
if ( *od ) {
|
|
m_docId = *(*od)->getDocId();
|
|
m_docIdValid = true;
|
|
return &m_docId;
|
|
}
|
|
|
|
m_docId = m_msg22a.getAvailDocId();
|
|
|
|
// if titlerec was there but not od it had an error uncompressing
|
|
// because of the corruption bug in RdbMem.cpp when dumping to disk.
|
|
if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
|
|
m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
|
|
log("build: salvaged docid %" INT64 " from corrupt title rec "
|
|
"for %s",m_docId,m_firstUrl.m_url);
|
|
}
|
|
|
|
if ( m_docId == 0 ) {
|
|
log("build: docid is 0 for %s",m_firstUrl.m_url);
|
|
g_errno = ENODOCID;
|
|
return NULL;
|
|
}
|
|
|
|
// ensure it is within probable range
|
|
if ( ! getUseTimeAxis () ) {
|
|
char *u = getFirstUrl()->getUrl();
|
|
int64_t pd = g_titledb.getProbableDocId(u);
|
|
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
|
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
|
if ( m_docId < d1 || m_docId > d2 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// if docid is zero, none is a available!!!
|
|
//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
|
|
m_docIdValid = true;
|
|
return &m_docId;
|
|
}
|
|
|
|
// . is our docid on disk? i.e. do we exist in the index already?
|
|
// . TODO: just check tfndb?
|
|
char *XmlDoc::getIsIndexed ( ) {
|
|
if ( m_isIndexedValid ) return &m_isIndexed;
|
|
|
|
setStatus ( "getting is indexed" );
|
|
|
|
// we must be old if this is true
|
|
//if ( m_setFromTitleRec ) {
|
|
// m_isNew = false;
|
|
// m_isNewValid = true;
|
|
// return &m_isNew;
|
|
//}
|
|
// get the url
|
|
//char *u = getFirstUrl()->getUrl();
|
|
|
|
if ( m_oldDocValid ) {
|
|
m_isIndexedValid = true;
|
|
if ( m_oldDoc ) m_isIndexed = true;
|
|
else m_isIndexed = false;
|
|
return &m_isIndexed;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// sanity check. if we have no url or docid ...
|
|
if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// use docid if first url not valid
|
|
int64_t docId = 0;
|
|
char *url = NULL;
|
|
// use docid if its valid, otherwise use url
|
|
if ( m_docIdValid ) docId = m_docId;
|
|
else url = ptr_firstUrl;
|
|
|
|
// note it
|
|
if ( ! m_calledMsg22e )
|
|
setStatus ( "checking titledb for old title rec");
|
|
else
|
|
setStatus ( "back from msg22e call");
|
|
|
|
// . consult the title rec tree!
|
|
// . "justCheckTfndb" is set to true here!
|
|
if ( ! m_calledMsg22e &&
|
|
! m_msg22e.getTitleRec ( &m_msg22Request ,
|
|
url ,
|
|
docId , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
NULL , // tr ptr
|
|
NULL , // tr size ptr
|
|
true , // just chk tfndb?
|
|
false, // getavaildocidonly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false )){//load balancing?
|
|
// validate
|
|
m_calledMsg22e = true;
|
|
// return -1 if we blocked
|
|
return (char *)-1;
|
|
}
|
|
// got it
|
|
m_calledMsg22e = true;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// get it
|
|
if ( m_msg22e.m_found ) m_isIndexed = true;
|
|
else m_isIndexed = false;
|
|
|
|
// validate
|
|
m_isIndexedValid = true;
|
|
return &m_isIndexed;
|
|
}
|
|
|
|
void gotTagRecWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// note it
|
|
THIS->setStatus ( "in got tag rec wrapper" );
|
|
// set these
|
|
if ( ! g_errno ) {
|
|
THIS->m_tagRec.serialize ( THIS->m_tagRecBuf );
|
|
THIS->ptr_tagRecData = THIS->m_tagRecBuf.getBufStart();
|
|
THIS->size_tagRecData = THIS->m_tagRecBuf.length();
|
|
// validate
|
|
THIS->m_tagRecValid = true;
|
|
}
|
|
// continue
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
// if tagrec changed enough so that it would affect what we would index
|
|
// since last time we indexed this doc, we need to know that!
|
|
/*
|
|
int32_t *XmlDoc::getTagHash32 ( ) {
|
|
// make it valid
|
|
if ( m_tagHash32Valid ) return &m_tagHash32;
|
|
// compute it
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// init it
|
|
m_tagHash32 = 0;
|
|
// hash the values of all tags
|
|
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get data
|
|
uint32_t h = hash32(tag->getTagData(),tag->getTagDataSize(),0);
|
|
// skip if 0
|
|
if ( ! h ) continue;
|
|
// xor it up
|
|
m_tagHash32 = hash32h ( h , m_tagHash32 );
|
|
}
|
|
// validate
|
|
m_tagHash32Valid = true;
|
|
return &m_tagHash32;
|
|
}
|
|
*/
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
TagRec *XmlDoc::getTagRec ( ) {
|
|
// if we got it give it
|
|
if ( m_tagRecValid ) return &m_tagRec;
|
|
// do we got a title rec?
|
|
if ( m_setFromTitleRec && m_version >= 118 &&
|
|
// lookup up fresh from tagdb when doing a rebuild so we get
|
|
// the latest sitenuminlinks! nah, we set m_tagRecValid and
|
|
// m_tagRecDataValid to false in Repair.cpp iff rebuilding
|
|
// titledb!! otherwise, we have to use what is in titlerec
|
|
// to avoid parsing inconsistencies that would result in
|
|
// undeletable posdb data.
|
|
//! m_useSecondaryRdbs &&
|
|
// lookup the tagdb rec fresh if setting for a summary. that way
|
|
// we can see if it is banned or not
|
|
m_tagRecDataValid ) {
|
|
// all done
|
|
m_tagRecValid = true;
|
|
// assume null if old version
|
|
//if ( m_version <= 115 ) return &m_tagRec;
|
|
// just return empty otherwise
|
|
m_tagRec.setFromBuf ( ptr_tagRecData , size_tagRecData );
|
|
return &m_tagRec;
|
|
}
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// get our site, usually the hostname, but can be like
|
|
// "www.last.fm/user/breendaxx/"
|
|
// we can't call this because it CALLS getTagRec()!!!
|
|
//char *mysite = getSite();
|
|
//if ( ! mysite || mysite == (char *)-1 ) return (TagRec *)mysite;
|
|
// update status msg
|
|
setStatus ( "getting tagdb record" );
|
|
// get the final redirected url
|
|
//Url *u = getCurrentUrl();
|
|
// nah, try this
|
|
Url *u = getFirstUrl();
|
|
// if we are docid based url this might block!
|
|
//if ( ! u || u == (void *)-1 ) return (TagRec *)u;
|
|
// good to go
|
|
//m_oldTagRecValid = true;
|
|
// get it, user our collection for lookups, not m_tagdbColl[] yet!
|
|
if ( ! m_msg8a.getTagRec ( u ,
|
|
// we have to guess the site because
|
|
// we can't hit tagdb to get it at this
|
|
// point!!!
|
|
NULL, // guess it! // mysite ,
|
|
cr->m_collnum ,
|
|
false, // skip domain lookup? // true
|
|
m_niceness ,
|
|
this ,
|
|
gotTagRecWrapper ,
|
|
&m_tagRec ) )
|
|
// we blocked, return -1
|
|
return (TagRec *)-1;
|
|
// error? ENOCOLLREC?
|
|
if ( g_errno ) return NULL;
|
|
// assign it
|
|
m_tagRec.serialize ( m_tagRecBuf );
|
|
ptr_tagRecData = m_tagRecBuf.getBufStart();
|
|
size_tagRecData = m_tagRecBuf.length();
|
|
// validate
|
|
m_tagRecValid = true;
|
|
// our tag rec should be all valid now
|
|
return &m_tagRec;
|
|
}
|
|
|
|
|
|
|
|
|
|
// this is only for purposes of setting the site's TagRec
|
|
char *XmlDoc::getHasContactInfo ( ) {
|
|
|
|
if ( m_hasContactInfoValid ) return &m_hasContactInfo2;
|
|
|
|
setStatus ( "getting has contact info" );
|
|
|
|
// get it from the tag rec if we can
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
|
|
|
|
char *ic = getIsThisDocContacty ( );
|
|
if ( ! ic || ic == (void *)-1 ) return (char *)ic;
|
|
|
|
// the current top ip address
|
|
//int32_t *ip = getIp();
|
|
//if ( ! ip || ip == (int32_t *)-1) return (char *)ip;
|
|
//int32_t top = *ip & 0x00ffffff;
|
|
|
|
// and should have a contact page tag
|
|
Tag *tag = gr->getTag ("hascontactinfo");
|
|
|
|
if ( tag ) m_hasContactInfo = true;
|
|
else m_hasContactInfo = false;
|
|
|
|
m_hasContactInfo2 = m_hasContactInfo;
|
|
|
|
// are we a "contact" link? i.e. about us, etc. that would contain
|
|
// the physical address of the entity responsible for this website
|
|
//bool isContacty = getIsContacty( fu ,
|
|
// info1 ,
|
|
// hops ,
|
|
// *ct ,
|
|
// *isRoot ,
|
|
// m_niceness );
|
|
|
|
// bail early if not a candidate for contact info
|
|
if ( ! *ic ) { // check ) {
|
|
m_hasContactInfoValid = true;
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
//
|
|
// TODO: did IP change?? invalidate it???
|
|
//
|
|
|
|
// set status. we can time status changes with this routine!
|
|
setStatus ( "getting contact info on just this page" );
|
|
|
|
int32_t *nca = getNumContactAddresses();
|
|
if ( ! nca || nca == (void *)-1 ) return (char *)nca;
|
|
|
|
// did we have a contact address?
|
|
if ( *nca ) {
|
|
m_hasContactInfo = true;
|
|
m_hasContactInfo2 = true;
|
|
m_hasContactInfoValid = true;
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
// get the email addresses
|
|
int32_t *numOfficial = getNumOfficialEmails ( );
|
|
if ( ! numOfficial || numOfficial == (void *)-1)
|
|
return (char *)numOfficial;
|
|
|
|
// did we get some?
|
|
if ( *numOfficial > 0 ) {
|
|
m_hasContactInfo = true;
|
|
m_hasContactInfo2 = true;
|
|
m_hasContactInfoValid = true;
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
// this should set m_hasContactInfo as well as m_contact*[] arrays
|
|
//TagRec *pcitr = getContactInfoTagRec ();
|
|
//if ( ! pcitr || pcitr == (void *)-1 ) return (char *)pcitr;
|
|
|
|
// do not re-peat the above now
|
|
m_hasContactInfoValid = true;
|
|
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
// returns "type" of contact link, > 0
|
|
int32_t getIsContacty ( Url *url ,
|
|
LinkInfo *info1 ,
|
|
int32_t hops ,
|
|
uint8_t ct ,
|
|
bool isRoot ,
|
|
int32_t niceness ) {
|
|
|
|
static int64_t h_home ;
|
|
static int64_t h_site ;
|
|
static int64_t h_map ;
|
|
static int64_t h_sitemap ;
|
|
static int64_t h_contact ;
|
|
static int64_t h_about ;
|
|
static int64_t h_privacy ;
|
|
static int64_t h_policy ;
|
|
static int64_t h_statement ;
|
|
static int64_t h_terms ;
|
|
static int64_t h_of ;
|
|
static int64_t h_and ;
|
|
static int64_t h_service ;
|
|
static int64_t h_conditions ;
|
|
static int64_t h_use ;
|
|
static int64_t h_us ;
|
|
static int64_t h_help ;
|
|
static int64_t h_location ;
|
|
static int64_t h_faq ;
|
|
static int64_t h_faqs ;
|
|
static int64_t h_customer ;
|
|
static int64_t h_support ;
|
|
static int64_t h_advertise ;
|
|
static int64_t h_inquiry ;
|
|
static int64_t h_inquiries ;
|
|
static int64_t h_feedback ;
|
|
static int64_t h_company ;
|
|
static int64_t h_corporate ;
|
|
|
|
static bool s_inith = false;
|
|
if ( ! s_inith ) {
|
|
s_inith = true;
|
|
h_home = hash64n ("home");
|
|
h_site = hash64n ("site");
|
|
h_map = hash64n ("map");
|
|
h_sitemap = hash64n ("sitemap");
|
|
h_contact = hash64n ("contact");
|
|
h_about = hash64n ("about");
|
|
h_privacy = hash64n ("privacy");
|
|
h_policy = hash64n ("policy");
|
|
h_statement = hash64n ("statement");
|
|
h_terms = hash64n ("terms");
|
|
h_of = hash64n ("of");
|
|
h_and = hash64n ("and");
|
|
h_service = hash64n ("service");
|
|
h_conditions = hash64n ("conditions");
|
|
h_use = hash64n ("use");
|
|
h_us = hash64n ("us");
|
|
h_help = hash64n ("help");
|
|
h_location = hash64n ("location");
|
|
h_faq = hash64n ("faq");
|
|
h_faqs = hash64n ("faqs");
|
|
h_customer = hash64n ("customer");
|
|
h_support = hash64n ("support");
|
|
h_advertise = hash64n ("advertise");
|
|
h_inquiry = hash64n ("inquiry");
|
|
h_inquiries = hash64n ("inquiries");
|
|
h_feedback = hash64n ("feedback");
|
|
h_company = hash64n ("company");
|
|
h_corporate = hash64n ("corporate");
|
|
}
|
|
|
|
int32_t check = 0;
|
|
// loop over the link texts we got
|
|
for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
|
|
// never do anything if hop count >= 3
|
|
if ( hops >= 3 ) break;
|
|
// javascript must be hopcount 1 only
|
|
if ( ct == CT_JS && hops != 1 ) break;
|
|
// is this inlinker internal?
|
|
//bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// skip if not local to site
|
|
//if ( ! internal ) continue;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// assume utf-8. so do a utf-8 sanity check so it doesn't
|
|
// break Words::countWords() by thinking a character is
|
|
// 2+ bytes and breaching the buffer
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 1 from url=%s for %s",
|
|
k->getUrl(),url->m_url);
|
|
continue;
|
|
}
|
|
// convert into words i guess
|
|
Words ww;
|
|
// . TODO: use alt text if only an image in the link!!!!!
|
|
// . return -1 if it fails with g_errno set
|
|
if ( ! ww.setx ( txt , tlen , niceness) ) return (char)-1;
|
|
// int16_tcut
|
|
int32_t nw = ww.getNumWords();
|
|
// skip if too big
|
|
if ( nw >= 30 ) continue;
|
|
// int16_tcut
|
|
int64_t *wids = ww.getWordIds();
|
|
// reset alnumcount
|
|
int32_t count = 0;
|
|
// loop over its words
|
|
for ( int32_t j = 0 ; j < nw && ! check ; j++ ) {
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) continue;
|
|
// keep track of alnum word position
|
|
count++;
|
|
// "contact..." only good from root or root kid
|
|
if ( wids[j] == h_contact && hops >= 1 && count == 1 )
|
|
check = 1;
|
|
// "about..." only good from root or root kid
|
|
if ( wids[j] == h_about && hops >= 1 && count == 1 )
|
|
check = 2;
|
|
// "...privacy policy..."
|
|
if ( wids[j ] == h_privacy && j+2<nw &&
|
|
wids[j+2] == h_policy )
|
|
check = 3;
|
|
// "...privacy statement..."
|
|
if ( wids[j ] == h_privacy && j+2<nw &&
|
|
wids[j+2] == h_statement )
|
|
check = 4;
|
|
// "...terms of service..."
|
|
if ( wids[j ] == h_terms && j+4<nw &&
|
|
wids[j+2] == h_of &&
|
|
wids[j+4] == h_service )
|
|
check = 5;
|
|
// "...terms of use..."
|
|
if ( wids[j ] == h_terms && j+4<nw &&
|
|
wids[j+2] == h_of &&
|
|
wids[j+4] == h_use )
|
|
check = 6;
|
|
// "... terms & conditions ..."
|
|
if ( wids[j ] == h_terms && j+2<nw &&
|
|
wids[j+2] == h_conditions )
|
|
check = 7;
|
|
// "... terms and conditions ..."
|
|
if ( wids[j ] == h_terms && j+4<nw &&
|
|
wids[j+2] == h_and &&
|
|
wids[j+4] == h_conditions )
|
|
check = 8;
|
|
// "...site map ..."
|
|
if ( wids[j] == h_site && j+2<nw &&
|
|
wids[j+2] == h_map )
|
|
check = 9;
|
|
// "...about us..."
|
|
if ( wids[j] == h_about && j+2<nw &&
|
|
wids[j+2] == h_us )
|
|
check = 10;
|
|
// "...contact us..."
|
|
if ( wids[j] == h_contact && j+2<nw &&
|
|
wids[j+2] == h_us)
|
|
check = 11;
|
|
// "help..."
|
|
if ( wids[j] == h_help && count == 1 )
|
|
check = 12;
|
|
// "faq..."
|
|
if ( wids[j] == h_faq && count == 1 )
|
|
check = 13;
|
|
// "faqs..."
|
|
if ( wids[j] == h_faqs && count == 1 )
|
|
check = 14;
|
|
// "...customer support..."
|
|
if ( wids[j] == h_customer && j+2<nw &&
|
|
wids[j+2] == h_support )
|
|
check = 15;
|
|
// "advertise..."
|
|
if ( wids[j] == h_advertise && count == 1)
|
|
check = 16;
|
|
// "...inquiry..."
|
|
if ( wids[j] == h_inquiry )
|
|
check = 17;
|
|
// "...inquiries..."
|
|
if ( wids[j] == h_inquiries )
|
|
check = 18;
|
|
// one word only below here
|
|
if ( ww.getNumAlnumWords() != 1 ) continue;
|
|
if ( wids[j] == h_about ) check = 2;
|
|
if ( wids[j] == h_home ) check = 19;
|
|
if ( wids[j] == h_support ) check = 20;
|
|
if ( wids[j] == h_advertise ) check = 21;
|
|
if ( wids[j] == h_help ) check = 22;
|
|
if ( wids[j] == h_faq ) check = 23;
|
|
if ( wids[j] == h_faqs ) check = 24;
|
|
if ( wids[j] == h_contact ) check = 25;
|
|
if ( wids[j] == h_feedback ) check = 26;
|
|
if ( wids[j] == h_sitemap ) check = 27;
|
|
if ( wids[j] == h_company ) check = 28;
|
|
if ( wids[j] == h_corporate ) check = 29;
|
|
if ( wids[j] == h_privacy ) check = 30;
|
|
if ( wids[j] == h_terms ) check = 31;
|
|
// "location" fixes guildcinema.com
|
|
if ( wids[j] == h_location && isRoot ) check = 32;
|
|
}
|
|
}
|
|
|
|
|
|
// check for certain things in the url path that would indicate that
|
|
// this is a contact info page
|
|
//char *path = m_firstUrl.getPath();
|
|
char *path = url->getPath();
|
|
if ( gb_strcasestr(path,"contact" ) ) { check += 33; check *= 90; }
|
|
if ( gb_strcasestr(path,"/about" ) ) { check += 34; check *= 91; }
|
|
if ( gb_strcasestr(path,"/feedback") ) { check += 35; check *= 92; }
|
|
if ( gb_strcasestr(path,"/help" ) ) { check += 36; check *= 93; }
|
|
if ( gb_strcasestr(path,"/faq" ) ) { check += 37; check *= 94; }
|
|
if ( gb_strcasestr(path,"advertise") ) { check += 38; check *= 95; }
|
|
if ( gb_strcasestr(path,"inquir" ) ) { check += 39; check *= 96; }
|
|
|
|
return check;
|
|
}
|
|
|
|
char *XmlDoc::getIsThisDocContacty() {
|
|
if ( m_isContactyValid ) return &m_isContacty;
|
|
setStatus ( "getting is contacty" );
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
|
|
int8_t *hc = getHopCount();
|
|
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
// get the first url
|
|
Url *fu = getFirstUrl();
|
|
// int16_tcut
|
|
int32_t hops = *hc;
|
|
// check it
|
|
m_isContacty = getIsContacty ( fu ,
|
|
info1 ,
|
|
hops ,
|
|
*ct ,
|
|
*isRoot ,
|
|
m_niceness );
|
|
m_isContactyValid = true;
|
|
return &m_isContacty;
|
|
}
|
|
|
|
|
|
int32_t *XmlDoc::getNumContactAddresses ( ) {
|
|
// process
|
|
Address **ca = getContactAddresses();
|
|
if ( ! ca || ca == (void *)-1 ) return (int32_t *)ca;
|
|
// now we are valid
|
|
return &m_numContactAddresses;
|
|
}
|
|
|
|
|
|
Address **XmlDoc::getContactAddresses ( ) {
|
|
// assume none
|
|
if ( m_contactAddressesValid ) return m_contactAddresses;
|
|
// need this of course
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (void *)-1 ) return (Address **)aa;
|
|
// assume none
|
|
m_contactAddressesValid = true;
|
|
m_numContactAddresses = 0;
|
|
// not if not contacty. we gotta be a url like ".../contact.asp"
|
|
char *ic = getIsThisDocContacty ( );
|
|
if ( ! ic || ic == (void *)-1 ) return (Address **)ic;
|
|
// if not a of contact url form, return none
|
|
if ( ! *ic )
|
|
return m_contactAddresses;
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (Address **)isRoot;
|
|
// do not do this for root if multiple addresses. this
|
|
// fixes http://obits.abqjournal.com/
|
|
if ( *isRoot && aa->m_uniqueStreetHashes > 1 )
|
|
return m_contactAddresses;
|
|
// reset count
|
|
int32_t nca = 0;
|
|
// number of addresses in this doc
|
|
int32_t na = aa->m_am.getNumPtrs();
|
|
// add all addresses then???
|
|
for ( int32_t i = 0 ; i < na ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *ai = (Address *)aa->m_am.getPtr(i);
|
|
// do not add this to tagdb if not inlined!
|
|
if ( ! ( ai->m_flags & AF_INLINED ) ) continue;
|
|
// store it
|
|
m_contactAddresses[nca++] = ai;
|
|
// stop before breach
|
|
if ( nca >= MAX_CONTACT_ADDRESSES ) break;
|
|
}
|
|
// update count
|
|
m_numContactAddresses = nca;
|
|
return m_contactAddresses;
|
|
}
|
|
|
|
int32_t *XmlDoc::getNumOfficialEmails ( ) {
|
|
char *eb = getEmailBuf();
|
|
if ( ! eb || eb == (void *)-1 ) return (int32_t *)eb;
|
|
return &m_numOfficialEmails;
|
|
}
|
|
|
|
// . add email addresses to tag rec
|
|
// . add up to 3 of same domain and different domain addresses
|
|
// . return # of *official* contact infos added to tag rec
|
|
// . this now includes submission forms!
|
|
// . returns -1 and sets g_errno on error
|
|
char *XmlDoc::getEmailBuf ( ) {
|
|
|
|
if ( m_emailBufValid ) return m_emailBuf;
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
|
|
|
|
// count # of official contacts we got
|
|
int32_t official = 0;
|
|
|
|
// int16_tcuts
|
|
int64_t *wids = ww->m_wordIds;
|
|
char **wptrs = ww->m_words;
|
|
int32_t *wlens = ww->m_wordLens;
|
|
nodeid_t *tids = ww->m_tagIds;
|
|
int32_t nw = ww->getNumWords();
|
|
|
|
// get our url
|
|
Url *f = getFirstUrl();
|
|
// get its domain len
|
|
char *myDom = f->getMidDomain();
|
|
int32_t myDomLen = f->getMidDomainLen();
|
|
|
|
|
|
// point here
|
|
char *eptr = m_emailBuf;
|
|
char *emax = m_emailBuf + EMAILBUFSIZE;
|
|
|
|
m_emailBufValid = true;
|
|
|
|
// reset
|
|
*eptr = '\0';
|
|
|
|
//
|
|
// ADD EMAIL ADDRESSES
|
|
//
|
|
|
|
// count how many we find
|
|
int32_t ne = 0;
|
|
// loop over all the words
|
|
for ( int32_t i = 1 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// . email address? look for the '@'
|
|
// . might also have <img src="at.gif"> (bot proof)
|
|
if ( wptrs[i][0] != '@' && tids[i] != TAG_IMG ) continue;
|
|
// . make sure any image has an "/at." in it!
|
|
// . "mail<img src="/common/images/at.gif">pipl.com"
|
|
if(tids[i]==TAG_IMG&&!gb_strncasestr(wptrs[i],wlens[i],"/at."))
|
|
continue;
|
|
// must be a single char
|
|
if ( ! tids[i] && wlens[i] != 1 ) continue;
|
|
// if i was the last word, give up!
|
|
if ( i + 1 >= nw ) break;
|
|
// back up i until we hit a non-email char
|
|
int32_t a ;
|
|
for ( a = i ; a - 1 > 0 ; a-- ) {
|
|
if (wids [a-1] ) continue;
|
|
if (wptrs[a-1][0]=='.'&&wlens[a-1]==1)continue;
|
|
if (wptrs[a-1][0]=='-'&&wlens[a-1]==1)continue;
|
|
break;
|
|
}
|
|
// must not start with '.'
|
|
if ( wptrs[a][0]=='.' ) a++;
|
|
// now get the end of it
|
|
int32_t b;
|
|
int32_t periodCount = 0;
|
|
for ( b = i ; b+1 < nw ; b++ ) {
|
|
if (wids[b+1]) continue;
|
|
// only punct we allow is a single period
|
|
if ( wptrs[b+1][0]!='.' ) break;
|
|
if ( wlens[b+1] != 1 ) break;
|
|
periodCount++;
|
|
}
|
|
// must have at least one!
|
|
if ( ! periodCount ) continue;
|
|
// must not end on '.'
|
|
if ( wptrs[b][0]=='.') b--;
|
|
// hostname must have a valid tld
|
|
char *host = wptrs[i+1];
|
|
char *hend = wptrs[b]+wlens[b];
|
|
// temp null term
|
|
char c = *hend;
|
|
*hend = '\0';
|
|
int32_t tldLen ; char *tld = getTLDFast ( host, &tldLen , false );
|
|
// ignore the rest of this line for addresses even
|
|
// if tld is bogus
|
|
//ignoreLine = true;
|
|
// must have a legit tld!
|
|
if ( ! tld ) { *hend = c; continue; }
|
|
// if not from our same domain, use "emailaddressoffsite"
|
|
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
|
|
// use mid domain. subtract '.'
|
|
//int32_t midlen = tld - dom - 1;
|
|
// undo the temp NULL thing
|
|
*hend = c;
|
|
if ( ! dom ) continue;
|
|
|
|
// include last word
|
|
b++;
|
|
// normal buffer
|
|
char buf[100];
|
|
char *p = buf;
|
|
char *pend = buf + 100;
|
|
// normalize it
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// include the at sign
|
|
if ( j == i ) {*p++ = '@'; continue;}
|
|
// skip tags
|
|
if ( tids[j] ) continue;
|
|
// skip punct
|
|
if ( ! wids[j] ) {*p++ ='.'; continue;}
|
|
// ensure minimal space
|
|
if ( p + wlens[j] + 1 >= pend ) break;
|
|
// write out wids
|
|
gbmemcpy ( p , wptrs[j] , wlens[j] );
|
|
p += wlens[j];
|
|
}
|
|
// NULL term it
|
|
*p = '\0';
|
|
|
|
// do we match domains?
|
|
//char *tn = "emailaddressoffsite";
|
|
// use this if we match domains
|
|
//if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
|
|
// tn = "emailaddressonsite";
|
|
// // this is an official contact method
|
|
// //official++;
|
|
//}
|
|
// we now count even offsite email addresses as official
|
|
// for addresses like @gmail.com etc. because we are now
|
|
// only checking "contact us" and "about us" and root pages,
|
|
// so they should never be email addresses of commenters.
|
|
// and often bloggers have external email addresses.
|
|
// http://www.christinesaari.com/html/about.php?psi=44
|
|
official++;
|
|
// store it
|
|
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
|
|
// return -1;
|
|
int32_t blen = gbstrlen(buf);
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// limit it
|
|
if ( ++ne >= 3 ) break;
|
|
}
|
|
|
|
//
|
|
// ADD BOT-PROOF EMAIL ADDRESSES (bot proof)
|
|
//
|
|
// super dot john at xyz dot com
|
|
//
|
|
|
|
int64_t h_at = hash64Lower_utf8("at");
|
|
int64_t h_dot = hash64Lower_utf8("dot");
|
|
// loop over all the words
|
|
for ( int32_t i = 1 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// email address? look for the " at "
|
|
if ( wids[i] != h_at ) continue;
|
|
// front name word count
|
|
int32_t nameCount = 0;
|
|
// back up i until we hit a non-email word
|
|
int32_t a ;
|
|
// do a loop
|
|
for ( a = i - 1 ; a > 0 ; ) {
|
|
// need a space/punt word
|
|
if ( wids[a] ) break;
|
|
if ( tids[a] ) break;
|
|
// skip it
|
|
a--;
|
|
// then need the "john" part
|
|
if ( ! wids[a] ) break;
|
|
if ( tids[a] ) break;
|
|
if ( wids[a] == h_dot ) break; // "dot" is bad
|
|
// count account name part
|
|
nameCount++;
|
|
// go back if like "mike dot smith"
|
|
if ( a - 4 >= 0 &&
|
|
! tids[a-1] &&
|
|
wids [a-2] == h_dot &&
|
|
! tids[a-3] &&
|
|
wids [a-4] != h_dot &&
|
|
wids [a-4] != h_at )
|
|
a -= 4;
|
|
// that is good enough
|
|
break;
|
|
}
|
|
// need a name at least one
|
|
if ( nameCount <= 0 ) continue;
|
|
// skip over that space/punct word
|
|
//a--;
|
|
// now must be regular word before that
|
|
//if ( tids[a-1] ) continue;
|
|
//if ( ! wids[a-1] ) continue;
|
|
// we got it
|
|
//a--;
|
|
// now get the end of it
|
|
int32_t b ;
|
|
// count the dots
|
|
int32_t dotCount = 0;
|
|
// make sure last word is a legit tld
|
|
int32_t tldLen = 0; char *tld = NULL;
|
|
// do a loop
|
|
for ( b = i + 1 ; b + 3 < nw ; b++ ) {
|
|
// need a space/punt word
|
|
if ( wids[b] ) break;
|
|
if ( tids[b] ) break;
|
|
// skip it
|
|
b++;
|
|
// then need the "xyz" part
|
|
if ( ! wids[b] ) break;
|
|
if ( tids[b] ) break;
|
|
if ( wids[b] == h_dot ) break; // "dot" is bad
|
|
// remember it for tld detection
|
|
tld = wptrs[b];
|
|
tldLen = wlens[b];
|
|
// skip it
|
|
b++;
|
|
// need another space/punct word
|
|
if ( wids[b] ) break;
|
|
if ( tids[b] ) break;
|
|
// skip it
|
|
b++;
|
|
// now we need a "dot"
|
|
if ( wids[b] != h_dot ) break;
|
|
// count the dots
|
|
dotCount++;
|
|
}
|
|
// need at least one "dot"
|
|
if ( dotCount < 1 ) continue;
|
|
// not too many!
|
|
if ( dotCount > 5 ) continue;
|
|
// must have legit tld
|
|
if ( tld && ! isTLD ( tld , tldLen ) ) continue;
|
|
// normal buffer
|
|
char buf[100];
|
|
char *p = buf;
|
|
char *pend = buf + 100;
|
|
// normalize it
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// skip tags
|
|
if ( tids[j] ) continue;
|
|
// skip punct
|
|
if ( ! wids[j] ) continue;
|
|
// ensure minimal space
|
|
if ( p + wlens[j] + 1 >= pend ) break;
|
|
// write out wids
|
|
if ( wids[j] == h_at ) {*p++ = '@'; continue;}
|
|
if ( wids[j] == h_dot ) {*p++ = '.'; continue;}
|
|
gbmemcpy ( p , wptrs[j] , wlens[j] );
|
|
p += wlens[j];
|
|
}
|
|
// NULL term it
|
|
*p = '\0';
|
|
// get the host
|
|
char *host = buf ; // wptrs[i+1]; ?? is this right?
|
|
// if not from our same domain, use "emailaddressoffsite"
|
|
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
|
|
if ( ! dom ) continue;
|
|
// use mid domain
|
|
int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
|
|
// limit domain by that. subtract '.'
|
|
int32_t midlen = tld3 - dom - 1;
|
|
// do we match domains?
|
|
char *tn = "emailaddressoffsite";
|
|
// use this if we match domains
|
|
if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
|
|
tn = "emailaddressonsite";
|
|
// this is an official contact method
|
|
//official++;
|
|
}
|
|
// we now count even offsite email addresses as official
|
|
// for addresses like @gmail.com etc. because we are now
|
|
// only checking "contact us" and "about us" and root pages,
|
|
// so they should never be email addresses of commenters
|
|
// and often bloggers have external email addresses.
|
|
// http://www.christinesaari.com/html/about.php?psi=44
|
|
official++;
|
|
// store that
|
|
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
|
|
// return -1;
|
|
int32_t blen = gbstrlen(buf);
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// limit it
|
|
if ( ++ne >= 3 ) break;
|
|
}
|
|
|
|
//
|
|
// ADD EMAIL ADDRESSES IN MAILTO TAGS
|
|
//
|
|
// <a href=mailto:steve@xyz.com>
|
|
// <a href=mailto:"steve at xyz dot com">
|
|
// now we check char by char since a website had it in the javascript:
|
|
// http://www.botanique.com/bincgi/stateprov.CFM?state=NM
|
|
//
|
|
char *m = xml->m_xml;
|
|
char *mend = m + xml->m_xmlLen - 4;
|
|
// empty?
|
|
if ( ! m ) mend = m;
|
|
// scan
|
|
for ( ; ; m++ ) {
|
|
// breach?
|
|
if ( m >= mend ) break;
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not possible mailto:
|
|
if ( *m != 'm' && *m !='M' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'a' && *m !='A' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'i' && *m !='I' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'l' && *m !='L' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 't' && *m !='T' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'o' && *m !='O' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != ':' ) continue;
|
|
// skip
|
|
m++;
|
|
// set end
|
|
char *mend = m + 100;
|
|
// skip over the mailto:
|
|
//m += 7;
|
|
// that is the start of the email address then
|
|
char *start = m;
|
|
// skip til '@'
|
|
for ( ; *m && m < mend && *m != '@' ; m++ ) {
|
|
// but give up if we hit a non-email name char
|
|
if ( is_alnum_a(*m) ) continue;
|
|
if ( *m == '.' ) continue;
|
|
if ( *m == '-' ) continue;
|
|
break;
|
|
}
|
|
// bad if no @
|
|
if ( *m != '@' ) continue;
|
|
// skip the @
|
|
m++;
|
|
// . skip until alnum
|
|
// . fix parsing of "dsquires@ unimelb.edu.au" for
|
|
// http://www.marcom1.unimelb.edu.au/public/contact.html
|
|
for (;*m && is_wspace_utf8(m); m+=getUtf8CharSize(m) );
|
|
// get the host
|
|
char *host = m;
|
|
// skip till end of hostname
|
|
for (;*m && m<mend && (is_alnum_a(*m)||*m=='.'||*m=='-');m++ );
|
|
// null term
|
|
char c = *m; *m = '\0';
|
|
// if not from our same domain, use "emailaddressoffsite"
|
|
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
|
|
// skip if no valid domain
|
|
if ( ! dom ) { *m = c; continue; }
|
|
// use mid domain
|
|
int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
|
|
// limit domain by that. subtract '.'
|
|
int32_t midlen = tld3 - dom - 1;
|
|
// put it back
|
|
*m = c;
|
|
// point "end" to end of the email address
|
|
char *end = dom + dlen;
|
|
// do we match domains?
|
|
char *tn = "emailaddressoffsite";
|
|
// use this if we match domains
|
|
if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
|
|
tn = "emailaddressonsite";
|
|
// this is an official contact method
|
|
//official++;
|
|
}
|
|
// we now count even offsite email addresses as official
|
|
// for addresses like @gmail.com etc. because we are now
|
|
// only checking "contact us" and "about us" and root pages,
|
|
// so they should never be email addresses of commenters
|
|
// and often bloggers have external email addresses.
|
|
// http://www.christinesaari.com/html/about.php?psi=44
|
|
official++;
|
|
// store that
|
|
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,start,end-start) )
|
|
// return -1;
|
|
// cast it
|
|
char *buf = start;
|
|
int32_t blen = end - start;
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// limit it
|
|
if ( ++ne >= 3 ) break;
|
|
}
|
|
|
|
|
|
//
|
|
// ADD CONTACT FORM
|
|
//
|
|
|
|
bool gotEmailBox = false;
|
|
bool storedForm = false;
|
|
int32_t emailPos = -1;
|
|
int32_t alnumCount = 0;
|
|
// quick compares
|
|
int64_t he1 = hash64Lower_utf8 ( "email");
|
|
int64_t he2 = hash64Lower_utf8 ( "mail");
|
|
// loop over all words again
|
|
for ( int32_t i = 1 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get tag id if any
|
|
int32_t tid = tids[i] & BACKBITCOMP;
|
|
// . do we have a submit form?
|
|
// . first, do we have a text box for the sender's email?
|
|
if ( tid == TAG_INPUT ) {
|
|
int32_t ttlen;
|
|
// bad i is not a node # it is a word #
|
|
int32_t nn = ww->m_nodes[i];
|
|
// must be valid
|
|
char *tt = xml->getString(nn,"type",&ttlen);
|
|
if ( ! tt || ttlen <= 0 ) continue;
|
|
// must be of type text
|
|
if ( strncasecmp(tt,"text",4) ) continue;
|
|
// might have "email" or "e-mail" in the value
|
|
int32_t vlen;
|
|
char *val = xml->getString(nn,"value",&vlen);
|
|
// check that
|
|
if ( val ) {
|
|
if ( gb_strncasestr(val,vlen,"email") ||
|
|
gb_strncasestr(val,vlen,"e-mail") )
|
|
// flag it good
|
|
gotEmailBox = true;
|
|
}
|
|
// must have the word "email" or "e-mail" within
|
|
// a few words right before it!
|
|
if ( emailPos == -1 ) continue;
|
|
//if ( i - emailPos >= 7 ) continue;
|
|
if ( alnumCount > 7 ) continue;
|
|
// flag it
|
|
gotEmailBox = true;
|
|
}
|
|
// text area? must happen AFTER the email address box
|
|
if ( tid == TAG_TEXTAREA && gotEmailBox ) {
|
|
// must have had the form before us
|
|
// do not double store into tagdb rec
|
|
if ( storedForm ) continue;
|
|
// store this bad boy into the tagdb rec
|
|
//if ( ! gr->addTag("hascontactform",
|
|
// timestamp,
|
|
// "xmldoc",
|
|
// ip,
|
|
// "1" ,
|
|
// 1 ) )
|
|
// return -1;
|
|
// copy it
|
|
char *buf = "hascontactform";
|
|
int32_t blen = gbstrlen(buf);
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// do not double store
|
|
storedForm = true;
|
|
// this is an official contact method
|
|
official++;
|
|
// another contact method
|
|
ne++;
|
|
// that's enough!
|
|
break;
|
|
}
|
|
// alnum counter
|
|
if ( wids[i] ) alnumCount++;
|
|
// special counter
|
|
if ( wids[i] == he1 || wids[i] == he2 ) {
|
|
// mark it
|
|
emailPos = i;
|
|
// reset counter
|
|
alnumCount = 0;
|
|
}
|
|
}
|
|
|
|
// null term
|
|
*eptr = '\0';
|
|
|
|
m_numOfficialEmails = official;
|
|
|
|
// i guess that is it
|
|
return m_emailBuf;
|
|
}
|
|
|
|
// returns vector 1-1 with Words.m_words[] array
|
|
/*
|
|
Spam *XmlDoc::getSpam ( ) {
|
|
if ( m_spamValid ) return &m_spam;
|
|
// set it
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Spam *)ww;
|
|
Bits *bits = getBits ();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (Spam *)bits;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (Spam *)sni;
|
|
// if more than X% ("thresh") of words are spammed to some degree,
|
|
// index all words with a minimum score
|
|
int32_t thresh = 6;
|
|
if ( *sni > 10 ) thresh = 8;
|
|
if ( *sni > 30 ) thresh = 10;
|
|
if ( *sni > 100 ) thresh = 20;
|
|
if ( *sni > 500 ) thresh = 30;
|
|
//int64_t x[] = {30,40,50,70,90};
|
|
//int64_t y[] = {6,8,10,20,30};
|
|
//int32_t spamThresh = getY ( m_docQuality , x , y , 5 );
|
|
if ( ! m_spam.set ( ww ,
|
|
bits ,
|
|
m_version ,
|
|
thresh ,
|
|
20 ,
|
|
m_niceness ))
|
|
return NULL;
|
|
m_spamValid = true;
|
|
return &m_spam;
|
|
}
|
|
*/
|
|
|
|
// this means any tod now
|
|
bool *XmlDoc::getHasTOD ( ) {
|
|
if ( m_hasTODValid ) return &m_hasTOD2;
|
|
// scan the dates
|
|
Dates *dp = getDates() ;
|
|
if ( ! dp || dp == (Dates *)-1 ) return (bool *)dp;
|
|
// assume not
|
|
m_hasTOD2 = false;
|
|
m_hasTOD = false;
|
|
// scan the dates
|
|
for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get date
|
|
Date *di = dp->m_datePtrs[i];
|
|
// skip if got nuked
|
|
if ( ! di ) continue;
|
|
// tod?
|
|
if ( !(di->m_hasType & DT_TOD) ) continue;
|
|
// got one
|
|
m_hasTOD2 = true;
|
|
m_hasTOD = true;
|
|
}
|
|
// it is now valid
|
|
m_hasTODValid = true;
|
|
return &m_hasTOD2;
|
|
}
|
|
|
|
/*
|
|
bool *XmlDoc::getHasSiteVenue ( ) {
|
|
if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
|
|
// get the tag rec
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (bool *)gr;
|
|
// get tag from it
|
|
Tag *sv = gr->getTag("venueaddress") ;
|
|
// from that
|
|
m_hasSiteVenue2 = (bool)sv;
|
|
m_hasSiteVenue = (bool)sv;
|
|
m_hasSiteVenueValid = true;
|
|
return &m_hasSiteVenue2;
|
|
}
|
|
*/
|
|
|
|
|
|
// do not include addresses that are always in the header/footer of every page!
|
|
bool *XmlDoc::getHasAddress ( ) {
|
|
if ( m_hasAddressValid ) return &m_hasAddress2;
|
|
// get the addresses
|
|
Addresses *aa = getAddresses();
|
|
if ( ! aa || aa == (void *)-1 ) return (bool *)aa;
|
|
// from that
|
|
m_hasAddress2 = (aa->getNumNonDupAddresses() > 0);
|
|
m_hasAddress = (aa->getNumNonDupAddresses() > 0);
|
|
m_hasAddressValid = true;
|
|
return &m_hasAddress2;
|
|
}
|
|
|
|
Addresses *XmlDoc::getAddresses ( ) {
|
|
if ( m_addressesValid ) {
|
|
// return error if buf was breached
|
|
//if ( m_addresses.m_breached ) {
|
|
// g_errno = EBUFOVERFLOW;
|
|
// return NULL;
|
|
//}
|
|
// otherwise, return it
|
|
return &m_addresses;
|
|
}
|
|
// skip for now
|
|
m_addressesValid = true;
|
|
return &m_addresses;
|
|
// note it
|
|
setStatus ( "getting addresses");
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Addresses *)ww;
|
|
// we make sure that D_IS_IN_DATE is set by doing this
|
|
//Dates *dp = getDates();
|
|
//if ( ! dp || dp == (Dates *)-1) return (Addresses *)dp;
|
|
// we set the D_IS_IN_DATE flag for these bits
|
|
Bits *bits = getBits(); if ( ! bits ) return NULL;
|
|
Sections *sections = getExplicitSections();
|
|
if ( !sections||sections==(Sections *)-1) return (Addresses *)sections;
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (Addresses *)gr;
|
|
// the site hash
|
|
//int32_t *sh32 = getSiteHash32();
|
|
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Addresses *)sh32;
|
|
int32_t dh = getDomHash32();
|
|
// hash of all adjacent tag pairs
|
|
//uint32_t *tph = getTagPairHash32 ( ) ;
|
|
//if ( ! tph || tph == (void *)-1 ) return (Addresses *)tph;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Addresses *)d;
|
|
// get our ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1) return (Addresses *)ip;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
//char **stb = getSiteTitleBuf();
|
|
//if ( ! stb || stb == (void *)-1 ) return (Addresses *)stb;
|
|
// sanity check
|
|
//if ( ! m_siteTitleBufValid ) { char *xx=NULL;*xx=0; }
|
|
char **fbuf = getFilteredRootTitleBuf();
|
|
if ( ! fbuf || fbuf == (void *)-1 ) return (Addresses *)fbuf;
|
|
|
|
// this will set D_IS_IN_DATE in the Bits::m_bits[] array which
|
|
// Addresses::set() uses to avoid having addresses that are really
|
|
// just dates!
|
|
Dates *dd = getSimpleDates();
|
|
// return NULL on error
|
|
if ( ! dd ) return (Addresses *)NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if the serialized section is valid, use that
|
|
//char *sd = NULL;
|
|
//bool valid = false;
|
|
//if ( od && od->m_sectionsReplyValid ) valid = true;
|
|
//if ( valid ) sd = od->ptr_sectionsReply;
|
|
// assume valid, really only when it returns in case it blocked...
|
|
//m_addressesValid = true;
|
|
// this should not be outstanding!
|
|
if ( m_addressSetCalled ) { char *xx=NULL;*xx=0; }
|
|
// assume valid, really only when it returns in case it blocked...
|
|
m_addressesValid = true;
|
|
// set it
|
|
m_addressSetCalled = true;
|
|
// make a copy of the tag rec here in case it gets mangled later
|
|
// because the m_addresses class may reference its buffer
|
|
//m_savedTagRec1.copy ( gr );
|
|
// . this returns false if blocked
|
|
// . it uses the "venueaddress" from the tagrec, "gr", BUT if this
|
|
// page is the one that sets the venue address, it won't be able
|
|
// to use it as a default city/state thingy until next time it is
|
|
// spidered, since that info is in the tagrec
|
|
// . PROBLEM: if the venue address is on this page, we can't take
|
|
// advantage of it by usings its city/state as a default for the
|
|
// other addresses on this page
|
|
if ( ! m_addresses.set ( sections ,
|
|
ww ,
|
|
bits ,
|
|
&m_tagRec , // &m_savedTagRec1 , // gr
|
|
&m_firstUrl ,
|
|
*d ,
|
|
cr->m_collnum ,
|
|
dh , // *sh32
|
|
*ip ,
|
|
//(int32_t)*tph ,
|
|
m_niceness ,
|
|
m_pbuf ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
*ct ,
|
|
//ptr_addressReply ,
|
|
//size_addressReply ,
|
|
//m_addressReplyValid ,
|
|
m_filteredRootTitleBuf ,
|
|
m_filteredRootTitleBufSize ,
|
|
this ))
|
|
return (Addresses *)-1;
|
|
// sanity check
|
|
if ( m_addresses.m_msg2c &&
|
|
m_addresses.m_msg2c->m_requests !=
|
|
m_addresses.m_msg2c->m_replies) {
|
|
char *xx=NULL;*xx=0; }
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// return it if not breached
|
|
//if ( ! m_addresses.m_breached ) return &m_addresses;
|
|
// return that error otherwise
|
|
//g_errno = EBUFOVERFLOW;
|
|
//return NULL;
|
|
return &m_addresses;
|
|
}
|
|
|
|
/*
|
|
int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
|
|
if ( m_siteNumInlinksUniqueIpValid )
|
|
return &m_siteNumInlinksUniqueIp;
|
|
// get our companion number
|
|
int32_t *ni = getSiteNumInlinks();
|
|
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
|
|
// sanity check
|
|
if ( ! m_siteNumInlinksUniqueIp ) { char *xx=NULL;*xx=0; }
|
|
// ok we must be valid
|
|
return &m_siteNumInlinksUniqueIp;
|
|
}
|
|
|
|
int32_t *XmlDoc::getSiteNumInlinksUniqueCBlock ( ) {
|
|
if ( m_siteNumInlinksUniqueCBlockValid )
|
|
return &m_siteNumInlinksUniqueCBlock;
|
|
// get our companion number
|
|
int32_t *ni = getSiteNumInlinks();
|
|
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
|
|
// sanity check
|
|
if ( ! m_siteNumInlinksUniqueCBlock ) { char *xx=NULL;*xx=0; }
|
|
// ok we must be valid
|
|
return &m_siteNumInlinksUniqueCBlock;
|
|
}
|
|
|
|
int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
|
|
if ( m_siteNumInlinksTotalValid )
|
|
return &m_siteNumInlinksTotal;
|
|
// get our companion number
|
|
int32_t *ni = getSiteNumInlinks();
|
|
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
|
|
// sanity check
|
|
if ( ! m_siteNumInlinksTotal ) { char *xx=NULL;*xx=0; }
|
|
// ok we must be valid
|
|
return &m_siteNumInlinksTotal;
|
|
}
|
|
*/
|
|
|
|
// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
|
|
int32_t *XmlDoc::getFirstIp ( ) {
|
|
// return it if we got it
|
|
if ( m_firstIpValid ) return &m_firstIp;
|
|
// note it
|
|
setStatus ( "getting first ip");
|
|
// get tag rec
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// got it
|
|
Tag *tag = gr->getTag ( "firstip" );
|
|
// get from tag
|
|
m_firstIp = 0;
|
|
if ( tag ) m_firstIp = atoip(tag->getTagData());
|
|
// if no tag, or is bogus in tag... set from ip
|
|
if ( m_firstIp == 0 || m_firstIp == -1 ) {
|
|
// need ip then!
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
|
|
// set that
|
|
m_firstIp = *ip;
|
|
}
|
|
m_firstIpValid = true;
|
|
return &m_firstIp;
|
|
// must be 4 bytes - no now its a string
|
|
//if ( tag->getTagDataSize() != 4 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
uint8_t *XmlDoc::getSiteNumInlinks8 () {
|
|
if ( m_siteNumInlinks8Valid ) return &m_siteNumInlinks8;
|
|
// get the full count
|
|
int32_t *si = getSiteNumInlinks();
|
|
if ( ! si || si == (int32_t *)-1 ) return (uint8_t *)si;
|
|
// convert to 8
|
|
m_siteNumInlinks8 = score32to8 ( *si );
|
|
// validate
|
|
m_siteNumInlinks8Valid = true;
|
|
return &m_siteNumInlinks8;
|
|
}
|
|
|
|
// this is the # of GOOD INLINKS to the site. so it is no more than
|
|
// 1 per c block, and it has to pass link spam detection. this is the
|
|
// highest-level count of inlinks to the site. use it a lot.
|
|
int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
|
|
|
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
|
|
|
|
// sanity check
|
|
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// hacks of speed. computeSiteNumInlinks is true by default
|
|
// but if the user turns it off the just use sitelinks.txt
|
|
if ( cr && ! cr->m_computeSiteNumInlinks ) {
|
|
int32_t hostHash32 = getHostHash32a();
|
|
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
|
// try with www if not there
|
|
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
|
|
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
|
|
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
|
}
|
|
// fix core by setting these
|
|
// m_siteNumInlinksUniqueIp = 0;
|
|
// m_siteNumInlinksUniqueCBlock = 0;
|
|
// m_siteNumInlinksTotal = 0;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
//a nd this
|
|
m_siteNumInlinksValid = true;
|
|
m_siteNumInlinks = 0;
|
|
// if still not in sitelinks.txt, just use 0
|
|
if ( min < 0 ) {
|
|
return &m_siteNumInlinks;
|
|
}
|
|
m_siteNumInlinks = min;
|
|
return &m_siteNumInlinks;
|
|
}
|
|
|
|
setStatus ( "getting site num inlinks");
|
|
|
|
// get it from the tag rec if we can
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (void *)-1 ) return (int32_t *)gr;
|
|
|
|
// the current top ip address
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
|
|
//int32_t top = *ip & 0x00ffffff;
|
|
|
|
// this happens when its NXDOMAIN reply from dns so assume
|
|
// no site inlinks
|
|
if ( *ip == 0 ) {
|
|
m_siteNumInlinks = 0;
|
|
// m_siteNumInlinksUniqueIp = 0;
|
|
// m_siteNumInlinksUniqueCBlock = 0;
|
|
// m_siteNumInlinksTotal = 0;
|
|
m_siteNumInlinksValid = true;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
return &m_siteNumInlinks;
|
|
}
|
|
|
|
if ( *ip == -1 ) {
|
|
log("xmldoc: ip is %" INT32 ", can not get site inlinks",*ip);
|
|
g_errno = EBADIP;
|
|
return NULL;
|
|
}
|
|
|
|
// wait for clock to sync before calling getTimeGlobal
|
|
int32_t wfts = waitForTimeSync();
|
|
// 0 means error, i guess g_errno should be set, -1 means blocked
|
|
if ( ! wfts ) return NULL;
|
|
if ( wfts == -1 ) return (int32_t *)-1;
|
|
|
|
setStatus ( "getting site num inlinks");
|
|
// check the tag first
|
|
Tag *tag = gr->getTag ("sitenuminlinks");
|
|
// is it valid?
|
|
bool valid = true;
|
|
// current time
|
|
int32_t now = getTimeGlobal();
|
|
// use the spidered time for the test collection for consistency
|
|
if ( !strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
now = getSpideredTime();//m_spideredTime;
|
|
}
|
|
// get tag age in days
|
|
int32_t age = 0; if ( tag ) age = (now - tag->m_timestamp) ;
|
|
// add in some flutter to avoid having all hsots in the network
|
|
// calling msg25 for this site at the same time.
|
|
// a 10,000 second jitter. 3 hours.
|
|
int32_t flutter = rand() % 10000;
|
|
// add it in
|
|
age += flutter;
|
|
// . if site changes ip then toss the contact info out the window,
|
|
// but give it a two week grace period
|
|
// . well now we use the "ownershipchanged" tag to indicate that
|
|
//if (tag && age>14*3600*24) valid=false;
|
|
// . we also expire it periodically to keep the info uptodate
|
|
// . the higher quality the site, the longer the expiration date
|
|
int32_t ns = 0;
|
|
int32_t maxAge = 0;
|
|
int32_t sni = -1;
|
|
if ( tag ) {
|
|
// how many site inlinks?
|
|
ns = atol(tag->getTagData());
|
|
// for less popular sites use smaller maxAges
|
|
maxAge = 90;
|
|
if ( ns < 10 ) maxAge = 10;
|
|
else if ( ns < 30 ) maxAge = 15;
|
|
else if ( ns < 50 ) maxAge = 30;
|
|
else if ( ns < 100 ) maxAge = 60;
|
|
// if index size is tiny then maybe we are just starting to
|
|
// build something massive, so reduce the cached max age
|
|
int64_t nt = g_titledb.m_rdb.getCollNumTotalRecs(m_collnum);
|
|
if ( nt < 100000000 ) //100M
|
|
maxAge = 3;
|
|
if ( nt < 10000000 ) //10M
|
|
maxAge = 1;
|
|
// for every 100 urls you already got, add a day!
|
|
sni = atol(tag->getTagData());
|
|
// double if repairing
|
|
//if ( m_useSecondaryRdbs ) maxAge = (maxAge+1) * 2;
|
|
// fix bug for rebuild. rebuild any tag before now because
|
|
// the MAX_LINKERS_IN_TERMLIST was too small in Linkdb.cpp
|
|
// and i raised from 1M to 3M. it was hurting mahalo.com.
|
|
if ( m_useSecondaryRdbs && tag->m_timestamp < 1345819704 )
|
|
valid = false;
|
|
// force another rebuild of siterank because i fixed
|
|
// the 'beds' query a little to use firstip, so recompute
|
|
// siterank for those spammers.
|
|
if ( m_useSecondaryRdbs && tag->m_timestamp < 1348257346 &&
|
|
// leave really big guys in tact
|
|
sni < 300 )
|
|
valid = false;
|
|
// convert into seconds
|
|
maxAge *= 3600*24;
|
|
// so youtube which has 2997 links will add an extra 29 days
|
|
maxAge += (sni / 100) * 86400;
|
|
// hack for global index. never affect siteinlinks i imported
|
|
if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0;
|
|
// invalidate for that as wel
|
|
if ( age > maxAge ) valid = false;
|
|
}
|
|
// our companion tags, sitePop and fresh inlinks
|
|
// Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
|
|
// Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
|
|
// Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
|
|
// if we are missing either of those, invalidate as well
|
|
// if ( ! tag2 ) valid = false;
|
|
// if ( ! tag3 ) valid = false;
|
|
// if ( ! tag4 ) valid = false;
|
|
// if we have already been through this
|
|
if ( m_updatingSiteLinkInfoTags ) valid = false;
|
|
// if rebuilding linkdb assume we have no links to sample from!
|
|
if ( tag && m_useSecondaryRdbs && g_repair.m_rebuildLinkdb )
|
|
valid = true;
|
|
|
|
// debug log
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: valid=%" INT32 " "
|
|
"age=%" INT32 " ns=%" INT32 " sni=%" INT32 " "
|
|
"maxage=%" INT32 " "
|
|
"tag=%" PTRFMT " "
|
|
// "tag2=%"PTRFMT" "
|
|
// "tag3=%"PTRFMT" "
|
|
"url=%s",
|
|
(int32_t)valid,age,ns,sni,
|
|
maxAge,
|
|
(PTRTYPE)tag,
|
|
// (PTRTYPE)tag2,
|
|
// (PTRTYPE)tag3,
|
|
m_firstUrl.m_url);
|
|
|
|
LinkInfo *sinfo = NULL;
|
|
char *mysite = NULL;
|
|
|
|
// if we are good return it
|
|
if ( tag && valid ) {
|
|
// set it
|
|
m_siteNumInlinks = atol(tag->getTagData());
|
|
m_siteNumInlinksValid = true;
|
|
|
|
// companion tags
|
|
// if ( tag2 ) {
|
|
// m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// }
|
|
// if ( tag3 ) {
|
|
// m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// }
|
|
// if ( tag4 ) {
|
|
// m_siteNumInlinksTotal =atol(tag4->getTagData());
|
|
// m_siteNumInlinksTotalValid = true;
|
|
// }
|
|
|
|
// . consult our sitelinks.txt file
|
|
// . returns -1 if not found
|
|
goto updateToMin;
|
|
}
|
|
|
|
// set status. we can time status changes with this routine!
|
|
//setStatus ( "getting site link info");
|
|
|
|
|
|
// if ip is bad we can't do this. we need to have a legit ip
|
|
// so we know if a linker is internal or not
|
|
/*
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
log("gb: bad ip so we can't get site num inlinks right");
|
|
m_siteNumInlinks = 0;
|
|
m_sitePop = 0;
|
|
m_siteNumInlinksFresh = 0;
|
|
m_siteNumInlinksValid = true;
|
|
m_siteNumInlinksFreshValid = true;
|
|
m_sitePopValid = true;
|
|
return &m_siteNumInlinks;
|
|
}
|
|
*/
|
|
|
|
// set this flag so when we are re-called, "valid" will be set to false
|
|
// so we can come down here and continue this. "flutter" might
|
|
// otherwise cause us to not make it down here.
|
|
m_updatingSiteLinkInfoTags = true;
|
|
|
|
// we need to re-get both if either is NULL
|
|
sinfo = getSiteLinkInfo();
|
|
// block or error?
|
|
if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;
|
|
|
|
//
|
|
// now update tagdb!
|
|
//
|
|
|
|
// ok, get the sites of the external outlinks and they must
|
|
// also be NEW outlinks, added to the page since the last time
|
|
// we spidered it...
|
|
//Links *links = getLinks ();
|
|
//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
|
|
|
mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;
|
|
|
|
setStatus ( "adding site info tags to tagdb 1");
|
|
|
|
// why are we adding tag again! should already be in tagdb!!!
|
|
if ( m_doingConsistencyCheck ) {char*xx=NULL;*xx=0;}
|
|
|
|
// do not re-call at this point
|
|
//m_siteNumInlinks = sinfo->m_numInlinksExtrapolated;
|
|
m_siteNumInlinks = (int32_t)sinfo->m_numGoodInlinks;
|
|
//m_siteNumInlinksFresh = sinfo->m_numInlinksFresh;
|
|
//m_sitePop = sinfo->m_pagePop;
|
|
// m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
|
|
// m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
|
|
// m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
|
|
|
|
m_siteNumInlinksValid = true;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
|
|
|
|
updateToMin:
|
|
|
|
// . consult our sitelinks.txt file
|
|
// . returns -1 if not found
|
|
int32_t hostHash32 = getHostHash32a();
|
|
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
|
|
|
// try with www if not there
|
|
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
|
|
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
|
|
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
|
}
|
|
|
|
if ( min >= 0 ) {
|
|
if ( m_siteNumInlinks < min ||
|
|
! m_siteNumInlinksValid ) {
|
|
m_siteNumInlinks = min;
|
|
m_siteNumInlinksValid = true;
|
|
}
|
|
// if ( ! m_siteNumInlinksUniqueIpValid ||
|
|
// m_siteNumInlinksUniqueIp < min ) {
|
|
// m_siteNumInlinksUniqueIp = min;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// }
|
|
// if ( ! m_siteNumInlinksUniqueCBlockValid ||
|
|
// m_siteNumInlinksUniqueCBlock < min ) {
|
|
// m_siteNumInlinksUniqueCBlock = min;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// }
|
|
// if ( ! m_siteNumInlinksTotalValid ||
|
|
// m_siteNumInlinksTotal < min ) {
|
|
// m_siteNumInlinksTotal = min;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
// }
|
|
}
|
|
|
|
|
|
|
|
// deal with it
|
|
return &m_siteNumInlinks;
|
|
}
|
|
|
|
// . do a 'site:xyz.com | gbnuminlinks' query to get the top docs
|
|
// from a site and get the gigabits from that query!
|
|
// . then store the resulting gigabits into tagdb for efficiency
|
|
// . recompute once per month or so ... or if ip changes i guess
|
|
// . we need the root title as a source for city and adm1's for
|
|
// Addresses::set() function
|
|
//char **XmlDoc::getSiteGigabits ( ) {
|
|
//}
|
|
|
|
// TODO: can we have a NULL LinkInfo without having had an error?
|
|
LinkInfo *XmlDoc::getSiteLinkInfo() {
|
|
// lookup problem?
|
|
if ( g_errno ) {
|
|
log("build: error getting link info: %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
setStatus ( "getting site link info" );
|
|
|
|
if ( m_siteLinkInfoValid )
|
|
//return msg25.m_linkInfo;
|
|
return (LinkInfo *)m_mySiteLinkInfoBuf.getBufStart();
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
|
|
int32_t *fip = getFirstIp();
|
|
if ( ! fip || fip == (int32_t *)-1) return (LinkInfo *)fip;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// can we be cancelled?
|
|
bool canBeCancelled = true;
|
|
// not if pageparser though
|
|
if ( m_pbuf ) canBeCancelled = false;
|
|
// not if injecting
|
|
if ( ! m_sreqValid ) canBeCancelled = false;
|
|
// assume valid when it returns
|
|
m_siteLinkInfoValid = true;
|
|
// use this buffer so XmlDoc::print() can display it where it wants
|
|
SafeBuf *sb = NULL;
|
|
if ( m_pbuf ) sb = &m_siteLinkBuf;
|
|
// only do this for showing them!!!
|
|
if ( m_useSiteLinkBuf ) sb = &m_siteLinkBuf;
|
|
//bool onlyGetGoodInlinks = true;
|
|
//if ( m_useSiteLinkBuf ) onlyGetGoodInlinks = false;
|
|
// get this
|
|
int32_t lastUpdateTime = getTimeGlobal();
|
|
// get from spider request if there
|
|
//bool injected = false;
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
|
// but be consistent if doing the "qatest123" collection
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
|
|
lastUpdateTime = getSpideredTime();//m_spideredTime;
|
|
}
|
|
|
|
bool onlyNeedGoodInlinks = true;
|
|
// so if steve wants to display all links then set this
|
|
// to false so we get titles of bad inlinks
|
|
// seems like pageparser.cpp just sets m_pbuf and not
|
|
// m_usePageLinkBuf any more
|
|
if ( sb ) onlyNeedGoodInlinks = false;
|
|
|
|
// int16_tcut
|
|
//Msg25 *m = &m_msg25;
|
|
if ( ! getLinkInfo ( &m_tmpBuf11,
|
|
&m_mcast11,
|
|
mysite , // site
|
|
mysite , // url
|
|
true , // isSiteLinkInfo?
|
|
*fip ,
|
|
0 , // docId
|
|
cr->m_collnum , //linkInfoColl
|
|
NULL , // qbuf
|
|
0 , // qbufSize
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_contentInjected ,// isInjecting?
|
|
sb ,
|
|
m_printInXml ,
|
|
0 , // sitenuminlinks -- dunno!
|
|
//0 , // sitePop
|
|
NULL , // oldLinkInfo1 ,
|
|
m_niceness ,
|
|
cr->m_doLinkSpamCheck ,
|
|
cr->m_oneVotePerIpDom ,
|
|
canBeCancelled ,
|
|
lastUpdateTime ,
|
|
onlyNeedGoodInlinks ,
|
|
false,
|
|
0,
|
|
0,
|
|
// it will store the linkinfo into this safebuf
|
|
&m_mySiteLinkInfoBuf) )
|
|
// return -1 if it blocked
|
|
return (LinkInfo *)-1;
|
|
// sanity check
|
|
//if ( ! m_msg25.m_linkInfo ) {
|
|
// log("build: error making link info: %s",mstrerror(g_errno));
|
|
// return NULL;
|
|
//}
|
|
// we got it
|
|
//return m_msg25.m_linkInfo;
|
|
// getLinkInfo() now calls multicast so it returns true on errors only
|
|
log("build: error making link info: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
static void gotIpWrapper ( void *state , int32_t ip ) ;
|
|
|
|
static void delayWrapper ( int fd , void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
int32_t *XmlDoc::getIp ( ) {
|
|
// return if we got it
|
|
if ( m_ipValid ) return &m_ip;
|
|
// update status msg
|
|
setStatus ( "getting ip" );
|
|
|
|
m_ipStartTime = 0;
|
|
// assume the same in case we get it right away
|
|
m_ipEndTime = 0;
|
|
|
|
// if set from docid and recycling
|
|
if ( m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (int32_t *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// set it
|
|
if ( od ) {
|
|
m_ip = od->m_ip;
|
|
m_ipValid = true;
|
|
return &m_ip;
|
|
}
|
|
}
|
|
|
|
|
|
// fakeit for now
|
|
//log("FAKING IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
|
|
//m_ip = atoip("74.201.80.152",13);
|
|
//m_ipValid = true;
|
|
//return &m_ip;
|
|
|
|
// get the best url
|
|
Url *u = getCurrentUrl();
|
|
if ( ! u || u == (void *)-1 ) return (int32_t *)u;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
bool useTestCache = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
|
// unless its the pagesubmit.cpp event submission tool
|
|
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
|
|
|
|
|
// when building the "qatest123" collection try to get the ip from
|
|
// "./test/ips.txt" so our injections are consistent every time
|
|
// Test.cpp runs its injection loop into the "qatest123" collection
|
|
if ( useTestCache ) { // && m_useIpsTxtFile ) {
|
|
// stolen from msgc.cpp:
|
|
// if url is already in a.b.c.d format return that
|
|
int32_t ip2 = 0;
|
|
char *host = u->getHost();
|
|
if ( host ) ip2 = atoip ( host,u->getHostLen() );
|
|
if ( ip2 != 0 ) {
|
|
m_ip = ip2;
|
|
m_ipValid = true;
|
|
return &m_ip;
|
|
}
|
|
// assume not found in our file
|
|
bool found = false;
|
|
// get test dir
|
|
char *testDir = getTestDir();
|
|
// get it from "./test/ips.txt"
|
|
getTestIp ( u->getUrl() , &m_ip , &found , m_niceness,testDir);
|
|
// if we found a match...
|
|
if ( found ) { // m_ip != 0 ) {
|
|
// we are valid now
|
|
return gotIp ( false );
|
|
//m_ipValid = true;
|
|
// return it
|
|
//return &m_ip;
|
|
}
|
|
}
|
|
|
|
// we need the ip before we download the page, but before we get
|
|
// the IP and download the page, wait for this many milliseconds.
|
|
// this basically slows the spider down.
|
|
int32_t delay = cr->m_spiderDelayInMilliseconds;
|
|
// ignore for testing
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
|
|
// injected?
|
|
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
|
|
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0;
|
|
if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0;
|
|
// . don't do the delay when downloading extra doc, robots.txt etc.
|
|
// . this also reports a status msg of "getting new doc" when it
|
|
// really means "delaying spider"
|
|
if ( m_isChildDoc ) delay = 0;
|
|
|
|
if ( delay > 0 && ! m_didDelay ) {
|
|
// we did it
|
|
m_didDelay = true;
|
|
m_statusMsg = "delaying spider";
|
|
// random fuzz so we don't get everyone being unleashed at once
|
|
int32_t radius = (int32_t)(.20 * (double)delay);
|
|
int32_t fuzz = (rand() % (radius * 2)) - radius;
|
|
delay += fuzz;
|
|
// make a callback wrapper.
|
|
// this returns false and sets g_errno on error
|
|
if ( g_loop.registerSleepCallback ( delay ,
|
|
m_masterState ,
|
|
delayWrapper,//m_masterLoop
|
|
m_niceness ))
|
|
// wait for it, return -1 since we blocked
|
|
return (int32_t *)-1;
|
|
// if was not able to register, ignore delay
|
|
}
|
|
|
|
if ( m_didDelay && ! m_didDelayUnregister ) {
|
|
g_loop.unregisterSleepCallback(m_masterState,delayWrapper);
|
|
m_didDelayUnregister = true;
|
|
}
|
|
|
|
// update status msg
|
|
setStatus ( "getting ip" );
|
|
|
|
m_ipStartTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
// assume valid! if reply handler gets g_errno set then m_masterLoop
|
|
// should see that and call the final callback
|
|
//m_ipValid = true;
|
|
// get it
|
|
if ( ! m_msgc.getIp ( u->getHost () ,
|
|
u->getHostLen() ,
|
|
&m_ip ,
|
|
this ,
|
|
gotIpWrapper ))
|
|
// we blocked
|
|
return (int32_t *)-1;
|
|
// wrap it up
|
|
return gotIp ( true );
|
|
}
|
|
|
|
void gotIpWrapper ( void *state , int32_t ip ) {
|
|
// point to us
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
// wrap it up
|
|
THIS->gotIp ( true );
|
|
// . call the master callback
|
|
// . m_masterState usually equals THIS, unless THIS is the
|
|
// Xml::m_contactDoc or something...
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
int32_t *XmlDoc::gotIp ( bool save ) {
|
|
// return NULL on error
|
|
if ( g_errno ) return NULL;
|
|
// this is bad too
|
|
//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
|
|
//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());
|
|
|
|
setStatus ("got ip");
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// note it for crawlbot
|
|
if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
|
|
log("db: got ip %" INT32 " for %s",
|
|
m_ip,getCurrentUrl()->getUrl());
|
|
|
|
bool useTestCache = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
|
// unless its the pagesubmit.cpp event submission tool
|
|
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
|
|
|
|
|
// when building the "qatest123" collection try to get the ip from
|
|
// "./test/ips.txt" so our injections are consistent every time
|
|
// Test.cpp runs its injection loop into the "qatest123" collection
|
|
if ( save && useTestCache ) {
|
|
// ip of 0 means NXDOMAIN i think (-1 means error)
|
|
//if ( m_ip == 0 ) {
|
|
// log("waiting for debug break");
|
|
// sleep(3600);
|
|
//}
|
|
// get the best url
|
|
Url *u = getCurrentUrl();
|
|
if ( !u || u == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// . add it to "./test/ips.txt"
|
|
// . this function is in Msge1.cpp
|
|
addTestIp ( u->getHost() , u->getHostLen() , m_ip );
|
|
// get test dir
|
|
char *testDir = getTestDir();
|
|
// save it
|
|
saveTestBuf ( testDir );
|
|
}
|
|
|
|
// we got it
|
|
m_ipValid = true;
|
|
// give it to them
|
|
return &m_ip;
|
|
}
|
|
|
|
#include "Mime.h"
|
|
|
|
// taken from Robotdb.cpp
|
|
bool isAllowed2 ( Url *url ,
|
|
char *userAgent ,
|
|
char *file ,
|
|
int32_t fileLen ,
|
|
bool *userAgentFound ,
|
|
bool substringMatch ,
|
|
int32_t *crawlDelay ,
|
|
char **cacheStart ,
|
|
int32_t *cacheLen ,
|
|
bool *hadAllowOrDisallow ) {
|
|
// assume nothing to cache yet
|
|
*cacheLen = 0;
|
|
*cacheStart = file;
|
|
// assume user agent is not in the file
|
|
*userAgentFound = false;
|
|
*hadAllowOrDisallow = false;
|
|
// assume no crawl delay (-1)
|
|
// *crawlDelay = -1;
|
|
// if fileLen is 0 it is allowed
|
|
if ( fileLen <= 0 ) return true;
|
|
// get path from url, include cgi stuff
|
|
char *path = url->getPath();
|
|
int32_t pathLen = url->getPathLenWithCgi();
|
|
// set the Mime class to this Mime file
|
|
Mime mime;
|
|
mime.set ( file , fileLen );
|
|
// get a line of Mime
|
|
char *f , *v;
|
|
int32_t flen, vlen;
|
|
// user agent length
|
|
int32_t uaLen = gbstrlen (userAgent);
|
|
// ptr into "file"
|
|
char *p = file;
|
|
char flag;
|
|
bool allowed = true;
|
|
loop:
|
|
// if p is NULL now we're done
|
|
if ( ! p ) return allowed;
|
|
// get the next Mime line
|
|
p = mime.getLine ( p , &f , &flen , &v , &vlen );
|
|
// if this field is NOT "user-agent" skip it
|
|
if ( flen != 10 ) goto loop;
|
|
if ( strncasecmp ( f , "user-agent" , 10 ) != 0 ) goto loop;
|
|
gotAgent:
|
|
//some webmasters put comments at the end of their lines,
|
|
//because they think this is a shell script or something.
|
|
char* vv = v;
|
|
while(vv - v < vlen && *vv != '#') vv++;
|
|
vlen = vv - v;
|
|
// decrement vlen to hack off spaces after the user-agent so that vlen
|
|
// is really the length of the user agent
|
|
while ( vlen > 0 && is_wspace_a(v[vlen-1]) ) vlen--;
|
|
// now match the user agent
|
|
if ( ! substringMatch && vlen != uaLen ) goto loop;
|
|
// otherwise take the min of the lengths
|
|
if ( uaLen < vlen ) vlen = uaLen;
|
|
// is it the right user-agent?
|
|
if ( strncasecmp ( v , userAgent , vlen ) != 0 ) goto loop;
|
|
// we got it, if first instance start our cache here
|
|
if ( !*userAgentFound ) *cacheStart = f;
|
|
*userAgentFound = true;
|
|
flag = 0;
|
|
urlLoop:
|
|
// if p is NULL now there is no more lines
|
|
if ( ! p ) {
|
|
// set our cache stop to the end of the file
|
|
*cacheLen = (file + fileLen) - *cacheStart;
|
|
return allowed;
|
|
}
|
|
// now loop over lines until we hit another user-agent line
|
|
p = mime.getLine ( p , &f , &flen , &v , &vlen );
|
|
// if it's another user-agent line ... ignore it unless we already
|
|
// have seen a disallow line, in which case we got another set of
|
|
if ( flag && flen==10 && strncasecmp(f,"user-agent",10)==0) {
|
|
// set our cache stop here
|
|
*cacheLen = f - *cacheStart;
|
|
goto gotAgent;
|
|
}
|
|
// if a crawl delay, get the delay
|
|
if ( flen == 11 && strncasecmp ( f , "crawl-delay", 11 ) == 0 ) {
|
|
// set flag
|
|
flag = 1;
|
|
// skip if invalid. it could be ".5" seconds
|
|
if ( ! is_digit ( *v ) && *v != '.' ) goto urlLoop;
|
|
// get this. multiply crawl delay by x1000 to be in
|
|
// milliseconds/ms
|
|
int64_t vv = (int64_t)(atof(v) * 1000LL);
|
|
// truncate to 0x7fffffff
|
|
if ( vv > 0x7fffffff ) *crawlDelay = 0x7fffffff;
|
|
else if ( vv < 0 ) *crawlDelay = -1;
|
|
else *crawlDelay = (int32_t)vv;
|
|
// get the delay
|
|
//*crawlDelay = atol(v) * 1000;
|
|
goto urlLoop;
|
|
}
|
|
// if already disallowed, just goto the next line
|
|
if ( !allowed ) goto urlLoop;
|
|
// if we have an allow line or sitemap: line, then set flag to 1
|
|
// so we can go to another user-agent line.
|
|
// fixes romwebermarketplace.com/robots.txt
|
|
// (doc.156447320458030317.txt)
|
|
if ( flen==5 && strncasecmp(f,"allow" ,5)==0 ) {
|
|
*hadAllowOrDisallow = true;
|
|
flag = 1;
|
|
}
|
|
if ( flen==7 && strncasecmp(f,"sitemap",7)==0 ) {
|
|
flag = 1;
|
|
}
|
|
// if not disallow go to loop at top
|
|
if ( flen != 8 ) goto urlLoop;
|
|
if ( strncasecmp ( f , "disallow" , 8 ) != 0 ) {
|
|
goto urlLoop;
|
|
}
|
|
// we had a disallow
|
|
*hadAllowOrDisallow = true;
|
|
// set flag
|
|
flag = 1;
|
|
// . take off trailing chars from the banned path name
|
|
// . this is now done below
|
|
//while ( vlen > 0 && is_space(v[vlen-1]) ) vlen--;
|
|
// . skip leading spaces
|
|
// . this should be done in mime class
|
|
// while ( vlen > 0 && is_space(v[0]) ) { v++; vlen--; }
|
|
// now stop at first space after url or end of line
|
|
char *s = v;
|
|
char *send = v + vlen;
|
|
// skip all non-space chars
|
|
while ( s < send && ! is_wspace_a(*s) ) s++;
|
|
// stop there
|
|
vlen = s - v;
|
|
// check for match
|
|
char *tmpPath = path;
|
|
int32_t tmpPathLen = pathLen;
|
|
// assume path begins with /
|
|
if ( vlen > 0 && v[0] != '/'){tmpPath++;tmpPathLen--;}
|
|
if ( vlen > tmpPathLen ) goto urlLoop;
|
|
if ( strncasecmp(tmpPath,v,vlen) != 0 ) goto urlLoop;
|
|
// an exact match
|
|
if ( vlen == tmpPathLen ) {
|
|
//return false;
|
|
allowed = false;
|
|
goto urlLoop;
|
|
}
|
|
// must be something
|
|
if ( vlen <= 0 ) goto urlLoop;
|
|
// "v" may or may not end in a /, it really should end in a / though
|
|
if ( v[vlen-1] == '/' && tmpPath[vlen-1] == '/' ) {
|
|
//return false;
|
|
allowed = false;
|
|
goto urlLoop;
|
|
}
|
|
if ( v[vlen-1] != '/' && tmpPath[vlen ] == '/' ) {
|
|
//return false;
|
|
allowed = false;
|
|
goto urlLoop;
|
|
}
|
|
// let's be stronger. just do the substring match. if the webmaster
|
|
// does not want us splitting path or file names then they should end
|
|
// all of their robots.txt entries in a '/'. this also fixes the
|
|
// problem of the "Disallow: index.htm?" line.
|
|
//return false;
|
|
allowed = false;
|
|
// get another url path
|
|
goto urlLoop;
|
|
}
|
|
|
|
// when doing a custom crawl we have to decide between the provided crawl
|
|
// delay, and the one in the robots.txt...
|
|
int32_t *XmlDoc::getFinalCrawlDelay() {
|
|
|
|
if ( m_finalCrawlDelayValid )
|
|
return &m_finalCrawlDelay;
|
|
|
|
bool *isAllowed = getIsAllowed();
|
|
if ( ! isAllowed || isAllowed == (void *)-1 ) return (int32_t *)isAllowed;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
m_finalCrawlDelayValid = true;
|
|
|
|
// getIsAllowed already sets m_crawlDelayValid to true
|
|
if ( ! cr->m_isCustomCrawl ) {
|
|
m_finalCrawlDelay = m_crawlDelay;
|
|
// default to 250ms i guess if none specified in robots
|
|
// just to be somewhat nice by default
|
|
if ( m_crawlDelay < 0 ) m_finalCrawlDelay = 250;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// get manually specified crawl delay in seconds. convert to ms.
|
|
int32_t manual = (int32_t)(cr->m_collectiveCrawlDelay * 1000.0);
|
|
// negative means -1 means unknown or not specified
|
|
if ( manual < 0 ) manual = -1;
|
|
|
|
// if both are unknown...
|
|
if ( m_crawlDelay == -1 && manual == -1 ) {
|
|
m_finalCrawlDelay = -1;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// if not in robots.txt use manual
|
|
if ( m_crawlDelay == -1 ) {
|
|
m_finalCrawlDelay = manual;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// if manually provided crawldelay is -1, use robots.txt then
|
|
if ( manual == -1 ) {
|
|
m_finalCrawlDelay = m_crawlDelay;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// let robots.txt dictate if both are >= 0
|
|
if ( m_useRobotsTxt ) {
|
|
m_finalCrawlDelay = m_crawlDelay;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// if not using robots.txt, pick the smallest
|
|
if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
|
|
else m_finalCrawlDelay = manual;
|
|
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
bool XmlDoc::isFirstUrlRobotsTxt ( ) {
|
|
if ( m_isRobotsTxtUrlValid )
|
|
return m_isRobotsTxtUrl;
|
|
Url *fu = getFirstUrl();
|
|
m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
|
|
m_isRobotsTxtUrlValid = true;
|
|
return m_isRobotsTxtUrl;
|
|
}
|
|
|
|
// . get the Robots.txt and see if we are allowed
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
// . getting a robots.txt is not trivial since we need to follow redirects,
|
|
// so we make use of the powerful XmlDoc class for this
|
|
bool *XmlDoc::getIsAllowed ( ) {
|
|
// return if we got it
|
|
if ( m_isAllowedValid ) return &m_isAllowed;
|
|
// could be turned off for everyone
|
|
if ( ! m_useRobotsTxt ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
m_crawlDelayValid = true;
|
|
m_crawlDelay = -1;
|
|
//log("xmldoc: skipping robots.txt lookup for %s",
|
|
// m_firstUrl.m_url);
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// . if setting from a title rec, assume allowed
|
|
// . this avoids doConsistencyCheck() from blocking and coring
|
|
if ( m_setFromTitleRec ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
if ( m_recycleContent ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// HACK: so we can spider archive.org warcs and arcs internally
|
|
if ( m_firstUrlValid &&
|
|
m_firstUrl.getDomainLen() == 11 &&
|
|
strncmp ( m_firstUrl.getDomain() , "archive.org" , 11 ) == 0 ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
|
|
// double get?
|
|
if ( m_crawlDelayValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// bulk jobs don't need this
|
|
CollectionRec *cr = getCollRec();
|
|
if ( cr && cr->m_isCustomCrawl == 2 ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// . if WE are robots.txt that is always allowed!!!
|
|
// . check the *first* url since these often redirect to weird things
|
|
if ( isFirstUrlRobotsTxt() ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
m_crawlDelayValid = true;
|
|
// make it super fast...
|
|
m_crawlDelay = 0;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// or if using the "qatest123" collection, assume yes!
|
|
//if ( ! strcmp ( m_coll , "qatest123" ) ) {
|
|
// m_isAllowed = true;
|
|
// m_isAllowedValid = true;
|
|
// return &m_isAllowed;
|
|
//}
|
|
|
|
// update status msg
|
|
setStatus ( "getting robots.txt" );
|
|
// sanity
|
|
int32_t *ip = getIp ();
|
|
// error? or blocked?
|
|
if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
|
|
Url *fu = getFirstUrl();
|
|
// if ip does not exist on the dns, do not try to download robots.txt
|
|
// it is pointless... this can happen in the dir coll and we basically
|
|
// have "m_siteInCatdb" set to true
|
|
if ( *ip == 1 || *ip == 0 || *ip == -1 ) {
|
|
// note this
|
|
log("build: robots.txt ip is %s for url=%s. allowing for now.",
|
|
fu->getUrl(),iptoa(*ip));
|
|
// just core for now
|
|
//char *xx=NULL;*xx=0;
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
// since ENOMIME is no longer causing the indexCode
|
|
// to be set, we are getting a core because crawlDelay
|
|
// is invalid in getNewSpiderReply()
|
|
m_crawlDelayValid = true;
|
|
m_crawlDelay = -1;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// we need this so getExtraDoc does not core
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (bool *)pfip;
|
|
|
|
// get the current url after redirects
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (bool *)cu;
|
|
|
|
// set m_extraUrl to the robots.txt url
|
|
char buf[MAX_URL_LEN+2];
|
|
char *p = buf;
|
|
if ( cu->isHttps() ) p += sprintf ( p , "https://" );
|
|
else p += sprintf ( p , "http://" );
|
|
// sanity
|
|
if ( ! cu->getHost() ) { char *xx=NULL;*xx=0; }
|
|
gbmemcpy ( p , cu->getHost() , cu->getHostLen() );
|
|
p += cu->getHostLen();
|
|
int32_t port = cu->getPort();
|
|
// 80 is the default port
|
|
int32_t defPort = 80;
|
|
// is it https://?
|
|
if ( cu->m_url[4] == 's' ) defPort = 443;
|
|
if ( port != defPort ) p += sprintf ( p , ":%" INT32 "",port );
|
|
p += sprintf ( p , "/robots.txt" );
|
|
m_extraUrl.set ( buf );
|
|
|
|
// . maxCacheAge = 3600 seconds = 1 hour for robots.txt
|
|
// . if this is non-zero then msg13 should store it as well!
|
|
// . for robots.txt it should only cache the portion of the doc
|
|
// relevant to our user agent!
|
|
// . getHttpReply() should use msg13 to get cached reply!
|
|
XmlDoc **ped = getExtraDoc ( m_extraUrl.getUrl() , 3600 );
|
|
if ( ! ped || ped == (void *)-1 ) return (bool *)ped;
|
|
// assign it
|
|
XmlDoc *ed = *ped;
|
|
// return NULL on error with g_errno set
|
|
if ( ! ed ) {
|
|
// sanity check, g_errno must be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// log it -- should be rare?
|
|
log("doc: had error getting robots.txt: %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
// inherit this
|
|
//if ( ! m_useIpsTxtFile ) ed->m_useIpsTxtFile = false;
|
|
// . steal m_firstIp from us to avoid tag rec lookup
|
|
// . why was this commented out?
|
|
// . maybe because if we redirect, this is not the same!!!
|
|
//ed->m_firstIp = m_firstIp;
|
|
//ed->m_firstIpValid = m_firstIpValid;//true;
|
|
// also, steal our ip! neither is this!
|
|
//ed->m_ip = m_ip;
|
|
//ed->m_ipValid = m_ipValid;
|
|
// . now try the content
|
|
// . should call getHttpReply
|
|
char **pcontent = ed->getContent();
|
|
if ( ! pcontent || pcontent == (void *)-1 ) return (bool *)pcontent;
|
|
// get the mime
|
|
HttpMime *mime = ed->getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (bool *)mime;
|
|
// get this
|
|
int32_t contentLen = ed->m_contentLen;
|
|
// save this
|
|
m_robotsTxtLen = contentLen;
|
|
m_robotsTxtLenValid = true;
|
|
// get content
|
|
char *content = *pcontent;
|
|
// sanity check
|
|
if ( content && contentLen>0 && content[contentLen] != '\0'){
|
|
char*xx=NULL;*xx=0;}
|
|
|
|
// reset this. -1 means unknown or none found.
|
|
m_crawlDelay = -1;
|
|
m_crawlDelayValid = true;
|
|
|
|
// assume valid and ok to spider
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
|
|
// put in a crawldelay test for diffbot
|
|
/*
|
|
SafeBuf tmp;
|
|
if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) {
|
|
tmp.safePrintf("User-Agent: *\n"
|
|
"Crawl-Delay: 10.1\n"
|
|
);
|
|
content = tmp.getBufStart();
|
|
contentLen = tmp.getLength();
|
|
}
|
|
|
|
// if not success, assume no robots.txt
|
|
else*/
|
|
|
|
if ( mime->getHttpStatus() != 200 ) {
|
|
// nuke it to save mem
|
|
nukeDoc ( ed );
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// get the url we lookup
|
|
//Url *cu = getCurrentUrl();
|
|
|
|
// this is set to true if our userAgent was found explicitly
|
|
bool uaFound;
|
|
bool allowed;
|
|
char *cacheStart;
|
|
int32_t cacheLen;
|
|
bool hadAllowOrDisallow;
|
|
int32_t savedCrawlDelay = -1;
|
|
// now use left-anchored substring match so we can match Gigabot/1.0
|
|
allowed = isAllowed2 ( cu ,
|
|
g_conf.m_spiderUserAgent ,
|
|
content ,
|
|
contentLen ,
|
|
&uaFound ,
|
|
true , // substrmatch?
|
|
&m_crawlDelay ,
|
|
&cacheStart ,
|
|
&cacheLen ,
|
|
&hadAllowOrDisallow );
|
|
// save it
|
|
savedCrawlDelay = m_crawlDelay;
|
|
// . if didn't find our user agent so check for * as a user-agent
|
|
// . www.wikihow.com/robots.txt just has "Gigabot: crawl-delay:10\n"
|
|
// and then a "User-Agent: *" after that with the disallows, so
|
|
// i added the hadAllowDisallow parm
|
|
if ( ! uaFound || ! hadAllowOrDisallow )
|
|
allowed = isAllowed2 ( cu ,
|
|
"*" ,
|
|
content ,
|
|
contentLen ,
|
|
&uaFound ,
|
|
false , // substrmatch?
|
|
&m_crawlDelay ,
|
|
&cacheStart ,
|
|
&cacheLen ,
|
|
&hadAllowOrDisallow );
|
|
// bring back?
|
|
if ( savedCrawlDelay != -1 ) m_crawlDelay = savedCrawlDelay;
|
|
// nuke it to save mem
|
|
nukeDoc ( ed );
|
|
// we are legit
|
|
m_isAllowed = allowed;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
|
|
// . lookup the title rec with the "www." if we do not have that in the url
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
char *XmlDoc::getIsWWWDup ( ) {
|
|
// this is not a real error really
|
|
//if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// return if we got it
|
|
if ( m_isWWWDupValid ) return &m_isWWWDup;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// could be turned off for everyone
|
|
if ( ! cr->m_dupCheckWWW ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
// get the FIRST URL... (no longer current url after redirects)
|
|
Url *u = getFirstUrl(); // CurrentUrl();
|
|
// if we are NOT a DOMAIN-ONLY url, then no need to do this dup check
|
|
if ( u->getDomainLen() != u->getHostLen() ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
// must NOT have a www
|
|
if ( ! u->isHostWWW() ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
// watch out for idiot urls like www.gov.uk and www.gov.za
|
|
// treat them as though the TLD is uk/za and the domain
|
|
// is gov.uk and gov.za
|
|
if ( u->getDomain() &&
|
|
strncmp ( u->getDomain() , "www." , 4 ) == 0 ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
// make it without the www
|
|
char withoutWWW[MAX_URL_LEN+1];
|
|
char *proto = "http";
|
|
if ( u->isHttps() ) proto = "https";
|
|
sprintf(withoutWWW,"%s://%s",proto,u->getDomain());
|
|
|
|
// assume yes
|
|
m_isWWWDup = true;
|
|
|
|
if ( ! m_calledMsg22f )
|
|
setStatus ( "getting possible www dup title rec" );
|
|
|
|
// . does this title rec exist in titledb?
|
|
// . "justCheckTfndb" is set to true here!
|
|
if ( ! m_calledMsg22f &&
|
|
! m_msg22f.getTitleRec ( &m_msg22Request ,
|
|
withoutWWW ,
|
|
0 , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
NULL , // tr ptr
|
|
NULL , // tr size ptr
|
|
true , // just chk tfndb?
|
|
false, // getavaildocidonly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false )){//load balancing?
|
|
// validate
|
|
m_calledMsg22f = true;
|
|
// return -1 if we blocked
|
|
return (char *)-1;
|
|
}
|
|
// got it
|
|
m_calledMsg22f = true;
|
|
// valid now
|
|
m_isWWWDupValid = true;
|
|
// found?
|
|
if ( ! g_errno && m_msg22f.m_found ) {
|
|
// crap we are a dup
|
|
m_isWWWDup = true;
|
|
// set the index code
|
|
//m_indexCode = EDOCDUPWWW;
|
|
}
|
|
// return us
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LinkInfo s_dummy2;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
|
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 )
|
|
return ptr_linkInfo1;
|
|
|
|
// at least get our firstip so if cr->m_getLinkInfo is false
|
|
// then getRevisedSpiderReq() will not core because it is invalid
|
|
int32_t *ip = getFirstIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;
|
|
|
|
|
|
// just return nothing if not doing link voting
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// to keep things fast we avoid getting link info for some collections
|
|
if ( ! m_linkInfo1Valid && ! cr->m_getLinkInfo ) {
|
|
ptr_linkInfo1 = NULL;
|
|
m_linkInfo1Valid = true;
|
|
}
|
|
|
|
// sometimes it is NULL in title rec when setting from title rec
|
|
if ( m_linkInfo1Valid && ! ptr_linkInfo1 ) {
|
|
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
|
s_dummy2.m_lisize = sizeof(LinkInfo);
|
|
ptr_linkInfo1 = &s_dummy2;
|
|
size_linkInfo1 = sizeof(LinkInfo);
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
// return if we got it
|
|
if ( m_linkInfo1Valid )
|
|
return ptr_linkInfo1;
|
|
|
|
// change status
|
|
setStatus ( "getting local inlinkers" );
|
|
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo *)od;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni;
|
|
//int32_t *fip = getFirstIp();
|
|
//if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d;
|
|
// sanity check. error?
|
|
if ( *d == 0LL ) {
|
|
log("xmldoc: crap no g_errno");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return NULL;
|
|
}
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
|
|
|
|
// no linkinfo for diffbot custom crawls to speed up
|
|
if ( cr->m_isCustomCrawl ) {
|
|
m_linkInfo1Valid = true;
|
|
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
|
s_dummy2.m_lisize = sizeof(LinkInfo);
|
|
ptr_linkInfo1 = &s_dummy2;
|
|
size_linkInfo1 = sizeof(LinkInfo);
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
// grab a ptr to the LinkInfo contained in our Doc class
|
|
LinkInfo *oldLinkInfo1 = NULL;
|
|
if ( *od ) oldLinkInfo1 = (*od)->getLinkInfo1();
|
|
|
|
// if ip does not exist, make it 0
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
m_linkInfo1Valid = true;
|
|
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
|
s_dummy2.m_lisize = sizeof(LinkInfo);
|
|
ptr_linkInfo1 = &s_dummy2;
|
|
size_linkInfo1 = sizeof(LinkInfo);
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
//link info generation requires an IP for internal/external computation
|
|
// UNLESS we are from getSpiderStatusDocMetaList2() ... so handle
|
|
// -1 above!
|
|
//if ( *ip == -1 || *ip == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . error getting linkers?
|
|
// . on udp timeout we were coring below because msg25.m_linkInfo
|
|
// was NULL
|
|
if ( g_errno && m_calledMsg25 ) return NULL;
|
|
// prevent core as well
|
|
//if ( m_calledMsg25 && ! size_linkInfo1 ) { // m_msg25.m_linkInfo ) {
|
|
// log("xmldoc: msg25 had null link info");
|
|
// g_errno = EBADENGINEER;
|
|
// return NULL;
|
|
//}
|
|
|
|
// . now search for some link info for this url/doc
|
|
// . this queries the search engine to get linking docIds along
|
|
// with their termIds/scores from anchor text and then compiles
|
|
// it all into one IndexList
|
|
// . if we have no linkers to this url then we set siteHash, etc.
|
|
// for this linkInfo class
|
|
// . this is my google algorithm
|
|
// . let's use the first url (before redirects) for this
|
|
// . m_newDocId is used for classifying doc under predefined news topic
|
|
// . catSiteRec is used for classifying pages under a predefined
|
|
// newstopic. this is currently for news search only.
|
|
// . use the rootTitleRecPtr if there and we are doing our link info
|
|
// stuff in this collection, but if doing it in another collection
|
|
// the msg25 will look up the root in that collection...
|
|
if ( ! m_calledMsg25 ) {
|
|
// get this
|
|
int32_t lastUpdateTime = getTimeGlobal();
|
|
// but be consistent if doing the "qatest123" collection
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
|
|
lastUpdateTime = getSpideredTime();//m_spideredTime;
|
|
}
|
|
// do not redo it
|
|
m_calledMsg25 = true;
|
|
// int16_tcut
|
|
//Msg25 *m = &m_msg25;
|
|
// can we be cancelled?
|
|
bool canBeCancelled = true;
|
|
// not if pageparser though
|
|
if ( m_pbuf ) canBeCancelled = false;
|
|
// not if injecting
|
|
if ( ! m_sreqValid ) canBeCancelled = false;
|
|
// use this buffer so XmlDoc::print() can display wherever
|
|
SafeBuf *sb = NULL;
|
|
if ( m_pbuf ) sb = &m_pageLinkBuf;
|
|
// only do this for showing them!!!
|
|
if ( m_usePageLinkBuf ) sb = &m_pageLinkBuf;
|
|
// get from spider request if there
|
|
//bool injected = false;
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
|
// we do not want to waste time computing the page title
|
|
// of bad inlinks if we only want the good inlinks, because
|
|
// as of oct 25, 2012 we only store the "good" inlinks
|
|
// in the titlerec
|
|
bool onlyNeedGoodInlinks = true;
|
|
// so if steve wants to display all links then set this
|
|
// to false so we get titles of bad inlinks
|
|
if ( m_usePageLinkBuf ) onlyNeedGoodInlinks = false;
|
|
// seems like pageparser.cpp just sets m_pbuf and not
|
|
// m_usePageLinkBuf any more
|
|
if ( m_pbuf ) onlyNeedGoodInlinks = false;
|
|
// status update
|
|
setStatus ( "calling msg25 for url" );
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// we want to get all inlinks if doing a custom crawlbot crawl
|
|
// because we need the anchor text to pass in to diffbot
|
|
bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
|
|
bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
|
|
// this seems to overdo it when we have a ton of linktext
|
|
// perhaps, so take this out...
|
|
//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
|
|
// doLinkSpamCheck = false;
|
|
// oneVotePerIpDom = false;
|
|
// onlyNeedGoodInlinks = false;
|
|
//}
|
|
|
|
// call it. this is defined in Linkdb.cpp
|
|
char *url = getFirstUrl()->getUrl();
|
|
if ( ! getLinkInfo ( &m_tmpBuf12,
|
|
&m_mcast12,
|
|
mysite ,
|
|
url ,
|
|
false , // isSiteLinkInfo?
|
|
*ip ,
|
|
*d ,
|
|
cr->m_collnum , //linkInfoColl
|
|
NULL , // qbuf
|
|
0 , // qbufSize
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_contentInjected ,//m_injectedReply ,
|
|
sb ,
|
|
m_printInXml ,
|
|
*sni ,
|
|
//m_sitePop ,
|
|
oldLinkInfo1 ,
|
|
m_niceness ,
|
|
doLinkSpamCheck ,
|
|
oneVotePerIpDom ,
|
|
canBeCancelled ,
|
|
lastUpdateTime ,
|
|
onlyNeedGoodInlinks ,
|
|
false, // getlinkertitles
|
|
0, // ourhosthash32 (special)
|
|
0, // ourdomhash32 (special)
|
|
&m_myPageLinkInfoBuf
|
|
) )
|
|
// blocked
|
|
return (LinkInfo *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// panic! what the fuck? why did it return true and then
|
|
// call our callback???
|
|
//if ( g_conf.m_logDebugBuild ) {
|
|
log("build: xmldoc call to msg25 did not block");
|
|
// must now block since it uses multicast now to
|
|
// send the request onto the network
|
|
char *xx=NULL;*xx=0;
|
|
//}
|
|
}
|
|
|
|
// at this point assume its valid
|
|
m_linkInfo1Valid = true;
|
|
// . get the link info we got set
|
|
// . this ptr references into m_myPageLinkInfoBuf safebuf
|
|
//ptr_linkInfo1 = m_msg25.m_linkInfo;
|
|
//size_linkInfo1 = m_msg25.m_linkInfo->getSize();
|
|
ptr_linkInfo1 = (LinkInfo *)m_myPageLinkInfoBuf.getBufStart();
|
|
size_linkInfo1 = m_myPageLinkInfoBuf.length();
|
|
// we should free it
|
|
m_freeLinkInfo1 = true;
|
|
// this can not be NULL!
|
|
if ( ! ptr_linkInfo1 || size_linkInfo1 <= 0 ) {
|
|
log("build: error getting linkinfo1: %s",mstrerror(g_errno));
|
|
char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
// take it from msg25 permanently
|
|
//m_msg25.m_linkInfo = NULL;
|
|
// set flag
|
|
m_linkInfo1Valid = true;
|
|
// . validate the hop count thing too
|
|
// . i took hopcount out of linkdb to put in lower ip byte for steve
|
|
//m_minInlinkerHopCount = -1;//m_msg25.getMinInlinkerHopCount();
|
|
// return it
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
|
|
static void *s_null = NULL;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
LinkInfo **XmlDoc::getLinkInfo2 ( ) {
|
|
|
|
// this can now be title hashes for XmlDoc::m_diffbotTitleHashes
|
|
// but otherwise, we don't use it for link info from another cluster
|
|
// any more.
|
|
m_linkInfo2Valid = true;
|
|
return (LinkInfo **)&s_null;
|
|
|
|
// return if we got it
|
|
if ( m_linkInfo2Valid ) return &ptr_linkInfo2;
|
|
|
|
m_linkInfo2Valid = true;
|
|
ptr_linkInfo2 = NULL;
|
|
return &ptr_linkInfo2;
|
|
|
|
/*
|
|
if ( ! cr->m_importFromHosts2Conf ) {
|
|
m_linkInfo2Valid = true;
|
|
ptr_linkInfo2 = NULL;
|
|
return &ptr_linkInfo2;
|
|
}
|
|
|
|
// change status
|
|
setStatus ( "getting remote hosts2.conf inlinkers" );
|
|
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo **)od;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo **)sni;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo **)ip;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (LinkInfo **)d;
|
|
// grab a ptr to the LinkInfo contained in our Doc class
|
|
LinkInfo *oldLinkInfo2 = NULL;
|
|
if ( *od ) oldLinkInfo2 = *(*od)->getLinkInfo2();
|
|
|
|
// . now search for some link info for this url/doc
|
|
// . this queries the search engine to get linking docIds along
|
|
// with their termIds/scores from anchor text and then compiles
|
|
// it all into one IndexList
|
|
// . if we have no linkers to this url then we set siteHash, etc.
|
|
// for this linkInfo class
|
|
// . this is my google algorithm
|
|
// . let's use the first url (before redirects) for this
|
|
// . m_newDocId is used for classifying doc under predefined news topic
|
|
// . catSiteRec is used for classifying pages under a predefined
|
|
// newstopic. this is currently for news search only.
|
|
// . use the rootTitleRecPtr if there and we are doing our link info
|
|
// stuff in this collection, but if doing it in another collection
|
|
// the msg25 will look up the root in that collection...
|
|
if ( ! m_calledMsg25b ) {
|
|
// do not redo it
|
|
m_calledMsg25b = true;
|
|
// int16_tcut
|
|
Msg25 *m = &m_msg25;
|
|
// can we be cancelled?
|
|
bool canBeCancelled = true;
|
|
// not if pageparser though
|
|
if ( m_pbuf ) canBeCancelled = false;
|
|
// not if injecting
|
|
if ( ! m_sreqValid ) canBeCancelled = false;
|
|
// use this buffer so XmlDoc::print() can display wherever
|
|
//SafeBuf *sb = NULL;
|
|
//if ( m_pbuf ) sb = &m_pageLinkBuf2;
|
|
// call it
|
|
if ( ! m->getPageLinkInfo2 ( getFirstUrl() ,
|
|
m_coll ,
|
|
cr->m_externalColl ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
cr->m_doLinkSpamCheck ,
|
|
cr->m_oneVotePerIpDom ,
|
|
canBeCancelled ) )
|
|
// blocked
|
|
return (LinkInfo **)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
|
|
// at this point assume its valid
|
|
m_linkInfo2Valid = true;
|
|
// get the link info we got set
|
|
ptr_linkInfo2 = m_msg25.m_linkInfo;
|
|
// we should free it
|
|
m_freeLinkInfo2 = true;
|
|
// take it from msg25 permanently
|
|
m_msg25.m_linkInfo = NULL;
|
|
// set flag
|
|
m_linkInfo2Valid = true;
|
|
// validate the hop count thing too
|
|
//m_minInlinkerHopCount = m_msg25.getMinInlinkerHopCount();
|
|
// return it
|
|
return &ptr_linkInfo2;
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
|
|
static void gotSiteWrapper ( void *state ) ;
|
|
|
|
// . we should store the site in the title rec because site getter might
|
|
// change what it thinks the site is!
|
|
char *XmlDoc::getSite ( ) {
|
|
// was there a problem getting site?
|
|
if ( m_siteValid && m_siteGetter.m_errno ) {
|
|
g_errno = m_siteGetter.m_errno;
|
|
return NULL;
|
|
}
|
|
// ok, return it
|
|
if ( m_siteValid ) return ptr_site;//m_siteGetter.m_site;
|
|
// note it
|
|
setStatus ( "getting site");
|
|
// need this
|
|
TagRec *gr = getTagRec();
|
|
// sanity check
|
|
if ( ! gr && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// blocked or error?
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// get url
|
|
Url *f = getFirstUrl();
|
|
// bogus first url? prevent core in getIsSiteRoot().
|
|
if ( f->getUrlLen() <= 1 ) {
|
|
log("xmldoc: getSite: got bogus first url.");
|
|
g_errno = EBADURL;
|
|
return NULL;
|
|
}
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t timestamp = getSpideredTime();//m_spideredTime;
|
|
// add tags to tagdb?
|
|
//bool addTags = true;
|
|
//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
|
|
//if ( getIsPageParser() ) addTags = false;
|
|
// do it
|
|
if ( ! m_siteGetter.getSite ( f->getUrl() ,
|
|
gr ,
|
|
timestamp ,
|
|
cr->m_collnum ,
|
|
m_niceness ,
|
|
//addTags ,
|
|
this , // state
|
|
gotSiteWrapper ))
|
|
// return -1 if we blocked
|
|
return (char *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// set these then
|
|
gotSite();
|
|
return ptr_site;//m_siteGetter.m_site;
|
|
}
|
|
|
|
// set it
|
|
void gotSiteWrapper ( void *state ) {
|
|
// point to us
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotSite ();
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
void XmlDoc::gotSite ( ) {
|
|
// sanity check
|
|
if ( ! m_siteGetter.m_allDone && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// this sets g_errno on error
|
|
ptr_site = m_siteGetter.m_site;
|
|
size_site = m_siteGetter.m_siteLen+1; // include \0
|
|
// sanity check -- must have a site
|
|
if ( ! g_errno && size_site <= 1 ) { char *xx=NULL;*xx=0; }
|
|
// sitegetter.m_errno might be set!
|
|
m_siteValid = true;
|
|
// must be valid
|
|
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
// add the sitepathdepth tag to our tagrec
|
|
//Tag *a = m_siteGetter.m_addedTag.getFirstTag();
|
|
//if ( a ) m_newTagRec.addTag ( a );
|
|
}
|
|
|
|
int64_t *XmlDoc::getSiteHash64 ( ) {
|
|
if ( m_siteHash64Valid ) return &m_siteHash64;
|
|
char *site = getSite();
|
|
// sanity check
|
|
if ( ! site && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
if ( ! site || site == (void *)-1) return (int64_t *)site;
|
|
m_siteHash64 = hash64 ( site , gbstrlen(site) );
|
|
m_siteHash64Valid = true;
|
|
return &m_siteHash64;
|
|
}
|
|
|
|
|
|
int32_t *XmlDoc::getSiteHash32 ( ) {
|
|
if ( m_siteHash32Valid ) return &m_siteHash32;
|
|
char *site = getSite();
|
|
if ( ! site || site == (void *)-1) return (int32_t *)site;
|
|
m_siteHash32 = hash32 ( site , gbstrlen(site) );
|
|
m_siteHash32Valid = true;
|
|
return &m_siteHash32;
|
|
}
|
|
|
|
|
|
|
|
|
|
void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
|
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
bool hadError = false;
|
|
|
|
THIS->setStatus("got diffbot reply");
|
|
|
|
// wha?
|
|
if ( g_errno ) {
|
|
log("diffbot: http error2 %s",mstrerror(g_errno));
|
|
THIS->m_diffbotReplyError = g_errno;
|
|
hadError = true;
|
|
}
|
|
|
|
// just retry if connection got reset by peer!
|
|
if ( g_errno == ECONNRESET ||
|
|
g_errno == ETIMEDOUT ) {
|
|
retry:
|
|
// reset error in case was set below before our retry.
|
|
// getDiffbotReply() will retry because we never set
|
|
// m_diffbotReplyValid to true, below.
|
|
THIS->m_diffbotReplyError = 0;
|
|
log("build: retrying diffbot reply");
|
|
THIS->m_diffbotReplyRetries++;
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
return;
|
|
}
|
|
|
|
THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
//char *buf = s->m_readBuf;
|
|
// do not allow TcpServer.cpp to free it since m_diffbotReply
|
|
// is now responsible for that
|
|
//s->m_readBuf = NULL;
|
|
|
|
// set the mime
|
|
HttpMime mime;
|
|
if ( ! hadError && s && s->m_readOffset>0 &&
|
|
// set location url to "null"
|
|
! mime.set ( s->m_readBuf , s->m_readOffset , NULL ) ) {
|
|
// g_errno should be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("build: error setting diffbot mime");
|
|
THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR;
|
|
hadError = true;
|
|
}
|
|
|
|
bool retryUrl = false;
|
|
|
|
// check the status
|
|
if ( ! hadError && mime.getHttpStatus() != 200 ) {
|
|
THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS;
|
|
log("xmldoc: diffbot reply mime was %" INT32 "",
|
|
mime.getHttpStatus());
|
|
hadError = true;
|
|
// gateway timed out? then retry.
|
|
if ( mime.getHttpStatus() == 504 )
|
|
retryUrl = true;
|
|
}
|
|
|
|
if ( hadError )
|
|
log("build: diffbot error for url %s",
|
|
THIS->m_diffbotUrl.getBufStart());
|
|
|
|
|
|
CollectionRec *cr = THIS->getCollRec();
|
|
|
|
if ( cr && strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
|
|
log("build: diffbot reply for url %s = %s",
|
|
THIS->m_diffbotUrl.getBufStart(),
|
|
s->m_readBuf);
|
|
}
|
|
|
|
|
|
if ( retryUrl )
|
|
goto retry;
|
|
|
|
// get page content
|
|
char *page = NULL;
|
|
int32_t pageLen = 0;
|
|
if ( ! hadError && mime.getMimeLen() >= 0 ) {
|
|
page = s->m_readBuf + mime.getMimeLen();
|
|
char *end = s->m_readBuf + s->m_readOffset;
|
|
pageLen = end - page;
|
|
}
|
|
|
|
// "-1" means diffbot had an error
|
|
if ( page &&
|
|
page[0] == '-' &&
|
|
page[1] == '1' ) {
|
|
log("xmldoc: diffbot reply was -1");
|
|
THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
|
|
}
|
|
|
|
|
|
// . verify that it contains legit json and has the last field
|
|
// b/c we saw a case where the diffbot reply was truncated
|
|
// somehow
|
|
// . check to make sure it has the "url": field as all diffbot
|
|
// json replies must
|
|
if ( ! THIS->m_diffbotReplyError ) {
|
|
char *ttt = strstr ( page , "\"url\":\"");
|
|
if ( ! ttt ) ttt = strstr ( page , "\"pageUrl\":\"");
|
|
if ( ! ttt ) {
|
|
log("xmldoc: diffbot reply for %s using %s is missing "
|
|
"the url: field in the json reply. reply=%s",
|
|
THIS->m_firstUrl.m_url,
|
|
THIS->m_diffbotUrl.getBufStart(),
|
|
page
|
|
);
|
|
// try to get the right error code
|
|
char *err = strstr(page,"\"error\":\"");
|
|
if ( err ) err += 9;
|
|
int32_t code = EDIFFBOTUNKNOWNERROR;
|
|
if ( ! err &&
|
|
page[0]=='{' &&
|
|
page[1]=='}' )
|
|
code = EDIFFBOTCURLYREPLY;
|
|
if ( err && !strncmp(err,"Unable to apply rules",21))
|
|
code = EDIFFBOTUNABLETOAPPLYRULES;
|
|
// like .pdf pages get this error
|
|
if ( err && !strncmp(err,"Could not parse page",20))
|
|
code = EDIFFBOTCOULDNOTPARSE;
|
|
// if it is 404... 502, etc. any http status code
|
|
if ( err && !strncmp(err,"Could not download page",23))
|
|
code = EDIFFBOTCOULDNOTDOWNLOAD;
|
|
// custom api does not apply to the url
|
|
if ( err && !strncmp(err,"Invalid API",11))
|
|
code = EDIFFBOTINVALIDAPI;
|
|
if ( err && !strncmp(err,"Version required",16))
|
|
code = EDIFFBOTVERSIONREQ;
|
|
if ( err && !strncmp(err,"Empty content",13))
|
|
code = EDIFFBOTEMPTYCONTENT;
|
|
if ( err && !strncmp(err,"The selected pages contains too many TextNodes",46))
|
|
code = EDIFFBOTTOOMANYTEXTNODES;
|
|
if ( err && !strncmp(err,"No content received",19))
|
|
code = EDIFFBOTEMPTYCONTENT;
|
|
if ( err && !strncmp(err,"Request timed",13))
|
|
code = EDIFFBOTREQUESTTIMEDOUT;
|
|
if ( err &&!strncmp(err,"Request of third-party c",24))
|
|
code = EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY;
|
|
// error processing url
|
|
if ( err && !strncmp(err,"Error processing",16))
|
|
code = EDIFFBOTURLPROCESSERROR;
|
|
if ( err && !strncmp(err,"Your token has exp",18))
|
|
code = EDIFFBOTTOKENEXPIRED;
|
|
if ( err && !strncmp(err,"Not authorized API tok",22))
|
|
code = EDIFFBOTTOKENUNAUTHORIZED;
|
|
if ( err && !strncmp(err,"Error.",6) )
|
|
code = EDIFFBOTPLAINERROR;
|
|
THIS->m_diffbotReplyError = code;
|
|
}
|
|
// a hack for detecting if token is expired
|
|
if ( THIS->m_diffbotReplyError == EDIFFBOTTOKENEXPIRED ) {
|
|
// note it
|
|
log("xmldoc: pausing crawl %s (%" INT32 ") because "
|
|
"token is expired",cr->m_coll,
|
|
(int32_t)cr->m_collnum);
|
|
// pause the crawl
|
|
SafeBuf parmList;
|
|
// spidering enabled is the "cse" cgi parm in Parms.cpp
|
|
g_parms.addNewParmToList1 ( &parmList ,
|
|
cr->m_collnum,
|
|
"0", // val
|
|
-1 ,
|
|
"cse");
|
|
// this uses msg4 so parm ordering is guaranteed
|
|
g_parms.broadcastParmList ( &parmList , NULL , NULL );
|
|
}
|
|
}
|
|
|
|
// reply is now valid but might be empty
|
|
THIS->m_diffbotReplyValid = true;
|
|
|
|
// if json reply was truncated, that is an error as well.
|
|
// likewise we have to check if such bad json is in the serps
|
|
// when doing an icc=1 and print 'bad json' in json instead.
|
|
if ( ! THIS->m_diffbotReplyError && s->m_readOffset > 1 &&
|
|
// json must end with '}' (ignores trailing whitespace)
|
|
! endsInCurly ( s->m_readBuf , s->m_readOffset ) ) {
|
|
// hopefully this can be re-tried later.
|
|
THIS->m_diffbotReplyError = EJSONMISSINGLASTCURLY;
|
|
// make a note of it
|
|
log("build: got diffbot reply missing curly for %s",
|
|
THIS->m_firstUrl.m_url);
|
|
}
|
|
|
|
//if ( ! cr ) return;
|
|
|
|
bool countIt = true;
|
|
if ( ! cr ) countIt = false;
|
|
if ( THIS->m_diffbotReplyError ) countIt = false;
|
|
|
|
/*
|
|
|
|
// solution for bug #2092 but probably not really needed so
|
|
// commented out.
|
|
|
|
// if doing /vxxx/analzye?mode=xxxx then ensure matches
|
|
bool isAnalyze = false;
|
|
if ( countIt &&
|
|
THIS->m_diffbotApiUrlValid &&
|
|
strstr ( THIS->m_diffbotApiUrl.getBufStart(), "/analyze?") )
|
|
isAnalyze = true;
|
|
|
|
char *mode = NULL;
|
|
if ( isAnalyze ) {
|
|
mode = strstr (THIS->m_diffbotApiUrl.getBufStart(), "mode=");
|
|
if ( mode ) mode += 5;
|
|
// find end of it
|
|
}
|
|
|
|
char *pageType = NULL;
|
|
int32_t pageTypeLen;
|
|
if ( mode &&
|
|
THIS->m_diffbotReplyValid &&
|
|
THIS->m_diffbotReply.length() > 5 ) {
|
|
char *reply = THIS->m_diffbotReply.getBufStart();
|
|
pageType = strstr ( reply , "\"type\":\"" );
|
|
if ( pageType ) pageType += 8;
|
|
char *e = pageType;
|
|
for ( ; *e && *e != '\"' ; e++ );
|
|
pageTypeLen = e - pageType;
|
|
}
|
|
|
|
// if it does not match, do not count it
|
|
if ( mode && pageType && strncmp ( mode , pageType , pageTypeLen ) )
|
|
countIt = false;
|
|
*/
|
|
|
|
// increment this counter on a successful reply from diffbot
|
|
if ( countIt ) { // ! THIS->m_diffbotReplyError && cr ) {
|
|
// mark this flag
|
|
THIS->m_gotDiffbotSuccessfulReply = 1;
|
|
// count it for stats
|
|
cr->m_localCrawlInfo.m_pageProcessSuccesses++;
|
|
cr->m_globalCrawlInfo.m_pageProcessSuccesses++;
|
|
// per round as well
|
|
cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound++;
|
|
cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound++;
|
|
// log it
|
|
log(LOG_INFO,
|
|
"build: processed page %s (pageLen=%" INT32 ")",
|
|
THIS->m_firstUrl.m_url,
|
|
pageLen);
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
// sanity!
|
|
// crap, this can happen if we try to get the metalist
|
|
// of an old page for purposes of incremental indexing or
|
|
// deletion. we do not re-download it, but it seems we try
|
|
// to re-process it...
|
|
//if ( cr->m_localCrawlInfo.m_pageProcessAttempts >
|
|
// cr->m_localCrawlInfo.m_pageDownloadAttempts ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// need to save collection rec now during auto save
|
|
cr->m_needsSave = true;
|
|
// the diffbot api url we used
|
|
//SafeBuf *au = THIS->getDiffbotApiUrl();
|
|
//if ( ! au || au == (void *)-1 ) {char *xx=NULL;*xx=0;}
|
|
// set the reply properly
|
|
int32_t need = pageLen + 1;// + au->length() + 1;
|
|
if ( ! THIS->m_diffbotReply.reserve ( need ) )
|
|
goto skip;
|
|
// first store the url we used on first line
|
|
//THIS->m_diffbotReply.safeMemcpy ( au->getBufStart(),
|
|
// au->length() );
|
|
//THIS->m_diffbotReply.pushChar('\n');
|
|
// convert the \u1f23 to utf8 (\n and \r as well)
|
|
// crap, this decodes \\\\\" to \\" which is causing
|
|
// the json parser to believe it is an encoded \ then
|
|
// a REAL quote... but quote is contained...
|
|
//THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen ,
|
|
// THIS->m_niceness );
|
|
|
|
// do not do that any more then, jsonparse can call it
|
|
// on a per string basis
|
|
THIS->m_diffbotReply.safeMemcpy ( page , pageLen );
|
|
|
|
// convert embedded \0 to space
|
|
//char *p = THIS->m_diffbotReply.getBufStart();
|
|
//char *pend = p + THIS->m_diffbotReply.getLength();
|
|
// tack on a \0 but don't increment m_length
|
|
THIS->m_diffbotReply.nullTerm();
|
|
|
|
// any embedded \0's in the utf8?
|
|
int32_t testLen1 = THIS->m_diffbotReply.length();
|
|
int32_t testLen2 = gbstrlen(THIS->m_diffbotReply.getBufStart());
|
|
if ( testLen1 != testLen2 ) { char *xx=NULL;*xx=0; }
|
|
// convert the \u1f23 to utf8 (\n and \r as well)
|
|
//THIS->m_diffbotReply.decodeJSONToUtf8 ( THIS->m_niceness );
|
|
//THIS->m_diffbotReply.nullTerm();
|
|
}
|
|
|
|
skip:
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
|
|
|
|
if ( m_diffbotApiUrlValid )
|
|
return &m_diffbotApiUrl;
|
|
|
|
// if we are a diffbot json object, do not re-send to diffbot!
|
|
if ( m_isDiffbotJSONObject ) {
|
|
//m_diffbotApiNum = DBA_NONE;
|
|
m_diffbotApiUrlValid = true;
|
|
return &m_diffbotApiUrl;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
|
|
m_diffbotApiUrl.safeMemcpy ( &cr->m_diffbotApiUrl );
|
|
m_diffbotApiUrl.nullTerm();
|
|
m_diffbotApiUrlValid = true;
|
|
|
|
// this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
|
|
// in case the url filters table changes while spidering this!!!
|
|
// gotta be careful of that.
|
|
//int32_t *ufn = getUrlFilterNum();
|
|
//if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
|
|
|
|
// ensure it does set it!
|
|
//if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
//m_diffbotApiNum = cr->m_spiderDiffbotApiNum[*ufn];
|
|
|
|
// sanity check
|
|
//if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
//m_diffbotApiNumValid = true;
|
|
return &m_diffbotApiUrl;
|
|
}
|
|
|
|
// if only processing NEW URLs is enabled, then do not get diffbot reply
|
|
// if we already got one before
|
|
bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
|
|
|
if ( m_recycleDiffbotReplyValid )
|
|
return &m_recycleDiffbotReply;
|
|
|
|
// if from pageparser.cpp re-call diffbot for debugging
|
|
if ( getIsPageParser() ) {
|
|
m_recycleDiffbotReply = false;
|
|
m_recycleDiffbotReplyValid = true;
|
|
return &m_recycleDiffbotReply;
|
|
}
|
|
|
|
XmlDoc **odp = getOldXmlDoc( );
|
|
if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp;
|
|
XmlDoc *od = *odp;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if doc has been successfully processed in the past then
|
|
// ***RECYCLE*** the diffbot reply!
|
|
m_recycleDiffbotReply = false;
|
|
|
|
if ( cr->m_diffbotOnlyProcessIfNewUrl &&
|
|
od && od->m_gotDiffbotSuccessfulReply )
|
|
m_recycleDiffbotReply = true;
|
|
|
|
// to fight off corrupted title recs just assume that even though
|
|
// we could not uncompress the title rec that it had a successful reply
|
|
// if ( cr->m_diffbotOnlyProcessIfNewUrl &&
|
|
// m_oldDocExistedButHadError )
|
|
// m_recycleDiffbotReply = true;
|
|
|
|
// don't recycle if specifically asked to reindex though
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex )
|
|
m_recycleDiffbotReply = false;
|
|
|
|
// unless the 'recycle content' checkbox was checked when doing
|
|
// the query (page) reindex...
|
|
if ( m_sreqValid && m_sreq.m_recycleContent )
|
|
m_recycleDiffbotReply = true;
|
|
|
|
|
|
m_recycleDiffbotReplyValid = true;
|
|
|
|
return &m_recycleDiffbotReply;
|
|
}
|
|
|
|
// get hashes of the json objects in the diffbotreply
|
|
int32_t *XmlDoc::getDiffbotTitleHashes ( int32_t *numHashes ) {
|
|
|
|
*numHashes = size_linkInfo2 / 4;
|
|
|
|
if ( ! ptr_linkInfo2 ) *numHashes = 0;
|
|
|
|
// hack: use linkdbdata2 field
|
|
if ( m_diffbotTitleHashBufValid ) {
|
|
// do not return NULL without g_errno set
|
|
if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
|
|
return (int32_t *)ptr_linkInfo2;
|
|
}
|
|
|
|
SafeBuf *tdbr = getTokenizedDiffbotReply();
|
|
if ( ! tdbr || tdbr == (void *)-1 ) return (int32_t *)tdbr;
|
|
|
|
HashTableX dedup;
|
|
if ( ! dedup.set ( 4,0,1024,NULL,0,false,m_niceness,"ddthbuf") )
|
|
return NULL;
|
|
|
|
// parse out the json items in the reply
|
|
char *p = tdbr->getBufStart();
|
|
char *pend = p + tdbr->length();
|
|
|
|
int32_t plen;
|
|
|
|
for ( ; p < pend ; p += plen + 1 ) {
|
|
// breathe some in case diffbot reply is 250MB
|
|
QUICKPOLL(m_niceness);
|
|
// set this
|
|
plen = gbstrlen(p);
|
|
// get title from it
|
|
int32_t valLen;
|
|
char *val = getJSONFieldValue ( p , "title", &valLen );
|
|
int32_t th32 = 0;
|
|
// hash the title
|
|
if ( val && valLen ) {
|
|
th32 = hash32 ( val , valLen );
|
|
// avoid 0
|
|
if ( th32 == 0 ) th32 = 1;
|
|
}
|
|
// if no title, use hash of body
|
|
if ( th32 == 0 ) {
|
|
th32 = hash32 ( p , plen );
|
|
// avoid 0
|
|
if ( th32 == 0 ) th32 = 2;
|
|
}
|
|
// if our hash is duplicated then increment until unique
|
|
while ( dedup.isInTable ( &th32 ) ) th32++;
|
|
// store it for deduping
|
|
dedup.addKey ( &th32 );
|
|
// store it
|
|
m_diffbotTitleHashBuf.pushLong(th32);
|
|
}
|
|
|
|
ptr_linkInfo2 = (LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
|
|
size_linkInfo2 = m_diffbotTitleHashBuf.length();
|
|
*numHashes = size_linkInfo2 / 4;
|
|
m_diffbotTitleHashBufValid = true;
|
|
|
|
// if no hashes return 0x01 because NULL means g_errno
|
|
if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
|
|
|
|
return (int32_t *)ptr_linkInfo2;
|
|
}
|
|
|
|
// . we now get the TOKENIZED diffbot reply.
|
|
// . that converts a single diffbot reply into multiple \0 separated
|
|
// json objects.
|
|
// . for instance, the diffbot product api returns an array like
|
|
// "products":[{...},{...}],"url":... that consists of multiple
|
|
// json product items, but the json elements that are not in
|
|
// this array are description of the page itself, like url and title.
|
|
// so we need to carry over these outter json objects to each
|
|
// inner json object we tokenize.
|
|
// . in this fashion we'll have separate objects that can each be indexed
|
|
// as a single page, which is what we want for searching.
|
|
SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
|
|
|
|
if ( m_tokenizedDiffbotReplyValid )
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( ! dbr || dbr == (void *)-1 ) return dbr;
|
|
|
|
// empty? that's easy. might be just "{}\n" i guess
|
|
if ( dbr->length() <= 3 ) return dbr;
|
|
|
|
char *text = dbr->getBufStart();
|
|
|
|
Json jp;
|
|
if ( ! jp.parseJsonStringIntoJsonItems ( text , m_niceness ) ) {
|
|
g_errno = EBADJSONPARSER;
|
|
return NULL;
|
|
}
|
|
|
|
JsonItem *jsonItem = jp.getItem("objects");
|
|
char *array = NULL;
|
|
int32_t arrayLen = 0;
|
|
if ( jsonItem ) {
|
|
array = jsonItem->getArrayStart();
|
|
arrayLen = jsonItem->getArrayLen();
|
|
}
|
|
if ( array && arrayLen > 0 ) {
|
|
m_v3buf.safeMemcpy( array , arrayLen );
|
|
m_v3buf.nullTerm();
|
|
// trim off the enclosing []'s
|
|
char *p = m_v3buf.getBufStart();
|
|
for ( ; *p && is_wspace_a(*p) ; p++ );
|
|
if ( *p == '[') *p = ' ';
|
|
char *e = m_v3buf.getBuf()-1;
|
|
for ( ; e>p && is_wspace_a(*e) ;e--);
|
|
if ( *e ==']') *e=' ';
|
|
// replace top level commas with \0's
|
|
int32_t curlies = 0;
|
|
char *x = p;
|
|
bool inQuotes = false;
|
|
// scan now
|
|
for ( ; *x ; x++ ) {
|
|
// escaping a backslash?
|
|
if ( *x == '\\' && x[1] == '\\' ) {
|
|
// skip two bytes then..
|
|
x++;
|
|
continue;
|
|
}
|
|
// escaping a quote? ignore quote then.
|
|
if ( *x == '\\' && x[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
x++;
|
|
continue;
|
|
}
|
|
if ( *x == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
continue;
|
|
}
|
|
// if in a quote, ignore {} in there
|
|
if ( inQuotes ) continue;
|
|
if ( *x== '{' ) {
|
|
curlies++;
|
|
continue;
|
|
}
|
|
if ( *x == '}' ) {
|
|
curlies--;
|
|
continue;
|
|
}
|
|
if ( curlies != 0 ) continue;
|
|
if ( *x == ',' ) *x = '\0';
|
|
}
|
|
m_tokenizedDiffbotReplyPtr = &m_v3buf;
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
|
|
// it must have \"type\":\"product or \"type\":\"image
|
|
// in order for us to do the array separation logic below.
|
|
// we don't want to do this logic for articles because they
|
|
// contain an image array!!!
|
|
|
|
// this must be on the FIRST level of the json object, otherwise
|
|
// we get errors because we got type:article and it
|
|
// contains an images array!
|
|
|
|
int32_t valLen;
|
|
char *val = getJSONFieldValue ( text , "type", &valLen );
|
|
|
|
bool isProduct = false;
|
|
bool isImage = false;
|
|
|
|
if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 )
|
|
isProduct = true;
|
|
|
|
if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 )
|
|
isImage = true;
|
|
|
|
if ( ! isProduct && ! isImage ) {
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
|
|
char *needle;
|
|
char *newTerm;
|
|
if ( isProduct ) {
|
|
needle = ",\"products\":[";
|
|
newTerm = "product";
|
|
}
|
|
else {
|
|
needle = ",\"images\":[";
|
|
newTerm = "image";
|
|
}
|
|
|
|
char *parray = strstr ( text , needle );
|
|
|
|
// if not found, no need to do anything...
|
|
if ( ! parray ) {
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
|
|
// point to [
|
|
char *pstart = parray + gbstrlen(needle) - 1;
|
|
|
|
//
|
|
// ok, now we have to do so json ju jitsu to fix it
|
|
//
|
|
|
|
// point to array. starting at the '['
|
|
char *p = pstart;
|
|
int32_t brackets = 0;
|
|
bool inQuotes = false;
|
|
for ( ; *p ; p++ ) {
|
|
// escaping a quote? ignore quote then.
|
|
if ( *p == '\\' && p[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
p++;
|
|
continue;
|
|
}
|
|
if ( *p == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
continue;
|
|
}
|
|
// if in a quote, ignore {} in there
|
|
if ( inQuotes ) continue;
|
|
if ( *p == '[' ) brackets++;
|
|
if ( *p != ']' ) continue;
|
|
brackets--;
|
|
// stop if array is done. p points to ']'
|
|
if ( brackets == 0 ) break;
|
|
}
|
|
|
|
// now point to outter items to the left of the ",\"products\":[...
|
|
char *left1 = dbr->getBufStart();
|
|
char *left2 = parray;
|
|
// then to the right. skip over the ending ']'
|
|
char *right1 = p + 1;
|
|
char *right2 = dbr->getBuf(); // end of the buffer
|
|
|
|
|
|
SafeBuf *tbuf = &m_tokenizedDiffbotReply;
|
|
|
|
// now scan the json products or images in the array
|
|
char *x = pstart;
|
|
// skip over [
|
|
x++;
|
|
// each product item in array is enclosed in {}'s
|
|
if ( *x != '{' ) {
|
|
log("build: something is wrong with diffbot reply");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
// reset CURLY bracket count
|
|
int32_t curlies = 0;
|
|
char *xstart = NULL;
|
|
inQuotes = false;
|
|
// scan now
|
|
for ( ; x < right1 ; x++ ) {
|
|
// escaping a quote? ignore quote then.
|
|
if ( *x == '\\' && x[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
x++;
|
|
continue;
|
|
}
|
|
if ( *x == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
continue;
|
|
}
|
|
// if in a quote, ignore {} in there
|
|
if ( inQuotes ) continue;
|
|
if ( *x== '{' ) {
|
|
if ( curlies == 0 ) xstart = x;
|
|
curlies++;
|
|
continue;
|
|
}
|
|
if ( *x == '}' ) {
|
|
curlies--;
|
|
if ( curlies != 0 ) continue;
|
|
// unreciprocated '{'? wtf???
|
|
if ( ! xstart ) continue;
|
|
// skip empty curlies
|
|
if ( x[-1] == '{' ) continue;
|
|
//
|
|
// ok, we got an item!
|
|
//
|
|
|
|
// left top items
|
|
if ( ! tbuf->safeMemcpy ( left1 , left2-left1 ) )
|
|
return NULL;
|
|
// use "product":
|
|
|
|
if ( ! tbuf->safePrintf(",\"%s\":" , newTerm ) )
|
|
return NULL;
|
|
// the item itself, include it's curlies.
|
|
if ( ! tbuf->safeMemcpy ( xstart , x - xstart+1 ) )
|
|
return NULL;
|
|
// right top items
|
|
if ( ! tbuf->safeMemcpy ( right1 , right2-right1 ) )
|
|
return NULL;
|
|
// then a \0
|
|
if ( ! tbuf->pushChar('\0') )
|
|
return NULL;
|
|
// reset this!
|
|
xstart = NULL;
|
|
}
|
|
}
|
|
|
|
// now show the items. debug!
|
|
//p = tbuf->getBufStart();
|
|
//for ( ; p < tbuf->getBuf() ; p += gbstrlen(p) + 1 )
|
|
// fprintf(stderr,"ITEM\n%s\n\n",p);
|
|
|
|
|
|
m_tokenizedDiffbotReplyPtr = tbuf;
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
void gotDiffbotProxyReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_diffbotProxyReply = NULL;
|
|
// if a valid reply, then point to it
|
|
if ( slot->m_readBufSize == sizeof(ProxyReply) ) {
|
|
THIS->m_diffbotProxyReply = (ProxyReply *)slot->m_readBuf;
|
|
// steal it, we will free it in XmlDoc::reset()
|
|
slot->m_readBuf = NULL;
|
|
}
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . convert document into json representing multiple documents
|
|
// if it makes sense. sometimes a single url contains multiple
|
|
// subdocuments that each should have their own url, but do not,
|
|
// so we fix that here.
|
|
// . the diffbot reply will be a list of json objects we want to index
|
|
SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
|
|
|
// got reply of malformed json missing final '}'
|
|
if ( m_diffbotReplyValid &&
|
|
m_diffbotReplyError == EJSONMISSINGLASTCURLY ) {
|
|
// hopefully spider will retry later
|
|
g_errno = m_diffbotReplyError;
|
|
return NULL;
|
|
}
|
|
|
|
if ( m_diffbotReplyValid )
|
|
return &m_diffbotReply;
|
|
|
|
// . check the url filters table to see if diffbot api is specified
|
|
// . just return "\0" if none, but NULL means error i guess
|
|
SafeBuf *au = getDiffbotApiUrl();
|
|
if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
|
|
|
|
// if no url, assume do not access diffbot
|
|
if ( au->length() <= 0 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// if we are json do not send that to diffbot, like an injected
|
|
// json diffbot object. should fix json injections into global index
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
|
|
if ( *ct == CT_JSON ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// we make a "fake" url for the diffbot reply when indexing it
|
|
// by appending -diffbotxyz%" UINT32 ". see "fakeUrl" below.
|
|
if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
|
|
if ( m_firstUrlValid )
|
|
log("build: diffbot url would be too long for "
|
|
"%s", m_firstUrl.getUrl() );
|
|
else
|
|
log("build: diffbot url would be too long for "
|
|
"%" INT64 "", m_docId );
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// getIndexCode() calls getDiffbotReply(), so avoid a loop!
|
|
//if ( *getIndexCode() )
|
|
// return &m_diffbotReply;
|
|
if ( m_indexCodeValid && m_indexCode )
|
|
return &m_diffbotReply;
|
|
|
|
|
|
if ( m_isDiffbotJSONObject ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// if this is a robots.txt or a root page we are downloading
|
|
// separately to get the title for to compare to this page's title,
|
|
// or whatever, do not pass to diffbot
|
|
if ( m_isChildDoc ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// get list of substring patterns
|
|
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
|
|
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
|
|
if ( upp && ! upp[0] ) upp = NULL;
|
|
if ( ucp && ! ucp[0] ) ucp = NULL;
|
|
// do we match the url process pattern or regex?
|
|
// get the compiled regular expressions
|
|
//regex_t *ucr = &cr->m_ucr;
|
|
regex_t *upr = &cr->m_upr;
|
|
//if ( ! cr->m_hasucr ) ucr = NULL;
|
|
if ( ! cr->m_hasupr ) upr = NULL;
|
|
// get the url
|
|
Url *f = getFirstUrl();
|
|
char *url = f->getUrl();
|
|
// . "upp" is a ||-separated list of substrings
|
|
// . "upr" is a regex
|
|
// . regexec returns 0 for a match
|
|
if ( upr && regexec(upr,url,0,NULL,0) ) {
|
|
// return empty reply
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
if ( upp && !upr &&!doesStringContainPattern(url,upp)) {
|
|
// return empty reply
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
|
|
|
|
// if already processed and onlyprocessifnewurl is enabled then
|
|
// we recycle and do not bother with this, we also do not nuke
|
|
// the diffbot json objects we have already indexed by calling
|
|
// nukeJSONObjects()
|
|
bool *recycle = getRecycleDiffbotReply();
|
|
if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle;
|
|
if ( *recycle ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// if set from title rec, do not do it. we are possibly an "old doc"
|
|
// and we should only call diffbot.com with new docs
|
|
if ( m_setFromTitleRec ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// "none" means none too! Parms.cpp doesn't like &dapi1=& because
|
|
// it does not call setParm() on such things even though it probably
|
|
// should, it doesn't like no values, so i put "none" in there.
|
|
if ( strncasecmp(au->getBufStart(),"none",4) == 0 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
if ( strncasecmp(au->getBufStart(),"donotprocess",12) == 0 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// invalid url?
|
|
Url apiUrl; apiUrl.set ( au->getBufStart() );
|
|
if (apiUrl.getUrlLen() <= 0 ||
|
|
apiUrl.getHostLen() <= 0 ||
|
|
apiUrl.getDomainLen() <= 0 ) {
|
|
log("build: invalid diffbot api url of \"%s\".",
|
|
au->getBufStart() );
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// when respidering an "old" doc, never call this. we already
|
|
// have the diffbot replies xyz.com/-diffbot-0 and xyz.com/-diffbot-1
|
|
// etc.
|
|
//if ( m_setFromTitleRec ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// sanity check. no! barfs on legit url with -diffbot- in it
|
|
//if ( strstr(m_firstUrl.m_url,"-diffbot-") ) {
|
|
// char *xx=NULL; *xx = 0; }
|
|
|
|
// we should not "process" (i.e. send to diffbot) urls that do
|
|
// not match the supplied CollectionRec::m_diffbotUrlProcessPattern
|
|
// let's just put a checkbox in the url filters box for this!
|
|
// i.e. Send to Diffbot? [X]
|
|
//if ( m_useDiffbot && ! doesUrlMatchDiffbotProcessPattern() ) {
|
|
// m_diffbotReplyValid = true;
|
|
// return &m_diffbotReply;
|
|
//}
|
|
|
|
// empty content, do not send to diffbot then
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
|
|
if ( ! *u8 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// do not send to diffbot if its binary!
|
|
char *ib = getIsBinary();
|
|
if ( ! ib || ib == (void *)-1 ) return (SafeBuf *)ib;
|
|
if ( *ib ) {
|
|
m_diffbotReplyValid = true;
|
|
log("diffbot: skipping binary page %s",m_firstUrl.m_url);
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// or if original page content matches the page regex dont hit diffbot
|
|
if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// now include referring link anchor text, etc.
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
|
|
|
|
|
|
setStatus("getting diffbot reply");
|
|
|
|
|
|
// set up dedup table for deduping on link text
|
|
HashTableX dedup;
|
|
char tmp[512];
|
|
if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") )
|
|
return NULL;
|
|
|
|
SafeBuf headers;
|
|
bool first = true;
|
|
|
|
// . make additional headers
|
|
// . add two headers for every "good" (non-dup) link
|
|
// . do NOT end headers in \r\n since HttpServer adds that!
|
|
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// sanity
|
|
if ( k->size_urlBuf <= 1 ) continue;
|
|
// skip if too long
|
|
if ( k->size_linkText > 1024 ) continue;
|
|
// or not enough! (size includes \0)
|
|
if ( k->size_linkText <= 1 ) continue;
|
|
// sanity check
|
|
char *txt = k->getLinkText();
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// this seems to happen sometimes..
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) continue;
|
|
// if anchor text has \0 skip it
|
|
if ( gbstrlen(txt) != tlen ) continue;
|
|
// or if surrounding text has \0 skip as well
|
|
char *surStr = k->getSurroundingText();
|
|
int32_t surLen = k->size_surroundingText;
|
|
if ( surLen > 0 ) surLen--;
|
|
if ( surStr && gbstrlen(surStr) != surLen ) continue;
|
|
// dedup on that
|
|
int32_t h32 = hash32 ( txt , tlen );
|
|
if ( dedup.isInTable ( &h32 ) ) continue;
|
|
if ( ! dedup.addKey ( &h32 ) ) return NULL;
|
|
// separate with \r\n
|
|
if ( ! first && ! headers.safePrintf("\r\n" ) )
|
|
return NULL;
|
|
first = false;
|
|
// add to http header
|
|
if ( ! headers.safePrintf("X-referring-url: ") )
|
|
return NULL;
|
|
// do not include the terminating \0, so -1
|
|
if ( ! headers.safeMemcpy(k->getUrl() , k->size_urlBuf-1 ))
|
|
return NULL;
|
|
// and link text
|
|
if ( ! headers.safePrintf("\r\nX-anchor-text: ") )
|
|
return NULL;
|
|
// store the anchor text without any \r or \n chars
|
|
if ( ! headers.reserve ( tlen ) ) return NULL;
|
|
char *p = txt;
|
|
char *pend = txt + tlen;
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( *p == '\r' ) continue;
|
|
if ( *p == '\n' ) continue;
|
|
headers.pushChar(*p);
|
|
}
|
|
// do not include it if more than 2000 chars big
|
|
if ( surLen > 0 && surLen < 2000 ) {
|
|
if ( ! headers.safePrintf("\r\nX-surrounding-text: ") )
|
|
return NULL;
|
|
// make room for copying the surrounding text
|
|
if ( ! headers.reserve ( surLen ) ) return NULL;
|
|
// copy minus any \r or \n so its mime header safe
|
|
p = surStr;
|
|
pend = surStr + surLen;
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( *p == '\r' ) continue;
|
|
if ( *p == '\n' ) continue;
|
|
headers.pushChar(*p);
|
|
}
|
|
}
|
|
}
|
|
|
|
// make sure to null term the headers
|
|
if ( headers.length() && ! headers.nullTerm() ) return NULL;
|
|
|
|
//char *path = "api";
|
|
//if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 )
|
|
// path = "v2";
|
|
|
|
//
|
|
// DIFFBOT injection interface TODO
|
|
//
|
|
// if we are intercepting a direct injection diffbot request
|
|
// then we will probably take the exact same parms provided and
|
|
// just relay them to diffbot here. maybe Diffbot.cpp can set
|
|
// the original diffbot.com request url in this xmldoc class that
|
|
// is being inject using the url encoded in that request.
|
|
//
|
|
|
|
// url can be on the stack since httpserver.cpp makes an http mime
|
|
// from this url
|
|
//SafeBuf diffbotUrl;
|
|
|
|
// TODO: make sure "api" works as hostname for not just product...
|
|
//diffbotUrl.safePrintf("http://www.diffbot.com/");
|
|
// skip extra '/'?
|
|
//char *api = au->getBufStart();
|
|
//int32_t apiLen = au->length();
|
|
//if ( api && api[0] == '/' ) { api++; apiLen--; }
|
|
// append the custom url. i.e. /api/analyze?mode=auto&u=
|
|
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
|
|
|
|
// reset it in case we are a re-call from gotDiffbotReplyWrapper()
|
|
// if g_errno == ECONNRESET
|
|
m_diffbotUrl.reset();
|
|
// store the api url into here
|
|
m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() );
|
|
|
|
// . m_diffbotApi Is like "article" or "product" etc.
|
|
// . if classify is true we always return the classification
|
|
// of the page in the JSON. like "type":"frontpage" regardless
|
|
// of the "api" specified.
|
|
// . otherwise, if classify is false empty json will be returned
|
|
// if there is no json objects of the specified page type, "api"
|
|
// . BUT if api is "all" return all types of json objects
|
|
// . SHOULD we return "type" in the json output?
|
|
/*
|
|
if ( *an == DBA_ALL )
|
|
diffbotUrl.safePrintf("analyze?mode=auto&" );
|
|
else if ( *an == DBA_ARTICLE_FORCE )
|
|
diffbotUrl.safePrintf("article?");
|
|
else if ( *an == DBA_ARTICLE_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=article&");
|
|
else if ( *an == DBA_PRODUCT_FORCE )
|
|
diffbotUrl.safePrintf("product?");
|
|
else if ( *an == DBA_PRODUCT_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=product&");
|
|
else if ( *an == DBA_IMAGE_FORCE )
|
|
diffbotUrl.safePrintf("image?");
|
|
else if ( *an == DBA_IMAGE_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=image&");
|
|
else if ( *an == DBA_FRONTPAGE_FORCE )
|
|
diffbotUrl.safePrintf("frontpage?");
|
|
else if ( *an == DBA_FRONTPAGE_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=frontpage&");
|
|
else {
|
|
log("build: unknown diffbot api num = %" INT32 ". assuming all",*an );
|
|
diffbotUrl.safePrintf("analyze?mode=auto&" );
|
|
}
|
|
*/
|
|
|
|
//CollectionRec *cr = getCollRec();
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// add a '?' if none
|
|
if ( ! strchr ( apiUrl.getUrl() , '?' ) )
|
|
m_diffbotUrl.pushChar('?');
|
|
else
|
|
m_diffbotUrl.pushChar('&');
|
|
|
|
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
|
|
// only print token if we have one, because if user provides their
|
|
// own diffbot url (apiUrl in Parms.cpp) then they might include
|
|
// the token in that for their non-custom crawl. m_customCrawl=0.
|
|
if ( cr->m_diffbotToken.length())
|
|
m_diffbotUrl.safePrintf("token=%s",
|
|
cr->m_diffbotToken.getBufStart());
|
|
|
|
bool useProxies = true;
|
|
// user can turn off proxy use with this switch
|
|
if ( ! g_conf.m_useProxyIps ) useProxies = false;
|
|
// did collection override?
|
|
if ( cr->m_forceUseFloaters ) useProxies = true;
|
|
// we gotta have some proxy ips that we can use
|
|
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
|
|
|
|
// until we fix https CONNECT support for https urls diffbot can't
|
|
// go through gb. we should fix that by downloading the whole page
|
|
// ourselves and sending it back, and tell diffbot's phantomjs not
|
|
// to do the certificate check.
|
|
//
|
|
// for now, allow http and NOT https urls through though.
|
|
// TODO: if the url redirects to an https url will this mess us up?
|
|
// if ( ! m_firstUrlValid )
|
|
// useProxies = false;
|
|
// if ( m_firstUrlValid && m_firstUrl.isHttps() )
|
|
// useProxies = false;
|
|
|
|
// turn off for now always
|
|
//useProxies = false;
|
|
|
|
if ( useProxies && ! m_diffbotProxyReplyValid && m_ipValid ) {
|
|
// a special opcode used in SpiderProxy.cpp
|
|
Msg13Request *r = &m_diffbotProxyRequest;
|
|
r->m_opCode = OP_GETPROXYFORDIFFBOT;
|
|
r->m_banProxyIp = 0;
|
|
r->m_urlIp = m_ip;
|
|
m_diffbotProxyReplyValid = true;
|
|
// get first alive host, usually host #0 but if he is dead then
|
|
// host #1 must take over! if all are dead, it returns host #0.
|
|
// so we are guaranteed "h will be non-null
|
|
Host *h = g_hostdb.getFirstAliveHost();
|
|
// now ask that host for the best spider proxy to send to
|
|
if ( ! g_udpServer.sendRequest ( (char *)r,
|
|
// just the top part of the
|
|
// Msg13Request is sent to
|
|
// handleRequest54() now
|
|
r->getProxyRequestSize() ,
|
|
0x54 , // msgType 0x54
|
|
h->m_ip ,
|
|
h->m_port ,
|
|
-1 , // h->m_hostId ,
|
|
NULL ,
|
|
this , // state data
|
|
gotDiffbotProxyReplyWrapper,
|
|
9999999 )){// 99999sectimeout
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// report it
|
|
log("spider: msg54 request3: %s %s",
|
|
mstrerror(g_errno),r->ptr_url);
|
|
return NULL;
|
|
}
|
|
// wait for reply
|
|
return (SafeBuf *)-1;
|
|
}
|
|
|
|
|
|
// if we used a proxy to download the doc, then diffbot should too
|
|
// BUT tell diffbot to go through host #0 so we can send it to the
|
|
// correct proxy using our load balancing & backoff algos.
|
|
if ( useProxies ) {
|
|
//Host *h0 = g_hostdb.getHost(0);
|
|
// use a random host now to avoid host #0 running
|
|
// out of sockets from diffbot trying to connect
|
|
// for downloading hundreds of urls from the same
|
|
// high crawl delay site.
|
|
// round robin over the hosts just to be more evenly
|
|
// distributed. it will likely get several http requests
|
|
// from diffbot.
|
|
// static int32_t s_lastHostId = -1;
|
|
// if ( s_lastHostId == -1 )
|
|
// s_lastHostId = g_hostdb.m_myHost->m_hostId;
|
|
// int32_t r = s_lastHostId;//rand() % g_hostdb.m_numHosts;
|
|
// if ( ++s_lastHostId >= g_hostdb.m_numHosts )
|
|
// s_lastHostId = 0;
|
|
// Host *h0 = g_hostdb.getHost(r);
|
|
// m_diffbotUrl.safePrintf("&proxy=%s:%" INT32 "",
|
|
// iptoa(h0->m_ip),
|
|
// (int32_t)h0->m_httpPort);
|
|
ProxyReply *prep = m_diffbotProxyReply;
|
|
m_diffbotUrl.safePrintf("&proxy=%s:%" UINT32 "",
|
|
iptoa(prep->m_proxyIp),
|
|
(uint32_t)prep->m_proxyPort);
|
|
m_diffbotUrl.safePrintf("&proxyAuth=");
|
|
m_diffbotUrl.urlEncode(prep->m_usernamePwd);
|
|
}
|
|
// char *p = g_conf.m_proxyAuth.getBufStart();
|
|
// if ( useProxies && p ) {
|
|
// char *p1 = p;
|
|
// for ( ; *p1 && is_wspace_a(*p1) ; p1++ );
|
|
// char *p2 = p1;
|
|
// for ( ; *p2 && ! is_wspace_a(*p2) ; p2++ );
|
|
// char c = *p2;
|
|
// *p2 = '\0';
|
|
// m_diffbotUrl.safePrintf("&proxyAuth=");
|
|
// m_diffbotUrl.urlEncode(p1);
|
|
// *p2 = c;
|
|
// }
|
|
|
|
// now so it works just give it a proxy directly, so it doesn't
|
|
// have to go through gb.
|
|
// if ( useProxies ) {
|
|
// // msg13 typically uses this to get an unbanned proxy
|
|
// getProxiesToUse();
|
|
// }
|
|
|
|
// if we use proxies then increase the timeout since proxies
|
|
// increase the crawl delay in hopes of backing off to discover
|
|
// the website's policy so we don't hit it too hard and get banned.
|
|
// so to avoid diffbot timing out tell it to wait up to a minute
|
|
// because the crawl delay can be as high as that, even higher
|
|
if ( useProxies )
|
|
m_diffbotUrl.safePrintf("&timeout=%" INT32 "",
|
|
(int32_t)MAX_PROXYCRAWLDELAYMS+10000);
|
|
|
|
m_diffbotUrl.safePrintf("&url=");
|
|
// give diffbot the url to process
|
|
m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
|
|
// append this just in case the next thing doesn't have it.
|
|
//if ( cr->m_diffbotApiQueryString.length() &&
|
|
// cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
|
|
// diffbotUrl.pushChar('&');
|
|
// then user provided parms that are dependent on if it is an
|
|
// article, product, etc. like "&dontstripads=1" or whatever
|
|
//diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart());
|
|
|
|
// for analyze requests without mode=, make sure that diffbot expands all objects
|
|
// "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze
|
|
// null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above)
|
|
if (m_diffbotUrl.nullTerm()) {
|
|
char *u = m_diffbotUrl.getBufStart();
|
|
if (strstr(u, "/analyze") && !strstr(u, "mode=")) {
|
|
m_diffbotUrl.safePrintf("&expand");
|
|
}
|
|
}
|
|
|
|
// null term it
|
|
m_diffbotUrl.nullTerm();
|
|
|
|
// mark as tried
|
|
if ( m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
m_sentToDiffbotThisTime = true;
|
|
|
|
// might have been a recall if gotDiffbotReplyWrapper() sensed
|
|
// g_errno == ECONNRESET and it will retry
|
|
if ( ! m_sentToDiffbot ) {
|
|
|
|
m_sentToDiffbot = 1;
|
|
|
|
// count it for stats
|
|
cr->m_localCrawlInfo.m_pageProcessAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageProcessAttempts++;
|
|
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
|
|
cr->m_needsSave = true;
|
|
}
|
|
|
|
char *additionalHeaders = NULL;
|
|
if ( headers.length() > 0 )
|
|
additionalHeaders = headers.getBufStart();
|
|
|
|
// if did not get the web page first and we are crawling, not
|
|
// doing a bulk, then core. we need the webpage to harvest links
|
|
// and sometimes to check the pageprocesspattern to see if we should
|
|
// process.
|
|
if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
log(LOG_INFO,
|
|
"diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
|
|
additionalHeaders);
|
|
|
|
m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
|
|
0 , // ip
|
|
0 , // offset
|
|
-1 , // size
|
|
0 , // ifmodifiedsince
|
|
this , // state
|
|
gotDiffbotReplyWrapper ,
|
|
// MDW: boost timeout from 180 to 18000
|
|
// seconds so we can figure out why
|
|
// diffbot times out, etc. what is
|
|
// going on.
|
|
18000*1000, // 180 sec timeout
|
|
0,//proxyip
|
|
0,//proxyport
|
|
// unlimited replies i guess
|
|
-1,//maxtextdoclen unlimited
|
|
-1,//maxotherdoclen unlimited
|
|
g_conf.m_spiderUserAgent ,
|
|
"HTTP/1.0",
|
|
false, // do post?
|
|
NULL, // cookie
|
|
additionalHeaders ) )
|
|
// return -1 if blocked
|
|
return (SafeBuf *)-1;
|
|
// error?
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// wha?
|
|
log("diffbot: http error %s",mstrerror(g_errno));
|
|
// had an error!
|
|
return NULL;
|
|
}
|
|
|
|
char **XmlDoc::getHttpReply ( ) {
|
|
// both must be valid now
|
|
if ( m_redirUrlValid && m_httpReplyValid ) {
|
|
// might have been a download error of ECORRUPTDATA
|
|
if ( m_downloadStatus == ECORRUPTDATA ) {
|
|
// set g_errno so caller knows
|
|
g_errno = m_downloadStatus;
|
|
// null means error
|
|
return NULL;
|
|
}
|
|
// otherwise, assume reply is valid
|
|
return &m_httpReply;
|
|
}
|
|
|
|
setStatus("getting http reply");
|
|
|
|
// come back up here if a redirect invalidates it
|
|
loop:
|
|
// sanity test -- only if not the test collection (NO, might be EBADIP)
|
|
//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
|
|
// get the http reply
|
|
char **replyPtr = getHttpReply2();
|
|
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
|
|
// . now if the reply was a redirect we should set m_redirUrl to it
|
|
// and re-do all this code
|
|
// . this often sets m_indexCode to stuff like ESIMPLIFIEDREDIR, etc.
|
|
Url **redirp = getRedirUrl();
|
|
// we often lookup the associated linkInfo on the original url to
|
|
// see if it is worth keeping and indexing just to take advantage of
|
|
// the incoming link text it has, so we may block on that!
|
|
// but in the case of a contactDoc, getContactDoc() sets these things
|
|
// to NULL to avoid unnecessary lookups.
|
|
if ( ! redirp || redirp == (void *)-1 ) return (char **)redirp;
|
|
// sanity check
|
|
if ( *redirp && ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
|
|
// if NULL, we are done
|
|
if ( ! *redirp ) return &m_httpReply;
|
|
// . also, hang it up if we got a simplified redir url now
|
|
// . we set m_redirUrl so that getLinks() can add a spiderRequest
|
|
// for it, but we do not want to actually redirect to it to get
|
|
// the content for THIS document
|
|
if ( m_redirError ) return &m_httpReply;
|
|
// and invalidate the redir url because we do not know if the
|
|
// current url will redirect or not (mdwmdw)
|
|
m_redirUrlValid = false;
|
|
m_metaRedirUrlValid = false;
|
|
// free it
|
|
mfree ( m_httpReply , m_httpReplyAllocSize, "freehr" );
|
|
// always nullify if we free so we do not re-use freed mem
|
|
m_httpReply = NULL;
|
|
// otherwise, we had a redirect, so invalidate what we had set
|
|
m_httpReplyValid = false;
|
|
// do not invalidate this any more, now it is when we STARTED spidering
|
|
// the document
|
|
//m_spideredTimeValid = false;
|
|
m_isContentTruncatedValid = false;
|
|
// do not redo robots.txt lookup if the redir url just changed from
|
|
// http to https or vice versa
|
|
Url *ru = *redirp;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1) return (char **)cu;
|
|
if ( strcmp ( ru->getUrl() + ru->getSchemeLen() ,
|
|
cu->getUrl() + cu->getSchemeLen() ) ) {
|
|
// redo robots.txt lookup. might be cached.
|
|
m_isAllowedValid = false;
|
|
m_crawlDelayValid = false;
|
|
}
|
|
// keep the same ip if hostname is unchanged
|
|
if ( ru->getHostLen() != cu->getHostLen() ||
|
|
strncmp ( ru->getHost() , cu->getHost(), cu->getHostLen() ) )
|
|
// ip is supposed to be that of the current url, which changed
|
|
m_ipValid = false;
|
|
// we set our m_xml to the http reply to check for meta redirects
|
|
// in the html sometimes in getRedirUrl() so since we are redirecting,
|
|
// invalidate that xml
|
|
m_xmlValid = false;
|
|
m_wordsValid = false;
|
|
m_rawUtf8ContentValid = false;
|
|
m_expandedUtf8ContentValid= false;
|
|
m_utf8ContentValid = false;
|
|
m_filteredContentValid = false;
|
|
m_contentValid = false;
|
|
m_mimeValid = false;
|
|
// update our current url now to be the redirected url
|
|
m_currentUrl.set ( *redirp , false );
|
|
m_currentUrlValid = true;
|
|
// loop it
|
|
goto loop;
|
|
}
|
|
|
|
void gotHttpReplyWrapper ( void *state ) {
|
|
// point to us
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// this sets g_errno on error
|
|
THIS->gotHttpReply ( );
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// "NULL" can be a valid http reply (empty page) so we need to use "char **"
|
|
char **XmlDoc::getHttpReply2 ( ) {
|
|
if ( m_httpReplyValid ) return &m_httpReply;
|
|
|
|
setStatus("getting http reply2");
|
|
|
|
|
|
// if recycle is set then NEVER download if doing query reindex
|
|
// but if doing an injection then i guess we can download.
|
|
// do not even do ip lookup if no old titlerec, which is how we
|
|
// ended up here...
|
|
if ( m_recycleContent && m_sreqValid && m_sreq.m_isPageReindex ) {
|
|
g_errno = ENOTITLEREC;
|
|
return NULL;
|
|
}
|
|
|
|
// doing a query reindex on diffbot objects does not have a
|
|
// valid spider request, only sets m_recycleContent to true
|
|
// in reindexJSONObjects()/redoJSONObjects()
|
|
if ( m_recycleContent && m_isDiffbotJSONObject ) {
|
|
g_errno = ENOTITLEREC;
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// get ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (char **)ip;
|
|
|
|
// reset
|
|
m_httpReplySize = 0;
|
|
m_httpReply = NULL;
|
|
|
|
// if ip is bogus, we are done
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
log("xmldoc: ip is bogus 0 or -1 for %s. skipping download",
|
|
m_firstUrl.getUrl());
|
|
m_httpReplyValid = true;
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// need this now too. but don't hurt a nonzero val if we have
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
return &m_httpReply;
|
|
//return gotHttpReply ( );
|
|
}
|
|
|
|
// get this. should operate on current url (i.e. redir url if there)
|
|
bool *isAllowed = getIsAllowed();
|
|
// error or blocked
|
|
if ( ! isAllowed || isAllowed == (void *)-1) return (char **)isAllowed;
|
|
// this must be valid, since we share m_msg13 with it
|
|
if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *cd = getFinalCrawlDelay();
|
|
if ( ! cd || cd == (void *)-1 ) return (char **)cd;
|
|
|
|
// we might bail
|
|
if ( ! *isAllowed ) {
|
|
m_httpReplyValid = true;
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// need this now too. but don't hurt a nonzero val if we have
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
m_downloadStatusValid = true;
|
|
// forbidden? assume we downloaded it and it was empty
|
|
m_downloadStatus = 0; // EDOCDISALLOWED;//403;
|
|
return &m_httpReply;
|
|
//return gotHttpReply ( );
|
|
}
|
|
|
|
// are we site root page?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
|
|
//int8_t *hc = getHopCount();
|
|
//if ( ! hc || hc == (void *)-1 ) return (char **)hc;
|
|
|
|
XmlDoc *od = NULL;
|
|
if ( ! m_isSpiderProxy &&
|
|
// don't lookup xyz.com/robots.txt in titledb
|
|
! isFirstUrlRobotsTxt() ) {
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
|
|
// get ptr to old xml doc, could be NULL if non exists
|
|
od = *pod;
|
|
}
|
|
|
|
// sanity check
|
|
if ( od && m_recycleContent ) {char *xx=NULL;*xx=0; }
|
|
|
|
// validate m_firstIpValid
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// robots.txt and css files etc should have m_isChildDoc as true
|
|
//if ( ! m_downloadAttempted && ! m_isChildDoc )
|
|
// // keep track of spider stats
|
|
// cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
|
|
// we made an attempt to download, so mark it
|
|
//m_downloadAttempted = true;
|
|
|
|
// if we didn't block getting the lock, keep going
|
|
setStatus ( "getting web page" );
|
|
|
|
|
|
// sanity check
|
|
if ( ! m_masterLoop ) { char *xx=NULL;*xx=0; }
|
|
|
|
// int16_tcut. this will return the redirUrl if it is non-empty.
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (char **)cu;
|
|
|
|
/*
|
|
// if on google, make it empty so we do not hit them
|
|
if ( strstr(cu->getUrl(),".google.com/") ) {
|
|
log("spider: encountered google.com url. emptying.");
|
|
m_httpReplyValid = true;
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// need this now too. but don't hurt a nonzero val if we have
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
return &m_httpReply;
|
|
}
|
|
*/
|
|
|
|
// no ip found means empty page i guess
|
|
//if ( *ip == 0 || *ip == -1 )
|
|
// return gotHttpReply ( );
|
|
|
|
bool useTestCache = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
|
// unless its the pagesubmit.cpp event submission tool
|
|
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
|
|
|
// sanity check
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set parms
|
|
Msg13Request *r = &m_msg13Request;
|
|
// clear it first
|
|
r->reset();
|
|
// and set the url
|
|
//strcpy ( r->m_url , cu->getUrl() );
|
|
r->ptr_url = cu->getUrl();
|
|
r->size_url = cu->getUrlLen()+1;
|
|
|
|
// caution: m_sreq.m_hopCountValid is false sometimes for page parser
|
|
// this is used for Msg13.cpp's ipWasBanned()
|
|
// we use hopcount now instead of isInSeedBuf(cr,r->ptr_url)
|
|
bool isInjecting = getIsInjecting();
|
|
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
|
|
r->m_isRootSeedUrl = 1;
|
|
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
|
|
r->m_isRootSeedUrl = 1;
|
|
|
|
// sanity check
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// r->m_maxTextDocLen = maxDownload;
|
|
// r->m_maxOtherDocLen = maxDownload;
|
|
r->m_maxTextDocLen = cr->m_maxTextDocLen;
|
|
r->m_maxOtherDocLen = cr->m_maxOtherDocLen;
|
|
|
|
// max to download in bytes. currently 1MB.
|
|
//int32_t maxDownload = (int32_t)MAXDOCLEN;
|
|
// but if url is http://127.0.0.1.... or local then
|
|
if ( m_ipValid ) {
|
|
// make into a string
|
|
char *ipStr = iptoa(m_ip);
|
|
// is it local?
|
|
bool isLocal = false;
|
|
if ( strncmp(ipStr,"192.168.",8) == 0) isLocal = true;
|
|
if ( strncmp(ipStr,"10." ,3) == 0) isLocal = true;
|
|
if ( m_ip == 16777343 ) isLocal = true; // 127.0.0.1 ?
|
|
// . if local then make web page download max size unlimited
|
|
// . this is for adding the gbdmoz.urls.txt.* files to
|
|
// populate dmoz. those files are about 25MB each.
|
|
if ( isLocal ) {
|
|
//maxDownload = -1;
|
|
r->m_maxTextDocLen = -1;
|
|
r->m_maxOtherDocLen = -1;
|
|
}
|
|
}
|
|
// m_maxCacheAge is set for getting contact or root docs in
|
|
// getContactDoc() and getRootDoc() and it only applies to
|
|
// titleRecs in titledb i guess... but still... for Msg13 it applies
|
|
// to its cache ... for robots.txt files too
|
|
r->m_maxCacheAge = m_maxCacheAge;
|
|
r->m_urlIp = *ip;
|
|
r->m_firstIp = m_firstIp;
|
|
r->m_urlHash48 = getFirstUrlHash48();
|
|
if ( r->m_maxTextDocLen < 100000 ) r->m_maxTextDocLen = 100000;
|
|
if ( r->m_maxOtherDocLen < 200000 ) r->m_maxOtherDocLen = 200000;
|
|
r->m_forwardDownloadRequest = (bool)m_forwardDownloadRequest;
|
|
r->m_useTestCache = (bool)useTestCache;
|
|
r->m_spideredTime = getSpideredTime();//m_spideredTime;
|
|
r->m_ifModifiedSince = 0;
|
|
r->m_skipHammerCheck = 0;
|
|
|
|
//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
|
|
//else r->m_addToTestCache = false;
|
|
r->m_addToTestCache = (bool)useTestCache;
|
|
|
|
if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
|
|
r->ptr_cookie = m_redirCookieBuf.getBufStart();
|
|
r->size_cookie = m_redirCookieBuf.length() + 1;
|
|
// . only do once per redirect
|
|
// . do not invalidate because we might have to carry it
|
|
// through to the next redir... unless we change domain
|
|
// . this fixes the nyt.com/nytimes.com bug some more
|
|
//m_redirCookieBufValid = false;
|
|
}
|
|
|
|
// . this is -1 if unknown. none found in robots.txt or provided
|
|
// in the custom crawl parms.
|
|
// . it should also be 0 for the robots.txt file itself
|
|
r->m_crawlDelayMS = *cd;
|
|
|
|
// let's time our crawl delay from the initiation of the download
|
|
// not from the end of the download. this will make things a little
|
|
// faster but could slam servers more.
|
|
r->m_crawlDelayFromEnd = false;
|
|
|
|
// need this in order to get all languages, etc. and avoid having
|
|
// to set words class at the spider compression proxy level
|
|
r->m_forEvents = 0;
|
|
// new stuff
|
|
r->m_contentHash32 = 0;
|
|
// if valid in SpiderRequest, use it. if spider compression proxy
|
|
// sees the content is unchanged it will not send it back! it will
|
|
// send back g_errno = EDOCUNCHANGED or something
|
|
if ( m_sreqValid )
|
|
r->m_contentHash32 = m_sreq.m_contentHash32;
|
|
|
|
// if we have the old doc already set use that
|
|
if ( od )
|
|
r->m_contentHash32 = od->m_contentHash32;
|
|
|
|
// force floater usage on even if "use spider proxies" parms is off
|
|
// if we're a diffbot crawl and use robots is off.
|
|
//if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
|
|
// r->m_forceUseFloaters = true;
|
|
|
|
// for beta testing, make it a collection specific parm for diffbot
|
|
// so we can turn on manually
|
|
if ( cr->m_forceUseFloaters )
|
|
r->m_forceUseFloaters = true;
|
|
|
|
// eventgurubot is the max
|
|
//char *userAgent = g_conf.m_spiderUserAgent;
|
|
// hardcode it
|
|
//char *userAgent = "EventGuruBot";
|
|
//int32_t uaLen = gbstrlen(userAgent);
|
|
//if ( uaLen > 12 ) {
|
|
// log("spider: user agent string too long");
|
|
// uaLen = 12;
|
|
//}
|
|
//strncpy(r->m_userAgent,userAgent,uaLen);
|
|
//r->m_userAgent[uaLen] = '\0';
|
|
|
|
// turn this off too
|
|
r->m_attemptedIframeExpansion = false;
|
|
|
|
r->m_collnum = (collnum_t)-1;
|
|
if ( m_collnumValid )r->m_collnum = m_collnum;
|
|
|
|
// turn off
|
|
r->m_useCompressionProxy = false;
|
|
r->m_compressReply = false;
|
|
r->m_isCustomCrawl = cr->m_isCustomCrawl;
|
|
|
|
// set it for this too
|
|
if ( g_conf.m_useCompressionProxy &&
|
|
// do not use for the test collection ever, that is qa'ing
|
|
strcmp(cr->m_coll,"qatest123") ) {
|
|
r->m_useCompressionProxy = true;
|
|
r->m_compressReply = true;
|
|
}
|
|
|
|
// are we a robots.txt file?
|
|
//bool isRobotsTxt = isRobotsTxtFile ( cu->getUrl() , cu->getUrlLen());
|
|
|
|
char *td = getTestDir();
|
|
if ( td ) strncpy ( r->m_testDir, td, 31);
|
|
|
|
//r->m_isPageParser = getIsPageParser();
|
|
//r->m_isPageInject = ( m_sreqValid && m_sreq.m_isInjecting );
|
|
|
|
// if current url IS NOT EQUAL to first url then set redir flag
|
|
if ( strcmp(cu->m_url,m_firstUrl.m_url) )
|
|
r->m_skipHammerCheck = 1;
|
|
// or if this an m_extraDoc or m_rootDoc for another url then
|
|
// do not bother printing the hammer ip msg in msg13.cpp either
|
|
if ( m_isChildDoc )
|
|
r->m_skipHammerCheck = 1;
|
|
|
|
if ( m_contentInjected ) // oldsrValid && m_sreq.m_isInjecting )
|
|
r->m_skipHammerCheck = 1;
|
|
|
|
// or if ahrefs
|
|
if ( strncmp(cu->m_url,"http://api.ahrefs.com/",22) == 0 )
|
|
r->m_skipHammerCheck = 1;
|
|
|
|
if ( r->m_skipHammerCheck )
|
|
log(LOG_DEBUG,"build: skipping hammer check");
|
|
|
|
// if we had already spidered it... try to save bandwidth and time
|
|
if ( od ) {
|
|
// sanity check
|
|
if ( ! od->m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// only get it if modified since last spider time
|
|
r->m_ifModifiedSince = od->m_spideredTime;
|
|
}
|
|
|
|
// tell msg13 he is scraping...
|
|
if ( m_sreqValid && m_sreq.m_isScraping )
|
|
r->m_isScraping = 1;
|
|
|
|
// if doing frame expansion on a doc we just downloaded as the
|
|
// spider proxy, we are asking ourselves now to download the url
|
|
// from an <iframe src=...> tag. so definitely use msg13 again
|
|
// so it can use the robots.txt cache, and regular html page cache.
|
|
if ( m_isSpiderProxy ) {
|
|
r->m_useCompressionProxy = false;
|
|
r->m_compressReply = false;
|
|
r->m_skipHammerCheck = 1;
|
|
//r->m_requireGoodDate = false;
|
|
// no frames within frames
|
|
r->m_attemptedIframeExpansion = 1;
|
|
log(LOG_DEBUG,"build: skipping hammer check 2");
|
|
|
|
}
|
|
|
|
// . use msg13 to download the file, robots.txt
|
|
// . msg13 will ensure only one download of that url w/ locks
|
|
// . msg13 can use the compress the http reply before
|
|
// sending it back to you via udp (compression proxy)
|
|
// . msg13 uses XmlDoc::getHttpReply() function to handle
|
|
// redirects, etc.? no...
|
|
bool isTestColl = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;
|
|
|
|
//if ( isTestColl && m_contentType == CT_IMAGE )
|
|
// isTestColl = false;
|
|
|
|
// sanity check. keep injections fast. no downloading!
|
|
if ( m_wasContentInjected ) {
|
|
log("xmldoc: url injection failed! error!");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// sanity check
|
|
if ( m_deleteFromIndex ) {
|
|
log("xmldoc: trying to download page to delete");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
m_downloadStartTimeValid = true;
|
|
m_downloadStartTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
|
|
// return -1 if blocked
|
|
return (char **)-1;
|
|
return gotHttpReply ( );
|
|
}
|
|
// . this returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
char **XmlDoc::gotHttpReply ( ) {
|
|
// save it
|
|
int32_t saved = g_errno;
|
|
// note it
|
|
setStatus ( "got web page" );
|
|
|
|
// sanity check. are we already valid?
|
|
if ( m_httpReply && m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do not re-call
|
|
m_httpReplyValid = true;
|
|
|
|
// assume none
|
|
m_httpReply = NULL;
|
|
|
|
// . get the HTTP reply
|
|
// . TODO: free it on reset/destruction, we own it now
|
|
// . this is now NULL terminated thanks to changes in
|
|
// Msg13.cpp, but watch the buf size, need to subtract 1
|
|
// . therefore, we can set the Xml class with it
|
|
m_httpReply = m_msg13.m_replyBuf;
|
|
m_httpReplySize = m_msg13.m_replyBufSize;
|
|
// how much to free?
|
|
m_httpReplyAllocSize = m_msg13.m_replyBufAllocSize;
|
|
|
|
// sanity check
|
|
if ( m_httpReplySize > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
|
|
// what is this for? that makes it into a length not a size!
|
|
//if ( m_httpReplySize > 0 ) m_httpReplySize--;
|
|
// . save entire reply length we read from the net so
|
|
// SpiderCache
|
|
// can use it for its m_avgReplyLen for throttling
|
|
// . m_bufLen may change due to filtering
|
|
//m_replyLen = m_bufLen;
|
|
// . don't let UdpServer free m_buf when socket is
|
|
// recycled/closed
|
|
// . we own it now and are responsible for freeing it
|
|
//slot->m_readBuf = NULL;
|
|
m_msg13.m_replyBuf = NULL;
|
|
// relabel mem so we know where it came from
|
|
relabel( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . sanity test -- only if not the test collection
|
|
// . i.e. what are you doing downloading the page if there was
|
|
// a problem with the page we already know about
|
|
if ( m_indexCode && m_indexCodeValid &&
|
|
strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
|
|
|
// fix this
|
|
if ( saved == EDOCUNCHANGED ) {
|
|
// assign content from it since unchanged
|
|
m_recycleContent = true;
|
|
// clear the error
|
|
saved = 0;
|
|
g_errno = 0;
|
|
}
|
|
|
|
// . save the error in download status
|
|
// . could now be EDOCUNCHANGED or EDOCNOGOODDATE (w/ tod)
|
|
m_downloadStatus = saved; // g_errno;
|
|
// validate
|
|
m_downloadStatusValid = true;
|
|
|
|
// update m_downloadEndTime if we should, used for sameIpWait
|
|
m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
|
|
m_downloadEndTimeValid = true;
|
|
|
|
// make it so
|
|
g_errno = saved;
|
|
|
|
bool doIncrement = true;
|
|
if ( m_isChildDoc ) doIncrement = false;
|
|
if ( m_incrementedDownloadCount ) doIncrement = false;
|
|
|
|
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
|
|
|
// if it doesn't match the crawl pattern, just the process pattern
|
|
// then do not increment download successes
|
|
if ( doIncrement &&
|
|
cr->m_isCustomCrawl == 1 &&
|
|
// allow seeds to be counted
|
|
! isSeed &&
|
|
//! sreq->m_isPageReindex &&
|
|
//! sreq->m_isInjecting &&
|
|
! doesUrlMatchDiffbotCrawlPattern() )
|
|
doIncrement = false;
|
|
|
|
|
|
|
|
// . do not count bad http status in mime as failure i guess
|
|
// . do not inc this count for robots.txt and root page downloads, etc.
|
|
if ( doIncrement ) {
|
|
cr->m_localCrawlInfo.m_pageDownloadSuccesses++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadSuccesses++;
|
|
cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound++;
|
|
m_incrementedDownloadCount = true;
|
|
cr->m_needsSave = true;
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
}
|
|
|
|
// this means the spider compression proxy's reply got corrupted
|
|
// over roadrunner's crappy wireless internet connection
|
|
if ( saved == ECORRUPTDATA ) return NULL;
|
|
// this one happens too! for the same reason...
|
|
if ( saved == EBADREPLYSIZE ) return NULL;
|
|
// might as well check this too while we're at it
|
|
if ( saved == ENOMEM ) return NULL;
|
|
|
|
// sanity check -- check after bailing on corruption because
|
|
// corrupted replies do not end in NULLs
|
|
if ( m_httpReplySize > 0 && m_httpReply[m_httpReplySize-1] ) {
|
|
log("http: httpReplySize=%" INT32 " http reply does not end in \\0 "
|
|
"for %s in collnum=%" INT32 ". blanking out reply."
|
|
,m_httpReplySize
|
|
,m_firstUrl.m_url
|
|
,(int32_t)m_collnum
|
|
);
|
|
// free it i guess
|
|
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
|
// and reset it
|
|
m_httpReplySize = 0;
|
|
m_httpReply = NULL;
|
|
m_httpReplyAllocSize = 0;
|
|
// call it data corruption i guess for now
|
|
g_errno = ECORRUPTDATA;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// if its a bad gzip reply, a compressed http reply, then
|
|
// make the whole thing empty? some websites return compressed replies
|
|
// even though we do not ask for them. and then the compression
|
|
// is corrupt.
|
|
if ( saved == ECORRUPTHTTPGZIP ||
|
|
// if somehow we got a page too big for MAX_DGRAMS... treat
|
|
// it like an empty page...
|
|
saved == EMSGTOOBIG ) {
|
|
// free it i guess
|
|
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
|
// and reset it
|
|
m_httpReplySize = 0;
|
|
m_httpReply = NULL;
|
|
m_httpReplyAllocSize = 0;
|
|
}
|
|
|
|
// if errors were not local, reset g_errno and set m_indexCode
|
|
//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
|
|
//if ( g_errno == EBADMIME ) m_indexCode = EBADMIME;
|
|
// clear g_errno
|
|
//if ( m_indexCode ) g_errno = 0;
|
|
// return if cancelled, etc.
|
|
//if ( g_errno ) return NULL;
|
|
|
|
// clear this i guess
|
|
g_errno = 0;
|
|
|
|
/*
|
|
MDW: 2/8/16 this logic now below in getIsContentTruncated() function
|
|
|
|
// int16_tcut - convert size to length
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
|
|
m_isContentTruncated = false;
|
|
// was the content truncated? these might label a doc is truncated
|
|
// when it really is not... but we only use this for link spam stuff,
|
|
// so it should not matter too much. it should only happen rarely.
|
|
//if ( LEN >= cr->m_maxTextDocLen-1 ) m_isContentTruncated = true;
|
|
//if ( LEN >= cr->m_maxOtherDocLen-1 ) m_isContentTruncated = true;
|
|
if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
|
|
// set this
|
|
m_isContentTruncated2 = (bool)m_isContentTruncated;
|
|
// validate it
|
|
m_isContentTruncatedValid = true;
|
|
*/
|
|
|
|
return &m_httpReply;
|
|
}
|
|
|
|
char *XmlDoc::getIsContentTruncated ( ) {
|
|
if ( m_isContentTruncatedValid ) return &m_isContentTruncated2;
|
|
|
|
setStatus ( "getting is content truncated" );
|
|
|
|
// if recycling content use its download end time
|
|
if ( m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (char *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
m_isContentTruncated = od->m_isContentTruncated;
|
|
m_isContentTruncated2 = (bool)m_isContentTruncated;
|
|
m_isContentTruncatedValid = true;
|
|
return &m_isContentTruncated2;
|
|
}
|
|
}
|
|
|
|
// need a valid reply
|
|
char **replyPtr = getHttpReply ();
|
|
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char *)replyPtr;
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (char *)ct;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// shortcut - convert size to length
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
|
|
m_isContentTruncated = false;
|
|
// was the content truncated? these might label a doc is truncated
|
|
// when it really is not... but we only use this for link spam stuff,
|
|
// so it should not matter too much. it should only happen rarely.
|
|
if ( cr->m_maxTextDocLen >= 0 &&
|
|
LEN >= cr->m_maxTextDocLen-1 &&
|
|
*ct == CT_HTML )
|
|
m_isContentTruncated = true;
|
|
|
|
if ( cr->m_maxOtherDocLen >= 0 &&
|
|
LEN >= cr->m_maxOtherDocLen-1 &&
|
|
*ct != CT_HTML )
|
|
m_isContentTruncated = true;
|
|
|
|
//if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
|
|
// set this
|
|
m_isContentTruncated2 = (bool)m_isContentTruncated;
|
|
// validate it
|
|
m_isContentTruncatedValid = true;
|
|
|
|
return &m_isContentTruncated2;
|
|
}
|
|
|
|
int32_t *XmlDoc::getDownloadStatus ( ) {
|
|
if ( m_downloadStatusValid ) return &m_downloadStatus;
|
|
// log it
|
|
setStatus ( "getting download status");
|
|
// if recycling content, we're 200!
|
|
if ( m_recycleContent ) {
|
|
m_downloadStatus = 0;
|
|
m_downloadStatusValid = true;
|
|
return &m_downloadStatus;
|
|
}
|
|
// get ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
|
|
// . first try ip
|
|
// . this means the dns lookup timed out
|
|
if ( *ip == -1 ) {
|
|
m_downloadStatus = EDNSTIMEDOUT;
|
|
m_downloadStatusValid = true;
|
|
return &m_downloadStatus;
|
|
}
|
|
// this means ip does not exist
|
|
if ( *ip == 0 ) {
|
|
m_downloadStatus = EBADIP;
|
|
m_downloadStatusValid = true;
|
|
return &m_downloadStatus;
|
|
}
|
|
// need a valid reply
|
|
char **reply = getHttpReply ();
|
|
if ( ! reply || reply == (void *)-1 ) return (int32_t *)reply;
|
|
// must be valid now
|
|
if ( ! m_downloadStatusValid ) { char *xx=NULL;*xx=0; }
|
|
// return it
|
|
return &m_downloadStatus;
|
|
}
|
|
|
|
int64_t *XmlDoc::getDownloadEndTime ( ) {
|
|
if ( m_downloadEndTimeValid ) return &m_downloadEndTime;
|
|
// log it
|
|
setStatus ( "getting download end time");
|
|
|
|
// do not cause us to core in getHttpReply2() because m_deleteFromIndex
|
|
// is set to true...
|
|
if ( m_deleteFromIndex ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
return &m_downloadEndTime;
|
|
}
|
|
|
|
// if recycling content use its download end time
|
|
if ( m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (int64_t *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
m_downloadEndTime = od->m_downloadEndTime;
|
|
m_downloadEndTimeValid = true;
|
|
return &m_downloadEndTime;
|
|
}
|
|
}
|
|
|
|
// need a valid reply
|
|
char **reply = getHttpReply ();
|
|
if ( ! reply || reply == (void *)-1 ) return (int64_t *)reply;
|
|
// must be valid now
|
|
if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0;}
|
|
// return it
|
|
return &m_downloadEndTime;
|
|
}
|
|
|
|
|
|
int16_t *XmlDoc::getHttpStatus ( ) {
|
|
// if we got a title rec then return that
|
|
if ( m_httpStatusValid ) return &m_httpStatus;
|
|
// get mime otherwise
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (int16_t *)mime;
|
|
// get from that
|
|
m_httpStatus = mime->getHttpStatus();
|
|
m_httpStatusValid = true;
|
|
return &m_httpStatus;
|
|
}
|
|
|
|
HttpMime *XmlDoc::getMime () {
|
|
if ( m_mimeValid ) return &m_mime;
|
|
|
|
// log debug
|
|
setStatus("getting http mime");
|
|
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1) return (HttpMime *)cu;
|
|
|
|
// injection from SpiderLoop.cpp sets this to true
|
|
if ( m_useFakeMime ) {
|
|
usefake:
|
|
m_mime.set ( NULL , 0 , cu );
|
|
m_mime.setHttpStatus ( 200 );
|
|
m_mime.setContentType ( CT_HTML );
|
|
m_mimeValid = true;
|
|
return &m_mime;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if recycling content, fake this mime
|
|
if ( cr->m_recycleContent || m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (HttpMime *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// . this is non-NULL if it existed
|
|
// . fake it for now
|
|
if ( od ) goto usefake;
|
|
}
|
|
|
|
// need a valid reply
|
|
char **reply = getHttpReply ();
|
|
if ( ! reply || reply == (void *)-1 ) return (HttpMime *)reply;
|
|
|
|
// fake it for now
|
|
m_mime.set ( NULL , 0 , cu );
|
|
m_mime.setHttpStatus ( 200 );
|
|
m_mime.setContentType ( CT_HTML );
|
|
|
|
// int16_tcut
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
|
|
// validate it
|
|
m_mimeValid = true;
|
|
|
|
// TODO: try again on failures because server may have been overloaded
|
|
// and closed the connection w/o sending anything
|
|
if ( LEN>0 && ! m_mime.set ( m_httpReply , LEN , cu ) ) {
|
|
// set this on mime error
|
|
//m_indexCode = EBADMIME;
|
|
// return a fake thing. content length is 0.
|
|
return &m_mime;
|
|
}
|
|
|
|
// . check the mime status, should be in the 200's for success
|
|
// . spider should redirect on 3xx codes
|
|
// . 404 means not found, etc.
|
|
// . 304 is not modified since
|
|
// . >= 300 should only happen if redirect chain was too long to follow
|
|
//int32_t httpStatus = m_mime.getHttpStatus();
|
|
// sanity check, these must be reserved! no longer, we have
|
|
// a separate m_httpStatus in the SpiderReply class now
|
|
//if ( mstrerror(httpStatus) ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
//if ( m_indexCode ) { char *xx=NULL;*xx=0; }
|
|
// set it
|
|
//m_indexCode = httpStatus;
|
|
// clear if it was ok though
|
|
//if ( m_indexCode == 200 ) m_indexCode = 0;
|
|
// bail out now
|
|
return &m_mime;
|
|
}
|
|
|
|
// need to use "char **" since content might be NULL itself, if none
|
|
char **XmlDoc::getContent ( ) {
|
|
if ( m_contentValid ) return &m_content;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// recycle?
|
|
if ( cr->m_recycleContent || m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (char **)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
m_content = od-> ptr_utf8Content;
|
|
m_contentLen = od->size_utf8Content - 1;
|
|
m_contentValid = true;
|
|
return &m_content;
|
|
}
|
|
if ( m_recycleContent )
|
|
log("xmldoc: failed to load old title rec "
|
|
"when recycle content was true and url = "
|
|
"%s",ptr_firstUrl);
|
|
// if could not find title rec and we are docid-based then
|
|
// we can't go any further!!
|
|
if ( m_setFromDocId ) {
|
|
log("xmldoc: null content for docid-based titlerec "
|
|
"lookup which was not found");
|
|
m_content = NULL;
|
|
m_contentLen = 0;
|
|
m_contentValid = true;
|
|
return &m_content;
|
|
}
|
|
}
|
|
|
|
if ( m_recycleContent ) {
|
|
if ( m_firstUrlValid )
|
|
log("xmldoc: failed to recycle content for %s. could "
|
|
"not load title rec",m_firstUrl.m_url);
|
|
else if ( m_docIdValid )
|
|
log("xmldoc: failed to recycle content for %" UINT64 ". "
|
|
"could "
|
|
"not load title rec",m_docId );
|
|
else
|
|
log("xmldoc: failed to recycle content. "
|
|
"could not load title rec" );
|
|
// let's let it pass and just download i guess, then
|
|
// we can get page stats for urls not in the index
|
|
//g_errno = EBADENGINEER;
|
|
//return NULL;
|
|
}
|
|
|
|
|
|
// if we were set from a title rec use that we do not have the original
|
|
// content, and caller should be calling getUtf8Content() anyway!!
|
|
if ( m_setFromTitleRec ) { char *xx=NULL; *xx=0; }
|
|
|
|
// query reindex has m_setFromDocId to true and we WANT to re-download
|
|
// the content... so why did i have this here? MDW 9/25/2014
|
|
//if ( m_setFromDocId ) { char *xx=NULL; *xx=0; }
|
|
|
|
// recycle?
|
|
//if ( m_recycleContent ) { char *xx=NULL; *xx=0; }
|
|
|
|
// get the mime first
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (char **)mime;
|
|
|
|
// http reply must be valid
|
|
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make it valid
|
|
m_contentValid = true;
|
|
|
|
// assume none
|
|
m_content = NULL;
|
|
m_contentLen = 0;
|
|
|
|
// all done if no reply
|
|
if ( ! m_httpReply ) return &m_content;
|
|
|
|
// set the content, account for mime header
|
|
m_content = m_httpReply + mime->getMimeLen() ;
|
|
m_contentLen = m_httpReplySize - mime->getMimeLen() ;
|
|
|
|
// watch out for this!
|
|
if ( m_useFakeMime ) {
|
|
m_content = m_httpReply;
|
|
m_contentLen = m_httpReplySize;
|
|
}
|
|
|
|
// why is this not really the size???
|
|
m_contentLen--;
|
|
|
|
// sanity check
|
|
if ( m_contentLen < 0 ) { char *xx = NULL; *xx = 0; }
|
|
return &m_content;
|
|
}
|
|
|
|
char getContentTypeFromContent ( char *p , int32_t niceness ) {
|
|
char ctype = 0;
|
|
// max
|
|
char *pmax = p + 100;
|
|
// check that out
|
|
for ( ; p && *p && p < pmax ; p++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( p[0] != '<' ) continue;
|
|
if ( p[1] != '!' ) continue;
|
|
if ( to_lower_a(p[2]) != 'd' ) continue;
|
|
if ( strncasecmp(p,"<!doctype ",10) ) continue;
|
|
char *dt = p + 10;
|
|
// skip spaces
|
|
for ( ; *dt ; dt++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( ! is_wspace_a ( *dt ) ) break;
|
|
}
|
|
// point to that
|
|
if ( ! strncasecmp(dt,"html" ,4) ) ctype = CT_HTML;
|
|
if ( ! strncasecmp(dt,"xml" ,3) ) ctype = CT_XML;
|
|
if ( ! strncasecmp(dt,"text/html",9) ) ctype = CT_HTML;
|
|
if ( ! strncasecmp(dt,"text/xml" ,8) ) ctype = CT_XML;
|
|
break;
|
|
}
|
|
return ctype;
|
|
}
|
|
|
|
uint8_t *XmlDoc::getContentType ( ) {
|
|
if ( m_contentTypeValid ) return &m_contentType;
|
|
// log debug
|
|
setStatus("getting content type");
|
|
// get the mime first
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (uint8_t *)mime;
|
|
// then get mime
|
|
m_contentType = mime->getContentType();
|
|
// but if they specify <!DOCTYPE html> in the document that overrides
|
|
// the content type in the mime! fixes planet.mozilla.org
|
|
char **pp = getContent();
|
|
if ( ! pp || pp == (void *)-1 ) return (uint8_t *)pp;
|
|
char *p = *pp;
|
|
// scan content for content type. returns 0 if none found.
|
|
char ctype2 = getContentTypeFromContent ( p , m_niceness );
|
|
// valid?
|
|
if ( ctype2 != 0 ) m_contentType = ctype2;
|
|
// it is valid now
|
|
m_contentTypeValid = true;
|
|
// give to to them
|
|
return &m_contentType;
|
|
}
|
|
|
|
|
|
// . similar to getMetaRedirUrl but look for different strings
|
|
// . rel="canonical" or rel=canonical in a link tag.
|
|
Url **XmlDoc::getCanonicalRedirUrl ( ) {
|
|
// return if we got it
|
|
if ( m_canonicalRedirUrlValid ) return &m_canonicalRedirUrlPtr;
|
|
|
|
//if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume none in doc
|
|
m_canonicalRedirUrlPtr = NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// disable for crawlbot, not good really for deduping
|
|
if ( cr->m_isCustomCrawl ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
if ( ! cr->m_useCanonicalRedirects ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
|
|
// are we site root page? don't follow canonical url then.
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (Url **)isRoot;
|
|
if ( *isRoot ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
// if this page has an inlink, then let it stand
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
|
|
if ( info1->getNumGoodInlinks() > 0 ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
// these canonical links only supported in xml/html i think
|
|
if ( *ct != CT_HTML && *ct != CT_XML ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Url **)xml;
|
|
|
|
// scan nodes looking for a <link> node. like getBaseUrl()
|
|
for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
|
|
// breathe some
|
|
QUICKPOLL(m_niceness);
|
|
// 12 is the <base href> tag id
|
|
if ( xml->getNodeId ( i ) != TAG_LINK ) continue;
|
|
// get the href field of this base tag
|
|
int32_t linkLen;
|
|
char *link = (char *) xml->getString ( i, "href", &linkLen );
|
|
// skip if not valid
|
|
if ( ! link || linkLen == 0 ) continue;
|
|
// must also have rel=canonical
|
|
int32_t relLen;
|
|
char *rel = xml->getString(i,"rel",&relLen);
|
|
if ( ! rel ) continue;
|
|
// skip if does not match "canonical"
|
|
if ( strncasecmp(rel,"canonical",relLen) ) continue;
|
|
// allow for relative urls
|
|
Url *cu = getCurrentUrl();
|
|
// set base to it. addWWW=false
|
|
m_canonicalRedirUrl.set(cu,link,linkLen,false);//true
|
|
// assume it is not our url
|
|
bool isMe = false;
|
|
// if it is us, then skip!
|
|
if(strcmp(m_canonicalRedirUrl.getUrl(),m_firstUrl.getUrl())==0)
|
|
isMe = true;
|
|
// might also be our redir url i guess
|
|
if(strcmp(m_canonicalRedirUrl.getUrl(),m_redirUrl.getUrl())==0)
|
|
isMe = true;
|
|
// if it is us, keep it NULL, it's not a redirect. we are
|
|
// the canonical url.
|
|
if ( isMe ) break;
|
|
// ignore if in an expanded iframe (<gbrame>) tag
|
|
char *pstart = xml->m_xml;
|
|
char *p = link;
|
|
// scan backwards
|
|
if ( ! m_didExpansion ) p = pstart;
|
|
bool skip = false;
|
|
for ( ; p > pstart ; p-- ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p[0] != '<' )
|
|
continue;
|
|
if ( p[1] == '/' &&
|
|
p[2] == 'g' &&
|
|
p[3] == 'b' &&
|
|
p[4] == 'f' &&
|
|
p[5] == 'r' &&
|
|
p[6] == 'a' &&
|
|
p[7] == 'm' &&
|
|
p[8] == 'e' &&
|
|
p[9] == '>' )
|
|
break;
|
|
if ( p[1] == 'g' &&
|
|
p[2] == 'b' &&
|
|
p[3] == 'f' &&
|
|
p[4] == 'r' &&
|
|
p[5] == 'a' &&
|
|
p[6] == 'm' &&
|
|
p[7] == 'e' &&
|
|
p[8] == '>' ) {
|
|
skip = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( skip ) continue;
|
|
// otherwise, it is not us, we are NOT the canonical url
|
|
// and we should not be indexed, but just ass the canonical
|
|
// url as a spiderrequest into spiderdb, just like
|
|
// simplified meta redirect does.
|
|
m_canonicalRedirUrlPtr = &m_canonicalRedirUrl;
|
|
break;
|
|
}
|
|
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
// returns false if none found
|
|
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
|
|
Url *cu ) {
|
|
// limit scan
|
|
char *limit = p + 30;
|
|
// skip whitespace
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// must be a num
|
|
if ( ! is_digit(*p) ) return false;
|
|
// init delay
|
|
int32_t delay = atol ( p );
|
|
// ignore long delays
|
|
if ( delay >= 10 ) return false;
|
|
// now find the semicolon, if any
|
|
for ( ; *p && p < limit && *p != ';' ; p++ );
|
|
// must have semicolon
|
|
if ( *p != ';' ) return false;
|
|
// skip it
|
|
p++;
|
|
// skip whitespace some more
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// must have URL
|
|
if ( strncasecmp(p,"URL",3) ) return false;
|
|
// skip that
|
|
p += 3;
|
|
// skip white space
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// then an equal sign
|
|
if ( *p != '=' ) return false;
|
|
// skip equal sign
|
|
p++;
|
|
// them maybe more whitespace
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// an optional quote
|
|
if ( *p == '\"' ) p++;
|
|
// can also be a single quote!
|
|
if ( *p == '\'' ) p++;
|
|
// set the url start
|
|
char *url = p;
|
|
// now advance to next quote or space or >
|
|
for ( ; *p && !is_wspace_a(*p) &&
|
|
*p !='\'' &&
|
|
*p !='\"' &&
|
|
*p !='>' ;
|
|
p++);
|
|
// that is the end
|
|
char *urlEnd = p;
|
|
// get size
|
|
int32_t usize = urlEnd - url;
|
|
// skip if too big
|
|
if ( usize > 1024 ) {
|
|
log("build: meta redirurl of %" INT32 " bytes too big",usize);
|
|
return false;
|
|
}
|
|
// get our current utl
|
|
//Url *cu = getCurrentUrl();
|
|
// decode what we got
|
|
char decoded[MAX_URL_LEN];
|
|
// convert & to "&"
|
|
int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
|
|
decoded[decBytes]='\0';
|
|
// . then the url
|
|
// . set the url to the one in the redirect tag
|
|
// . but if the http-equiv meta redirect url starts with a '?'
|
|
// then just replace our cgi with that one
|
|
if ( *url == '?' ) {
|
|
char foob[MAX_URL_LEN*2];
|
|
char *pf = foob;
|
|
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
|
|
gbmemcpy(foob,cu->getUrl(),cuBytes);
|
|
pf += cuBytes;
|
|
gbmemcpy ( pf , decoded , decBytes );
|
|
pf += decBytes;
|
|
*pf = '\0';
|
|
metaRedirUrl->set(foob);
|
|
}
|
|
// . otherwise, append it right on
|
|
// . use "url" as the base Url
|
|
// . it may be the original url or the one we redirected to
|
|
// . redirUrl is set to the original at the top
|
|
else
|
|
// addWWW = false, stripSessId=true
|
|
metaRedirUrl->set(cu,decoded,decBytes,false,true);
|
|
return true;
|
|
}
|
|
|
|
|
|
// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
|
|
Url **XmlDoc::getMetaRedirUrl ( ) {
|
|
if ( m_metaRedirUrlValid ) return &m_metaRedirUrlPtr;
|
|
// get ptr to utf8 content
|
|
//char **u8 = getHttpReply();
|
|
//if ( ! u8 || u8 == (void *)-1 ) return (Url **)u8;
|
|
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
char *p = m_httpReply;
|
|
// subtract one since this is a size not a length
|
|
char *pend = p + m_httpReplySize - 1;//size_utf8Content;
|
|
|
|
// assume no meta refresh url
|
|
m_metaRedirUrlPtr = NULL;
|
|
// make it valid regardless i guess
|
|
m_metaRedirUrlValid = true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if we are recycling or injecting, do not consider meta redirects
|
|
if ( cr->m_recycleContent || m_recycleContent )
|
|
return &m_metaRedirUrlPtr;
|
|
|
|
// will this work in here?
|
|
//uint8_t *ct = getContentType();
|
|
//if ( ! ct ) return NULL;
|
|
|
|
Url *cu = getCurrentUrl();
|
|
|
|
bool gotOne = false;
|
|
|
|
// advance a bit, we are initially looking for the 'v' char
|
|
p += 10;
|
|
// begin the string matching loop
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// fix <!--[if lte IE 6]>
|
|
// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
|
|
if ( *p == '!' &&
|
|
p[-1]=='<' &&
|
|
p[1] == '-' &&
|
|
p[2] == '-' ) {
|
|
// find end of comment
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p[0] == '-' &&
|
|
p[1] == '-' &&
|
|
p[2] == '>' )
|
|
break;
|
|
}
|
|
// if found no end of comment, then stop
|
|
if ( p >= pend )
|
|
break;
|
|
// resume looking for meta redirect tags
|
|
continue;
|
|
}
|
|
// base everything off the equal sign
|
|
if ( *p != '=' ) continue;
|
|
// did we match "http-equiv="?
|
|
if ( to_lower_a(p[-1]) != 'v' ) continue;
|
|
if ( to_lower_a(p[-2]) != 'i' ) continue;
|
|
if ( to_lower_a(p[-3]) != 'u' ) continue;
|
|
if ( to_lower_a(p[-4]) != 'q' ) continue;
|
|
if ( to_lower_a(p[-5]) != 'e' ) continue;
|
|
if ( p[-6] != '-' ) continue;
|
|
if ( to_lower_a(p[-7]) != 'p' ) continue;
|
|
if ( to_lower_a(p[-8]) != 't' ) continue;
|
|
if ( to_lower_a(p[-9]) != 't' ) continue;
|
|
if ( to_lower_a(p[-10])!= 'h' ) continue;
|
|
// skip the equal sign
|
|
p++;
|
|
// skip quote if there
|
|
if ( *p == '\"' ) p++;
|
|
// must be "refresh", continue if not
|
|
if ( strncasecmp(p,"refresh",7) ) continue;
|
|
// skip that
|
|
p += 7;
|
|
// skip another quote if there
|
|
if ( *p == '\"' ) p++;
|
|
// limit the # of white spaces
|
|
char *limit = p + 20;
|
|
// skip white spaces
|
|
while ( *p && p < limit && is_wspace_a(*p) ) p++;
|
|
// must be content now
|
|
if ( strncasecmp(p,"content=",8) ) continue;
|
|
// skip that
|
|
p += 8;
|
|
// skip possible quote
|
|
if ( *p == '\"' ) p++;
|
|
// PARSE OUT THE URL
|
|
Url dummy;
|
|
if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
|
|
continue;
|
|
gotOne = true;
|
|
break;
|
|
}
|
|
|
|
if ( ! gotOne )
|
|
return &m_metaRedirUrlPtr;
|
|
|
|
// to fix issue with scripts containing
|
|
// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
|
|
// we have to get the Xml. we can't call getXml() because of
|
|
// recursion bugs so just do it directly here
|
|
|
|
Xml xml;
|
|
if ( ! xml.set ( m_httpReply ,
|
|
m_httpReplySize - 1, // make it a length
|
|
false , // ownData?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
false , // setParentsArg?
|
|
m_niceness ,
|
|
// assume html since getContentType() is recursive
|
|
// on us.
|
|
CT_HTML ) ) // *ct ) )
|
|
// return NULL on error with g_errno set
|
|
return NULL;
|
|
|
|
XmlNode *nodes = xml.getNodes();
|
|
int32_t n = xml.getNumNodes();
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != 68 ) continue;
|
|
// only get content for <meta http-equiv=..>
|
|
int32_t tagLen;
|
|
char *tag ;
|
|
tag = xml.getString ( i , "http-equiv" , &tagLen );
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// if not a refresh, skip it
|
|
if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
|
|
// get the content
|
|
tag = xml.getString ( i ,"content", &tagLen );
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// PARSE OUT THE URL
|
|
if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
|
|
continue;
|
|
// set it
|
|
m_metaRedirUrlPtr = &m_metaRedirUrl;
|
|
// return it
|
|
return &m_metaRedirUrlPtr;
|
|
}
|
|
|
|
// nothing found
|
|
return &m_metaRedirUrlPtr;
|
|
}
|
|
|
|
uint16_t getCharsetFast ( HttpMime *mime,
|
|
char *url,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int32_t niceness ){
|
|
|
|
int16_t charset = csUnknown;
|
|
|
|
if ( slen < 0 ) slen = 0;
|
|
|
|
char *pstart = s;
|
|
char *pend = s + slen;
|
|
|
|
char *cs = mime->getCharset();
|
|
int32_t cslen = mime->getCharsetLen();
|
|
if ( cslen > 31 ) cslen = 31;
|
|
if ( cs && cslen > 0 ) {
|
|
char *p2 = cs + cslen ; char c = *p2; *p2 = '\0';
|
|
// get it
|
|
charset = get_iana_charset ( cs , gbstrlen(cs) );
|
|
// restore
|
|
*p2 = c;
|
|
}
|
|
|
|
// look for Unicode BOM first though
|
|
cs = ucDetectBOM ( pstart , pend - pstart );
|
|
if ( cs && charset == csUnknown ) {
|
|
log(LOG_DEBUG, "build: Unicode BOM signature detected: %s",cs);
|
|
int32_t len = gbstrlen(cs); if ( len > 31 ) len = 31;
|
|
charset = get_iana_charset ( cs , len );
|
|
}
|
|
|
|
// prepare to scan doc
|
|
char *p = pstart;
|
|
|
|
// if the doc claims it is utf-8 let's double check because
|
|
// newmexicomusic.org says its utf-8 in the mime header and it says
|
|
// it is another charset in a meta content tag, and it is NOT in
|
|
// utf-8, so don't trust that!
|
|
if ( charset == csUTF8 ) {
|
|
// loop over every char
|
|
for ( char *s = pstart ; s < pend ; s += getUtf8CharSize(s) ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// sanity check
|
|
if ( ! isFirstUtf8Char ( s ) ) {
|
|
// note it
|
|
log(LOG_DEBUG,
|
|
"build: mime says UTF8 but does not "
|
|
"seem to be for url %s",url);
|
|
// reset it back to unknown then
|
|
charset = csUnknown;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// do not scan the doc if we already got it set
|
|
if ( charset != csUnknown ) p = pend;
|
|
|
|
//
|
|
// it is inefficient to set xml just to get the charset.
|
|
// so let's put in some quick string matching for this!
|
|
//
|
|
|
|
// . how big is one char? usually this is 1 unless we are in utf16...
|
|
// . if we are in utf16 natively then this code needs to know that and
|
|
// set oneChar to 2! TODO!!
|
|
//char oneChar = 1;
|
|
// advance a bit, we are initially looking for the = sign
|
|
if ( p ) p += 10;
|
|
// begin the string matching loop
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// base everything off the equal sign
|
|
if ( *p != '=' ) continue;
|
|
// must have a 't' or 'g' before the equal sign
|
|
char c = to_lower_a(p[-1]);
|
|
// did we match "charset="?
|
|
if ( c == 't' ) {
|
|
if ( to_lower_a(p[-2]) != 'e' ) continue;
|
|
if ( to_lower_a(p[-3]) != 's' ) continue;
|
|
if ( to_lower_a(p[-4]) != 'r' ) continue;
|
|
if ( to_lower_a(p[-5]) != 'a' ) continue;
|
|
if ( to_lower_a(p[-6]) != 'h' ) continue;
|
|
if ( to_lower_a(p[-7]) != 'c' ) continue;
|
|
}
|
|
// did we match "encoding="?
|
|
else if ( c == 'g' ) {
|
|
if ( to_lower_a(p[-2]) != 'n' ) continue;
|
|
if ( to_lower_a(p[-3]) != 'i' ) continue;
|
|
if ( to_lower_a(p[-4]) != 'd' ) continue;
|
|
if ( to_lower_a(p[-5]) != 'o' ) continue;
|
|
if ( to_lower_a(p[-6]) != 'c' ) continue;
|
|
if ( to_lower_a(p[-7]) != 'n' ) continue;
|
|
if ( to_lower_a(p[-8]) != 'e' ) continue;
|
|
}
|
|
// if not either, go to next char
|
|
else
|
|
continue;
|
|
// . make sure a <xml or a <meta preceeds us
|
|
// . do not look back more than 500 chars
|
|
char *limit = p - 500;
|
|
// assume charset= or encoding= did NOT occur in a tag
|
|
bool inTag = false;
|
|
// check crazy wrap if m_content was close to a NULL ptr...
|
|
if ( limit >= pend ) limit = pstart;
|
|
if ( limit < pstart ) limit = pstart;
|
|
for ( char *s = p ; s >= limit ; s -= 1 ) { // oneChar ) {
|
|
// break at > or <
|
|
if ( *s == '>' ) break;
|
|
if ( *s != '<' ) continue;
|
|
// . TODO: this could be in a quoted string too! fix!!
|
|
// . is it in a <meta> tag?
|
|
if ( to_lower_a(s[1]) == 'm' &&
|
|
to_lower_a(s[2]) == 'e' &&
|
|
to_lower_a(s[3]) == 't' &&
|
|
to_lower_a(s[4]) == 'a' ) {
|
|
inTag = true;
|
|
break;
|
|
}
|
|
// is it in an <xml> tag?
|
|
if ( to_lower_a(s[1]) == 'x' &&
|
|
to_lower_a(s[2]) == 'm' &&
|
|
to_lower_a(s[3]) == 'l' ) {
|
|
inTag = true;
|
|
break;
|
|
}
|
|
// is it in an <?xml> tag?
|
|
if ( to_lower_a(s[1]) == '?' &&
|
|
to_lower_a(s[2]) == 'x' &&
|
|
to_lower_a(s[3]) == 'm' &&
|
|
to_lower_a(s[4]) == 'l' ) {
|
|
inTag = true;
|
|
break;
|
|
}
|
|
}
|
|
// if not in a tag proper, it is useless
|
|
if ( ! inTag ) continue;
|
|
// skip over equal sign
|
|
p += 1;//oneChar;
|
|
// skip over ' or "
|
|
if ( *p == '\'' ) p += 1;//oneChar;
|
|
if ( *p == '\"' ) p += 1;//oneChar;
|
|
// keep start ptr
|
|
char *csString = p;
|
|
// set a limit
|
|
limit = p + 50;
|
|
if ( limit > pend ) limit = pend;
|
|
if ( limit < p ) limit = pend;
|
|
// stop at first special character
|
|
while ( p < limit &&
|
|
*p &&
|
|
*p !='\"' &&
|
|
*p !='\'' &&
|
|
! is_wspace_a(*p) &&
|
|
*p !='>' &&
|
|
*p != '<' &&
|
|
*p !='?' &&
|
|
*p !='/' &&
|
|
// fix yaya.pro-street.us which has
|
|
// charset=windows-1251;charset=windows-1"
|
|
*p !=';' &&
|
|
*p !='\\' )
|
|
p += 1;//oneChar;
|
|
// save it
|
|
char d = *p;
|
|
// do the actual NULL termination
|
|
*p = 0;
|
|
// get the character set
|
|
int16_t metaCs = get_iana_charset(csString, gbstrlen(csString));
|
|
// put it back
|
|
*p = d;
|
|
// update "charset" to "metaCs" if known, it overrides all
|
|
if (metaCs != csUnknown ) charset = metaCs;
|
|
// all done, only if we got a known char set though!
|
|
if ( charset != csUnknown ) break;
|
|
}
|
|
|
|
// alias these charsets so iconv understands
|
|
if ( charset == csISO58GB231280 ||
|
|
charset == csHZGB2312 ||
|
|
charset == csGB2312 )
|
|
charset = csGB18030;
|
|
|
|
if ( charset == csEUCKR )
|
|
charset = csKSC56011987; //x-windows-949
|
|
|
|
// use utf8 if still unknown
|
|
if ( charset == csUnknown ) {
|
|
if ( g_conf.m_logDebugSpider )
|
|
logf(LOG_DEBUG,"doc: forcing utf8 charset");
|
|
charset = csUTF8;
|
|
}
|
|
|
|
// once again, if the doc is claiming utf8 let's double check it!
|
|
if ( charset == csUTF8 ) {
|
|
// use this for iterating
|
|
char size;
|
|
// loop over every char
|
|
for ( char *s = pstart ; s < pend ; s += size ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// set
|
|
size = getUtf8CharSize(s);
|
|
// sanity check
|
|
if ( ! isFirstUtf8Char ( s ) ) {
|
|
// but let 0x80 slide? it is for the
|
|
// 0x80 0x99 apostrophe i've seen for
|
|
// eventvibe.com. it did have a first byte,
|
|
// 0xe2 that led that sequence but it was
|
|
// converted into â by something that
|
|
// thought it was a latin1 byte.
|
|
if ( s[0] == (char)0x80 &&
|
|
s[1] == (char)0x99 ) {
|
|
s += 2;
|
|
size = 0;
|
|
continue;
|
|
}
|
|
// note it
|
|
log(LOG_DEBUG,
|
|
"build: says UTF8 (2) but does not "
|
|
"seem to be for url %s"
|
|
" Resetting to ISOLatin1.",url);
|
|
// reset it to ISO then! that's pretty common
|
|
// no! was causing problems for
|
|
// eventvibe.com/...Yacht because it had
|
|
// some messed up utf8 in it but it really
|
|
// was utf8. CRAP, but really messes up
|
|
// sunsetpromotions.com and washingtonia
|
|
// if we do not have this here
|
|
charset = csISOLatin1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
|
|
//char *csName = get_charset_str(charset);
|
|
|
|
// breathe
|
|
//QUICKPOLL ( m_niceness );
|
|
|
|
// if we are not supported, set m_indexCode
|
|
//if ( csName && ! supportedCharset(charset) ) {
|
|
// log("build: xml: Unsupported charset: %s", csName);
|
|
// g_errno = EBADCHARSET;
|
|
// return NULL;
|
|
// //charset = csUnknown;
|
|
// // i guess do not risk it
|
|
// //m_indexCode = EBADCHARSET;
|
|
//}
|
|
|
|
// all done
|
|
return charset;
|
|
}
|
|
|
|
|
|
uint16_t *XmlDoc::getCharset ( ) {
|
|
if ( m_charsetValid ) return &m_charset;
|
|
|
|
// . get ptr to filtered content
|
|
// . we can't get utf8 content yet until we know what charset this
|
|
// junk is so we can convert it!
|
|
char **fc = getFilteredContent();
|
|
if ( ! fc || fc == (void *)-1 ) return (uint16_t *)fc;
|
|
|
|
// scan document for two things:
|
|
// 1. charset= (in a <meta> tag)
|
|
// 2. encoding= (in an <?xml> tag)
|
|
char *pstart = *fc;
|
|
//char *pend = *fc + m_filteredContentLen;
|
|
|
|
// assume known charset
|
|
m_charset = csUnknown;
|
|
// make it valid regardless i guess
|
|
m_charsetValid = true;
|
|
|
|
// check in http mime for charset
|
|
HttpMime *mime = getMime();
|
|
|
|
m_charset = getCharsetFast ( mime ,
|
|
m_firstUrl.getUrl(),
|
|
pstart ,
|
|
m_filteredContentLen,
|
|
m_niceness );
|
|
m_charsetValid = true;
|
|
return &m_charset;
|
|
}
|
|
|
|
char *XmlDoc::getIsBinary ( ) {
|
|
if ( m_isBinaryValid ) return &m_isBinary;
|
|
|
|
// get the content
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1 ) return (char *)u8;
|
|
|
|
//char *ctype = getContentType();
|
|
//if ( ! ctype || ctype == (void *)-1 ) return (char *)ctype;
|
|
//bool doBinaryCheck = false;
|
|
// the "abq-g" query gives a lot of binary content, use that
|
|
// as a testbed to make sure we filter it out!
|
|
//if ( *ctype == CT_TEXT ) doBinaryCheck = true;
|
|
//if ( *ctype == CT_UNKNOWN ) doBinaryCheck = true;
|
|
//if ( *ctype == CT_XML ) doBinaryCheck = true;
|
|
//if ( *ctype == CT_HTML ) doBinaryCheck = true;
|
|
//if ( csEnum == csUnknown ) doBinaryCheck = true;
|
|
//if ( csEnum == csASCII ) doBinaryCheck = true;
|
|
//if ( csEnum == csISOLatin1 ) doBinaryCheck = true;
|
|
//if ( slen <= 0 ) doBinaryCheck = false;
|
|
// why shouldn't we binary check everything? now that we are utf8...
|
|
//doBinaryCheck = true;
|
|
|
|
// assume not
|
|
m_isBinary = false;
|
|
m_isBinaryValid = true;
|
|
|
|
// if content is not identifed as a type known to us, then check it
|
|
// for binary characters. yes, this can be utf8 or utf16 and then
|
|
// detected as binary i think, but it should really be identified as
|
|
// being html or txt or something...
|
|
//if ( ! doBinaryCheck ) return &m_isBinary;
|
|
|
|
// use a table
|
|
char table[256];
|
|
memset ( table , 0 , 256 );
|
|
// see if we had deceitful binary content
|
|
char *s = ptr_utf8Content;
|
|
char *send = s + size_utf8Content - 1;
|
|
// for now just count the binary chars
|
|
int32_t count = 0;
|
|
|
|
// no content?
|
|
if ( ! s ) return &m_isBinary;
|
|
|
|
for ( ; s < send ; s += getUtf8CharSize(s) ) {
|
|
// yield
|
|
QUICKPOLL(m_niceness);
|
|
// skip valid utf8 characters
|
|
if ( getUtf8CharSize(s) > 1 ) continue;
|
|
// . do not count \0's
|
|
// . the fctypes.cpp isBinary array takes into account
|
|
// that people mix windows 1254 characters into
|
|
// latin-1. windows 1254 is a superset of latin-1.
|
|
// so the more common quotes and dashes are no longer
|
|
// counted as binary characters, but some of the
|
|
// rarer ones are! however, the "diff" count
|
|
// constraint helps us make up for that.
|
|
// . the first char of a utf8 character sequence always has
|
|
// the high bit off, so just test that...
|
|
if ( ! is_binary_a(*s) || ! *s ) continue;
|
|
// count it up
|
|
count++;
|
|
table[(unsigned char)*s]++;
|
|
}
|
|
// how many DIFFERENT bin chars do we have?
|
|
int32_t diff = 0;
|
|
for ( int32_t i = 0 ; i < 256 ; i++ )
|
|
if ( table[i] ) diff++;
|
|
// . is binary if 10 or more bin chars and at least 10
|
|
// DIFFERENT binary chars
|
|
// . is binary if > 5% of chars are binary
|
|
if ( (count > 10 && diff>=5) || ( 100 * count ) / size_utf8Content>6) {
|
|
// note it for now
|
|
logf(LOG_DEBUG,"build: Got binary content for %s. "
|
|
"Zeroing out content. (diff=%" INT32 " count=%" INT32 " "
|
|
"len=%" INT32 ")",
|
|
m_firstUrl.getUrl(),diff,count,size_utf8Content-1);
|
|
// do not try to index binary content, but keep it
|
|
// around for site: queries or in case we have
|
|
// inlink text for it!
|
|
ptr_utf8Content = NULL;
|
|
size_utf8Content = 0;
|
|
m_isBinary = true;
|
|
}
|
|
return &m_isBinary;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// declare these two routines for using threads
|
|
static void filterDoneWrapper ( void *state , ThreadEntry *te ) ;
|
|
static void *filterStartWrapper_r ( void *state , ThreadEntry *te ) ;
|
|
|
|
// filters m_content if its pdf, word doc, etc.
|
|
char **XmlDoc::getFilteredContent ( ) {
|
|
// return it if we got it already
|
|
if ( m_filteredContentValid ) return &m_filteredContent;
|
|
|
|
// this must be valid
|
|
char **content = getContent();
|
|
if ( ! content || content == (void *)-1 ) return content;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
// it needs this
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (void *)-1 ) return (char **)mime;
|
|
|
|
// make sure NULL terminated always
|
|
// Why? pdfs can have nulls embedded
|
|
// if ( m_content &&
|
|
// m_contentValid &&
|
|
// m_content[m_contentLen] ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
|
|
int32_t max , max2;
|
|
CollectionRec *cr;
|
|
bool filterable = false;
|
|
|
|
if ( m_calledThread ) goto skip;
|
|
|
|
// assume we do not need filtering by default
|
|
m_filteredContent = m_content;
|
|
m_filteredContentLen = m_contentLen;
|
|
m_filteredContentValid = true;
|
|
m_filteredContentAllocSize = 0;
|
|
|
|
// empty content?
|
|
if ( ! m_content ) return &m_filteredContent;
|
|
|
|
if ( *ct == CT_HTML ) return &m_filteredContent;
|
|
if ( *ct == CT_TEXT ) return &m_filteredContent;
|
|
if ( *ct == CT_XML ) return &m_filteredContent;
|
|
// javascript - sometimes has address information in it, so keep it!
|
|
if ( *ct == CT_JS ) return &m_filteredContent;
|
|
if ( m_contentLen == 0 ) return &m_filteredContent;
|
|
|
|
// we now support JSON for diffbot
|
|
if ( *ct == CT_JSON ) return &m_filteredContent;
|
|
|
|
if ( *ct == CT_ARC ) return &m_filteredContent;
|
|
if ( *ct == CT_WARC ) return &m_filteredContent;
|
|
|
|
// unknown content types are 0 since it is probably binary... and
|
|
// we do not want to parse it!!
|
|
if ( *ct == CT_PDF ) filterable = true;
|
|
if ( *ct == CT_DOC ) filterable = true;
|
|
if ( *ct == CT_XLS ) filterable = true;
|
|
if ( *ct == CT_PPT ) filterable = true;
|
|
if ( *ct == CT_PS ) filterable = true;
|
|
|
|
// if its a jpeg, gif, text/css etc. bail now
|
|
if ( ! filterable ) {
|
|
m_filteredContent = NULL;
|
|
m_filteredContentLen = 0;
|
|
m_filteredContentValid = true;
|
|
return &m_filteredContent;
|
|
}
|
|
|
|
// invalidate
|
|
m_filteredContentValid = false;
|
|
|
|
cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . if we have no filter specified...
|
|
// . usually "gbfilter" and it is a script in the working directory
|
|
//if ( ! cr->m_filter[0] ) {
|
|
// m_indexCode = EDOCBADCONTENTTYPE;
|
|
// return &m_filteredContent;
|
|
//}
|
|
|
|
// if not text/html or text/plain, use the other max
|
|
//max = MAXDOCLEN; // cr->m_maxOtherDocLen;
|
|
max = cr->m_maxOtherDocLen;
|
|
// now we base this on the pre-filtered length to save memory because
|
|
// our maxOtherDocLen can be 30M and when we have a lot of injections
|
|
// at the same time we lose all our memory quickly
|
|
max2 = 5 * m_contentLen + 10*1024;
|
|
if ( max > max2 ) max = max2;
|
|
// user uses -1 to specify no maxTextDocLen or maxOtherDocLen
|
|
if ( max < 0 ) max = max2;
|
|
// make a buf to hold filtered reply
|
|
m_filteredContentAllocSize = max;
|
|
m_filteredContent = (char *)mmalloc(m_filteredContentAllocSize,"xdfc");
|
|
if ( ! m_filteredContent ) {
|
|
log("build: Could not allocate %" INT32 " bytes for call to "
|
|
"content filter.",m_filteredContentMaxSize);
|
|
return NULL;
|
|
}
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// reset this here in case thread gets killed by the kill() call below
|
|
m_filteredContentLen = 0;
|
|
// update status msg so its visible in the spider gui
|
|
setStatus ( "filtering content" );
|
|
// reset this... why?
|
|
g_errno = 0;
|
|
// . call thread to call popen
|
|
// . callThread returns true on success, in which case we block
|
|
// . do not repeat
|
|
m_calledThread = true;
|
|
// reset this since filterStart_r() will set it on error
|
|
m_errno = 0;
|
|
|
|
// how can this be? don't core like this in thread, because it
|
|
// does not save our files!!
|
|
if ( ! m_mimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do it
|
|
if ( g_threads.call ( FILTER_THREAD ,
|
|
MAX_NICENESS ,
|
|
this ,
|
|
filterDoneWrapper ,
|
|
filterStartWrapper_r ) )
|
|
// return -1 if blocked
|
|
return (char **)-1;
|
|
// clear error!
|
|
g_errno = 0;
|
|
// note it
|
|
log("build: Could not spawn thread for call to "
|
|
"content filter.");
|
|
// get the data
|
|
filterStart_r ( false ); // am thread?
|
|
|
|
// skip down here if thread has returned and we got re-called
|
|
skip:
|
|
|
|
// if size is 0, free the buf
|
|
if ( m_filteredContentLen <= 0 ) {
|
|
mfree ( m_filteredContent ,
|
|
m_filteredContentAllocSize,"fcas");
|
|
m_filteredContent = NULL;
|
|
m_filteredContentLen = 0;
|
|
m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
// did we have an error from the thread?
|
|
if ( m_errno ) g_errno = m_errno;
|
|
// but bail out if it set g_errno
|
|
if ( g_errno ) return NULL;
|
|
// must be valid now - sanity check
|
|
if ( ! m_filteredContentValid ) { char *xx=NULL;*xx=0; }
|
|
// return it
|
|
return &m_filteredContent;
|
|
}
|
|
|
|
// come back here
|
|
void filterDoneWrapper ( void *state , ThreadEntry *te ) {
|
|
// jump back into the brawl
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
// if size is 0, free the buf. have to do this outside the thread
|
|
// since malloc/free cannot be called in thread
|
|
if ( THIS->m_filteredContentLen <= 0 ) {
|
|
mfree ( THIS->m_filteredContent ,
|
|
THIS->m_filteredContentAllocSize,"fcas");
|
|
THIS->m_filteredContent = NULL;
|
|
THIS->m_filteredContentLen = 0;
|
|
THIS->m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
// . call the master callback
|
|
// . it will ultimately re-call getFilteredContent()
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// thread starts here
|
|
void *filterStartWrapper_r ( void *state , ThreadEntry *te ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->filterStart_r ( true ); // am thread?
|
|
return NULL;
|
|
}
|
|
|
|
//int my_system_r ( char *cmd , int32_t timeout ) ;
|
|
|
|
// sets m_errno on error
|
|
void XmlDoc::filterStart_r ( bool amThread ) {
|
|
// get thread id
|
|
pthread_t id = getpidtid();
|
|
// sanity check
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
int32_t ctype = m_contentType;
|
|
|
|
// assume none
|
|
m_filteredContentLen = 0;
|
|
|
|
//if ( amThread ) id = pthread_self();
|
|
//else id = getpid();
|
|
// pass the input to the program through this file
|
|
// rather than a pipe, since popen() seems broken
|
|
char in[1024];
|
|
snprintf(in,1023,"%sin.%" INT64 "", g_hostdb.m_dir , (int64_t)id );
|
|
unlink ( in );
|
|
// collect the output from the filter from this file
|
|
char out[1024];
|
|
snprintf ( out , 1023,"%sout.%" INT64 "", g_hostdb.m_dir, (int64_t)id );
|
|
unlink ( out );
|
|
// ignore errno from those unlinks
|
|
errno = 0;
|
|
// open the input file
|
|
retry11:
|
|
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry11;
|
|
m_errno = errno;
|
|
log("build: Could not open file %s for writing: %s.",
|
|
in,mstrerror(m_errno));
|
|
return;
|
|
}
|
|
// we are in a thread, this must be valid!
|
|
if ( ! m_mimeValid ) { char *xx=NULL;*xx=0;}
|
|
|
|
retry12:
|
|
// write the content into the input file
|
|
int32_t w = write ( fd , m_content , m_contentLen );
|
|
// valgrind
|
|
if ( w < 0 && errno == EINTR ) goto retry12;
|
|
// did we get an error
|
|
if ( w != m_contentLen ) {
|
|
//int32_t w = fwrite ( m_buf , 1 , m_bufLen , pd );
|
|
//if ( w != m_bufLen ) {
|
|
m_errno = errno;
|
|
log("build: Error writing to %s: %s.",in,
|
|
mstrerror(m_errno));
|
|
close(fd);
|
|
return;
|
|
}
|
|
// close the file
|
|
close ( fd );
|
|
|
|
// int16_tcut
|
|
char *wdir = g_hostdb.m_dir;
|
|
|
|
// . open a pipe to pdf2html program
|
|
// . the output will go to stdout
|
|
char cmd[3072];
|
|
// different commands to filter differt ctypes
|
|
// -i : ignore images
|
|
// -stdout: send output to stdout
|
|
// -c : generate complex document
|
|
// Google generates complex docs, but the large ones are horribly slow
|
|
// in the browser, but docs with 2 cols don't display right w/o -c.
|
|
// damn, -stdout doesn't work when -c is specified.
|
|
// These ulimit sizes are max virtual memory in kilobytes. let's
|
|
// keep them to 25 Megabytes
|
|
if ( ctype == CT_PDF )
|
|
snprintf(cmd,3071 ,"ulimit -v 25000 ; ulimit -t 30 ; timeout 30s nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
|
|
else if ( ctype == CT_DOC )
|
|
// "wdir" include trailing '/'? not sure
|
|
snprintf(cmd,3071, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; timeout 30s nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
|
|
else if ( ctype == CT_XLS )
|
|
snprintf(cmd,3071, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
|
|
// this is too buggy for now... causes hanging threads because it
|
|
// hangs, so i added 'timeout 10s' but that only works on newer
|
|
// linux version, so it'll just error out otherwise.
|
|
else if ( ctype == CT_PPT )
|
|
snprintf(cmd,3071, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/ppthtml %s > %s" , wdir , in , out );
|
|
else if ( ctype == CT_PS )
|
|
snprintf(cmd,3071, "ulimit -v 25000 ; ulimit -t 30; timeout 10s nice -n 19 %s/pstotext %s > %s" , wdir , in , out );
|
|
else { char *xx=NULL;*xx=0; }
|
|
|
|
// breach sanity check
|
|
//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// exectue it
|
|
int retVal = gbsystem ( cmd );
|
|
if ( retVal == -1 )
|
|
log("gb: system(%s) : %s",
|
|
cmd,mstrerror(g_errno));
|
|
|
|
// all done with input file
|
|
// clean up the binary input file from disk
|
|
if ( unlink ( in ) != 0 ) {
|
|
// log error
|
|
log("gbfilter: unlink (%s): %s\n",in, strerror(errno));
|
|
// ignore it, since it was not a processing error per se
|
|
errno = 0;
|
|
}
|
|
|
|
// don't use too much memory, i think xhtml uses so much that it
|
|
// swaps out all the gb processes?
|
|
//struct rlimit lim;
|
|
//lim.rlim_cur = lim.rlim_max = 24 * 1024 * 1024 ;
|
|
//if ( setrlimit ( RLIMIT_AS , &lim ) )
|
|
// fprintf (stderr,"gbfilter:setrlimit: %s", strerror(errno) );
|
|
|
|
retry13:
|
|
fd = open ( out , O_RDONLY );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry13;
|
|
m_errno = errno;
|
|
log("gbfilter: Could not open file %s for reading: %s.",
|
|
out,mstrerror(m_errno));
|
|
return;
|
|
}
|
|
// sanity -- need room to store a \0
|
|
if ( m_filteredContentAllocSize < 2 ) { char *xx=NULL;*xx=0; }
|
|
// to read - leave room for \0
|
|
int32_t toRead = m_filteredContentAllocSize - 1;
|
|
retry14:
|
|
// read right from pipe descriptor
|
|
int32_t r = read (fd, m_filteredContent,toRead);
|
|
// note errors
|
|
if ( r < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry14;
|
|
log("gbfilter: reading output: %s",mstrerror(errno));
|
|
// this is often bad fd from an oom error, so ignore it
|
|
//m_errno = errno;
|
|
errno = 0;
|
|
r = 0;
|
|
}
|
|
// clean up shop
|
|
close ( fd );
|
|
// delete output file
|
|
unlink ( out );
|
|
|
|
// validate now
|
|
m_filteredContentValid = 1;
|
|
// save the new buf len
|
|
m_filteredContentLen = r;
|
|
// ensure enough room for null term
|
|
if ( r >= m_filteredContentAllocSize ) { char *xx=NULL;*xx=0; }
|
|
// ensure filtered stuff is NULL terminated so we can set the Xml class
|
|
m_filteredContent [ m_filteredContentLen ] = '\0';
|
|
// it is good
|
|
m_filteredContentValid = true;
|
|
|
|
// . at this point we got the filtered content
|
|
// . bitch if we didn't allocate enough space
|
|
if ( r > 0 && r == toRead )
|
|
log(LOG_LOGIC,"build: Had to truncate document to %" INT32 " bytes "
|
|
"because did not allocate enough space for filter. "
|
|
"This should never happen. It is a hack that should be "
|
|
"fixed right.", toRead );
|
|
|
|
// if we got something, then we're done
|
|
//if ( r > 0 ) return;
|
|
// otherwise, free it up
|
|
// . NO! not in a thread!!
|
|
//mfree ( m_filteredContent , m_filteredContentAllocSize, "fcas" );
|
|
//m_filteredContent = NULL;
|
|
//m_filteredContentLen = 0;
|
|
//m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
pid_t g_pid = -1;
|
|
int32_t g_ticker = 0;
|
|
int32_t g_filterTimeout = -1;
|
|
|
|
/*
|
|
static int startUp ( void *cmd ) ;
|
|
#include <sys/types.h> // waitpid()
|
|
#include <sys/wait.h> // waitpid()
|
|
#include <sched.h> // clone()
|
|
|
|
static char cloneStack[250000];
|
|
|
|
int my_system_r ( char *cmd , int32_t timeout ) {
|
|
// bail if cmd empty
|
|
if ( ! cmd ) {
|
|
log(LOG_LOGIC,"build: empty command.");
|
|
return -1;
|
|
}
|
|
errno = 0;
|
|
// this gives out of memory on newer kernels, was that causing our
|
|
// older kernerls to crash, too, in addition to the e1000 driver?
|
|
//pid_t pid = fork();
|
|
// let's use clone() instead now
|
|
// error forking?
|
|
pid_t pid = clone ( startUp ,
|
|
cloneStack + 250000 ,
|
|
CLONE_FS | CLONE_FILES | CLONE_VM | SIGCHLD ,
|
|
cmd );
|
|
if (pid == -1) {
|
|
log("build: fork: %s.",mstrerror(errno));
|
|
return -1;
|
|
}
|
|
// sanity check
|
|
if ( g_pid != -1 ) { char *xx = NULL; *xx = 0; }
|
|
// set the process group id of this guy to itself, so he becomes
|
|
// the process leader, so any processes he spawns should all receive
|
|
// the same HUP or kill signals he receives. uhhhh probably not...
|
|
//setpgid ( pid , pid );
|
|
// save the pid globally so Threads.cpp can kill(9,g_pid) it if it
|
|
// stalls too long. but to measure how long it is out for, keep a
|
|
// ticker count. this ticker count is incremented in the sleep wrapper
|
|
// in Threads.cpp.
|
|
g_ticker = 0;
|
|
g_pid = pid;
|
|
g_filterTimeout = timeout;
|
|
loop:
|
|
int status;
|
|
if ( waitpid ( pid , &status , 0 ) == -1 ) {
|
|
// reset g_pid so Threads.cpp's kill wrapper chills out
|
|
if ( errno != EINTR ) {
|
|
log("build: waitpid pid=%" INT32 ": %s.",
|
|
(int32_t)g_pid,mstrerror(errno));
|
|
g_pid = -1;
|
|
return -1;
|
|
}
|
|
// if we got interrupted by a different signal keep waiting
|
|
goto loop;
|
|
}
|
|
// reset g_pid so Threads.cpp's kill wrapper chills out
|
|
g_pid = -1;
|
|
if ( status < 0 ) log("build: Got bad status from child.");
|
|
// we got the signal
|
|
return status;
|
|
}
|
|
|
|
int startUp ( void *cmd ) {
|
|
char *argv[4];
|
|
argv[0] = "sh";
|
|
argv[1] = "-c";
|
|
argv[2] = (char *)cmd;
|
|
argv[3] = 0;
|
|
char *envp[2];
|
|
char buf[1024];
|
|
// antiword needs this environment var so it can find
|
|
// the .antiword/ dir , we should put it in gb's working dir
|
|
snprintf(buf,1023,"HOME=%s", g_hostdb.m_dir );
|
|
envp[0] = buf;
|
|
envp[1] = 0;
|
|
execve("/bin/sh", argv, envp );
|
|
//exit(127);
|
|
return 1;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
// return downloaded content as utf8
|
|
char **XmlDoc::getRawUtf8Content ( ) {
|
|
// if we already computed it, return that
|
|
if ( m_rawUtf8ContentValid ) return &m_rawUtf8Content;
|
|
|
|
// . get our characterset
|
|
// . crap! this can be recursive. it calls getXml() which calls
|
|
// getUtf8Content() which is us!
|
|
uint16_t *charset = getCharset ( );
|
|
if ( ! charset || charset == (uint16_t *)-1 ) return (char **)charset;
|
|
|
|
char *csName = get_charset_str(*charset);
|
|
|
|
// . if not supported fix that!
|
|
// . m_indexCode should be set to EBADCHARSET ultimately, but not here
|
|
if ( ! supportedCharset(*charset) && csName ) {
|
|
m_rawUtf8Content = NULL;
|
|
m_rawUtf8ContentSize = 0;
|
|
m_rawUtf8ContentAllocSize = 0;
|
|
m_rawUtf8ContentValid = true;
|
|
return &m_rawUtf8Content;
|
|
}
|
|
|
|
// get ptr to filtered content
|
|
char **fc = getFilteredContent();
|
|
if ( ! fc || fc == (void *)-1 ) return (char **)fc;
|
|
|
|
// make sure NULL terminated always
|
|
if ( m_filteredContent &&
|
|
m_filteredContentValid &&
|
|
m_filteredContent[m_filteredContentLen] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// NULL out if no content
|
|
if ( ! m_filteredContent ) {
|
|
m_rawUtf8Content = NULL;
|
|
m_rawUtf8ContentSize = 0;
|
|
m_rawUtf8ContentAllocSize = 0;
|
|
m_rawUtf8ContentValid = true;
|
|
return &m_rawUtf8Content;
|
|
}
|
|
|
|
// assume already utf8
|
|
m_rawUtf8Content = m_filteredContent;
|
|
m_rawUtf8ContentSize = m_filteredContentLen + 1;
|
|
m_rawUtf8ContentAllocSize = 0;
|
|
|
|
// if we are not ascii or utf8 already, encode it into utf8
|
|
if ( m_rawUtf8ContentSize > 1 &&
|
|
csName &&
|
|
*charset != csASCII &&
|
|
*charset != csUTF8 ) {
|
|
// ok, no-go
|
|
//ptr_utf8Content = NULL;
|
|
m_rawUtf8Content = NULL;
|
|
// assume utf8 will be twice the size ... then add a little
|
|
int32_t need = (m_filteredContentLen * 2) + 4096;
|
|
char *buf = (char *) mmalloc(need, "Xml3");
|
|
// log oom error
|
|
if ( ! buf ) {
|
|
log("build: xml: not enough memory for utf8 buffer");
|
|
return NULL;
|
|
}
|
|
// sanity check
|
|
if ( ! csName ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
setStatus ( "converting doc to utf8" );
|
|
// returns # of bytes i guess
|
|
int32_t used = ucToUtf8 ( buf ,
|
|
// fix core dump by subtracting 10!
|
|
need - 10,
|
|
m_filteredContent ,
|
|
m_filteredContentLen ,
|
|
csName ,
|
|
-1 ,//allowBadChars
|
|
m_niceness );
|
|
// clear this if successful, otherwise, it sets errno
|
|
if ( used > 0 ) g_errno = 0;
|
|
// unrecoverable error? bad charset is g_errno == 7
|
|
// which is like argument list too long or something
|
|
// error from Unicode.cpp's call to iconv()
|
|
if ( g_errno )
|
|
log(LOG_INFO, "build: xml: failed parsing buffer: %s "
|
|
"(cs=%d)", mstrerror(g_errno), *charset);
|
|
if ( g_errno && g_errno != 7 ) {
|
|
mfree ( buf, need, "Xml3");
|
|
// do not index this doc, delete from spiderdb/tfndb
|
|
//if ( g_errno != ENOMEM ) m_indexCode = g_errno;
|
|
// if conversion failed NOT because of bad charset
|
|
// then return NULL now and bail out. probably ENOMEM
|
|
return NULL;
|
|
}
|
|
// if bad charset... just make doc empty as a utf8 doc
|
|
if ( g_errno == 7 ) {
|
|
used = 0;
|
|
buf[0] = '\0';
|
|
buf[1] = '\0';
|
|
// clear g_errno
|
|
g_errno = 0;
|
|
// and make a note for getIndexCode() so it will not
|
|
// bother indexing the doc! nah, just index it
|
|
// but with no content...
|
|
}
|
|
// crazy? this is pretty important...
|
|
if ( used + 10 >= need )
|
|
log("build: utf8 using too much buf space!!! u=%s",
|
|
getFirstUrl()->getUrl());
|
|
// re-assign
|
|
//ptr_utf8Content = buf;
|
|
//size_utf8Content = used + 1;
|
|
//m_utf8ContentAllocSize = need;
|
|
m_rawUtf8Content = buf;
|
|
m_rawUtf8ContentSize = used + 1;
|
|
m_rawUtf8ContentAllocSize = need;
|
|
}
|
|
|
|
// convert \0's to spaces. why do we see these in some pages?
|
|
// http://www.golflink.com/golf-courses/ has one in the middle after
|
|
// about 32k of content.
|
|
char *p = m_rawUtf8Content;
|
|
char *pend = p + m_rawUtf8ContentSize - 1;
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ! *p ) *p = ' ';
|
|
}
|
|
|
|
|
|
//
|
|
// VALIDATE the UTF-8
|
|
//
|
|
|
|
// . make a buffer to hold the decoded content now
|
|
// . we were just using the m_expandedUtf8Content buf itself, but "n"
|
|
// ended up equalling m_expadedUtf8ContentSize one time for a
|
|
// doc, http://ediso.net/, which probably had corrupt utf8 in it,
|
|
// and that breached our buffer! so verify that this is good
|
|
// utf8, and that we can parse it without breaching our buffer!
|
|
p = m_rawUtf8Content;
|
|
// make sure NULL terminated always
|
|
if ( p[m_rawUtf8ContentSize-1]) { char *xx=NULL;*xx=0;}
|
|
// make sure we don't breach the buffer when parsing it
|
|
char size;
|
|
char *lastp = NULL;
|
|
for ( ; ; p += size ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p >= pend ) break;
|
|
lastp = p;
|
|
size = getUtf8CharSize(p);
|
|
}
|
|
// overflow?
|
|
if ( p > pend && lastp ) {
|
|
// back up to the bad utf8 char that made us overshoot
|
|
p = lastp;
|
|
// space it out
|
|
for ( ; p < pend ; p++ ) *p = ' ';
|
|
// log it maybe due to us not being keep alive http server?
|
|
log("doc: fix bad utf8 overflow (because we are not "
|
|
"keepalive?) in doc %s",m_firstUrl.m_url);
|
|
}
|
|
// overflow?
|
|
if ( p != pend ) { char *xx=NULL;*xx=0; }
|
|
// sanity check for breach. or underrun in case we encountered a
|
|
// premature \0
|
|
if (p-m_rawUtf8Content!=m_rawUtf8ContentSize-1) {char*xx=NULL;*xx=0;}
|
|
|
|
// sanity -- must be \0 terminated
|
|
if ( m_rawUtf8Content[m_rawUtf8ContentSize-1] ) {char *xx=NULL;*xx=0; }
|
|
|
|
// it might have shrunk us
|
|
//m_rawUtf8ContentSize = n + 1;
|
|
// we are good to go
|
|
m_rawUtf8ContentValid = true;
|
|
|
|
//return &ptr_utf8Content;
|
|
return &m_rawUtf8Content;
|
|
}
|
|
|
|
// this is so Msg13.cpp can call getExpandedUtf8Content() to do its
|
|
// iframe expansion logic
|
|
void getExpandedUtf8ContentWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
char **retVal = THIS->getExpandedUtf8Content();
|
|
// return if blocked again
|
|
if ( retVal == (void *)-1 ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// now if there are any <iframe> tags let's substitute them for
|
|
// the html source they represent here. that way we will get all the
|
|
// information you see on the page. this is somewhat critical since
|
|
// a lot of pages have their content in the frame.
|
|
char **XmlDoc::getExpandedUtf8Content ( ) {
|
|
// if we already computed it, return that
|
|
if ( m_expandedUtf8ContentValid ) return &m_expandedUtf8Content;
|
|
|
|
// if called from spider compression proxy we need to set
|
|
// masterLoop here now
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getExpandedUtf8ContentWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// get the unexpanded cpontent first
|
|
char **up = getRawUtf8Content ();
|
|
if ( ! up || up == (void *)-1 ) return up;
|
|
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (char **)cu;
|
|
|
|
// NULL out if no content
|
|
if ( ! *up ) {
|
|
m_expandedUtf8Content = NULL;
|
|
m_expandedUtf8ContentSize = 0;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
// do not do iframe expansion in order to keep injections fast
|
|
if ( m_wasContentInjected ) {
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
bool skip = m_skipIframeExpansion;
|
|
|
|
// if we are a warc, arc or doc that consists of a sequence of
|
|
// sub-docs that we are indexing/injecting then skip iframe expansion
|
|
if ( isContainerDoc() )
|
|
skip = true;
|
|
|
|
// or if this is set to true
|
|
if ( skip ) {
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (char **)ct;
|
|
|
|
// if we have a json reply, leave it alone... do not expand iframes
|
|
// in json, it will mess up the json
|
|
if ( *ct == CT_JSON ) {
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
// we need this so getExtraDoc does not core
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;
|
|
|
|
// point to it
|
|
char *p = *up;
|
|
char *pend = *up + m_rawUtf8ContentSize; // includes \0
|
|
// declare crap up here so we can jump into the for loop
|
|
int32_t urlLen;
|
|
char *url;
|
|
char *fend;
|
|
Url furl;
|
|
XmlDoc **ped;
|
|
XmlDoc *ed;
|
|
bool inScript = false;
|
|
bool match;
|
|
// assign saved value if we got that
|
|
if ( m_savedp ) {
|
|
// restore "p"
|
|
p = m_savedp;
|
|
// update this
|
|
ed = m_extraDoc;
|
|
// and see if we got the mime now
|
|
goto gotMime;
|
|
}
|
|
// now loop for frame and iframe tags
|
|
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// if never found a frame tag, just keep on chugging
|
|
if ( *p != '<' ) continue;
|
|
// <script>?
|
|
if ( to_lower_a(p[1]) == 's' &&
|
|
to_lower_a(p[2]) == 'c' &&
|
|
to_lower_a(p[3]) == 'r' &&
|
|
to_lower_a(p[4]) == 'i' &&
|
|
to_lower_a(p[5]) == 'p' &&
|
|
to_lower_a(p[6]) == 't' )
|
|
inScript = 1;
|
|
// </script>?
|
|
if ( p[1]=='/' &&
|
|
to_lower_a(p[2]) == 's' &&
|
|
to_lower_a(p[3]) == 'c' &&
|
|
to_lower_a(p[4]) == 'r' &&
|
|
to_lower_a(p[5]) == 'i' &&
|
|
to_lower_a(p[6]) == 'p' &&
|
|
to_lower_a(p[7]) == 't' )
|
|
inScript = 0;
|
|
// . skip if in script
|
|
// . fixes guysndollsllc.com which has an iframe tag in
|
|
// a script section, "document.write ('<iframe..."
|
|
if ( inScript ) continue;
|
|
// iframe or frame?
|
|
match = false;
|
|
if ( to_lower_a(p[1]) == 'f' &&
|
|
to_lower_a(p[2]) == 'r' &&
|
|
to_lower_a(p[3]) == 'a' &&
|
|
to_lower_a(p[4]) == 'm' &&
|
|
to_lower_a(p[5]) == 'e' )
|
|
match = true;
|
|
if ( to_lower_a(p[1]) == 'i' &&
|
|
to_lower_a(p[2]) == 'f' &&
|
|
to_lower_a(p[3]) == 'r' &&
|
|
to_lower_a(p[4]) == 'a' &&
|
|
to_lower_a(p[5]) == 'm' &&
|
|
to_lower_a(p[6]) == 'e' )
|
|
match = true;
|
|
// skip tag if not iframe or frame
|
|
if ( ! match ) continue;
|
|
// check for frame or iframe
|
|
//if ( strncasecmp(p+1,"frame " , 6) &&
|
|
// strncasecmp(p+1,"iframe ", 7) )
|
|
// continue;
|
|
// get src tag (function in Words.h)
|
|
url = getFieldValue ( p , pend - p ,"src" , &urlLen );
|
|
// needs a src field
|
|
if ( ! url ) continue;
|
|
// "" is not acceptable either. techcrunch.com has
|
|
// <iframe src=""> which ends up embedding the root url.
|
|
if ( urlLen == 0 )
|
|
continue;
|
|
// skip if "about:blank"
|
|
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
|
|
continue;
|
|
// get our current url
|
|
//cu = getCurrentUrl();
|
|
// set our frame url
|
|
furl.set ( cu , url , urlLen );
|
|
// no recursion
|
|
if ( strcmp(furl.getUrl(),m_firstUrl.getUrl()) == 0 )
|
|
continue;
|
|
// must be http or https, not ftp! ftp was causing us to
|
|
// core in Msg22.cpp where it checks the url's protocol
|
|
// when trying to lookup the old title rec.
|
|
// http://sweetaub.ipower.com/ had an iframe with a ftp url.
|
|
if ( ! furl.isHttp() && ! furl.isHttps() ) continue;
|
|
// ignore google.com/ assholes for now
|
|
if ( strstr(furl.getUrl(),"google.com/" ) ) continue;
|
|
// and bing just to be safe
|
|
if ( strstr(furl.getUrl(),"bing.com/" ) ) continue;
|
|
// save it in case we have to return and come back later
|
|
m_savedp = p;
|
|
// break here
|
|
//log("mdw: breakpoing here");
|
|
// . download that. get as a doc. use 0 for max cache time
|
|
// . no, use 5 seconds since we often have the same iframe
|
|
// in the root doc that we have in the main doc, like a
|
|
// facebook iframe or something.
|
|
// . use a m_maxCacheAge of 5 seconds now!
|
|
ped = getExtraDoc ( furl.m_url , 5 );
|
|
// should never block
|
|
if ( ! ped ) {
|
|
log("xmldoc: getExpandedutf8content = %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
// . return -1 if it blocked???
|
|
// . no, this is not supported right now
|
|
// . it will mess up our for loop
|
|
if ( ped == (void *)-1 ) {char *xx=NULL;*xx=0;}
|
|
// cast it
|
|
ed = *ped;
|
|
// sanity
|
|
if ( ! ed ) { char *xx=NULL;*xx=0; }
|
|
// jump in here from above
|
|
gotMime:
|
|
// make it not use the ips.txt cache
|
|
//ed->m_useIpsTxtFile = false;
|
|
//ed->m_readFromTestCache = false;
|
|
// get the mime
|
|
HttpMime *mime = ed->getMime();
|
|
if ( ! mime || mime == (void *)-1 ) return (char **)mime;
|
|
// if not success, do not expand it i guess...
|
|
if ( mime->getHttpStatus() != 200 ) {
|
|
// free it
|
|
nukeDoc ( ed );
|
|
// and continue
|
|
continue;
|
|
}
|
|
// update m_downloadEndTime if we should
|
|
if ( ed->m_downloadEndTimeValid ) {
|
|
// we must already be valid
|
|
if ( ! m_downloadEndTimeValid ) {char *xx=NULL;*xx=0;}
|
|
// only replace it if it had ip and robots.txt allowed
|
|
if ( ed->m_downloadEndTime )
|
|
m_downloadEndTime = ed->m_downloadEndTime;
|
|
}
|
|
|
|
// re-write that extra doc into the content
|
|
char **puc = ed->getRawUtf8Content();
|
|
// this should not block
|
|
//if ( puc == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// it blocked before! because the charset was not known!
|
|
if ( puc == (void *)-1 ) return (char **)puc;
|
|
// error?
|
|
if ( ! puc ) return (char **)puc;
|
|
// cast it
|
|
char *uc = *puc;
|
|
// or if no content, and no mime (like if robots.txt disallows)
|
|
if ( ! uc || ed->m_rawUtf8ContentSize == 1 ) {
|
|
// free it
|
|
nukeDoc ( ed );
|
|
// and continue
|
|
continue;
|
|
}
|
|
// size includes terminating \0
|
|
if ( uc[ed->m_rawUtf8ContentSize-1] ) { char *xx=NULL;*xx=0;}
|
|
|
|
// if first time we are expanding, set this
|
|
if ( ! m_oldp ) m_oldp = *up;
|
|
|
|
// find end of frame tag
|
|
fend = p;
|
|
for ( ; fend < pend ; fend += getUtf8CharSize(fend) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// if never found a frame tag, just keep on chugging
|
|
if ( *fend == '>' ) break;
|
|
}
|
|
// if no end to the iframe tag was found, bail then...
|
|
if ( fend >= pend ) continue;
|
|
// skip the >
|
|
fend++;
|
|
|
|
// insert the non-frame crap first AND the frame/iframe tag
|
|
m_esbuf.safeMemcpy ( m_oldp , fend - m_oldp );
|
|
// end the frame
|
|
//m_esbuf.safeMemcpy ( "</iframe>", 9 );
|
|
// use our own special tag so Sections.cpp can set
|
|
// Section::m_gbFrameNum which it uses internally
|
|
m_esbuf.safePrintf("<gbframe>"); // gbiframe
|
|
// identify javascript
|
|
bool javascript = false;
|
|
if ( *ed->getContentType() == CT_JS ) javascript = true;
|
|
// so we do not mine javascript for cities and states etc.
|
|
// in Address.cpp
|
|
if ( javascript ) m_esbuf.safePrintf("<script>");
|
|
// store that
|
|
m_esbuf.safeMemcpy ( uc , ed->m_rawUtf8ContentSize - 1 );
|
|
// our special tag has an end tag as well
|
|
if ( javascript ) m_esbuf.safePrintf("</script>");
|
|
m_esbuf.safePrintf("</gbframe>");
|
|
// free up ed
|
|
nukeDoc ( ed );
|
|
|
|
// end of frame tag, skip over whole thing
|
|
m_oldp = fend ;
|
|
// sanity check
|
|
if ( m_oldp > pend ) { char *xx=NULL;*xx=0; }
|
|
// another flag
|
|
m_didExpansion = true;
|
|
// count how many we did
|
|
if ( ++m_numExpansions >= 5 ) break;
|
|
}
|
|
// default
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
// point to expansion buffer if we did any expanding
|
|
if ( m_didExpansion ) {
|
|
// copy over the rest
|
|
m_esbuf.safeMemcpy ( m_oldp , pend - m_oldp );
|
|
// null term it
|
|
m_esbuf.pushChar('\0');
|
|
// and point to that buffer
|
|
m_expandedUtf8Content = m_esbuf.getBufStart();//m_buf;
|
|
// include the \0 as part of the size
|
|
m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
|
|
}
|
|
// sanity -- must be \0 terminated
|
|
if ( m_expandedUtf8Content[m_expandedUtf8ContentSize-1] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
static SafeBuf s_cookieBuf;
|
|
|
|
|
|
|
|
|
|
|
|
void *systemStartWrapper_r ( void *state , ThreadEntry *t ) {
|
|
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
char filename[2048];
|
|
snprintf(filename,2048,"%sgbarchivefile%" UINT32 ".gz",
|
|
g_hostdb.m_dir,
|
|
(int32_t)(int64_t)THIS);
|
|
|
|
char cmd[MAX_URL_LEN+256];
|
|
snprintf( cmd,
|
|
MAX_URL_LEN+256,
|
|
"wget -q --header=\"Cookie: %s\" \"%s\" -O %s" ,
|
|
s_cookieBuf.getBufStart() ,
|
|
THIS->m_firstUrl.getUrl() ,
|
|
filename );
|
|
|
|
log("build: wget: %s",cmd );
|
|
|
|
int ret;
|
|
|
|
ret = system(cmd);
|
|
if ( ret == -1 )
|
|
log("build: wget system failed: %s",mstrerror(errno));
|
|
else
|
|
log("build: wget system returned %" INT32 "",ret);
|
|
|
|
// unzip it now
|
|
snprintf ( cmd , MAX_URL_LEN+256, "gunzip -f %s" , filename );
|
|
|
|
log("build: wget begin: %s",cmd );
|
|
|
|
ret = system(cmd);
|
|
if ( ret == -1 )
|
|
log("build: gunzip system failed: %s",mstrerror(errno));
|
|
else
|
|
log("build: gunzip system returned %" INT32 "",ret);
|
|
|
|
|
|
log("build: done with gunzip");
|
|
|
|
return NULL;
|
|
}
|
|
|
|
// come back here
|
|
void systemDoneWrapper ( void *state , ThreadEntry *t ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// we download large files to a file on disk, like warcs and arcs
|
|
FILE *XmlDoc::getUtf8ContentInFile () {
|
|
|
|
setStatus ("wgetting archive file");
|
|
|
|
// if ( m_calledWgetThread ) {
|
|
|
|
// char filename[2048];
|
|
// snprintf ( filename,
|
|
// 2048,
|
|
// "gbarchivefile%" UINT32 "",
|
|
// (int32_t)(int64_t)this);
|
|
|
|
// m_file.set ( g_hostdb.m_dir , filename );
|
|
// m_fileSize = m_file.getFileSize();
|
|
// m_fileValid = true;
|
|
// *fileSizeArg = m_fileSize;
|
|
// m_file.open(O_RDONLY);
|
|
// // explicitly set it to false now to make it harder for
|
|
// // it not to be true because that messes things up
|
|
// m_file.m_usePartFiles = false;
|
|
// return &m_file;
|
|
// }
|
|
|
|
// before calling the system wget thread we gotta set the cookiebuf
|
|
// HACK: for archive.org
|
|
// if getting a page from archive.org then append the cookie
|
|
// so we have the proper permissions
|
|
static bool s_triedToLoadCookie = false;
|
|
char *x = m_firstUrl.getUrl();
|
|
// only go out 20 chars looking for start of .archive.org/
|
|
char *xend = x + 25;
|
|
bool isArchiveOrg = false;
|
|
for ( ; x < xend && *x ; x++ ) {
|
|
if ( x[ 0] != '.' && x[0] != '/' ) continue; // /archive.org?
|
|
if ( x[ 1] != 'a' ) continue;
|
|
if ( x[ 2] != 'r' ) continue;
|
|
if ( x[ 3] != 'c' ) continue;
|
|
if ( x[ 4] != 'h' ) continue;
|
|
if ( x[ 5] != 'i' ) continue;
|
|
if ( x[ 6] != 'v' ) continue;
|
|
if ( x[ 7] != 'e' ) continue;
|
|
if ( x[ 8] != '.' ) continue;
|
|
if ( x[ 9] != 'o' ) continue;
|
|
if ( x[10] != 'r' ) continue;
|
|
if ( x[11] != 'g' ) continue;
|
|
if ( x[12] != '/' ) continue;
|
|
isArchiveOrg = true;
|
|
break;
|
|
}
|
|
|
|
if ( isArchiveOrg && ! s_triedToLoadCookie ) {
|
|
// try to load it up if haven't tried yet
|
|
s_triedToLoadCookie = true;
|
|
SafeBuf tmp;
|
|
//int32_t loaded = tmp.load ( "/home/mwells/.config/internetarchive.yml");
|
|
int32_t loaded = tmp.load ( "auth/internetarchive.yml");
|
|
if(loaded <= 0) {
|
|
if ( ! g_errno ) g_errno = EDOCTOOBIG;
|
|
log("gb: failed to load auth/internetarchive.yml: "
|
|
"%s",mstrerror(g_errno));
|
|
// do not restart gb in a loop, so return 0 to shell
|
|
exit(0);
|
|
//return NULL;
|
|
// FIXME
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
char *s = tmp.getBufStart();
|
|
char *line;
|
|
char *lineEnd;
|
|
line = strstr ( s , "logged-in-user: " );
|
|
if ( line ) lineEnd = strstr(line,"\n");
|
|
if ( lineEnd ) {
|
|
s_cookieBuf.safePrintf("logged-in-user=");
|
|
line += 16;
|
|
s_cookieBuf.safeMemcpy(line,lineEnd-line);
|
|
s_cookieBuf.pushChar(';');
|
|
s_cookieBuf.pushChar(' ');
|
|
s_cookieBuf.nullTerm();
|
|
}
|
|
line = strstr ( s , "logged-in-sig: " );
|
|
if ( line ) lineEnd = strstr(line,"\n");
|
|
if ( lineEnd ) {
|
|
s_cookieBuf.safePrintf("logged-in-sig=");
|
|
line += 15;
|
|
s_cookieBuf.safeMemcpy(line,lineEnd-line);
|
|
//s_cookieBuf.pushChar(';');
|
|
//s_cookieBuf.pushChar(' ');
|
|
s_cookieBuf.nullTerm();
|
|
}
|
|
}
|
|
|
|
// if we loaded something use it
|
|
if ( isArchiveOrg && s_cookieBuf.length() ) {
|
|
//cookie = s_cookieBuf.getBufStart();
|
|
log("http: using archive cookie %s",s_cookieBuf.getBufStart());
|
|
// and set user-agent too
|
|
// userAgent = "python-requests/2.3.0 "
|
|
// "CPython/2.7.3 Linux/3.5.0-32-generic";
|
|
}
|
|
|
|
char cmd[MAX_URL_LEN+256];
|
|
snprintf( cmd,
|
|
MAX_URL_LEN+256,
|
|
"set -o pipefail|"
|
|
"wget --limit-rate=10M -O- --header=\"Cookie: %s\" \"%s\"|" //
|
|
"zcat|"
|
|
"mbuffer -t -m 10M -o-", //this is useful but we need a new version of mbuffer -W 30
|
|
s_cookieBuf.getBufStart() ,
|
|
m_firstUrl.getUrl());
|
|
|
|
log("build: wget: %s",cmd );
|
|
|
|
FILE* fh = gbpopen(cmd);
|
|
|
|
int fd = fileno(fh);
|
|
int flags = fcntl(fd, F_GETFL, 0);
|
|
if(fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
|
|
log("build: could not set wget stream to nonblocking %s",
|
|
m_firstUrl.getUrl());
|
|
//error
|
|
}
|
|
|
|
if(!g_loop.registerReadCallback ( fd,
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return NULL;
|
|
}
|
|
m_registeredWgetReadCallback = true;
|
|
|
|
|
|
log("build: called popen");
|
|
|
|
m_calledWgetThread = true;
|
|
m_hasMoreToRead = true;
|
|
|
|
return fh;
|
|
|
|
// return getUtf8ContentInFile ( fileSizeArg );
|
|
|
|
|
|
// . callThread returns true on success, in which case we block
|
|
// if ( g_threads.call ( FILTER_THREAD ,
|
|
// MAX_NICENESS ,
|
|
// (void *)this , // this
|
|
// systemDoneWrapper ,
|
|
// systemStartWrapper_r ) )
|
|
// // would block, wait for thread
|
|
// return (BigFile *)-1;
|
|
// // failed?
|
|
// log("build: failed to launch wget thread");
|
|
// If we run it in this thread then if we are fetching
|
|
// a local url it will block forever.
|
|
// systemStartWrapper_r(this,NULL);
|
|
// return getUtf8ContentInFile ( fileSizeArg );
|
|
//g_errno = ETHREADSDISABLED;
|
|
|
|
//return NULL;
|
|
}
|
|
|
|
// . get the final utf8 content of the document
|
|
// . all html entities are replaced with utf8 chars
|
|
// . all iframes are expanded
|
|
// . if we are using diffbot then getting the utf8 content should return
|
|
// the json which is the output from the diffbot api. UNLESS we are getting
|
|
// the webpage itself for harvesting outlinks to spider later.
|
|
char **XmlDoc::getUtf8Content ( ) {
|
|
|
|
// if we already computed it, return that
|
|
if ( m_utf8ContentValid ) return &ptr_utf8Content;
|
|
|
|
if ( m_setFromTitleRec ) {
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
setStatus("getting utf8 content");
|
|
|
|
// recycle?
|
|
if ( cr->m_recycleContent || m_recycleContent ||
|
|
// if trying to delete from index, load from old titlerec
|
|
m_deleteFromIndex ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (char **)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
ptr_utf8Content = od-> ptr_utf8Content;
|
|
size_utf8Content = od->size_utf8Content;
|
|
m_utf8ContentValid = true;
|
|
m_contentType = od->m_contentType;
|
|
m_contentTypeValid = true;
|
|
// sanity check
|
|
if ( ptr_utf8Content &&
|
|
ptr_utf8Content[size_utf8Content-1] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
return &ptr_utf8Content;
|
|
}
|
|
// if could not find title rec and we are docid-based then
|
|
// we can't go any further!!
|
|
if ( m_setFromDocId ||
|
|
// it should be there if trying to delete as well!
|
|
m_deleteFromIndex ) {
|
|
log("xmldoc: null utf8 content for docid-based "
|
|
"titlerec (d=%" INT64 ") lookup which was not found",
|
|
m_docId);
|
|
ptr_utf8Content = NULL;
|
|
size_utf8Content = 0;
|
|
m_utf8ContentValid = true;
|
|
m_contentType = CT_HTML;
|
|
m_contentTypeValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
char **ep = getExpandedUtf8Content();
|
|
if ( ! ep || ep == (void *)-1 ) return ep;
|
|
|
|
// NULL out if no content
|
|
if ( ! *ep ) {
|
|
ptr_utf8Content = NULL;
|
|
size_utf8Content = 0;
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (char **)ct;
|
|
|
|
// if we have a json reply, leave it alone... expanding a "
|
|
// into a double quote will mess up the JSON!
|
|
if ( *ct == CT_JSON ) {
|
|
ptr_utf8Content = (char *)m_expandedUtf8Content;
|
|
size_utf8Content = m_expandedUtf8ContentSize;
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
// why would the spider proxy, who use msg13.cpp to call
|
|
// XmlDoc::getExpandedUtf8Content() want to call this??? it seems
|
|
// to destroy expandedutf8content with a call to htmldecode
|
|
if ( m_isSpiderProxy ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// not if rss file extension
|
|
//bool isRSSExt = false;
|
|
//char *ext = m_firstUrl.getExtension();
|
|
//if ( ext && strcasecmp(ext,"rss") == 0 ) isRSSExt = true;
|
|
//if ( ext && strcasecmp(ext,"xml") == 0 ) isRSSExt = true;
|
|
//if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;
|
|
|
|
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( m_contentTypeValid && m_contentType == CT_XML ) isRSSExt = true;
|
|
|
|
// convert < to <gb and > to gb/> ???? and & to utf32 char
|
|
// for a double wide ampersand?
|
|
//bool doSpecial = true;
|
|
// convert to what it should be if we are an .rss file extension
|
|
//if ( isRSSExt ) doSpecial = false;
|
|
|
|
// sabnity check
|
|
if ( m_xmlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_wordsValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
//
|
|
// convert illegal utf8 characters into spaces
|
|
//
|
|
// fixes santaclarachorale.vbotickets.com/tickets/g.f._handels_israel_in_egypt/1062
|
|
// which has a 228,0x80,& sequence (3 chars, last is ascii)
|
|
uint8_t *x = (uint8_t *)m_expandedUtf8Content;
|
|
char size;
|
|
for ( ; *x ; x += size ) {
|
|
QUICKPOLL(m_niceness);
|
|
size = getUtf8CharSize(x);
|
|
// ok, make it a space i guess if it is a bad utf8 char
|
|
if ( ! isSaneUtf8Char(x) ) {
|
|
*x = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
// skip if only one byte
|
|
if ( size == 1 ) continue;
|
|
// now each byte in the sequence must have 0x80 set...
|
|
if ( ! (x[1] & 0x80) ) {
|
|
x[0] = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
if ( size == 2 ) continue;
|
|
if ( ! (x[2] & 0x80) ) {
|
|
x[0] = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
if ( size == 3 ) continue;
|
|
if ( ! (x[3] & 0x80) ) {
|
|
x[0] = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// sanity
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if we are an xml doc, then before we call htmlDecode translate
|
|
// all tags like <title> or <link> to <gbtitle> or <gblink> so we
|
|
// know they are xml tags. because stuff like <br> will
|
|
// become <br> and will be within its xml tag like <gbdescription>
|
|
// or <gbtitle>.
|
|
// MDW: 9/28/2014. no longer do this since i added hashXmlFields().
|
|
/*
|
|
if ( m_contentType == CT_XML ) {
|
|
// count the xml tags
|
|
char *p = m_expandedUtf8Content;
|
|
char *pend = p + m_expandedUtf8ContentSize - 1;
|
|
int32_t need = m_expandedUtf8ContentSize;
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( *p == '<' ) need += 5; // for adding "gbxml"
|
|
}
|
|
if ( ! m_xbuf.reserve(need) ) return NULL;
|
|
// reset ptr
|
|
p = m_expandedUtf8Content;
|
|
// ponit to dst
|
|
char *dst = m_xbuf.getBufStart();
|
|
// do the copy
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// copy it over
|
|
*dst++ = *p;
|
|
if ( *p != '<' ) continue;
|
|
// if <?xml> copy over as is, do not insert 'gb'
|
|
if ( p[1] == '?' ) continue;
|
|
// same for comments <!--...-->
|
|
if ( p[1] == '!' ) continue;
|
|
// point to tagname
|
|
char *tagName = p+1;
|
|
if ( p[1] == '/' ) tagName++;
|
|
// also get the full node now
|
|
NodeType *nt; getTagId ( tagName , &nt );
|
|
// if it is not an html tag, do not fuss with it!
|
|
if ( ! nt ) continue;
|
|
// if its in the list but is xml, let it go too
|
|
if ( nt->m_isXmlTag ) continue;
|
|
// . otherwise, its an html tag being used as an xml
|
|
// tag and we need to encode (append gbxml to it)
|
|
// . insert / first if there
|
|
if ( p[1] == '/' ) {p++;*dst++ = *p;}
|
|
// then "gb"
|
|
*dst++ = 'g';
|
|
*dst++ = 'b';
|
|
*dst++ = 'x';
|
|
*dst++ = 'm';
|
|
*dst++ = 'l';
|
|
}
|
|
// update
|
|
m_xbuf.m_length = dst - m_xbuf.getBufStart();
|
|
// final \0
|
|
*dst = '\0';
|
|
// re-assign these
|
|
m_expandedUtf8Content = m_xbuf.getBufStart();//m_buf;
|
|
m_expandedUtf8ContentSize = m_xbuf.m_length + 1;
|
|
// free esbuf if we were referencing that to save mem
|
|
m_esbuf.purge();
|
|
}
|
|
*/
|
|
|
|
// richmondspca.org has " in some tags and we do not like
|
|
// expanding that to " because it messes up XmlNode::getTagLen()
|
|
// and creates big problems. same for www.first-avenue.com. so
|
|
// by setting doSpecial to try we change < > and " to
|
|
// [ ] and ' which have no meaning in html per se.
|
|
bool doSpecial = true;
|
|
if ( m_contentType == CT_XML ) doSpecial = false;
|
|
|
|
// . now decode those html entities into utf8 so that we never have to
|
|
// check for html entities anywhere else in the code. a big win!!
|
|
// . doSpecial = true, so that <, >, & and " are
|
|
// encoded into high value
|
|
// utf8 chars so that Xml::set(), etc. still work properly and don't
|
|
// add any more html tags than it should
|
|
// . this will decode in place
|
|
// . MDW: 9/28/2014. no longer do for xml docs since i added
|
|
// hashXmlFields()
|
|
int32_t n = m_expandedUtf8ContentSize - 1;
|
|
if ( m_contentType != CT_XML )
|
|
n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
|
|
m_expandedUtf8Content,//ptr_utf8Content,
|
|
m_expandedUtf8ContentSize-1,//size_utf8Con
|
|
doSpecial,
|
|
m_niceness);
|
|
|
|
// can't exceed this! n does not include the final \0 even though
|
|
// we do right it out.
|
|
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
|
|
// sanity
|
|
if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now rss has crap in it like "&nbsp;" so we have to do another
|
|
// decoding pass
|
|
// . MDW: 9/28/2014. no longer do for xml docs since i added
|
|
// hashXmlFields()
|
|
// if ( m_contentType == CT_XML ) // isRSSExt )
|
|
// n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
|
|
// m_expandedUtf8Content,//ptr_utf8Content,
|
|
// n,
|
|
// false,//doSpecial,
|
|
// m_niceness);
|
|
// sanity
|
|
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
|
|
// sanity
|
|
if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
|
|
// finally transform utf8 apostrophe's into regular apostrophes
|
|
// to make parsing easier
|
|
uint8_t *p = (uint8_t *)m_expandedUtf8Content;
|
|
uint8_t *dst = (uint8_t *)m_expandedUtf8Content;
|
|
uint8_t *pend = p + n;
|
|
for ( ; *p ; p += size ) {
|
|
QUICKPOLL(m_niceness);
|
|
size = getUtf8CharSize(p);
|
|
// quick copy
|
|
if ( size == 1 && p[0] != '<' ) { *dst++ = *p; continue; }
|
|
// make "1<super>st</super>" into "1st" so Dates.cpp can
|
|
// have an easier time
|
|
if ( p[0] == '<' &&
|
|
to_lower_a(p[1]) == 's' &&
|
|
to_lower_a(p[2]) == 'u' &&
|
|
to_lower_a(p[3]) == 'p' ) {
|
|
// assume no go!
|
|
*dst++ = '<';
|
|
// use this
|
|
char *s = (char *)p;
|
|
// did number preceed?
|
|
char *pn = s - 1;
|
|
for (;pn>=m_expandedUtf8Content&&is_wspace_a(*pn);pn--)
|
|
QUICKPOLL(m_niceness);
|
|
// must be like "1st" or "32nd"
|
|
if ( ! is_digit(*pn) ) continue;
|
|
// skip the "<sup"
|
|
s += 4;
|
|
// skip until >
|
|
for (; *s && *s != '>' ; s++ )
|
|
QUICKPOLL(m_niceness);
|
|
// crazy?
|
|
if ( ! *s ) continue;
|
|
// skip the '>'
|
|
s++;
|
|
// skip spaces after the "<sup>" tag
|
|
for (; *s && is_wspace_a(*s) ; s++ )
|
|
QUICKPOLL(m_niceness);
|
|
// crazy?
|
|
if ( ! *s ) continue;
|
|
// check for "st" etc
|
|
bool gotIt = false;
|
|
char *suffix = s;
|
|
if ( (to_lower_a(s[0])=='s'&&to_lower_a(s[1]) == 't')||
|
|
(to_lower_a(s[0])=='n'&&to_lower_a(s[1]) == 'd')||
|
|
(to_lower_a(s[0])=='r'&&to_lower_a(s[1]) == 'd')||
|
|
(to_lower_a(s[0])=='t'&&to_lower_a(s[1]) == 'h'))
|
|
gotIt = true;
|
|
if ( ! gotIt ) continue;
|
|
// skip that
|
|
s += 2;
|
|
// skip more spaces
|
|
for (; *s && is_wspace_a(*s) ; s++ )
|
|
QUICKPOLL(m_niceness);
|
|
// crazy?
|
|
if ( ! *s ) continue;
|
|
// find </super> tag
|
|
if ( s[0] != '<' ) continue;
|
|
if ( s[1] != '/' ) continue;
|
|
if ( to_lower_a(s[2]) != 's' ) continue;
|
|
if ( to_lower_a(s[3]) != 'u' ) continue;
|
|
if ( to_lower_a(s[4]) != 'p' ) continue;
|
|
if ( s[5] != '>' ) continue;
|
|
// skip it, point to >
|
|
s += 5;
|
|
// assign p to that
|
|
p = (unsigned char *)s;
|
|
// back up ove rthe no-go
|
|
dst--;
|
|
// rewrite it
|
|
*dst++ = to_lower_a(suffix[0]);
|
|
*dst++ = to_lower_a(suffix[1]);
|
|
// do next round
|
|
continue;
|
|
}
|
|
|
|
|
|
// check for crazy apostrophes
|
|
if ( p[0]==0xe2 &&
|
|
p[1]==0x80 &&
|
|
(p[2]==0x99 ||
|
|
p[2]==0x98 ||
|
|
p[2]==0x9b ) ) {
|
|
*dst++ = '\'';
|
|
continue;
|
|
}
|
|
// utf8 control character?
|
|
if ( p[0] == 0xc2 &&
|
|
p[1] >= 0x80 &&
|
|
p[1] <= 0x9f ) {
|
|
*dst++ = ' ';
|
|
continue;
|
|
}
|
|
// double quotes in utf8
|
|
// DO NOT do this if type JSON!! json uses quotes as
|
|
// control characters
|
|
if ( p[0] == 0xe2 &&
|
|
p[1] == 0x80 &&
|
|
m_contentType != CT_JSON ) {
|
|
if (p[2] == 0x9c ) {
|
|
*dst++ = '\"';
|
|
continue;
|
|
}
|
|
if (p[2] == 0x9d ) {
|
|
*dst++ = '\"';
|
|
continue;
|
|
}
|
|
}
|
|
// and crazy hyphens (8 - 10pm)
|
|
if ( p[0]==0xc2 &&
|
|
p[1]==0xad ) {
|
|
*dst++ = '-';
|
|
continue;
|
|
}
|
|
if ( p[0]==0xe2 &&
|
|
p[1]==0x80 &&
|
|
p[2]==0x93 ) {
|
|
*dst++ = '-';
|
|
continue;
|
|
}
|
|
if ( p[0]==0xe2 &&
|
|
p[1]==0x80 &&
|
|
p[2]==0x94 ) {
|
|
*dst++ = '-';
|
|
continue;
|
|
}
|
|
// . convert all utf8 white space to ascii white space
|
|
// . should benefit the string matching algo in
|
|
// XmlDoc::getEventSummary() which needs to skip spaces
|
|
if ( ! g_map_is_ascii[(unsigned char)*p] &&
|
|
is_wspace_utf8(p) ) {
|
|
*dst++ = ' ';
|
|
continue;
|
|
}
|
|
// otherwise, just copy it
|
|
gbmemcpy(dst,p,size);
|
|
dst += size;
|
|
}
|
|
// null term
|
|
*dst++ = '\0';
|
|
|
|
// now set it up
|
|
ptr_utf8Content = (char *)m_expandedUtf8Content;
|
|
//size_utf8Content = n+1;//m_expandedUtf8ContentSize;
|
|
size_utf8Content = (char *)dst - m_expandedUtf8Content;
|
|
|
|
// sanity -- skipped over the \0???
|
|
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity check
|
|
if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
// *pend should be \0
|
|
int32_t getContentHash32Fast ( unsigned char *p ,
|
|
int32_t plen ,
|
|
int32_t niceness ) {
|
|
// sanity
|
|
if ( ! p ) return 0;
|
|
if ( plen <= 0 ) return 0;
|
|
if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
unsigned char *pend = p + plen;
|
|
|
|
static bool s_init = false;
|
|
static char s_qtab0[256];
|
|
static char s_qtab1[256];
|
|
static char s_qtab2[256];
|
|
static char *s_skips[] = {
|
|
"jan",
|
|
"feb",
|
|
"mar",
|
|
"apr",
|
|
"may",
|
|
"jun",
|
|
"jul",
|
|
"aug",
|
|
"sep",
|
|
"oct",
|
|
"nov",
|
|
"dec",
|
|
"sun",
|
|
"mon",
|
|
"tue",
|
|
"wed",
|
|
"thu",
|
|
"fri",
|
|
"sat" };
|
|
if ( ! s_init ) {
|
|
// only call this crap once
|
|
s_init = true;
|
|
// clear up
|
|
memset(s_qtab0,0,256);
|
|
memset(s_qtab1,0,256);
|
|
memset(s_qtab2,0,256);
|
|
for ( int32_t i = 0 ; i < 19 ; i++ ) {
|
|
unsigned char *s = (unsigned char *)s_skips[i];
|
|
s_qtab0[(unsigned char)to_lower_a(s[0])] = 1;
|
|
s_qtab0[(unsigned char)to_upper_a(s[0])] = 1;
|
|
// do the quick hash
|
|
unsigned char qh = to_lower_a(s[0]);
|
|
qh ^= to_lower_a(s[1]);
|
|
qh <<= 1;
|
|
qh ^= to_lower_a(s[2]);
|
|
s_qtab1[qh] = 1;
|
|
// try another hash, the swift hash
|
|
unsigned char sh = to_lower_a(s[0]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(s[1]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(s[2]);
|
|
s_qtab2[sh] = 1;
|
|
}
|
|
}
|
|
|
|
bool lastWasDigit = false;
|
|
bool lastWasPunct = true;
|
|
uint32_t h = 0LL;
|
|
//char size = 0;
|
|
unsigned char pos = 0;
|
|
for ( ; p < pend ; p++ ) { // += size ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// get size
|
|
// this might not be utf8!!!
|
|
//size = getUtf8CharSize(p);
|
|
// skip if not alnum
|
|
// this might not be utf8!!!
|
|
//if ( ! is_alnum_utf8 ( (char *)p ) ) {
|
|
if ( ! is_alnum_a ( *p ) ) {
|
|
lastWasDigit = false;
|
|
lastWasPunct = true;
|
|
continue;
|
|
}
|
|
// if its a digit, call it 1
|
|
if ( is_digit(*p) ) {
|
|
// skip consecutive digits
|
|
if ( lastWasDigit ) continue;
|
|
// xor in a '1'
|
|
h ^= g_hashtab[pos][(unsigned char)'1'];
|
|
pos++;
|
|
lastWasDigit = true;
|
|
continue;
|
|
}
|
|
// reset
|
|
lastWasDigit = false;
|
|
|
|
// exclude days of the month or week so clocks do
|
|
// not affect this hash
|
|
if ( s_qtab0[p[0]] && lastWasPunct && p[1] && p[2] ) {
|
|
// quick hash
|
|
unsigned char qh = to_lower_a(p[0]);
|
|
qh ^= to_lower_a(p[1]);
|
|
qh <<= 1;
|
|
qh ^= to_lower_a(p[2]);
|
|
// look that up
|
|
if ( ! s_qtab1[qh] ) goto skip;
|
|
// try another hash, the swift hash
|
|
unsigned char sh = to_lower_a(p[0]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(p[1]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(p[2]);
|
|
if ( ! s_qtab2[sh] ) goto skip;
|
|
// ok, probably a match..
|
|
unsigned char *s = p + 3;
|
|
// skip to end of word
|
|
//char size2;
|
|
//for ( ; s < pend ; s += size2 ) {
|
|
for ( ; s < pend ; s++ ) {
|
|
//size2 = getUtf8CharSize(s);
|
|
//if ( ! is_alnum_utf8 ((char *)s) )
|
|
if ( ! is_alnum_a ( *s ) )
|
|
break;
|
|
}
|
|
// it is already point to the next char, so clr this
|
|
//size = 0;
|
|
// advance p now
|
|
p = s;
|
|
// hash as one type of thing...
|
|
h ^= g_hashtab[pos][(unsigned char)'X'];
|
|
pos++;
|
|
continue;
|
|
}
|
|
|
|
skip:
|
|
// reset this
|
|
lastWasPunct = false;
|
|
// xor this in right
|
|
h ^= g_hashtab[pos][p[0]];
|
|
pos++;
|
|
// assume ascii or latin1
|
|
continue;
|
|
/*
|
|
// one more?
|
|
if ( size == 1 ) continue;
|
|
// do that
|
|
h ^= g_hashtab[pos][p[1]];
|
|
pos++;
|
|
// one more?
|
|
if ( size == 2 ) continue;
|
|
// do that
|
|
h ^= g_hashtab[pos][p[2]];
|
|
pos++;
|
|
// one more?
|
|
if ( size == 3 ) continue;
|
|
// do that
|
|
h ^= g_hashtab[pos][p[3]];
|
|
pos++;
|
|
// that should do it!
|
|
continue;
|
|
*/
|
|
}
|
|
return h;
|
|
}
|
|
|
|
int32_t *XmlDoc::getContentHash32 ( ) {
|
|
// return it if we got it
|
|
if ( m_contentHash32Valid ) return &m_contentHash32;
|
|
setStatus ( "getting contenthash32" );
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (int32_t *)ct;
|
|
|
|
// we do not hash the url/resolved_url/html fields in diffbot json
|
|
// because the url field is a mirror of the url and the html field
|
|
// is redundant and would slow us down
|
|
if ( *ct == CT_JSON )
|
|
return getContentHashJson32();
|
|
|
|
// if we are a diffbot json object, fake this for now, it will
|
|
// be set for real in hashJSON()
|
|
// no, because we call this before hashJSON() for to set
|
|
// EDOCUNCHANGED above... so just hash the json normally for now
|
|
//if ( m_isDiffbotJSONObject ) {
|
|
// m_contentHash32 = 0;
|
|
// return &m_contentHash32;
|
|
//}
|
|
|
|
// . get the content. get the pure untouched content!!!
|
|
// . gotta be pure since that is what Msg13.cpp computes right
|
|
// after it downloads the doc...
|
|
// . if iframes are present, msg13 gives up
|
|
char **pure = getContent();
|
|
if ( ! pure || pure == (char **)-1 ) return (int32_t *)pure;
|
|
// size
|
|
//int32_t n = size_utf8Content - 1;
|
|
// hash up to first 10,000 chars
|
|
//if ( n > 10000 ) n = 10000;
|
|
// do it
|
|
//m_contentHash32 = hash32 ( ptr_utf8Content , n );
|
|
unsigned char *p = (unsigned char *)(*pure);
|
|
int32_t plen = m_contentLen;//size_utf8Content - 1;
|
|
|
|
// no content means no hash32
|
|
if ( plen <= 0 ) {//ptr_utf8Content ) {
|
|
m_contentHash32 = 0;
|
|
m_contentHash32Valid = true;
|
|
return &m_contentHash32;
|
|
}
|
|
|
|
// we set m_contentHash32 in ::hashJSON() below because it is special
|
|
// for diffbot since it ignores certain json fields like url: and the
|
|
// fields are independent, and numbers matter, like prices
|
|
//if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; }
|
|
|
|
// *pend should be \0
|
|
m_contentHash32 = getContentHash32Fast ( p , plen , m_niceness );
|
|
// validate
|
|
m_contentHash32Valid = true;
|
|
return &m_contentHash32;
|
|
}
|
|
|
|
// we do not hash the url/resolved_url/html fields in diffbot json
|
|
// because the url field is a mirror of the url and the html field
|
|
// is redundant and would slow us down
|
|
int32_t *XmlDoc::getContentHashJson32 ( ) {
|
|
|
|
if ( m_contentHash32Valid ) return &m_contentHash32;
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (int32_t *)jp;
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
int32_t totalHash32 = 0;
|
|
|
|
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
|
|
char *topName = NULL;
|
|
|
|
// what name level are we?
|
|
int32_t numNames = 1;
|
|
JsonItem *pi = ji->m_parent;
|
|
for ( ; pi ; pi = pi->m_parent ) {
|
|
// empty name?
|
|
if ( ! pi->m_name ) continue;
|
|
if ( ! pi->m_name[0] ) continue;
|
|
topName = pi->m_name;
|
|
numNames++;
|
|
}
|
|
|
|
// if we are the diffbot reply "html" field do not hash this
|
|
// because it is redundant and it hashes html tags etc.!
|
|
// plus it slows us down a lot and bloats the index.
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"html") == 0 )
|
|
continue;
|
|
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"url") == 0 )
|
|
continue;
|
|
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"pageUrl") == 0 )
|
|
continue;
|
|
|
|
// mike will track down how the hash works in article|3|123456
|
|
//if ( ji->m_name && numNames==1 &&
|
|
// strcmp(ji->m_name,"diffbotUri") == 0 )
|
|
// continue;
|
|
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"resolved_url") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"stats") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"queryString") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"nextPages") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"textAnalysis") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"links") == 0 )
|
|
continue;
|
|
|
|
|
|
// hash the fully compound name
|
|
int32_t nameHash32 = 0;
|
|
JsonItem *p = ji;
|
|
char *lastName = NULL;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
// empty name?
|
|
if ( ! p->m_name ) continue;
|
|
if ( ! p->m_name[0] ) continue;
|
|
// dup? can happen with arrays. parent of string
|
|
// in object, has same name as his parent, the
|
|
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
|
|
if ( p->m_name == lastName ) continue;
|
|
// update
|
|
lastName = p->m_name;
|
|
// hash it up
|
|
nameHash32 = hash32(p->m_name,p->m_nameLen,nameHash32);
|
|
}
|
|
|
|
//
|
|
// now Json.cpp decodes and stores the value into
|
|
// a buffer, so ji->getValue() should be decoded completely
|
|
//
|
|
|
|
// . get the value of the json field
|
|
// . if it's a number or bool it converts into a string
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
|
|
//
|
|
// for deduping search results we set m_contentHash32 here for
|
|
// diffbot json objects.
|
|
//
|
|
// we use this hash for setting EDOCUNCHANGED when reindexing
|
|
// a diffbot reply. we also use to see if the diffbot reply
|
|
// is a dup with another page in the index. thirdly, we use
|
|
// to dedup search results, which could be redundant because
|
|
// of our spider-time deduping.
|
|
//
|
|
// make the content hash so we can set m_contentHash32
|
|
// for deduping. do an exact hash for now...
|
|
int32_t vh32 = hash32 ( val , vlen , m_niceness );
|
|
// combine
|
|
int32_t combined32 = hash32h ( nameHash32 , vh32 );
|
|
// accumulate field/val pairs order independently
|
|
totalHash32 ^= combined32;
|
|
// debug note
|
|
//logf(LOG_DEBUG,"ch32: field=%s nh32=%" UINT32 " vallen=%" INT32 "",
|
|
// ji->m_name,
|
|
// nameHash32,
|
|
// vlen);
|
|
}
|
|
|
|
m_contentHash32 = totalHash32;
|
|
m_contentHash32Valid = true;
|
|
return &m_contentHash32;
|
|
}
|
|
|
|
// do not consider tags except frame and iframe... make all months
|
|
// and days of weeks and digits basically the same
|
|
int64_t *XmlDoc::getLooseContentHash64 ( ) {
|
|
|
|
if ( m_looseContentHash64Valid )
|
|
return &m_looseContentHash64;
|
|
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int64_t *)xml;
|
|
|
|
int64_t h64 = 0LL;
|
|
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes ();
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// skip if not the right kinda tag
|
|
if ( nodes[i].isTag() &&
|
|
nodes[i].getNodeId() != TAG_FRAME &&
|
|
nodes[i].getNodeId() != TAG_IFRAME &&
|
|
nodes[i].getNodeId() != TAG_IMG )
|
|
continue;
|
|
|
|
// hash that node up
|
|
int64_t ch64;
|
|
|
|
// this is really a 32-bit hash
|
|
ch64=getContentHash32Fast((unsigned char *)nodes[i].getNode() ,
|
|
nodes[i].getNodeLen() ,
|
|
m_niceness );
|
|
|
|
// incorporate hash from that node
|
|
h64 = hash64h ( ch64 , h64 );
|
|
}
|
|
|
|
m_looseContentHash64Valid = true;
|
|
m_looseContentHash64 = h64;
|
|
return &m_looseContentHash64;
|
|
}
|
|
|
|
int32_t XmlDoc::getHostHash32a ( ) {
|
|
if ( m_hostHash32aValid ) return m_hostHash32a;
|
|
m_hostHash32aValid = true;
|
|
Url *f = getFirstUrl();
|
|
m_hostHash32a = f->getHostHash32();
|
|
return m_hostHash32a;
|
|
}
|
|
|
|
int32_t XmlDoc::getHostHash32b ( ) {
|
|
if ( m_hostHash32bValid ) return m_hostHash32b;
|
|
m_hostHash32bValid = true;
|
|
Url *c = getCurrentUrl();
|
|
m_hostHash32b = c->getHostHash32();
|
|
return m_hostHash32b;
|
|
}
|
|
|
|
int32_t XmlDoc::getDomHash32( ) {
|
|
if ( m_domHash32Valid ) return m_domHash32;
|
|
m_domHash32Valid = true;
|
|
Url *f = getFirstUrl();
|
|
m_domHash32 = hash32 ( f->getDomain(), f->getDomainLen() );
|
|
return m_domHash32;
|
|
}
|
|
|
|
// . this will be the actual pnm data of the image thumbnail
|
|
// . you can inline it in an image tag like
|
|
// <img src="...."/>
|
|
// background-image:url(...);
|
|
// . FORMAT of ptr_imageData:
|
|
// <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg>
|
|
char **XmlDoc::getThumbnailData ( ) {
|
|
if ( m_imageDataValid ) return &ptr_imageData;
|
|
Images *images = getImages();
|
|
if ( ! images || images == (Images *)-1 ) return (char **)images;
|
|
ptr_imageData = NULL;
|
|
size_imageData = 0;
|
|
m_imageDataValid = true;
|
|
if ( ! images || ! images->m_imageBufValid ) return &ptr_imageData;
|
|
if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData;
|
|
// this buffer is a ThumbnailArray
|
|
ptr_imageData = images->m_imageBuf.getBufStart();
|
|
size_imageData = images->m_imageBuf.length();
|
|
return &ptr_imageData;
|
|
}
|
|
|
|
Images *XmlDoc::getImages ( ) {
|
|
if ( m_imagesValid ) return &m_images;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
if ( ! cr->m_makeImageThumbnails ) {
|
|
m_images.reset();
|
|
m_imagesValid = true;
|
|
return &m_images;
|
|
}
|
|
|
|
if ( cr->m_isCustomCrawl ) {
|
|
m_images.reset();
|
|
m_imagesValid = true;
|
|
return &m_images;
|
|
}
|
|
|
|
setStatus ( "getting thumbnail" );
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (Images *)words;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Images *)xml;
|
|
Sections *sections = getSections();
|
|
if ( ! sections || sections==(Sections *)-1) return (Images *)sections;
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (Images *)site;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Images *)d;
|
|
int8_t *hc = getHopCount();
|
|
if ( ! hc || hc == (void *)-1 ) return (Images *)hc;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (Images *)cu;
|
|
|
|
// . this does not block or anything
|
|
// . if we are a diffbot json reply it should just use the primary
|
|
// image, if any, as the only candidate
|
|
m_images.setCandidates ( cu , words , xml , sections , this );
|
|
|
|
setStatus ("getting thumbnail");
|
|
|
|
// assume valid
|
|
m_imagesValid = true;
|
|
|
|
// now get the thumbnail
|
|
if ( ! m_images.getThumbnail ( site ,
|
|
gbstrlen(site) ,
|
|
*d ,
|
|
this ,
|
|
cr->m_collnum ,
|
|
//NULL , // statusPtr ptr
|
|
*hc ,
|
|
m_masterState,
|
|
m_masterLoop ) )
|
|
return (Images *)-1;
|
|
|
|
return &m_images;
|
|
}
|
|
|
|
|
|
// . get different attributes of the Links as vectors
|
|
// . these are 1-1 with the Links::m_linkPtrs[] array
|
|
TagRec ***XmlDoc::getOutlinkTagRecVector () {
|
|
|
|
// if page has a <meta name=usefakeips content=1> tag
|
|
// then use the hash of the links host as the firstip.
|
|
// this will speed things up when adding a gbdmoz.urls.txt.*
|
|
// file to index every url in dmoz.
|
|
char *useFakeIps = hasFakeIpsMetaTag();
|
|
if ( ! useFakeIps || useFakeIps == (void *)-1 )
|
|
return (TagRec ***)useFakeIps;
|
|
|
|
// no error and valid, return quick
|
|
if ( m_outlinkTagRecVectorValid && *useFakeIps )
|
|
return &m_outlinkTagRecVector;
|
|
|
|
// error?
|
|
if ( m_outlinkTagRecVectorValid && m_msge0.m_errno ) {
|
|
g_errno = m_msge0.m_errno;
|
|
return NULL;
|
|
}
|
|
|
|
// if not using fake ips, give them the real tag rec vector
|
|
if ( m_outlinkTagRecVectorValid )
|
|
return &m_msge0.m_tagRecPtrs;
|
|
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (void *) -1 ) return (TagRec ***)links;
|
|
|
|
if ( *useFakeIps ) {
|
|
// set to those
|
|
m_fakeTagRec.reset();
|
|
// just make a bunch ptr to empty tag rec
|
|
int32_t need = links->m_numLinks * sizeof(TagRec *);
|
|
if ( ! m_fakeTagRecPtrBuf.reserve ( need ) ) return NULL;
|
|
// make them all point to the fake empty tag rec
|
|
TagRec **grv = (TagRec **)m_fakeTagRecPtrBuf.getBufStart();
|
|
for ( int32_t i = 0 ; i < links->m_numLinks ; i++ )
|
|
grv[i] = &m_fakeTagRec;
|
|
// set it
|
|
m_outlinkTagRecVector = grv;
|
|
m_outlinkTagRecVectorValid = true;
|
|
return &m_outlinkTagRecVector;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
|
|
// update status msg
|
|
setStatus ( "getting outlink tag rec vector" );
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (TagRec ***)gr;
|
|
// assume valid
|
|
m_outlinkTagRecVectorValid = true;
|
|
// go get it
|
|
if ( ! m_msge0.getTagRecs ( links->m_linkPtrs ,
|
|
links->m_linkFlags ,
|
|
links->m_numLinks ,
|
|
false , // skip old?
|
|
// make it point to this basetagrec if
|
|
// the LF_SAMEHOST flag is set for the link
|
|
gr ,
|
|
cr->m_collnum ,
|
|
m_niceness ,
|
|
m_masterState ,
|
|
m_masterLoop )) {
|
|
// sanity check
|
|
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
|
|
// we blocked
|
|
return (TagRec ***)-1;
|
|
}
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// or this?
|
|
if ( m_msge0.m_errno ) {
|
|
g_errno = m_msge0.m_errno;
|
|
return NULL;
|
|
}
|
|
// set it
|
|
//m_outlinkTagRecVector = m_msge0.m_tagRecPtrs;
|
|
// ptr to a list of ptrs to tag recs
|
|
return &m_msge0.m_tagRecPtrs;
|
|
}
|
|
|
|
char *XmlDoc::hasNoIndexMetaTag() {
|
|
if ( m_hasNoIndexMetaTagValid )
|
|
return &m_hasNoIndexMetaTag;
|
|
// assume none
|
|
m_hasNoIndexMetaTag = false;
|
|
// store value/content of meta tag in here
|
|
char mbuf[16];
|
|
mbuf[0] = '\0';
|
|
char *tag = "noindex";
|
|
int32_t tlen = gbstrlen(tag);
|
|
// check the xml for a meta tag
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
if ( mbuf[0] == '1' ) m_hasNoIndexMetaTag = true;
|
|
m_hasNoIndexMetaTagValid = true;
|
|
return &m_hasNoIndexMetaTag;
|
|
}
|
|
|
|
|
|
char *XmlDoc::hasFakeIpsMetaTag ( ) {
|
|
if ( m_hasUseFakeIpsMetaTagValid ) return &m_hasUseFakeIpsMetaTag;
|
|
|
|
char mbuf[16];
|
|
mbuf[0] = '\0';
|
|
char *tag = "usefakeips";
|
|
int32_t tlen = gbstrlen(tag);
|
|
|
|
// check the xml for a meta tag
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
|
|
m_hasUseFakeIpsMetaTag = false;
|
|
if ( mbuf[0] == '1' ) m_hasUseFakeIpsMetaTag = true;
|
|
m_hasUseFakeIpsMetaTagValid = true;
|
|
return &m_hasUseFakeIpsMetaTag;
|
|
}
|
|
|
|
|
|
int32_t **XmlDoc::getOutlinkFirstIpVector () {
|
|
|
|
Links *links = getLinks();
|
|
if ( ! links ) return NULL;
|
|
|
|
// if page has a <meta name=usefakeips content=1> tag
|
|
// then use the hash of the links host as the firstip.
|
|
// this will speed things up when adding a gbdmoz.urls.txt.*
|
|
// file to index every url in dmoz.
|
|
char *useFakeIps = hasFakeIpsMetaTag();
|
|
if ( ! useFakeIps || useFakeIps == (void *)-1 )
|
|
return (int32_t **)useFakeIps;
|
|
|
|
if ( *useFakeIps && m_outlinkIpVectorValid )
|
|
return &m_outlinkIpVector;
|
|
|
|
if ( *useFakeIps ) {
|
|
int32_t need = links->m_numLinks * 4;
|
|
m_fakeIpBuf.reserve ( need );
|
|
for ( int32_t i = 0 ; i < links->m_numLinks ; i++ ) {
|
|
uint64_t h64 = links->getHostHash64(i);
|
|
int32_t ip = h64 & 0xffffffff;
|
|
m_fakeIpBuf.pushLong(ip);
|
|
}
|
|
int32_t *ipBuf = (int32_t *)m_fakeIpBuf.getBufStart();
|
|
m_outlinkIpVector = ipBuf;
|
|
m_outlinkIpVectorValid = true;
|
|
return &m_outlinkIpVector;
|
|
}
|
|
|
|
// return msge1's buf otherwise
|
|
if ( m_outlinkIpVectorValid )
|
|
return &m_msge1.m_ipBuf;
|
|
|
|
// should we have some kinda error for msge1?
|
|
//if ( m_outlinkIpVectorValid && m_msge1.m_errno ) {
|
|
// g_errno = m_msge1.m_errno;
|
|
// return NULL;
|
|
//}
|
|
|
|
// . we now scrounge them from TagRec's "firstip" tag if there!
|
|
// . that way even if a domain changes its ip we still use the
|
|
// original ip, because the only reason we need this ip is for
|
|
// deciding which group of hosts will store this SpiderRequest and
|
|
// we use that for throttling, so we have to be consistent!!!
|
|
// . we never add -1 or 0 ips to tagdb though.... (NXDOMAIN,error...)
|
|
// . uses m_msgeForTagRecs for this one
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (int32_t **)grv;
|
|
// note it
|
|
setStatus ( "getting outlink first ip vector" );
|
|
// assume valid
|
|
m_outlinkIpVectorValid = true;
|
|
// sanity check
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// use this
|
|
int32_t nowGlobal = getSpideredTime();//m_spideredTime;
|
|
// add tags to tagdb?
|
|
bool addTags = true;
|
|
//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
|
|
if ( getIsPageParser() ) addTags = false;
|
|
// get this
|
|
char *testDir = getTestDir();
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . go get it
|
|
// . if coll is "qatest123" then try to use the file ./test/ips.txt to
|
|
// see if the ip is in there for the given url hostname
|
|
// . this will now update Tagdb with the "firstip" tags if it should!!
|
|
// . this just dns looks up the DOMAINS of each outlink because these
|
|
// are *first* ips and ONLY used by Spider.cpp for throttling!!!
|
|
if ( ! m_msge1.getFirstIps ( *grv ,
|
|
links->m_linkPtrs ,
|
|
links->m_linkFlags ,
|
|
links->m_numLinks ,
|
|
false , // skip old?
|
|
cr->m_coll ,
|
|
m_niceness ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
nowGlobal ,
|
|
addTags ,
|
|
testDir )) {
|
|
// sanity check
|
|
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
|
|
// we blocked
|
|
return (int32_t **)-1;
|
|
}
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// . ptr to a list of ptrs to tag recs
|
|
// . ip will be -1 on error
|
|
return &m_msge1.m_ipBuf;
|
|
}
|
|
|
|
/*
|
|
// really this could just check titledb in memory tree and tfndb and should
|
|
// be really fast!!
|
|
char **XmlDoc::getOutlinkIsIndexedVector () {
|
|
if ( m_outlinkIsIndexedVectorValid ) return &m_msge2.m_isIndexedBuf;
|
|
setStatus ( "getting outlink is indexed vector" );
|
|
Links *links = getLinks();
|
|
if ( ! links ) return NULL;
|
|
// assume valid
|
|
m_outlinkIsIndexedVectorValid = true;
|
|
// go get it
|
|
bool status = m_msge2.getIsIndexed ( links->m_linkPtrs ,
|
|
links->m_linkFlags ,
|
|
links->m_numLinks ,
|
|
false , // skip old?
|
|
m_coll ,
|
|
m_niceness ,
|
|
m_masterState ,
|
|
m_masterLoop );
|
|
// set it
|
|
//m_outlinkIsIndexedVector = m_msge2.m_isIndexedBuf;
|
|
// we blocked
|
|
if ( ! status ) return (char **)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// ptr to a list of ptrs to tag recs
|
|
return &m_msge2.m_isIndexedBuf;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
char *XmlDoc::getIsVisible ( ) {
|
|
if ( m_isVisibleValid ) return &m_isVisible;
|
|
setStatus ( "getting is visible" );
|
|
// to get a live reading, invalidate tag rec from title rec
|
|
m_oldTagRecValid = false;
|
|
// . loop over all regular expression in the url filters table
|
|
// . stop at first regular expression it matches
|
|
int32_t *rn = getRegExpNum2 ( -1 );
|
|
// need to wait for a callback at this point (or we had critical error)
|
|
if ( ! rn || rn == (int32_t *)-1 ) return (char *)rn;
|
|
// assume yes
|
|
m_isVisible = true;
|
|
// and valid
|
|
m_isVisibleValid = true;
|
|
// no match
|
|
if ( *rn == -1 ) return &m_isVisible;
|
|
// get spider priority
|
|
int32_t pr = m_cr->m_spiderPriorities[*rn];
|
|
// test it
|
|
if ( pr == -2 ) m_isVisible = false;
|
|
if ( pr == -3 ) m_isVisible = false;
|
|
return &m_isVisible;
|
|
}
|
|
*/
|
|
|
|
int32_t *XmlDoc::getUrlFilterNum ( ) {
|
|
// return it if already set
|
|
if ( m_urlFilterNumValid ) return &m_urlFilterNum;
|
|
// note that
|
|
setStatus ( "getting url filter row num");
|
|
|
|
// . make the partial new spider rec
|
|
// . we need this for matching filters like lang==zh_cn
|
|
// . crap, but then it matches "hasReply" when it should not
|
|
// . PROBLEM! this is the new reply not the OLD reply, so it may
|
|
// end up matching a DIFFERENT url filter num then what it did
|
|
// before we started spidering it...
|
|
//SpiderReply *newsr = getNewSpiderReply ( );
|
|
// note it
|
|
//if ( ! newsr )
|
|
// log("doc: getNewSpiderReply: %s",mstrerror(g_errno));
|
|
//if ( ! newsr || newsr == (void *)-1 ) return (int32_t *)newsr;
|
|
|
|
// need language i guess
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (int32_t *)langId;
|
|
|
|
|
|
// make a fake one for now
|
|
// SpiderReply fakeReply;
|
|
// // fix errors
|
|
// fakeReply.reset();
|
|
// fakeReply.m_isIndexedINValid = true;
|
|
// // just language for now, so we can FILTER by language
|
|
// if ( m_langIdValid ) fakeReply.m_langId = m_langId;
|
|
|
|
int32_t langIdArg = -1;
|
|
if ( m_langIdValid ) langIdArg = m_langId;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t spideredTime = getSpideredTime();
|
|
// get the spider request
|
|
SpiderRequest *oldsr = &m_sreq;
|
|
// null it out if invalid...
|
|
if ( ! m_sreqValid ) oldsr = NULL;
|
|
// do not set the spideredTime in the spiderReply to 0
|
|
// so we do not trigger the lastSpiderTime
|
|
//int32_t saved = newsr->m_spideredTime;
|
|
//newsr->m_spideredTime = 0;
|
|
//
|
|
// PROBLEM: we end up matching "isIndexed" in the url filters
|
|
// even if this is a NEW document because we pass it in the spider
|
|
// reply that we generate now even though another spider reply
|
|
// may not exist.
|
|
//
|
|
// SOLUTION: just do not supply a spider reply, we only seem to
|
|
// use the urlfilternum to get a diffbot api url OR to see if the
|
|
// document is banned/filtered so we should delete it. otherwise
|
|
// we were supplying "newsr" above...
|
|
|
|
// . look it up
|
|
// . use the old spidered date for "nowGlobal" so we can be consistent
|
|
// for injecting into the "qatest123" coll
|
|
int32_t ufn = ::getUrlFilterNum ( oldsr,
|
|
NULL,//&fakeReply,
|
|
spideredTime,false,
|
|
m_niceness,cr,
|
|
false, // isOutlink?
|
|
NULL,
|
|
langIdArg);
|
|
|
|
// put it back
|
|
//newsr->m_spideredTime = saved;
|
|
|
|
// bad news?
|
|
if ( ufn < 0 ) {
|
|
log("build: failed to get url filter for xmldoc %s",
|
|
m_firstUrl.m_url);
|
|
//g_errno = EBADENGINEER;
|
|
//return NULL;
|
|
}
|
|
|
|
|
|
// store it
|
|
m_urlFilterNum = ufn;
|
|
m_urlFilterNumValid = true;
|
|
|
|
// set this too in case the url filters table changes while
|
|
// we are spidering this and a row is inserted or deleted or something
|
|
//SafeBuf *yy = &cr->m_spiderDiffbotApiUrl[ufn];
|
|
// copy to ours
|
|
//m_diffbotApiUrl.safeMemcpy ( yy );
|
|
// ensure null term
|
|
//m_diffbotApiUrl.nullTerm();
|
|
//m_diffbotApiUrlValid = true;
|
|
|
|
|
|
return &m_urlFilterNum;
|
|
}
|
|
|
|
// . both "u" and "site" must not start with http:// or https:// or protocol
|
|
bool isSiteRootFunc ( char *u , char *site ) {
|
|
// get length of each
|
|
int32_t slen = gbstrlen(site);//m_siteLen;
|
|
int32_t ulen = gbstrlen(u);
|
|
// "site" may or may not end in /, so remove that
|
|
if ( site[slen-1] == '/' ) slen--;
|
|
// same for url
|
|
if ( u[ulen-1] == '/' ) ulen--;
|
|
// skip http:// or https://
|
|
if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
|
|
if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
|
|
if ( strncmp(site,"http://" ,7)==0 ) { site += 7; slen -= 7; }
|
|
if ( strncmp(site,"https://",8)==0 ) { site += 8; slen -= 8; }
|
|
// subtract default.asp etc. from "u"
|
|
//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.asp",11)==0 )
|
|
// ulen -= 11;
|
|
//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.html",12)==0 )
|
|
// ulen -= 12;
|
|
//if ( ulen > 15 && strncasecmp(u+ulen-11,"index.html",10)==0 )
|
|
// ulen -= 10;
|
|
// now they must match exactly
|
|
if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
|
|
// all done
|
|
return false;
|
|
}
|
|
|
|
bool isSiteRootFunc3 ( char *u , int32_t siteRootHash32 ) {
|
|
// get length of each
|
|
int32_t ulen = gbstrlen(u);
|
|
// remove trailing /
|
|
if ( u[ulen-1] == '/' ) ulen--;
|
|
// skip http:// or https://
|
|
if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
|
|
if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
|
|
// now they must match exactly
|
|
int32_t sh32 = hash32(u,ulen);
|
|
return ( sh32 == siteRootHash32 );
|
|
}
|
|
|
|
char *XmlDoc::getIsSiteRoot ( ) {
|
|
if ( m_isSiteRootValid ) return &m_isSiteRoot2;
|
|
// get our site
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (char *)site;
|
|
// get our url without the http:// or https://
|
|
char *u = getFirstUrl()->getHost();
|
|
if ( ! u ) {
|
|
g_errno = EBADURL;
|
|
return NULL;
|
|
}
|
|
// assume valid now
|
|
m_isSiteRootValid = true;
|
|
// get it
|
|
bool isRoot = isSiteRootFunc ( u , site );
|
|
// seems like https:://twitter.com/ is not getting set to root
|
|
if ( m_firstUrl.getPathDepth(true) == 0 && ! m_firstUrl.isCgi() )
|
|
isRoot = true;
|
|
m_isSiteRoot2 = m_isSiteRoot = isRoot;
|
|
return &m_isSiteRoot2;
|
|
}
|
|
|
|
/*
|
|
bool XmlDoc::getIsOutlinkSiteRoot ( char *u , TagRec *gr ) {
|
|
// get our site
|
|
Tag *tag = gr->getTag("site");
|
|
// make "host" point to u's hostname
|
|
int32_t hostLen; char *host = getHostFast ( u , &hostLen );
|
|
// use hostname?
|
|
char *site;
|
|
int32_t slen;
|
|
if ( tag ) {
|
|
site = tag->getTagData();
|
|
slen = tag->getTagDataSize() - 1;
|
|
}
|
|
// otherwise, use hostname as site
|
|
else {
|
|
// must be end, or could be '/'
|
|
if ( ! host[hostLen] || ! host[hostLen+1] ) return true;
|
|
// i guess we were more than just a hostname, so not site root
|
|
return false;
|
|
}
|
|
// get length of each
|
|
int32_t ulen = gbstrlen(u);
|
|
// "site" may or may not end in /, so remove that
|
|
if ( site[slen-1] == '/' ) slen--;
|
|
// same for url
|
|
if ( u[ulen-1] == '/' ) ulen--;
|
|
// now they must match exactly
|
|
if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
|
|
// all done
|
|
return false;
|
|
}
|
|
*/
|
|
|
|
|
|
int8_t *XmlDoc::getHopCount ( ) {
|
|
// return now if valid
|
|
if ( m_hopCountValid ) return &m_hopCount;
|
|
|
|
setStatus ( "getting hop count" );
|
|
|
|
CollectionRec *cr = this->getCollRec();
|
|
if(cr && cr->m_isCustomCrawl ) {
|
|
// for diffbot collections, compute hopcount without casting
|
|
// site/rss to 0 hopcount -- copied from below
|
|
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if (!info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
|
|
int32_t origHopCount = -1;
|
|
if ( m_sreqValid ) {
|
|
origHopCount = m_sreq.m_hopCount;
|
|
}
|
|
int32_t hc = -1;
|
|
// if(m_minInlinkerHopCount+1 < hc && m_minInlinkerHopCount>=0)
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
if ( origHopCount < hc && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
if ( hc == -1 && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
if ( hc == -1 )
|
|
hc = 1;
|
|
if ( hc > 0x7f ) hc = 0x7f;
|
|
m_hopCountValid = true;
|
|
m_hopCount = hc;
|
|
|
|
//printf("Custom hopcount: %d for url: %s",
|
|
//m_hopCount, this->ptr_firstUrl);
|
|
return &m_hopCount;
|
|
}
|
|
|
|
// the unredirected url
|
|
Url *f = getFirstUrl();
|
|
// get url as string, skip "http://" or "https://"
|
|
//char *u = f->getHost();
|
|
// if we match site, we are a site root, so hop count is 0
|
|
//char *isr = getIsSiteRoot();
|
|
//if ( ! isr || isr == (char *)-1 ) return (int8_t *)isr;
|
|
//if ( *isr ) {
|
|
// m_hopCount = 0;
|
|
// m_hopCountValid = true;
|
|
// return &m_hopCount;
|
|
//}
|
|
// ping servers have 0 hop counts
|
|
if ( f->isPingServer() ) {
|
|
// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
|
|
m_hopCount = 0;
|
|
m_hopCountValid = true;
|
|
return &m_hopCount;
|
|
}
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
|
|
// check for site root
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int8_t *)gr;
|
|
// and site roots
|
|
char *isSiteRoot = getIsSiteRoot();
|
|
if (!isSiteRoot ||isSiteRoot==(char *)-1) return (int8_t *)isSiteRoot;
|
|
if ( *isSiteRoot ) {
|
|
// log("xmldoc: hc1 is 0 (siteroot) %s",m_firstUrl.m_url);
|
|
m_hopCount = 0;
|
|
m_hopCountValid = true;
|
|
return &m_hopCount;
|
|
}
|
|
// make sure m_minInlinkerHopCount is valid
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
|
|
// . fix bad original hop counts
|
|
// . assign this hop count from the spider rec
|
|
int32_t origHopCount = -1;
|
|
if ( m_sreqValid ) origHopCount = m_sreq.m_hopCount;
|
|
// derive our hop count from our parent hop count
|
|
int32_t hc = -1;
|
|
// . BUT use inlinker if better
|
|
// . if m_linkInfo1Valid is true, then m_minInlinkerHopCount is valid
|
|
// if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 )
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
// or if parent is unknown, but we have a known inlinker with a
|
|
// valid hop count, use the inlinker hop count then
|
|
// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
// if ( origHopCount == 0 )
|
|
// log("xmldoc: hc3 is 0 (spiderreq) %s",m_firstUrl.m_url);
|
|
// or use our hop count from the spider rec if better
|
|
if ( origHopCount < hc && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
// or if neither parent or inlinker was valid hop count
|
|
if ( hc == -1 && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
// if we have no hop count at this point, i guess just pick 1!
|
|
if ( hc == -1 )
|
|
hc = 1;
|
|
// truncate, hop count is only one byte in the TitleRec.h::m_hopCount
|
|
if ( hc > 0x7f ) hc = 0x7f;
|
|
|
|
// and now so do rss urls.
|
|
if ( *isRSS && hc > 1 ) {
|
|
// force it to one, not zero, otherwise it gets pounded
|
|
// too hard on the aggregator sites. spider priority
|
|
// is too high
|
|
m_hopCount = 1;
|
|
m_hopCountValid = true;
|
|
return &m_hopCount;
|
|
}
|
|
|
|
// unknown hop counts (-1) are propagated, except for root urls
|
|
m_hopCountValid = true;
|
|
m_hopCount = hc;
|
|
return &m_hopCount;
|
|
}
|
|
|
|
/*
|
|
int8_t *XmlDoc::getOutlinkHopCountVector ( ) {
|
|
if ( m_outlinkHopCountVectorValid ) return m_outlinkHopCountVector;
|
|
// need these of course
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (int8_t *)links;
|
|
// and these for seeing if outlink is a site root
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (int8_t *)grv;
|
|
// hop count of parent
|
|
int8_t *ph = getHopCount();
|
|
if ( ! ph || ph == (void *)-1 ) return (int8_t *)ph;
|
|
// int16_tcut
|
|
int32_t n = links->getNumLinks();
|
|
// sanity check
|
|
if ( m_outlinkHopCountVector ) { char *xx=NULL;*xx=0; }
|
|
// make some space
|
|
m_outlinkHopCountVector = (int8_t *)mmalloc ( n * 4 ,"xdhc");
|
|
// return NULL on error with g_errno set
|
|
if ( ! m_outlinkHopCountVector ) return NULL;
|
|
// save size
|
|
m_outlinkHopCountVectorSize = n * 4;
|
|
// stock it
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get it
|
|
char *u = links->getLinkPtr(i);
|
|
// and this
|
|
TagRec *gr = (*grv)[i];
|
|
// flags
|
|
linkflags_t flags = links->m_linkFlags[i];
|
|
// hop count. default to 1.
|
|
int32_t hc = 1;
|
|
if ( getIsOutlinkSiteRoot ( u , gr ) ) hc = 0;
|
|
else if ( isPingServer ( u ) ) hc = 0;
|
|
else if ( flags & LF_RSS ) hc = 0;
|
|
else hc = *ph + 1;
|
|
// assign it
|
|
m_outlinkHopCountVector[i] = hc;
|
|
}
|
|
m_outlinkHopCountVectorValid = true;
|
|
return m_outlinkHopCountVector;
|
|
}
|
|
*/
|
|
|
|
//set to false fo rinjecting and validate it... if &spiderlinks=0
|
|
// should we spider links?
|
|
char *XmlDoc::getSpiderLinks ( ) {
|
|
// set it to false on issues
|
|
//if ( m_indexCode ) {
|
|
// m_spiderLinks = false;
|
|
// m_spiderLinks2 = false;
|
|
// m_spiderLinksValid = true ; }
|
|
|
|
// this slows importing down because we end up doing ip lookups
|
|
// for every outlink if "firstip" not in tagdb.
|
|
// shoot. set2() already sets m_spiderLinksValid to true so we
|
|
// have to override if importing.
|
|
if ( m_isImporting && m_isImportingValid ) {
|
|
m_spiderLinks = false;
|
|
m_spiderLinks2 = false;
|
|
m_spiderLinksValid = true;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
// return the valid value
|
|
if ( m_spiderLinksValid ) return &m_spiderLinks2;
|
|
|
|
setStatus ( "getting spider links flag");
|
|
|
|
// do not add links now if doing the parser test
|
|
if ( g_conf.m_testParserEnabled ||
|
|
m_isDiffbotJSONObject ) {
|
|
m_spiderLinks = false;
|
|
m_spiderLinks2 = false;
|
|
m_spiderLinksValid = true;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return (char *)cr;
|
|
|
|
int32_t *ufn = getUrlFilterNum();
|
|
if ( ! ufn || ufn == (void *)-1 ) return (char *)ufn;
|
|
|
|
// if url filters forbids it
|
|
if ( ! cr->m_harvestLinks[*ufn] ) {
|
|
m_spiderLinksValid = true;
|
|
m_spiderLinks2 = false;
|
|
m_spiderLinks = false;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
// hack for bulk job detection. never spider links
|
|
//if ( cr->m_isCustomCrawl == 2 ) {
|
|
// m_spiderLinks = false;
|
|
// m_spiderLinks2 = false;
|
|
// m_spiderLinksValid = true;
|
|
// return &m_spiderLinks2;
|
|
//}
|
|
|
|
// check the xml for a meta robots tag
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
|
|
// assume true
|
|
m_spiderLinks = true;
|
|
|
|
// or if meta tag says not to
|
|
char buf1 [256];
|
|
char buf2 [256];
|
|
buf1[0] = '\0';
|
|
buf2[0] = '\0';
|
|
xml->getMetaContent ( buf1, 255 , "robots" , 6 );
|
|
xml->getMetaContent ( buf2, 255 , "gigabot", 7 );
|
|
|
|
if ( strstr ( buf1 , "nofollow" ) ||
|
|
strstr ( buf2 , "nofollow" ) ||
|
|
strstr ( buf1 , "none" ) ||
|
|
strstr ( buf2 , "none" ) )
|
|
m_spiderLinks = false;
|
|
|
|
// spider links if doing custom crawl or not using robots.txt
|
|
if ( ! m_useRobotsTxt || cr->m_isCustomCrawl )
|
|
m_spiderLinks = true;
|
|
|
|
// spider request forbade it? diffbot.cpp crawlbot api when
|
|
// specifying urldata (list of urls to add to spiderdb) usually
|
|
// they do not want the links crawled i'd imagine.
|
|
if ( m_sreqValid && m_sreq.m_avoidSpiderLinks )
|
|
m_spiderLinks = false;
|
|
|
|
|
|
// also check in url filters now too
|
|
|
|
|
|
// set shadow member
|
|
m_spiderLinks2 = m_spiderLinks;
|
|
// validate
|
|
m_spiderLinksValid = true;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
//
|
|
// . DELETE ALL SPAM FROM THE INDEX!!!
|
|
//
|
|
// . for a page to be spam these must ALL be true, with the current ip:
|
|
// . site is not in google
|
|
// . site has no "stars" in google's dir
|
|
// . site has no authorityinlink tag
|
|
// . site has less than 10 fresh inlinks
|
|
// . site has less than 500 total inlinks
|
|
// . ip is not from ultra dns
|
|
// . TODO: site is not linked to by wikipedia.com
|
|
// . TODO: site is not linked to by about.com
|
|
// . TODO: site is not linked to by a .gov site
|
|
// . the page IP address changed significantly since the same since last
|
|
// time we indexed it when it was not spam (if applicable)
|
|
//
|
|
// . if the page was indexed at one time and then we decided it was spam,
|
|
// and its ip changed significantly since last time, we just
|
|
// reschedule the spider rec for 15 days later and do not touch anything
|
|
// else. that way we keep the index somewhat stable.
|
|
//
|
|
|
|
/*
|
|
char *XmlDoc::getIsSpam() {
|
|
// return it if valid
|
|
if ( m_isSpamValid ) return &m_isSpam;
|
|
|
|
setStatus ("getting is spam");
|
|
|
|
// assume it is not spam
|
|
m_isSpam = false;
|
|
|
|
// debug
|
|
//logf(LOG_DEBUG,"doc: NOT SPAM!!");
|
|
//m_isSpamValid = true; return &m_isSpam;
|
|
|
|
// we disable this check for the contact doc
|
|
if ( m_spamCheckDisabled ) { m_isSpamValid = true; return &m_isSpam; }
|
|
|
|
// . i put this here for debugging purposes
|
|
// . some big sites have no easy to find contact info
|
|
// . get our domain
|
|
Url *fu = getFirstUrl();
|
|
char *dom = fu->getDomain ();
|
|
int32_t dlen = fu->getDomainLen();
|
|
if ( dlen == 12 && !strncmp(dom,"facebook.com",dlen) ) {
|
|
m_isSpamValid = true; return &m_isSpam; }
|
|
if ( dlen == 9 && !strncmp(dom,"yahoo.com",dlen) ) {
|
|
m_isSpamValid = true; return &m_isSpam; }
|
|
|
|
// get our site's tag rec
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
|
|
|
|
// are we already in the index?
|
|
//char *isIndexed = getIsIndexed();
|
|
//if (!isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
|
|
|
|
// this will update m_oldTagRec with the latest info if its stale
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (char *)hci;
|
|
|
|
//int32_t *ip = getIp();
|
|
//if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
|
|
|
|
//XmlDoc **od = getOldXmlDoc ( );
|
|
//if ( ! od || od == (void *)-1 ) return (char *)od;
|
|
|
|
//int32_t oldIp = 0 ;
|
|
//if ( *od ) {
|
|
// int32_t *ip2 = (*od)->getIp();
|
|
// if ( ! ip2 || ip2 == (int32_t *)-1 ) return (char *)ip2;
|
|
// oldIp = *ip2;
|
|
//}
|
|
|
|
// i am guessing that most sites that use ultra dns will have a lot
|
|
// of site inlinks! so comment this our for now
|
|
//char *ultra = getIpIsUltraDns();
|
|
//if ( ultra || ultra==(char *)-1 ) return (char *)ultra;
|
|
// spammers do not use ultradns
|
|
//if ( *ultra ) return false;
|
|
|
|
Url *f = getFirstUrl();
|
|
char *u = f->getUrl();
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// this will be valid
|
|
m_isSpamValid = true;
|
|
|
|
// use this routine
|
|
m_isSpam = isSpam ( u,
|
|
gr,
|
|
now,
|
|
// *isIndexed,
|
|
//oldIp ,
|
|
// *ip ,
|
|
*hci );
|
|
|
|
// we are doomed! delete in its entirety
|
|
if ( m_isSpam ) m_indexCode = EDOCSPAM;
|
|
|
|
return &m_isSpam;
|
|
}
|
|
|
|
// . "u" must be NORMALIZED. i.e. start with http:// or https:// etc.
|
|
// . we call this on outlinks as well
|
|
// . we no longer look at the old and newip to determine ownership change,
|
|
// because that is not reliable enough
|
|
// . we now maybe rely on a major change to the site root page...
|
|
bool XmlDoc::isSpam ( char *u ,
|
|
TagRec *gr ,
|
|
int32_t now ,
|
|
char isIndexed ,
|
|
int32_t oldIp ,
|
|
int32_t newIp ,
|
|
bool hasContactInfo ) {
|
|
|
|
// we need to mine that same database that firefox does...
|
|
Tag *tag = gr->getTag ( "malware" );
|
|
if ( tag && tag->getTagData()[0] != '0' ) return true;
|
|
|
|
// if they have contact info, that is a really good sign
|
|
if ( hasContactInfo ) return false;
|
|
|
|
// .edu and .gov sites are always fine
|
|
int32_t tlen; char *tld = getTLDFast(u,&tlen);
|
|
if ( tlen == 3 && ! strncmp(tld,"edu",3) ) return false;
|
|
if ( tlen == 3 && ! strncmp(tld,"gov",3) ) return false;
|
|
|
|
// the current top ip address
|
|
//int32_t top = newIp & 0x00ffffff;
|
|
|
|
// TODO: in the case of multiple ips on one domain, ensure we select
|
|
// the same IP every time we do a lookup in MsgC.
|
|
|
|
// ok if in google
|
|
if ( gr->getTag ( "ingoogle" ) ) return false;
|
|
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
|
|
|
|
// can also be in google's dmoz dir. must have a decent page rank.
|
|
if ( gr->getTag ( "pagerank" ) ) return false;
|
|
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
|
|
|
|
// . if was linked to by a high quality root as a new external outlink
|
|
// . TODO: include about.com and wikipedia.com i guess (TODO)
|
|
if ( gr->getTag ( "authorityinlink" ) ) return false;
|
|
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
|
|
|
|
tag = gr->getTag("sitenuminlinks");
|
|
// i guess if it has no entry for this, assume the best
|
|
if ( ! tag ) return false;
|
|
// or just a massive amount of any-age inlinks
|
|
if ( atol(tag->getTagData()) >= 500 ) return false;
|
|
|
|
tag = gr->getTag("sitenuminlinksfresh");
|
|
// i guess if it has no entry for this, assume the best
|
|
if ( ! tag ) return false;
|
|
// if site has enough good FRESH inlinks from the last 3 mos, no spam
|
|
if( atol(tag->getTagData()) >= 10 ) return false;
|
|
|
|
// if we are old and the top 3 bytes of the ip is the same as the last
|
|
// time we were indexed and thereby not identified as spam...
|
|
// then assume we are still not spam! because it was unlikely that
|
|
// the domain ownership changed...
|
|
//if ( isIndexed (oldIp & 0x00ffffff) == top ) return false;
|
|
|
|
// if they have contact info, that is a really good sign
|
|
//if ( hasContactInfo && (oldIp & 0x00ffffff) == top ) return false;
|
|
|
|
// if first time... accept them if they got contact info
|
|
//if ( ! oldIp && hasContactInfo ) return false;
|
|
|
|
// . if it has had the same ip for the last 365 days, let it in
|
|
// . getTagRec() updates this tag immediately if the ip changes
|
|
// . so we can't really use this tag for outlinks, because they might
|
|
// never get thrown into spiderdb to where we can add this tag to
|
|
// their tag rec... UNLESS msgc/msge were to update their tag rec...
|
|
// . i've seen quite a few old spam sites/pages. they just kinda stay
|
|
// there. so let's not do this...
|
|
//tag = gr->get("iptimestamp");
|
|
//int32_t now;
|
|
//if ( tag ) now = getTimeGlobal();
|
|
//if(tag&&now-atol(tag->getTagData())>365*24*3600&&
|
|
// ((tag->m_ip&0x00ffffff)==top))
|
|
// return false;
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// should we index the doc? if already indexed, and is filtered, we delete it
|
|
char *XmlDoc::getIsFiltered ( ) {
|
|
if ( m_isFilteredValid ) return &m_isFiltered;
|
|
if ( m_isDiffbotJSONObject ) {
|
|
m_isFiltered = false;
|
|
m_isFilteredValid = true;
|
|
return &m_isFiltered;
|
|
}
|
|
int32_t *priority = getSpiderPriority();
|
|
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
|
|
m_isFiltered = false;
|
|
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
|
|
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
|
|
if ( *priority == -3 ) m_isFiltered = true;
|
|
m_isFilteredValid = true;
|
|
return &m_isFiltered;
|
|
}
|
|
|
|
int32_t *XmlDoc::getSpiderPriority ( ) {
|
|
if ( m_priorityValid ) return &m_priority;
|
|
setStatus ("getting spider priority");
|
|
// need tagrec to see if banned
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// this is an automatic ban!
|
|
if ( gr->getLong("manualban",0) ) {
|
|
m_priority = -3;//SPIDER_PRIORITY_BANNED;
|
|
m_priorityValid = true;
|
|
return &m_priority;
|
|
}
|
|
int32_t *ufn = getUrlFilterNum();
|
|
if ( ! ufn || ufn == (void *)-1 ) return (int32_t *)ufn;
|
|
// sanity check
|
|
if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
m_priority = cr->m_spiderPriorities[*ufn];
|
|
|
|
// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
|
|
if ( cr->m_forceDelete[*ufn] ) m_priority = -3;
|
|
|
|
m_priorityValid = true;
|
|
return &m_priority;
|
|
}
|
|
|
|
bool XmlDoc::logIt ( SafeBuf *bb ) {
|
|
|
|
// set errCode
|
|
int32_t errCode = m_indexCode;
|
|
if ( ! errCode && g_errno ) errCode = g_errno;
|
|
|
|
// were we new?
|
|
//char isIndexed = -1;
|
|
//if ( m_isIndexedValid ) isIndexed = m_isIndexed;
|
|
bool isNew = true;
|
|
if ( m_sreqValid && m_sreq.m_hadReply ) isNew = false;
|
|
|
|
// keep track of stats
|
|
g_stats.addSpiderPoint ( errCode, isNew ); // !isIndexed );
|
|
|
|
// do not log if we should not, saves some time
|
|
//if ( ! g_conf.m_logSpideredUrls && ! m_forceDelete ) return true;
|
|
if ( ! g_conf.m_logSpideredUrls ) return true;
|
|
|
|
// patch the ip
|
|
int32_t ip = m_ip;
|
|
// invalid?
|
|
if ( ! m_ipValid ) ip = 0;
|
|
|
|
char *coll = "nuked";
|
|
CollectionRec *cr = getCollRec();
|
|
if ( cr ) coll = cr->m_coll;
|
|
|
|
SafeBuf tmpsb;
|
|
|
|
// print into this now
|
|
SafeBuf *sb = &tmpsb;
|
|
// log into provided safebuf if not null
|
|
if ( bb ) sb = bb;
|
|
|
|
//
|
|
// coll
|
|
//
|
|
sb->safePrintf("coll=%s ",coll);
|
|
sb->safePrintf("collnum=%" INT32 " ",(int32_t)m_collnum);
|
|
|
|
//
|
|
// print ip
|
|
//
|
|
if ( m_ipValid )
|
|
sb->safePrintf("ip=%s ",iptoa(m_ip) );
|
|
|
|
if ( m_firstIpValid )
|
|
sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );
|
|
|
|
// . first ip from spider req if it is fake
|
|
// . we end up spidering the same url twice because it will have
|
|
// different "firstips" in the SpiderRequest key. maybe just
|
|
// use domain hash instead of firstip, and then let msg13
|
|
// make queues in the case of hammering an ip, which i think
|
|
// it already does...
|
|
if ( m_sreqValid && m_sreq.m_firstIp != m_firstIp )
|
|
sb->safePrintf("fakesreqfirstip=%s ",iptoa(m_sreq.m_firstIp) );
|
|
|
|
//
|
|
// print when this spider request was added
|
|
//
|
|
//if ( m_sreqValid && m_sreq.m_addedTime ) {
|
|
// struct tm *timeStruct = gmtime ( &m_sreq.m_addedTime );
|
|
// char tmp[64];
|
|
// strftime(tmp,64,"requestadded=%b-%d-%Y(%H:%M:%S)", timeStruct);
|
|
// sb->safePrintf("%s(%" UINT32 ") ",tmp,m_sreq.m_addedTime);
|
|
//}
|
|
|
|
//
|
|
// print spidered time
|
|
//
|
|
//if ( m_spideredTimeValid ) {
|
|
time_t spideredTime = (time_t)getSpideredTime();
|
|
struct tm *timeStruct = gmtime ( &spideredTime );
|
|
char tmp[64];
|
|
strftime(tmp,64,"spidered=%b-%d-%Y(%H:%M:%S)", timeStruct );
|
|
sb->safePrintf("%s(%" UINT32 ") ",tmp,(uint32_t)spideredTime);
|
|
|
|
// when it was scheduled to be spidered
|
|
if ( m_sreqValid && m_sreq.m_addedTime ) {
|
|
time_t ts = m_sreq.m_addedTime;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
char tmp[64];
|
|
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("scheduledtime=%s(%" UINT32 ") ",
|
|
tmp,(uint32_t)m_sreq.m_addedTime);
|
|
}
|
|
|
|
// discovery date, first time spiderrequest was added to spiderdb
|
|
if ( m_sreqValid && m_sreq.m_discoveryTime ) {
|
|
time_t ts = m_sreq.m_discoveryTime;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
char tmp[64];
|
|
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("discoverydate=%s(%" UINT32 ") ",
|
|
tmp,(uint32_t)m_sreq.m_discoveryTime);
|
|
}
|
|
|
|
// print first indexed time
|
|
if ( m_firstIndexedDateValid ) {
|
|
time_t ts = m_firstIndexedDate;
|
|
timeStruct = gmtime ( &ts );//m_firstIndexedDate );
|
|
strftime(tmp,64,"firstindexed=%b-%d-%Y(%H:%M:%S)", timeStruct);
|
|
sb->safePrintf("%s(%" UINT32 ") ",tmp,
|
|
(uint32_t)m_firstIndexedDate);
|
|
}
|
|
|
|
|
|
//if ( ! m_isIndexedValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// just use the oldurlfilternum for grepping i guess
|
|
//if ( m_oldDocValid && m_oldDoc )
|
|
|
|
// when injecting a request we have no idea if it had a reply or not
|
|
if ( m_sreqValid && m_sreq.m_isInjecting )
|
|
sb->safePrintf("firsttime=? ");
|
|
else if ( m_sreqValid && m_sreq.m_hadReply )
|
|
sb->safePrintf("firsttime=0 ");
|
|
else if ( m_sreqValid )
|
|
sb->safePrintf("firsttime=1 ");
|
|
else
|
|
sb->safePrintf("firsttime=? ");
|
|
|
|
//
|
|
// print # of link texts
|
|
//
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 ) {
|
|
LinkInfo *info = ptr_linkInfo1;
|
|
int32_t nt = info->getNumLinkTexts();
|
|
sb->safePrintf("goodinlinks=%" INT32 " ",nt );
|
|
// new stuff. includes ourselves i think.
|
|
//sb->safePrintf("ipinlinks=%" INT32 " ",info->m_numUniqueIps);
|
|
//sb->safePrintf("cblockinlinks=%" INT32 " ",
|
|
//info->m_numUniqueCBlocks);
|
|
}
|
|
|
|
//
|
|
// print # of link texts from 2nd coll
|
|
//
|
|
// this is not used for what it was used for.
|
|
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
|
|
// LinkInfo *info = ptr_linkInfo2;
|
|
// int32_t nt = 0;
|
|
// if ( info ) nt = info->getNumLinkTexts();
|
|
// if ( nt ) sb->safePrintf("goodinlinks2=%" INT32 " ",nt );
|
|
// }
|
|
|
|
if ( m_docIdValid )
|
|
sb->safePrintf("docid=%" UINT64 " ",m_docId);
|
|
|
|
char *u = getFirstUrl()->getUrl();
|
|
int64_t pd = g_titledb.getProbableDocId(u);
|
|
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
|
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
|
sb->safePrintf("probdocid=%" UINT64 " ",pd);
|
|
sb->safePrintf("probdocidmin=%" UINT64 " ",d1);
|
|
sb->safePrintf("probdocidmax=%" UINT64 " ",d2);
|
|
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
|
|
|
|
|
|
if ( m_siteNumInlinksValid ) {
|
|
sb->safePrintf("siteinlinks=%04" INT32 " ",m_siteNumInlinks );
|
|
// sb->safePrintf("siteipinlinks=%" INT32 " ",
|
|
// m_siteNumInlinksUniqueIp);
|
|
// sb->safePrintf("sitecblockinlinks=%" INT32 " ",
|
|
// m_siteNumInlinksUniqueCBlock);
|
|
int32_t sr = ::getSiteRank ( m_siteNumInlinks );
|
|
sb->safePrintf("siterank=%" INT32 " ", sr );
|
|
}
|
|
|
|
if ( m_sreqValid )
|
|
sb->safePrintf("pageinlinks=%04" INT32 " ",
|
|
m_sreq.m_pageNumInlinks);
|
|
|
|
// int16_tcut
|
|
int64_t uh48 = hash64b ( m_firstUrl.m_url );
|
|
// mask it
|
|
uh48 &= 0x0000ffffffffffffLL;
|
|
sb->safePrintf ("uh48=%" UINT64 " ",uh48 );
|
|
|
|
|
|
if ( m_charsetValid )
|
|
sb->safePrintf("charset=%s ",get_charset_str(m_charset));
|
|
|
|
if ( m_contentTypeValid )
|
|
sb->safePrintf("ctype=%s ",
|
|
g_contentTypeStrings [m_contentType]);
|
|
|
|
if ( m_sreqValid )
|
|
sb->safePrintf("parentlang=%02" INT32 "(%s) ",
|
|
(int32_t)m_sreq.m_parentLangId,
|
|
getLanguageAbbr(m_sreq.m_parentLangId));
|
|
|
|
if ( m_langIdValid )
|
|
sb->safePrintf("lang=%02" INT32 "(%s) ",(int32_t)m_langId,
|
|
getLanguageAbbr(m_langId));
|
|
|
|
if ( m_countryIdValid )
|
|
sb->safePrintf("country=%02" INT32 "(%s) ",(int32_t)m_countryId,
|
|
g_countryCode.getAbbr(m_countryId));
|
|
|
|
if ( m_hopCountValid )
|
|
sb->safePrintf("hopcount=%02" INT32 " ",(int32_t)m_hopCount);
|
|
|
|
|
|
if ( m_contentValid )
|
|
sb->safePrintf("contentlen=%06" INT32 " ",m_contentLen);
|
|
|
|
if ( m_isContentTruncatedValid )
|
|
sb->safePrintf("contenttruncated=%" INT32 " ",
|
|
(int32_t)m_isContentTruncated);
|
|
|
|
if ( m_robotsTxtLenValid )
|
|
sb->safePrintf("robotstxtlen=%04" INT32 " ",m_robotsTxtLen );
|
|
|
|
if ( m_isAllowedValid )
|
|
sb->safePrintf("robotsallowed=%i ", (int)m_isAllowed);
|
|
else
|
|
sb->safePrintf("robotsallowed=? " );
|
|
|
|
if ( m_contentHash32Valid )
|
|
sb->safePrintf("ch32=%010" UINT32 " ",m_contentHash32);
|
|
|
|
if ( m_domHash32Valid )
|
|
sb->safePrintf("dh32=%010" UINT32 " ",m_domHash32);
|
|
|
|
if ( m_siteHash32Valid )
|
|
sb->safePrintf("sh32=%010" UINT32 " ",m_siteHash32);
|
|
|
|
if ( m_isPermalinkValid )
|
|
sb->safePrintf("ispermalink=%" INT32 " ",(int32_t)m_isPermalink);
|
|
|
|
if ( m_isRSSValid )
|
|
sb->safePrintf("isrss=%" INT32 " ",(int32_t)m_isRSS);
|
|
|
|
if ( m_linksValid )
|
|
sb->safePrintf("hasrssoutlink=%" INT32 " ",
|
|
(int32_t)m_links.hasRSSOutlink() );
|
|
|
|
if ( m_numOutlinksAddedValid ) {
|
|
sb->safePrintf("outlinksadded=%04" INT32 " ",
|
|
(int32_t)m_numOutlinksAdded);
|
|
sb->safePrintf("outlinksaddedfromsamedomain=%04" INT32 " ",
|
|
(int32_t)m_numOutlinksAddedFromSameDomain);
|
|
}
|
|
|
|
if ( m_metaListValid )
|
|
sb->safePrintf("addlistsize=%05" INT32 " ",
|
|
(int32_t)m_metaListSize);
|
|
else
|
|
sb->safePrintf("addlistsize=%05" INT32 " ",(int32_t)0);
|
|
|
|
if ( m_addedSpiderRequestSizeValid )
|
|
sb->safePrintf("addspiderreqsize=%05" INT32 " ",
|
|
m_addedSpiderRequestSize);
|
|
else
|
|
sb->safePrintf("addspiderreqsize=%05" INT32 " ",0);
|
|
|
|
|
|
if ( m_addedSpiderReplySizeValid )
|
|
sb->safePrintf("addspiderrepsize=%05" INT32 " ",
|
|
m_addedSpiderReplySize);
|
|
else
|
|
sb->safePrintf("addspiderrepsize=%05" INT32 " ",0);
|
|
|
|
|
|
if ( m_addedStatusDocSizeValid ) {
|
|
sb->safePrintf("addstatusdocsize=%05" INT32 " ",
|
|
m_addedStatusDocSize);
|
|
sb->safePrintf("addstatusdocid=%" UINT64 " ",
|
|
m_addedStatusDocId);
|
|
}
|
|
else {
|
|
sb->safePrintf("addstatusdocsize=%05" INT32 " ",0);
|
|
sb->safePrintf("addstatusdocid=0 ");
|
|
}
|
|
|
|
|
|
if ( m_useSecondaryRdbs ) {
|
|
sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
|
|
sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
|
|
sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
|
|
sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
|
|
sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
|
|
if ( cr )
|
|
sb->safePrintf("indexspiderreplies=%i ",(int)
|
|
cr->m_indexSpiderReplies);
|
|
}
|
|
|
|
if ( size_imageData && m_imageDataValid ) {
|
|
// url is in data now
|
|
ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
|
|
int32_t nt = ta->getNumThumbnails();
|
|
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
|
|
sb->safePrintf("thumbnail=%s,%" INT32 "bytes,%" INT32 "x%" INT32 ",(%" INT32 ") ",
|
|
ti->getUrl(),
|
|
ti->m_dataSize,
|
|
ti->m_dx,
|
|
ti->m_dy,
|
|
nt);
|
|
}
|
|
else
|
|
sb->safePrintf("thumbnail=none ");
|
|
|
|
|
|
/*
|
|
if ( m_hasAddressValid && m_addressesValid )
|
|
sb->safePrintf("numaddr=%" INT32 " ",(int32_t)m_addresses.m_numValid);
|
|
|
|
//if ( m_skipIndexingValid )
|
|
// sb->safePrintf("skipindexing=%" INT32 " ",(int32_t)m_skipIndexing);
|
|
|
|
if ( m_hasTODValid )
|
|
sb->safePrintf("hastod=%" INT32 " ",(int32_t)m_hasTOD);
|
|
*/
|
|
|
|
// get the content type
|
|
uint8_t ct = CT_UNKNOWN;
|
|
if ( m_contentTypeValid ) ct = m_contentType;
|
|
|
|
bool isRoot = false;
|
|
if ( m_isSiteRootValid ) isRoot = m_isSiteRoot;
|
|
|
|
// make sure m_minInlinkerHopCount is valid
|
|
LinkInfo *info1 = NULL;
|
|
if ( m_linkInfo1Valid ) info1 = ptr_linkInfo1;
|
|
|
|
//bool isContacty = getIsContacty(&m_firstUrl,
|
|
// info1,
|
|
// m_hopCount ,
|
|
// ct , // contentType
|
|
// isRoot ,
|
|
// m_niceness );
|
|
/*
|
|
// just use this now
|
|
if ( m_hasContactInfoValid )
|
|
sb->safePrintf("iscontacty=%" INT32 " ",(int32_t)m_hasContactInfo);
|
|
|
|
if ( m_hasSiteVenueValid )
|
|
sb->safePrintf("hassitevenue=%" INT32 " ",(int32_t)m_hasSiteVenue);
|
|
*/
|
|
|
|
// hack this kinda
|
|
// . in PageInject.cpp we do not have a valid priority without
|
|
// blocking because we did a direct injection!
|
|
// so ignore this!!
|
|
// . a diffbot json object, an xmldoc we set from a json object
|
|
// in a diffbot reply, is a childDoc (m_isChildDoc) is true
|
|
// and does not have a spider priority. only the parent doc
|
|
// that we used to get the diffbot reply (array of json objects)
|
|
// will have the spider priority
|
|
if ( ! getIsInjecting() && ! m_isDiffbotJSONObject ) {
|
|
//int32_t *priority = getSpiderPriority();
|
|
//if ( ! priority ||priority==(void *)-1){char *xx=NULL;*xx=0;}
|
|
if ( m_priorityValid )
|
|
sb->safePrintf("priority=%" INT32 " ",
|
|
(int32_t)m_priority);
|
|
}
|
|
|
|
// should be valid since we call getSpiderPriority()
|
|
if ( m_urlFilterNumValid )
|
|
sb->safePrintf("urlfilternum=%" INT32 " ",(int32_t)m_urlFilterNum);
|
|
|
|
|
|
if ( m_diffbotApiUrlValid &&
|
|
m_diffbotApiUrl.getBufStart() &&
|
|
m_diffbotApiUrl.getBufStart()[0] )
|
|
sb->safePrintf("diffbotjsonobjects=%" INT32 " ",
|
|
(int32_t)m_diffbotJSONCount);
|
|
|
|
if ( m_diffbotReplyValid )
|
|
sb->safePrintf("diffboterror=%" INT32 " ",m_diffbotReplyError);
|
|
|
|
if ( m_siteValid )
|
|
sb->safePrintf("site=%s ",ptr_site);
|
|
|
|
if ( m_isSiteRootValid )
|
|
sb->safePrintf("siteroot=%" INT32 " ",m_isSiteRoot );
|
|
else
|
|
sb->safePrintf("siteroot=? ");
|
|
|
|
// like how we index it, do not include the filename. so we can
|
|
// have a bunch of pathdepth 0 urls with filenames like xyz.com/abc.htm
|
|
if ( m_firstUrlValid ) {
|
|
int32_t pd = -1;
|
|
// fix core
|
|
if ( m_firstUrl.m_url &&
|
|
m_firstUrl.m_ulen > 0 &&
|
|
m_firstUrl.m_path )
|
|
pd = m_firstUrl.getPathDepth(false);
|
|
sb->safePrintf("pathdepth=%" INT32 " ",pd);
|
|
}
|
|
else {
|
|
sb->safePrintf("pathdepth=? ");
|
|
}
|
|
|
|
//
|
|
// . sometimes we print these sometimes we do not
|
|
// . put this at the end so we can awk out the above fields reliably
|
|
//
|
|
|
|
// print when it was last spidered
|
|
if ( m_oldDocValid && m_oldDoc ) {
|
|
time_t spideredTime = m_oldDoc->getSpideredTime();
|
|
struct tm *timeStruct = gmtime ( &spideredTime );
|
|
char tmp[64];
|
|
strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
|
|
sb->safePrintf("%s(%" UINT32 ") ", tmp,(uint32_t)spideredTime);
|
|
}
|
|
|
|
// print new pubdate
|
|
if ( m_pubDateValid && m_pubDate!=(uint32_t)-1 && m_pubDate!=0 ) {
|
|
char tmp[64];
|
|
time_t ts = (time_t)m_pubDate;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("pubdate=%s ", tmp );
|
|
}
|
|
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 && ptr_linkInfo1->hasRSSItem())
|
|
sb->safePrintf("hasrssitem=1 ");
|
|
|
|
// was the content itself injected?
|
|
if ( m_wasContentInjected )
|
|
sb->safePrintf("contentinjected=1 ");
|
|
else
|
|
sb->safePrintf("contentinjected=0 ");
|
|
|
|
// might have just injected the url and downloaded the content?
|
|
if ( (m_sreqValid && m_sreq.m_isInjecting) ||
|
|
(m_isInjecting && m_isInjectingValid) )
|
|
sb->safePrintf("urlinjected=1 ");
|
|
else
|
|
sb->safePrintf("urlinjected=0 ");
|
|
|
|
if ( m_sreqValid && m_sreq.m_isAddUrl )
|
|
sb->safePrintf("isaddurl=1 ");
|
|
else
|
|
sb->safePrintf("isaddurl=0 ");
|
|
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex )
|
|
sb->safePrintf("pagereindex=1 ");
|
|
|
|
if ( m_spiderLinksValid && m_spiderLinks )
|
|
sb->safePrintf("spiderlinks=1 ");
|
|
if ( m_spiderLinksValid && ! m_spiderLinks )
|
|
sb->safePrintf("spiderlinks=0 ");
|
|
|
|
|
|
if ( m_crawlDelayValid && m_crawlDelay != -1 )
|
|
sb->safePrintf("crawldelayms=%" INT32 " ",(int32_t)m_crawlDelay);
|
|
|
|
if ( m_recycleContent )
|
|
sb->safePrintf("recycleContent=1 ");
|
|
|
|
if ( m_exactContentHash64Valid )
|
|
sb->safePrintf("exactcontenthash=%" UINT64 " ",
|
|
m_exactContentHash64 );
|
|
|
|
// . print percent changed
|
|
// . only print if non-zero!
|
|
if ( m_percentChangedValid && m_oldDocValid && m_oldDoc &&
|
|
m_percentChanged )
|
|
sb->safePrintf("changed=%.00f%% ",m_percentChanged);
|
|
|
|
// only print if different now! good for grepping changes
|
|
if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_docId != m_docId )
|
|
sb->safePrintf("olddocid=%" UINT64 " ",m_oldDoc->m_docId);
|
|
|
|
// only print if different now! good for grepping changes
|
|
if ( m_sreqValid && m_sreq.m_ufn >= 0 &&
|
|
m_sreq.m_ufn != m_urlFilterNum )
|
|
sb->safePrintf("oldurlfilternum=%" INT32 " ",
|
|
(int32_t)m_sreq.m_ufn);
|
|
|
|
if ( m_sreqValid && m_sreq.m_priority >= 0 &&
|
|
m_sreq.m_priority != m_priority )
|
|
sb->safePrintf("oldpriority=%" INT32 " ",
|
|
(int32_t)m_sreq.m_priority);
|
|
|
|
if ( m_oldDoc && m_oldDoc->m_langIdValid &&
|
|
m_oldDoc->m_langId != m_langId )
|
|
sb->safePrintf("oldlang=%02" INT32 "(%s) ",(int32_t)m_oldDoc->m_langId,
|
|
getLanguageAbbr(m_oldDoc->m_langId));
|
|
|
|
if ( m_useSecondaryRdbs &&
|
|
m_useTitledb &&
|
|
m_logLangId != m_langId )
|
|
sb->safePrintf("oldlang=%02" INT32 "(%s) ",(int32_t)m_logLangId,
|
|
getLanguageAbbr(m_logLangId));
|
|
|
|
if ( m_useSecondaryRdbs &&
|
|
m_useTitledb &&
|
|
m_logSiteNumInlinks != m_siteNumInlinks )
|
|
sb->safePrintf("oldsiteinlinks=%04" INT32 " ",m_logSiteNumInlinks);
|
|
|
|
if ( m_useSecondaryRdbs &&
|
|
m_useTitledb &&
|
|
m_oldDocValid &&
|
|
m_oldDoc &&
|
|
strcmp(ptr_site,m_oldDoc->ptr_site) )
|
|
sb->safePrintf("oldsite=%s ",m_oldDoc->ptr_site);
|
|
|
|
// . print old pubdate
|
|
// . -1 means unsupported, 0 means could not find one
|
|
// . only print if different now! good for grepping changes
|
|
if ( m_oldDocValid && m_oldDoc &&
|
|
m_oldDoc->m_pubDate!= (uint32_t)-1 &&
|
|
m_oldDoc->m_pubDate !=0 &&
|
|
m_oldDoc->m_pubDate != m_pubDate ) {
|
|
char tmp[64];
|
|
time_t ts = m_oldDoc->m_pubDate;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("oldpubdate=%s ",tmp );
|
|
}
|
|
|
|
if ( m_isAdultValid )
|
|
sb->safePrintf("isadult=%" INT32 " ",(int32_t)m_isAdult);
|
|
|
|
// only print if different now! good for grepping changes
|
|
if ( m_oldDocValid && m_oldDoc &&
|
|
m_oldDoc->m_siteNumInlinks >= 0 &&
|
|
m_oldDoc->m_siteNumInlinks != m_siteNumInlinks ) {
|
|
int32_t sni = -1;
|
|
if ( m_oldDoc ) sni = m_oldDoc->m_siteNumInlinks;
|
|
sb->safePrintf("oldsiteinlinks=%04" INT32 " ",sni);
|
|
}
|
|
|
|
|
|
// Spider.cpp sets m_sreq.m_errCount before adding it to doledb
|
|
if ( m_sreqValid ) // && m_sreq.m_errCount )
|
|
sb->safePrintf("errcnt=%" INT32 " ",(int32_t)m_sreq.m_errCount );
|
|
else
|
|
sb->safePrintf("errcnt=? ");
|
|
|
|
if ( ptr_redirUrl ) { // m_redirUrlValid && m_redirUrlPtr ) {
|
|
sb->safePrintf("redir=%s ",ptr_redirUrl);//m_redirUrl.getUrl());
|
|
if ( m_numRedirects > 2 )
|
|
sb->safePrintf("numredirs=%" INT32 " ",m_numRedirects);
|
|
}
|
|
|
|
if ( m_canonicalRedirUrlValid && m_canonicalRedirUrlPtr )
|
|
sb->safePrintf("canonredir=%s ",
|
|
m_canonicalRedirUrlPtr->getUrl());
|
|
|
|
if ( m_httpStatusValid && m_httpStatus != 200 )
|
|
sb->safePrintf("httpstatus=%" INT32 " ",(int32_t)m_httpStatus);
|
|
|
|
if ( m_updatedMetaData )
|
|
sb->safePrintf("updatedmetadata=1 ");
|
|
|
|
if ( m_isDupValid && m_isDup )
|
|
sb->safePrintf("dupofdocid=%" INT64 " ",m_docIdWeAreADupOf);
|
|
|
|
if ( m_firstUrlValid )
|
|
sb->safePrintf("url=%s ",m_firstUrl.m_url);
|
|
else
|
|
sb->safePrintf("urldocid=%" INT64 " ",m_docId);
|
|
|
|
//
|
|
// print error/status
|
|
//
|
|
sb->safePrintf(": %s",mstrerror(m_indexCode));
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// if safebuf provided, do not log to log
|
|
if ( bb ) return true;
|
|
|
|
// log it out
|
|
logf ( LOG_INFO ,
|
|
"build: %s",
|
|
//getFirstUrl()->getUrl(),
|
|
sb->getBufStart() );
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . make sure that the title rec we generated creates the exact same
|
|
// meta list as what we got
|
|
bool XmlDoc::doConsistencyTest ( bool forceTest ) {
|
|
|
|
// skip for now it was coring on a json doc test
|
|
return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr )
|
|
return true;
|
|
|
|
if ( ! m_doConsistencyTesting && strcmp(cr->m_coll,"qatest123") != 0 )
|
|
return true;
|
|
|
|
// if we had an old doc then our meta list will have removed
|
|
// stuff already in the database from indexing the old doc.
|
|
// so it will fail the parsing consistency check... because of
|
|
// the 'incremental indexing' algo above
|
|
// disable for now... just a secondfor testing cheatcc.com
|
|
if ( m_oldDoc && m_oldDocValid && g_conf.m_doIncrementalUpdating )
|
|
return true;
|
|
|
|
// if not test coll skip this
|
|
//if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
|
|
|
// title rec is null if we are reindexing an old doc
|
|
// and "unchanged" was true.
|
|
if ( m_unchangedValid && m_unchanged ) {
|
|
if ( ! m_titleRecBufValid ) return true;
|
|
if ( m_titleRecBuf.length()==0 ) return true;
|
|
}
|
|
|
|
// leave this uncommented so we can see if we are doing it
|
|
setStatus ( "doing consistency check" );
|
|
|
|
// log debug
|
|
log("spider: doing consistency check for %s",ptr_firstUrl);
|
|
|
|
// . set another doc from that title rec
|
|
// . do not keep on stack since so huge!
|
|
XmlDoc *doc ;
|
|
try { doc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return false;
|
|
}
|
|
mnew ( doc , sizeof(XmlDoc),"xmldcs");
|
|
|
|
|
|
if ( ! doc->set2 ( m_titleRecBuf.getBufStart() ,
|
|
-1 , cr->m_coll , NULL , m_niceness ,
|
|
// no we provide the same SpiderRequest so that
|
|
// it can add the same SpiderReply to the metaList
|
|
&m_sreq ) ) {
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
return false;
|
|
}
|
|
|
|
// . some hacks
|
|
// . do not look up title rec in titledb, assume it is new
|
|
doc->m_isIndexed = false;
|
|
doc->m_isIndexedValid = true;
|
|
|
|
// so we don't core in getRevisedSpiderRequest()
|
|
doc->m_firstIp = m_firstIp;
|
|
doc->m_firstIpValid = true;
|
|
|
|
// inherit this doc's tag rec since it has not called updateTagdb() yet
|
|
//doc->ptr_tagRecData = ptr_tagRecData;
|
|
//doc->size_tagRecData = size_tagRecData;
|
|
|
|
// getNewSpiderReply() calls getDownloadEndTime() which is not valid
|
|
// and causes the page to be re-downloaded, so stop that..!
|
|
doc->m_downloadEndTime = m_downloadEndTime;
|
|
doc->m_downloadEndTimeValid = true;
|
|
|
|
// inherit doledb key as well to avoid a core there
|
|
doc->m_doledbKey = m_doledbKey;
|
|
|
|
// skip the robots.txt lookup! that was causing this too block!
|
|
//doc->m_isAllowed = true;
|
|
//doc->m_isAllowedValid = true;
|
|
|
|
// do not get outlink info for this, that stuff is for adding outlinks
|
|
// to spiderdb, and tagdb may have changed. so we can't really compare
|
|
// spider recs! if this is false then the call to doc->getMetaList()
|
|
// blocks to lookup the tagdb and titledb recs for each outlink!
|
|
// therefore, set it to true!
|
|
//doc->m_isInjecting = true;
|
|
// mdw: shouldn't this have the same effect?
|
|
//doc->m_spiderLinks2 = false;
|
|
//doc->m_spiderLinksValid = true;
|
|
|
|
// flag it
|
|
doc->m_doingConsistencyCheck = true;
|
|
|
|
// get get its metalist. rv = return value
|
|
char *rv = doc->getMetaList ( );
|
|
|
|
// sanity check - compare urls
|
|
if ( doc->m_firstUrl.m_ulen != m_firstUrl.m_ulen){char *xx=NULL;*xx=0;}
|
|
|
|
// error setting it?
|
|
if ( ! rv ) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// free it
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
// error
|
|
return false;
|
|
}
|
|
// blocked? that is not allowed
|
|
if ( rv == (void *)-1 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// compare with the old list
|
|
char *list1 = m_metaList;
|
|
int32_t listSize1 = m_metaListSize;
|
|
|
|
char *list2 = doc->m_metaList;
|
|
int32_t listSize2 = doc->m_metaListSize;
|
|
|
|
|
|
// show it for now
|
|
//log("build: printing meta list 1");
|
|
//printMetaList(list1,list1+listSize1,NULL);
|
|
//log("build: printing meta list 2");
|
|
//printMetaList(list2,list2+listSize2,NULL);
|
|
|
|
|
|
// do a compare
|
|
HashTableX ht1;
|
|
HashTableX ht2;
|
|
|
|
ht1.set ( sizeof(key224_t),sizeof(char *),
|
|
262144,NULL,0,false,m_niceness,"xmlht1");
|
|
ht2.set ( sizeof(key224_t),sizeof(char *),
|
|
262144,NULL,0,false,m_niceness,"xmlht2");
|
|
|
|
// format of a metalist... see XmlDoc::addTable() where it adds keys
|
|
// from a table into the metalist
|
|
// <nosplitflag|rdbId><key><dataSize><data>
|
|
// where nosplitflag is 0x80
|
|
char *p1 = list1;
|
|
char *p2 = list2;
|
|
char *pend1 = list1 + listSize1;
|
|
char *pend2 = list2 + listSize2;
|
|
|
|
// see if each key in list1 is in list2
|
|
if ( ! hashMetaList ( &ht1 , p1 , pend1 , false ) ) {
|
|
char *xx=NULL;*xx=0;
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
return log("doc: failed consistency test for %s",ptr_firstUrl);
|
|
}
|
|
if ( ! hashMetaList ( &ht2 , p2 , pend2 , false ) ) {
|
|
char *xx=NULL;*xx=0;
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
return log("doc: failed consistency test for %s",ptr_firstUrl);
|
|
}
|
|
|
|
// . now make sure each list matches the other
|
|
// . first scan the guys in "p1" and make sure in "ht2"
|
|
hashMetaList ( &ht2 , p1 , pend1 , true );
|
|
// . second scan the guys in "p2" and make sure in "ht1"
|
|
hashMetaList ( &ht1 , p2 , pend2 , true );
|
|
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
|
|
log ("spider: passed consistency test for %s",ptr_firstUrl );
|
|
|
|
// no serious error, although there might be an inconsistency
|
|
return true;
|
|
}
|
|
|
|
int32_t XmlDoc::printMetaList ( ) {
|
|
|
|
SafeBuf sb;
|
|
printMetaList ( m_metaList ,
|
|
m_metaList + m_metaListSize ,
|
|
&sb );
|
|
fprintf(stderr,"%s\n",sb.getBufStart());
|
|
return 0;
|
|
}
|
|
|
|
|
|
#define TABLE_ROWS 25
|
|
|
|
// print this also for page parser output!
|
|
void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
|
|
|
|
verifyMetaList ( p , pend , false );
|
|
|
|
SafeBuf tmp;
|
|
if ( ! sb ) sb = &tmp;
|
|
|
|
char *hdr =
|
|
"<table border=1>\n"
|
|
"<tr>"
|
|
"<td><b>rdb</b></td>"
|
|
"<td><b>del?</b></td>"
|
|
"<td><b>shardByTermId?</b></td>"
|
|
// illustrates key size
|
|
"<td><b>key</b></td>"
|
|
// break it down. based on rdb, of course.
|
|
"<td><b>desc</b></td>"
|
|
"</tr>\n" ;
|
|
|
|
sb->safePrintf("%s",hdr);
|
|
|
|
int32_t recSize = 0;
|
|
int32_t rcount = 0;
|
|
for ( ; p < pend ; p += recSize ) {
|
|
// get rdbid
|
|
uint8_t rdbId = *p & 0x7f;
|
|
// skip
|
|
p++;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// point to it
|
|
char *rec = p;
|
|
// init this
|
|
int32_t recSize = ks;
|
|
// convert into a key128_t, the biggest possible key
|
|
//key224_t k ;
|
|
char k[MAX_KEY_BYTES];
|
|
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
|
|
//k.setMin();
|
|
gbmemcpy ( &k , p , ks );
|
|
// is it a negative key?
|
|
char neg = false;
|
|
if ( ! ( p[0] & 0x01 ) ) neg = true;
|
|
// this is now a bit in the posdb key so we can rebalance
|
|
char shardByTermId = false;
|
|
if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
|
|
shardByTermId = true;
|
|
// skip it
|
|
p += ks;
|
|
// get datasize
|
|
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// . always zero if key is negative
|
|
// . this is not the case unfortunately...
|
|
if ( neg ) dataSize = 0;
|
|
// if -1, read it in
|
|
if ( dataSize == -1 ) {
|
|
dataSize = *(int32_t *)p;
|
|
// inc this
|
|
recSize += 4;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
}
|
|
// point to it
|
|
char *data = p;
|
|
// skip the data
|
|
p += dataSize;
|
|
// inc it
|
|
recSize += dataSize;
|
|
// NULL it for negative keys
|
|
if ( dataSize == 0 ) data = NULL;
|
|
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS) == 0 )
|
|
sb->safePrintf("<!--ignore--></table>%s",hdr);
|
|
|
|
|
|
//if ( rdbId != RDB_LINKDB ) continue;
|
|
|
|
// print dbname
|
|
sb->safePrintf("<tr>");
|
|
char *dn = getDbnameFromId ( rdbId );
|
|
sb->safePrintf("<td>%s</td>",dn);
|
|
|
|
if ( neg ) sb->safePrintf("<td>D</td>");
|
|
else sb->safePrintf("<td> </td>");
|
|
|
|
if ( shardByTermId ) sb->safePrintf("<td>shardByTermId</td>");
|
|
else sb->safePrintf("<td> </td>");
|
|
|
|
sb->safePrintf("<td><nobr>%s</nobr></td>", KEYSTR(k,ks));
|
|
|
|
|
|
|
|
if ( rdbId == RDB_POSDB ) {
|
|
// get termid et al
|
|
key144_t *k2 = (key144_t *)k;
|
|
int64_t tid = g_posdb.getTermId(k2);
|
|
//uint8_t score8 = g_posdb.getScore ( *k2 );
|
|
//uint32_t score32 = score8to32 ( score8 );
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
"termId=%020" UINT64 " "
|
|
//"score8=%03" UINT32 " "
|
|
//"score32=%010" UINT32 ""
|
|
"</td>"
|
|
,(uint64_t)tid
|
|
//(int32_t)score8,
|
|
//(int32_t)score32
|
|
);
|
|
}
|
|
else if ( rdbId == RDB_DATEDB ) {
|
|
// get termid et al
|
|
key128_t *k2 = (key128_t *)k;
|
|
int64_t tid = g_datedb.getTermId(k2);
|
|
// use indexdb's function for this. should be the same
|
|
uint8_t score8 = g_indexdb.getScore ( (char *)k );
|
|
int32_t date = g_datedb.getDate ( k2 );
|
|
uint32_t score32 = score8to32 ( score8 );
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
"termId=%020" UINT64 " "
|
|
"date=%010" UINT32 " "
|
|
"score8=%03" UINT32 " "
|
|
"score32=%010" UINT32 ""
|
|
"</td>",
|
|
tid,
|
|
date,
|
|
(int32_t)score8,
|
|
(int32_t)score32);
|
|
}
|
|
// key parsing logic from Sections.cpp::gotSectiondbList()
|
|
else if ( rdbId == RDB_SECTIONDB ) {
|
|
key128_t *k2 = (key128_t *)k;
|
|
int32_t secType = g_indexdb.getScore ( (char *)k2);
|
|
int32_t tagHash = g_datedb.getDate ( k2 );
|
|
int64_t tid = g_datedb.getTermId(k2);
|
|
int64_t siteHash = tid; // not quite 64 bits
|
|
SectionVote *sv = (SectionVote *)data;
|
|
char *dd = "tagHash32";
|
|
if ( secType == SV_TAGCONTENTHASH )
|
|
dd ="tagcontentHash32";
|
|
if ( secType == SV_TAGPAIRHASH )
|
|
dd = "tagPairHash32";
|
|
// sanity check
|
|
int32_t ds = sizeof(SectionVote);
|
|
if (!neg&&dataSize!=ds){char*xx=NULL;*xx=0;}
|
|
if ( neg&&dataSize!=0 ){char*xx=NULL;*xx=0;}
|
|
float score = 0.0;
|
|
float numSampled = 0.0;
|
|
if ( data ) {
|
|
score = sv->m_score;
|
|
numSampled = sv->m_numSampled;
|
|
}
|
|
sb->safePrintf("<td>"
|
|
"<nobr>"
|
|
"siteHash48=0x%016" XINT64 " "
|
|
"%s=0x%08" XINT32 " "
|
|
"secType=%s "
|
|
"score=%.02f "
|
|
"numSampled=%.02f"
|
|
"</nobr>"
|
|
"</td>",
|
|
siteHash,
|
|
dd,tagHash,
|
|
getSectionTypeAsStr(secType),
|
|
score,
|
|
numSampled);
|
|
}
|
|
else if ( rdbId == RDB_LINKDB ) {
|
|
key224_t *k2 = (key224_t *)k;
|
|
int64_t linkHash=g_linkdb.getLinkeeUrlHash64_uk(k2);
|
|
int32_t linkeeSiteHash = g_linkdb.getLinkeeSiteHash32_uk(k2);
|
|
int32_t linkerSiteHash = g_linkdb.getLinkerSiteHash32_uk(k2);
|
|
char linkSpam = g_linkdb.isLinkSpam_uk (k2);
|
|
int32_t siteRank = g_linkdb.getLinkerSiteRank_uk (k2);
|
|
//int32_t hopCount = g_linkdb.getLinkerHopCount_uk (k2);
|
|
//int32_t ip24 = g_linkdb.getLinkerIp24_uk (k2);
|
|
int32_t ip32 = g_linkdb.getLinkerIp_uk (k2);
|
|
int64_t docId = g_linkdb.getLinkerDocId_uk (k2);
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
"<nobr>"
|
|
"linkeeSiteHash32=0x%08" XINT32 " "
|
|
"linkeeUrlHash=0x%016" XINT64 " "
|
|
"linkSpam=%" INT32 " "
|
|
"siteRank=%" INT32 " "
|
|
//"hopCount=%03" INT32 " "
|
|
"sitehash32=0x%" XINT32 " "
|
|
"IP32=%s "
|
|
"docId=%" UINT64 ""
|
|
"</nobr>"
|
|
"</td>",
|
|
linkeeSiteHash,
|
|
linkHash,
|
|
(int32_t)linkSpam,
|
|
siteRank,
|
|
//hopCount,
|
|
linkerSiteHash,
|
|
iptoa(ip32),
|
|
docId);
|
|
|
|
}
|
|
else if ( rdbId == RDB_CLUSTERDB ) {
|
|
key128_t *k2 = (key128_t *)k;
|
|
char *r = (char *)k2;
|
|
int32_t siteHash26 = g_clusterdb.getSiteHash26 ( r );
|
|
char lang = g_clusterdb.getLanguage ( r );
|
|
int64_t docId = g_clusterdb.getDocId ( r );
|
|
char ff = g_clusterdb.getFamilyFilter ( r );
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
// 26 bit site hash
|
|
"siteHash26=0x%08" XINT32 " "
|
|
"family=%" INT32 " "
|
|
"lang=%03" INT32 " "
|
|
"docId=%" UINT64 ""
|
|
"</td>",
|
|
siteHash26 ,
|
|
(int32_t)ff,
|
|
(int32_t)lang,
|
|
docId );
|
|
}
|
|
// key parsing logic taken from Address::makePlacedbKey
|
|
else if ( rdbId == RDB_PLACEDB ) {
|
|
key128_t *k2 = (key128_t *)k;
|
|
int64_t bigHash = g_placedb.getBigHash ( k2 );
|
|
int64_t docId = g_placedb.getDocId ( k2 );
|
|
int32_t snh = g_placedb.getStreetNumHash ( k2 );
|
|
//int32_t smallHash = g_placedb.getSmallHash ( k2 );
|
|
// sanity check
|
|
if(!neg &&dataSize<=0){char*xx=NULL;*xx=0;}
|
|
if( neg &&dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td><nobr>"
|
|
"bigHash64=0x%016" XINT64 " "
|
|
"docId=%" UINT64 " "
|
|
"streetNumHash25=0x%08" XINT32 " "
|
|
"dataSize=%010" INT32 " "
|
|
"address=%s"
|
|
"</nobr>"
|
|
"</td>",
|
|
bigHash,
|
|
docId,
|
|
snh,
|
|
dataSize ,
|
|
data );
|
|
}
|
|
// key parsing logic taken from Address::makePlacedbKey
|
|
else if ( rdbId == RDB_SPIDERDB ) {
|
|
sb->safePrintf("<td><nobr>");
|
|
key128_t *k2 = (key128_t *)k;
|
|
if ( g_spiderdb.isSpiderRequest(k2) ) {
|
|
SpiderRequest *sreq = (SpiderRequest *)rec;
|
|
sreq->print ( sb );
|
|
}
|
|
else {
|
|
SpiderReply *srep = (SpiderReply *)rec;
|
|
srep->print ( sb );
|
|
}
|
|
sb->safePrintf("</nobr></td>");
|
|
}
|
|
else if ( rdbId == RDB_DOLEDB ) {
|
|
key_t *k2 = (key_t *)k;
|
|
sb->safePrintf("<td><nobr>");
|
|
sb->safePrintf("priority=%" INT32 " "
|
|
"spidertime=%" UINT32 " "
|
|
"uh48=%" XINT64 " "
|
|
"isdel=%" INT32 "",
|
|
g_doledb.getPriority(k2),
|
|
(uint32_t)g_doledb.getSpiderTime(k2),
|
|
g_doledb.getUrlHash48(k2),
|
|
g_doledb.getIsDel(k2));
|
|
sb->safePrintf("</nobr></td>");
|
|
}
|
|
else if ( rdbId == RDB_TITLEDB ) {
|
|
//XmlDoc tr;
|
|
//SafeBuf tmp;
|
|
//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
|
|
// print each offset and size for the variable crap
|
|
sb->safePrintf("<td><nobr>titlerec datasize=%" INT32 " "
|
|
//"sizeofxmldoc=%" INT32 " "
|
|
//"hdrSize=%" INT32 " "
|
|
//"version=%" INT32 " "
|
|
//"%s"
|
|
"</nobr></td>",
|
|
dataSize
|
|
//(int32_t)sizeof(XmlDoc),
|
|
//(int32_t)tr.m_headerSize,
|
|
//(int32_t)tr.m_version,
|
|
//tmp.getBufStart());
|
|
);
|
|
}
|
|
//else if ( rdbId == RDB_REVDB ) {
|
|
// sb->safePrintf("<td><nobr>revdb datasize=%" INT32 " ",
|
|
// dataSize);
|
|
//}
|
|
else if ( rdbId == RDB_TAGDB ) {
|
|
Tag *tag = (Tag *)rec;
|
|
sb->safePrintf("<td><nobr>");
|
|
if ( rec[0] & 0x01 ) tag->printToBuf(sb);
|
|
else sb->safePrintf("negativeTagKey");
|
|
sb->safePrintf("</nobr></td>");
|
|
}
|
|
else {
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// close it up
|
|
sb->safePrintf("</tr>\n");
|
|
|
|
/*
|
|
// hash the data into a int32_t for hash table
|
|
char *ns = "no";
|
|
if ( noSplit ) ns = "yes";
|
|
char *del = "";
|
|
if ( neg ) del = " (delete)";
|
|
|
|
if ( ks==12 ) {
|
|
key_t *k2 = (key_t *)k;
|
|
int64_t tid = g_indexdb.getTermId(k2);
|
|
uint8_t score8 = g_indexdb.getScore ( *k2 );
|
|
uint32_t score32 = score8to32 ( score8 );
|
|
log("build: key #%" INT32 " rdb=%s ks=%" INT32 " ds=%" INT32 " "
|
|
"tid=%" UINT64 " score8=%" UINT32 " score32=%" UINT32 " nosplit=%s%s",
|
|
count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,tid ,(int32_t)score8,(int32_t)score32,
|
|
ns,del);
|
|
}
|
|
else {
|
|
log("build: key #%" INT32 " rdb=%s ks=%" INT32 " ds=%" INT32 " "
|
|
"nosplit=%s%s",
|
|
count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,ns,del);
|
|
}
|
|
*/
|
|
|
|
}
|
|
sb->safePrintf("</table>\n");
|
|
|
|
if ( sb == &tmp )
|
|
sb->print();
|
|
}
|
|
|
|
|
|
bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// do not do this if not test collection for now
|
|
if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
|
|
|
log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");
|
|
|
|
// store each record in the list into the send buffers
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// first is rdbId
|
|
//char rdbId = -1; // m_rdbId;
|
|
//if ( rdbId < 0 ) rdbId = *p++;
|
|
uint8_t rdbId = *p++;
|
|
// mask off rdbId
|
|
rdbId &= 0x7f;
|
|
// get the key of the current record
|
|
//char *key = p;
|
|
// negative key?
|
|
bool del ;
|
|
if ( *p & 0x01 ) del = false;
|
|
else del = true;
|
|
// must always be negative if deleting
|
|
// spiderdb is exempt because we add a spiderreply that is
|
|
// positive and a spiderdoc
|
|
// no, this is no longer the case because we add spider
|
|
// replies to the index when deleting or rejecting a doc.
|
|
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// get the key size. a table lookup in Rdb.cpp.
|
|
int32_t ks ;
|
|
if ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 ) {
|
|
ks = 18;
|
|
// no compress bits set!
|
|
if ( p[0] & 0x06 ) { char*xx=NULL;*xx=0; }
|
|
// alignment bit set or cleared
|
|
if ( ! ( p[1] & 0x02 ) ) { char *xx=NULL;*xx=0; }
|
|
if ( ( p[7] & 0x02 ) ) { char *xx=NULL;*xx=0; }
|
|
int64_t docId = g_posdb.getDocId(p);
|
|
if ( docId != m_docId && !cr->m_indexSpiderReplies) {
|
|
log("xmldoc: %" INT64 " != %" INT64 ""
|
|
, docId
|
|
, m_docId );
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// else
|
|
// log("xmldoc: %" INT64 " == %" INT64 ""
|
|
// , docId
|
|
// , m_docId );
|
|
|
|
// uint64_t termId = g_posdb.getTermId(p);
|
|
// if ( termId == 59194288760543LL ) {
|
|
// log("xmldoc: debug");
|
|
// //char *xx=NULL;*xx=0;
|
|
// }
|
|
}
|
|
else if ( rdbId == RDB_DATEDB ) ks = 16;
|
|
else ks = getKeySizeFromRdbId ( rdbId );
|
|
// sanity
|
|
if ( ks < 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
|
|
// another check
|
|
Rdb *rdb = getRdbFromId(rdbId);
|
|
if ( ! rdb ) { char *xx=NULL;*xx=0; }
|
|
if ( rdb->m_ks < 12 || rdb->m_ks > MAX_KEY_BYTES ) {
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// special linkdb check
|
|
//if ( rdbId == RDB_LINKDB ) {
|
|
// // parse it up
|
|
// key192_t *k = (key192_t *)p;
|
|
// unsigned char hc = g_linkdb.getLinkerHopCount_uk(k);
|
|
// if ( hc != 0 ){ char *xx=NULL;*xx=0; }
|
|
//}
|
|
|
|
char *rec = p;
|
|
|
|
// set this
|
|
//bool split = true;
|
|
//if(rdbId == RDB_POSDB && g_posdb.isShardedByTermId(p) )
|
|
// split =false;
|
|
// skip key
|
|
p += ks;
|
|
// . if key belongs to same group as firstKey then continue
|
|
// . titledb now uses last bits of docId to determine groupId
|
|
// . but uses the top 32 bits of key still
|
|
// . spiderdb uses last 64 bits to determine groupId
|
|
// . tfndb now is like titledb(top 32 bits are top 32 of docId)
|
|
//uint32_t gid = getGroupId ( rdbId , key , split );
|
|
// get the record, is -1 if variable. a table lookup.
|
|
int32_t dataSize;
|
|
if ( rdbId == RDB_POSDB || rdbId==RDB2_POSDB2)dataSize=0;
|
|
else if ( rdbId == RDB_DATEDB ) dataSize = 0;
|
|
//else if ( rdbId == RDB_REVDB ) dataSize = -1;
|
|
else if ( rdbId == RDB2_POSDB2 ) dataSize = 0;
|
|
else if ( rdbId == RDB2_DATEDB2 ) dataSize = 0;
|
|
//else if ( rdbId == RDB2_REVDB2 ) dataSize = -1;
|
|
else dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// . for delete never stores the data
|
|
// . you can have positive keys without any dataSize member
|
|
// when they normally should have one, like titledb
|
|
if ( forDelete ) dataSize = 0;
|
|
// . negative keys have no data
|
|
// . this is not the case unfortunately
|
|
if ( del ) dataSize = 0;
|
|
|
|
// ensure spiderdb request recs have data/url in them
|
|
if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
|
|
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)rec ) &&
|
|
! forDelete &&
|
|
! del &&
|
|
dataSize == 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// if variable read that in
|
|
if ( dataSize == -1 ) {
|
|
// -1 means to read it in
|
|
dataSize = *(int32_t *)p;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip dataSize
|
|
p += 4;
|
|
}
|
|
// skip over the data, if any
|
|
p += dataSize;
|
|
// breach us?
|
|
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// must be exactly equal to end
|
|
if ( p != pend ) return false;
|
|
return true;
|
|
|
|
/*
|
|
int32_t recSize = 0;
|
|
int32_t count = 0;
|
|
for ( ; p < pend ; p += recSize , count++ ) {
|
|
// get rdbid
|
|
char rdbId = *p & 0x7f;
|
|
// get nosplit flag
|
|
char noSplit = *p & 0x80;
|
|
// skip
|
|
p++;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// sanity
|
|
if ( ks > 16 ) { char *xx=NULL;*xx=0;}
|
|
// negative key?
|
|
bool del;
|
|
if ( *p & 0x01 ) del = false;
|
|
else del = true;
|
|
// convert into a key128_t, the biggest possible key
|
|
char k[16];
|
|
gbmemcpy ( &k , p , ks );
|
|
// skip it
|
|
p += ks;
|
|
// flip this
|
|
char split = ! noSplit;
|
|
// test it
|
|
g_hostdb.getGroupId(rdbId,k,split);
|
|
// if negative, no data size allowed
|
|
if ( ( k[0] & 0x01 ) == 0x00 ) continue;
|
|
// get datasize
|
|
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// no negative key has data
|
|
if ( del ) dataSize = 0;
|
|
// if -1, read it in
|
|
if ( dataSize == -1 ) {
|
|
dataSize = *(int32_t *)p;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
}
|
|
// skip the data
|
|
p += dataSize;
|
|
}
|
|
*/
|
|
}
|
|
|
|
bool XmlDoc::hashMetaList ( HashTableX *ht ,
|
|
char *p ,
|
|
char *pend ,
|
|
bool checkList ) {
|
|
int32_t recSize = 0;
|
|
int32_t count = 0;
|
|
for ( ; p < pend ; p += recSize , count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get rdbid
|
|
char rdbId = *p & 0x7f;
|
|
// skip rdb id
|
|
p++;
|
|
// save that
|
|
char *rec = p;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// sanity check
|
|
if ( ks > 28 ) { char *xx=NULL;*xx=0; }
|
|
// is it a delete key?
|
|
char del ;
|
|
if ( ( p[0] & 0x01 ) == 0x00 ) del = true;
|
|
else del = false;
|
|
// convert into a key128_t, the biggest possible key
|
|
char k[MAX_KEY_BYTES];//key128_t k ;
|
|
// zero out
|
|
KEYMIN(k,MAX_KEY_BYTES);
|
|
//k.setMin();
|
|
gbmemcpy ( k , p , ks );
|
|
// skip it
|
|
p += ks;
|
|
// if negative, no data size allowed -- no
|
|
if ( del ) continue;
|
|
// get datasize
|
|
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// if -1, read it in
|
|
if ( dataSize == -1 ) {
|
|
dataSize = *(int32_t *)p;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
}
|
|
// hash the data into a int32_t for hash table
|
|
//int32_t h32 = 0;
|
|
//h32 = hash32 ( p , dataSize );
|
|
// do not allow 0
|
|
//if ( h32 == 0 ) h32 = 1;
|
|
// skip the data
|
|
p += dataSize;
|
|
// ignore spiderdb recs for parsing consistency check
|
|
if ( rdbId == RDB_SPIDERDB ) continue;
|
|
if ( rdbId == RDB2_SPIDERDB2 ) continue;
|
|
// ignore tagdb as well!
|
|
if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;
|
|
// skip revdb for now too
|
|
//if ( rdbId == RDB_REVDB ) continue;
|
|
// set our rec size, includes key/dataSize/data
|
|
int32_t recSize = p - rec;
|
|
// debug point
|
|
//if ( *(uint64_t *)k == 4828936067112479745LL )
|
|
// log("hey");
|
|
// if just adding, do it
|
|
if ( ! checkList ) {
|
|
// we now store ptr to the rec, not hash!
|
|
if ( ! ht->addKey ( k , &rec ) ) return false;
|
|
continue;
|
|
}
|
|
// check to see if this rec is in the provided hash table
|
|
int32_t slot = ht->getSlot ( k );
|
|
// bitch if not found
|
|
if ( slot < 0 && ks==12 ) {
|
|
key144_t *k2 = (key144_t *)k;
|
|
int64_t tid = g_posdb.getTermId(k2);
|
|
char shardByTermId = g_posdb.isShardedByTermId(k2);
|
|
//uint8_t score8 = g_indexdb.getScore ( *k2 );
|
|
//uint32_t score32 = score8to32 ( score8 );
|
|
log("build: missing key #%" INT32 " rdb=%s ks=%" INT32 " ds=%" INT32 " "
|
|
"tid=%" UINT64 " "
|
|
"key=%s "
|
|
//"score8=%" UINT32 " score32=%" UINT32 " "
|
|
"shardByTermId=%" INT32 "",
|
|
count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,tid ,
|
|
//(int32_t)score8,(int32_t)score32,
|
|
KEYSTR(k2,ks),
|
|
(int32_t)shardByTermId);
|
|
// look it up
|
|
|
|
|
|
// int16_tcut
|
|
HashTableX *wt = m_wts;
|
|
|
|
// point to keys, termids?
|
|
//TermInfo **tp = (TermInfo **)wt->m_keys;
|
|
|
|
// now print the table we stored all we hashed into
|
|
for ( int32_t i = 0 ; i < wt->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( wt->m_flags[i] == 0 ) continue;
|
|
// get the TermInfo
|
|
TermDebugInfo *ti;
|
|
ti = (TermDebugInfo *)wt->getValueFromSlot(i);
|
|
// skip if not us
|
|
if((ti->m_termId & TERMID_MASK)!=tid)continue;
|
|
// got us
|
|
char *start = m_wbuf.getBufStart();
|
|
char *term = start + ti->m_termOff;
|
|
char *prefix = "";
|
|
if ( ti->m_prefixOff >= 0 ) {
|
|
prefix = start + ti->m_prefixOff;
|
|
//prefix[ti->m_prefixLen] = '\0';
|
|
}
|
|
// NULL term it
|
|
term[ti->m_termLen] = '\0';
|
|
// print it
|
|
log("parser: term=%s prefix=%s",//score32=%" INT32 "",
|
|
term,prefix);//,(int32_t)ti->m_score32);
|
|
}
|
|
|
|
char *xx=NULL; *xx=0;
|
|
// count it for PageStats.cpp
|
|
g_stats.m_parsingInconsistencies++;
|
|
continue;
|
|
}
|
|
if ( slot < 0 && ks != 12 ) {
|
|
// if it is sectiondb and the orig doc did not
|
|
// add sectiondb recs because m_totalSiteVoters >=
|
|
// MAX_SITE_VOTERS, then that is ok!
|
|
if ( (rdbId == RDB_SECTIONDB ||
|
|
rdbId == RDB2_SECTIONDB2 ) &&
|
|
m_sectionsValid &&
|
|
m_sections.m_totalSiteVoters >= MAX_SITE_VOTERS )
|
|
continue;
|
|
log("build: missing key #%" INT32 " rdb=%s ks=%" INT32 " ds=%" INT32 " "
|
|
"ks=%s "
|
|
,count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,KEYSTR(k,ks));
|
|
char *xx=NULL; *xx=0;
|
|
// count it for PageStats.cpp
|
|
g_stats.m_parsingInconsistencies++;
|
|
continue;
|
|
}
|
|
// if in there, check the hashes
|
|
//int32_t h2 = *(int32_t *)ht->getValueFromSlot ( slot );
|
|
char *rec2 = *(char **)ht->getValueFromSlot ( slot );
|
|
// get his dataSize
|
|
int32_t dataSize2 = getDataSizeFromRdbId(rdbId);
|
|
// his keysize
|
|
int32_t ks2 = getKeySizeFromRdbId(rdbId);
|
|
// get his recsize
|
|
int32_t recSize2 = ks2 ;
|
|
// if -1 that is variable
|
|
if ( dataSize2 == -1 ) {
|
|
dataSize2 = *(int32_t *)(rec2+ks2);
|
|
recSize2 += 4;
|
|
}
|
|
// add it up
|
|
recSize2 += dataSize2;
|
|
// keep on chugging if they match
|
|
if ( recSize2==recSize && !memcmp(rec,rec2,recSize) ) continue;
|
|
// otherwise, bitch
|
|
char shardByTermId = false;
|
|
if ( rdbId == RDB_POSDB )
|
|
shardByTermId = g_posdb.isShardedByTermId(rec2);
|
|
log("build: data not equal for key=%s "
|
|
"rdb=%s splitbytermid=%" INT32 " dataSize=%" INT32 "",
|
|
KEYSTR(k,ks2),
|
|
getDbnameFromId(rdbId),(int32_t)shardByTermId,dataSize);
|
|
|
|
// print into here
|
|
SafeBuf sb1;
|
|
SafeBuf sb2;
|
|
|
|
// print it out
|
|
if ( rdbId == RDB_SPIDERDB ) {
|
|
// get rec
|
|
if ( g_spiderdb.isSpiderRequest((key128_t *)rec) ) {
|
|
SpiderRequest *sreq1 = (SpiderRequest *)rec;
|
|
SpiderRequest *sreq2 = (SpiderRequest *)rec2;
|
|
sreq1->print(&sb1);
|
|
sreq2->print(&sb2);
|
|
}
|
|
else {
|
|
SpiderReply *srep1 = (SpiderReply *)rec;
|
|
SpiderReply *srep2 = (SpiderReply *)rec2;
|
|
srep1->print(&sb1);
|
|
srep2->print(&sb2);
|
|
}
|
|
log("build: rec1=%s",sb1.getBufStart());
|
|
log("build: rec2=%s",sb2.getBufStart());
|
|
|
|
}
|
|
char *xx=NULL; *xx=0;
|
|
// count it for PageStats.cpp
|
|
g_stats.m_parsingInconsistencies++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool checkRegex ( SafeBuf *regex ,
|
|
char *target ,
|
|
bool *boolVal ,
|
|
bool *boolValValid ,
|
|
int32_t *compileError ,
|
|
CollectionRec *cr ) {
|
|
|
|
if ( compileError ) *compileError = false;
|
|
|
|
if ( *boolValValid )
|
|
return *boolVal;
|
|
|
|
// if not using diffbot or there is no regex, it is ok
|
|
if ( regex->length() <= 0 ) {
|
|
*boolVal = true;
|
|
*boolValValid = true;
|
|
return boolVal;
|
|
}
|
|
|
|
// whip out the regex shit i guess...
|
|
regex_t buf;
|
|
// this will store the compiled regular expression into "buf"
|
|
int32_t ret = regcomp ( &buf ,
|
|
// the actual regular expression to compile
|
|
regex->getBufStart() ,
|
|
// some flags
|
|
REG_EXTENDED|REG_ICASE|REG_NEWLINE|REG_NOSUB);
|
|
|
|
if ( ret ) {
|
|
//g_errno = ret;
|
|
if ( compileError ) *compileError = errno;
|
|
log("xmldoc: diffbot regcomp failed: %s. This should have "
|
|
"been tested before crawl was started. Ignoring.",
|
|
mstrerror(errno));
|
|
return true;
|
|
}
|
|
|
|
// now see if it is a match
|
|
if ( regexec(&buf,target,0,NULL,0) ) *boolVal = true;
|
|
else *boolVal = false;
|
|
|
|
*boolValValid = true;
|
|
return boolVal;
|
|
}
|
|
*/
|
|
|
|
// . should we send this url off to diffbot or processing?
|
|
// . if the url's downloaded content does not match the provided regex
|
|
// in m_diffbotPageProcessPattern, then we do not send the url to diffbot
|
|
// for processing
|
|
// . make sure this regex is pre-tested before starting the crawl
|
|
// so we know it compiles
|
|
bool XmlDoc::doesUrlMatchDiffbotCrawlPattern() {
|
|
|
|
if ( m_matchesCrawlPatternValid )
|
|
return m_matchesCrawlPattern;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// get the compiled regular expressions
|
|
regex_t *ucr = &cr->m_ucr;
|
|
if ( ! cr->m_hasucr ) ucr = NULL;
|
|
|
|
if ( ! m_firstUrlValid ) return false;
|
|
|
|
|
|
m_matchesCrawlPatternValid = true;
|
|
m_matchesCrawlPattern = false;
|
|
|
|
Url *furl = getFirstUrl();
|
|
char *url = furl->getUrl();
|
|
|
|
// if we had a url crawl regex then regexec will return non-zero
|
|
// if our url does NOT match i guess
|
|
if ( ucr && regexec(ucr,url,0,NULL,0) )
|
|
return false;
|
|
|
|
// int16_tcut
|
|
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
|
|
if ( ucp && ! ucp[0] ) ucp = NULL;
|
|
|
|
// do not require a match on ucp if ucr is given
|
|
if ( ucp && ! ucr && ! doesStringContainPattern(url,ucp) )
|
|
return false;
|
|
|
|
m_matchesCrawlPattern = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool XmlDoc::doesUrlMatchDiffbotProcessPattern() {
|
|
return checkRegex ( &cr->m_diffbotUrlProcessPattern ,
|
|
m_firstUrl.m_url ,
|
|
&m_diffbotUrlProcessPatternMatch,
|
|
&m_diffbotUrlProcessPatternMatchValid,
|
|
NULL,
|
|
cr);
|
|
}
|
|
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
return checkRegex ( &cr->m_diffbotPageProcessPattern ,
|
|
ptr_utf8Content,
|
|
&m_diffbotPageProcessPatternMatch,
|
|
&m_diffbotPageProcessPatternMatchValid,
|
|
NULL,
|
|
cr);
|
|
}
|
|
*/
|
|
|
|
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
char *p = cr->m_diffbotPageProcessPattern.getBufStart();
|
|
// empty? no pattern matches everything.
|
|
if ( ! p ) return true;
|
|
if ( ! m_content ) return false;
|
|
// how many did we have?
|
|
return doesStringContainPattern ( m_content , p );
|
|
}
|
|
|
|
int32_t *XmlDoc::reindexJSONObjects ( int32_t *newTitleHashes,
|
|
int32_t numNewHashes ) {
|
|
return redoJSONObjects (newTitleHashes,numNewHashes,false );
|
|
}
|
|
|
|
int32_t *XmlDoc::nukeJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ) {
|
|
return redoJSONObjects (newTitleHashes,numNewHashes,true );
|
|
}
|
|
|
|
// . returns ptr to status
|
|
// . diffbot uses this to remove the indexed json pages associated with
|
|
// a url. each json object is basically its own url. a json object
|
|
// url is the parent page's url with a -diffbotxyz-%" UINT32 " appended to it
|
|
// where %" INT32 " is the object # starting at 0 and incrementing from there.
|
|
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
|
|
int32_t *XmlDoc::redoJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ,
|
|
bool deleteFromIndex ) {
|
|
// use this
|
|
static int32_t s_return = 1;
|
|
// if none, we are done
|
|
if ( m_diffbotJSONCount <= 0 ) return &s_return;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// i was trying to re-index some diffbot json docs in the global
|
|
// index but it wasn't set as custom crawl
|
|
//if ( ! cr->m_isCustomCrawl ) return &s_return;
|
|
|
|
// already did it?
|
|
if ( m_joc >= m_diffbotJSONCount ) return &s_return;
|
|
|
|
// new guy here
|
|
if ( ! m_dx ) {
|
|
try { m_dx = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("xmldoc: failed to alloc m_dx");
|
|
return NULL;
|
|
}
|
|
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
|
|
}
|
|
|
|
//
|
|
// index the hashes of the latest diffbot json items for this parent
|
|
//
|
|
HashTableX dedup;
|
|
if ( ! dedup.set(4,0,numNewHashes*4,NULL,0,false,m_niceness,"njodt") )
|
|
return NULL;
|
|
for ( int32_t i = 0 ; i < numNewHashes ; i++ )
|
|
dedup.addKey ( &newTitleHashes[i] );
|
|
|
|
// get this old doc's current title hashes
|
|
int32_t numOldHashes;
|
|
int32_t *oldTitleHashes = getDiffbotTitleHashes ( &numOldHashes );
|
|
// sanity. should return right away without having to block
|
|
if ( oldTitleHashes == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
//int32_t count = m_diffbotJSONCount;
|
|
// sanity again
|
|
if ( numOldHashes != m_diffbotJSONCount ) {
|
|
log("build: can't remove json objects. "
|
|
"jsoncount mismatch %" INT32 " != %" INT32
|
|
,numOldHashes
|
|
,m_diffbotJSONCount
|
|
);
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
//count = 0;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// scan down each
|
|
for ( ; m_joc < m_diffbotJSONCount ; ) {
|
|
// only NUKE the json items for which title hashes we lost
|
|
int32_t th32 = oldTitleHashes[m_joc];
|
|
// . if still in the new diffbot reply, do not DELETE!!!
|
|
// . if there was no title, it uses hash of entire object
|
|
if ( deleteFromIndex && dedup.isInTable(&th32) ) {
|
|
m_joc++;
|
|
continue;
|
|
}
|
|
// if m_dx has no url set, call set4 i guess
|
|
if ( ! m_dx->m_firstUrlValid ) {
|
|
// make the fake url for this json object for indexing
|
|
SafeBuf fakeUrl;
|
|
fakeUrl.set ( m_firstUrl.getUrl() );
|
|
// get his title hash32
|
|
//int32_t jsonTitleHash32 = titleHashes[m_joc];
|
|
// append -diffbotxyz%" UINT32 " for fake url
|
|
fakeUrl.safePrintf("-diffbotxyz%" UINT32 "",
|
|
(uint32_t)th32);
|
|
// set url of new xmldoc
|
|
if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness ) )
|
|
// g_errno should be set!
|
|
return NULL;
|
|
// we are indexing json objects, don't use all these
|
|
m_dx->m_useClusterdb = false;
|
|
m_dx->m_useSpiderdb = false;
|
|
m_dx->m_useTagdb = false;
|
|
m_dx->m_usePlacedb = false;
|
|
m_dx->m_useLinkdb = false;
|
|
m_dx->m_isChildDoc = true;
|
|
m_dx->m_parentDocPtr = this;
|
|
// are we doing a query reindex or a nuke?
|
|
m_dx->m_deleteFromIndex = deleteFromIndex;//true;
|
|
// do not try to download this url
|
|
if ( ! deleteFromIndex )
|
|
m_dx->m_recycleContent = true;
|
|
// we need this because only m_dx->m_oldDoc will
|
|
// load from titledb and have it set
|
|
m_dx->m_isDiffbotJSONObject = true;
|
|
// for debug
|
|
char *str = "reindexing";
|
|
if ( deleteFromIndex ) str = "nuking";
|
|
log("xmldoc: %s %s",str,fakeUrl.getBufStart());
|
|
}
|
|
|
|
// when the indexdoc completes, or if it blocks, call us!
|
|
// we should just pass through here
|
|
m_dx->setCallback ( m_masterState , m_masterLoop );
|
|
|
|
// . this should ultimately load from titledb and not
|
|
// try to download the page since m_deleteFromIndex is
|
|
// set to true
|
|
// . if m_dx got its msg4 reply it ends up here, in which
|
|
// case do NOT re-call indexDoc() so check for
|
|
// m_listAdded.
|
|
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
|
|
return (int32_t *)-1;
|
|
// critical error on our part trying to index it?
|
|
// does not include timeouts or 404s, etc. mostly just
|
|
// OOM errors.
|
|
if ( g_errno ) return NULL;
|
|
// count as deleted
|
|
cr->m_localCrawlInfo.m_objectsDeleted++;
|
|
cr->m_globalCrawlInfo.m_objectsDeleted++;
|
|
cr->m_needsSave = true;
|
|
// but gotta set this crap back
|
|
//log("diffbot: resetting %s",m_dx->m_firstUrl.m_url);
|
|
// clear for next guy if there is one. clears
|
|
// m_dx->m_contentValid so the set4() can be called again above
|
|
m_dx->reset();
|
|
// try to do more json objects indexed from this parent doc
|
|
m_joc++;
|
|
}
|
|
|
|
// nuke it
|
|
mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
|
|
delete ( m_dx );
|
|
m_dx = NULL;
|
|
|
|
return &s_return;
|
|
}
|
|
|
|
|
|
void getMetaListWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in get meta list wrapper" );
|
|
// get it
|
|
char *ml = THIS->getMetaList ( );
|
|
// sanity check
|
|
if ( ! ml && ! g_errno ) {
|
|
log("doc: getMetaList() returned NULL without g_errno");
|
|
sleep(5);
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// return if it blocked
|
|
if ( ml == (void *)-1 ) return;
|
|
// sanityh check
|
|
if ( THIS->m_callback1 == getMetaListWrapper ) { char *xx=NULL;*xx=0;}
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
XmlDoc *g_od = NULL;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . make a meta list to call Msg4::addMetaList() with
|
|
// . called by Msg14.cpp
|
|
// . a meta list is just a buffer of Rdb records of the following format:
|
|
// rdbid | rdbRecord
|
|
// . meta list does not include title rec since Msg14 adds that using Msg1
|
|
// . returns false and sets g_errno on error
|
|
// . sets m_metaList ptr and m_metaListSize
|
|
// . if "deleteIt" is true, we are a delete op on "old"
|
|
// . returns (char *)-1 if it blocks and will call your callback when done
|
|
// . generally only Repair.cpp changes these use* args to false
|
|
char *XmlDoc::getMetaList ( bool forDelete ) {
|
|
|
|
if ( m_metaListValid ) return m_metaList;
|
|
|
|
setStatus ( "getting meta list" );
|
|
|
|
// force it true?
|
|
// "forDelete" means we want the metalist to consist of "negative"
|
|
// keys that will annihilate with the positive keys in the index,
|
|
// posdb and the other rdbs, in order to delete them. "deleteFromIndex"
|
|
// means to just call getMetaList(tre) on the m_oldDoc (old XmlDoc)
|
|
// which is built from the titlerec in Titledb. so don't confuse
|
|
// these two things. otherwise when i add this we were not adding
|
|
// the spiderreply of "Doc Force Deleted" from doing a query reindex
|
|
// and it kept repeating every time we started gb up.
|
|
//if ( m_deleteFromIndex ) forDelete = true;
|
|
|
|
// assume valid
|
|
m_metaList = "";
|
|
m_metaListSize = 0;
|
|
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block, this callback will be called
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getMetaListWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// returning from a handler that had an error?
|
|
if ( g_errno ) return NULL;
|
|
|
|
// if we are a spider status doc/titlerec and we are doing a rebuild
|
|
// operation, then keep it simple
|
|
if ( m_setFromTitleRec &&
|
|
m_useSecondaryRdbs &&
|
|
m_contentTypeValid &&
|
|
m_contentType == CT_STATUS ) {
|
|
// if not rebuilding posdb then done, list is empty since
|
|
// spider status docs do not contribute to linkdb, clusterdb,..
|
|
if ( ! m_usePosdb && ! m_useTitledb ) {
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
/////////////
|
|
//
|
|
// if user disabled spider status docs then delete the titlerec
|
|
// AND the posdb index list from our dbs for this ss doc
|
|
//
|
|
/////////////
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
if ( ! cr->m_indexSpiderReplies ) {
|
|
int64_t uh48 = m_firstUrl.getUrlHash48();
|
|
// delete title rec. true = delete?
|
|
key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
|
|
// shortcut
|
|
SafeBuf *ssb = &m_spiderStatusDocMetaList;
|
|
// add to list. and we do not add the spider status
|
|
// doc to posdb since we deleted its titlerec.
|
|
ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
|
|
ssb->safeMemcpy ( &tkey , sizeof(key_t) );
|
|
m_metaList = ssb->getBufStart();
|
|
m_metaListSize = ssb->getLength ();
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
// set safebuf to the json of the spider status doc
|
|
SafeBuf jd;
|
|
if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
|
|
return NULL;
|
|
// set m_spiderStatusDocMetaList from the json
|
|
if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
|
|
return NULL;
|
|
// TODO: support titledb rebuild as well
|
|
m_metaList = m_spiderStatusDocMetaList.getBufStart();
|
|
m_metaListSize = m_spiderStatusDocMetaList.getLength();
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
|
|
// any other indexing issue? hey! g_errno might not be set here
|
|
//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }
|
|
|
|
// a hacky thing
|
|
//XmlDoc *od = (XmlDoc *)1;
|
|
|
|
//bool diffbotEmptyReply = false;
|
|
|
|
/*
|
|
// fake this for diffbot?
|
|
if ( m_useDiffbot &&
|
|
! m_isDiffbotJSONObject &&
|
|
! doesUrlMatchDiffbotCrawlPattern() ) {
|
|
// flag it so we only add the SpiderReply to spiderdb and bail
|
|
//diffbotEmptyReply = true;
|
|
// we should not delete the json objects for this url
|
|
// from the index just because the user decided to remove
|
|
// it from her crawl
|
|
m_isIndexedValid = true;
|
|
m_isIndexed = false;
|
|
m_oldDocValid = true;
|
|
m_oldDoc = NULL;
|
|
}
|
|
*/
|
|
|
|
// if "rejecting" from index fake all this stuff
|
|
if ( m_deleteFromIndex ) {
|
|
// if we are using diffbot api and diffbot found no json objects
|
|
// or we never even processed the url, we really just want to
|
|
// add the SpiderReply for this url to spiderdb and nothing more.
|
|
// NO! we still want to store the page content in titledb
|
|
// so we can see if it has changed i guess
|
|
//diffbotEmptyReply ) {
|
|
// set these things to bogus values since we don't need them
|
|
m_contentHash32Valid = true;
|
|
m_contentHash32 = 0;
|
|
m_httpStatusValid = true;
|
|
m_httpStatus = 200;
|
|
m_siteValid = true;
|
|
ptr_site = "www.poopoo.com";
|
|
size_site = gbstrlen(ptr_site)+1;
|
|
m_isSiteRootValid = true;
|
|
m_isSiteRoot2 = 1;
|
|
//m_tagHash32Valid = true;
|
|
//m_tagHash32 = 0;
|
|
m_tagPairHash32Valid = true;
|
|
m_tagPairHash32 = 0;
|
|
m_siteHash64Valid = true;
|
|
m_siteHash64 = 0LL;
|
|
m_spiderLinksValid = true;
|
|
m_spiderLinks2 = 1;
|
|
m_langIdValid = true;
|
|
m_langId = 1;
|
|
m_siteNumInlinksValid = true;
|
|
m_siteNumInlinks = 0;
|
|
m_isIndexed = true;
|
|
m_isIndexedValid = true;
|
|
m_ipValid = true;
|
|
m_ip = 123456;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
//
|
|
// BEGIN MULTI DOC QUERY REINDEX HACK
|
|
//
|
|
// this fixes it so we can do a query reindex on fake child urls
|
|
// of their original parent multidoc url. the child urls are
|
|
// subsections of the original parent url that were indexed as
|
|
// separate documents with their own docid. if we try to do a
|
|
// query reindex on such things, detect it, and add the request
|
|
// for the original parent multidoc url.
|
|
//
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex &&
|
|
// if it is a force delete, then allow the user to delete
|
|
// such diffbot reply json children documents, however.
|
|
! m_sreq.m_forceDelete ) {
|
|
// see if its diffbot json object
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
|
|
XmlDoc *od = *pod;
|
|
// if no old doc then we might have just been a diffbot
|
|
// json url that was directly injected into GLOBAL-INDEX
|
|
// like xyz.com/-diffbotxyz12345 (my format) or
|
|
if ( ! od ) goto skip9;
|
|
// if we are indexing a subdoc piece of a multidoc url
|
|
// then parentUrl should return non-NULL
|
|
char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
|
|
if ( ! parentUrl && od->m_contentType != CT_STATUS )
|
|
goto skip9;
|
|
// in that case we need to reindex the parent url not the
|
|
// subdoc url, so make the spider reply gen quick
|
|
//SpiderReply *newsr = od->getFakeSpiderReply();
|
|
//if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
|
|
// use our ip though
|
|
//newsr->m_firstIp = od->m_firstIp;
|
|
// however we have to use our docid-based spider request
|
|
SpiderReply srep;
|
|
srep.reset();
|
|
// it MUST match up with original spider request so the
|
|
// lock key in Spider.cpp can unlock it. that lock key
|
|
// uses the "uh48" (48bit hash of the url) and "srep.m_firstIp"
|
|
// in this case the SpiderRequest, sreq, is docid-based because
|
|
// it was added through PageReindex.cpp (query reindex) so
|
|
// it will be the 48 bit hash64b() of the docid
|
|
// (see PageReindex.cpp)'s call to SpiderRequest::setKey()
|
|
srep.m_firstIp = m_sreq.m_firstIp;
|
|
// assume no error
|
|
srep.m_errCount = 0;
|
|
// do not inherit this one, it MIGHT HAVE CHANGE!
|
|
srep.m_siteHash32 = m_sreq.m_siteHash32;
|
|
srep.m_domHash32 = m_sreq.m_domHash32;
|
|
srep.m_spideredTime = getTimeGlobal();
|
|
int64_t uh48 = m_sreq.getUrlHash48();
|
|
int64_t parentDocId = 0LL;
|
|
srep.m_contentHash32 = 0;
|
|
// were we already in titledb before we started spidering?
|
|
// yes otherwise we would have called "goto skip9" above
|
|
srep.m_wasIndexed = 1;
|
|
srep.m_wasIndexedValid = 1;
|
|
srep.m_isIndexed = 1;
|
|
srep.m_isIndexedINValid = false;
|
|
srep.m_errCode = EREINDEXREDIR; // indexCode
|
|
srep.m_downloadEndTime = 0;
|
|
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
|
|
// lock of request needs to match that of reply so the
|
|
// reply, when received by Rdb.cpp which calls addSpiderReply()
|
|
// can unlock this url so it can be spidered again.
|
|
int64_t lock1 = makeLockTableKey(&m_sreq);
|
|
int64_t lock2 = makeLockTableKey(&srep);
|
|
if ( lock1 != lock2 ) { char *xx=NULL;*xx=0; }
|
|
// make a fake spider reply so this docid-based spider
|
|
// request is not used again
|
|
//SpiderReply srep;
|
|
// store the rdbid
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
if ( ! m_zbuf.pushChar(rd) )
|
|
return NULL;
|
|
// store that reply to indicate this spider request has
|
|
// been fulfilled!
|
|
if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
|
|
return NULL;
|
|
|
|
// but also store a new spider request for the parent url
|
|
SpiderRequest ksr;
|
|
int64_t pd;
|
|
|
|
// skip if doc is a spider status "document". their docids
|
|
// often get added during a query reindex but we should ignore
|
|
// them completely.
|
|
if ( od->m_contentType == CT_STATUS )
|
|
goto returnList;
|
|
|
|
//goto returnList;
|
|
|
|
// complain
|
|
if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
|
|
log("build: doing query reindex but diffbot api "
|
|
"url is not set in spider controls");
|
|
// just copy original request
|
|
gbmemcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
|
|
// do not spider links, it's a page reindex of a multidoc url
|
|
ksr.m_avoidSpiderLinks = 1;
|
|
// avoid EDOCUNCHANGED
|
|
ksr.m_ignoreDocUnchangedError = 1;
|
|
// no longer docid based we set it to parentUrl
|
|
ksr.m_urlIsDocId = 0;
|
|
// but consider it a manual add. this should already be set.
|
|
ksr.m_isPageReindex = 1;
|
|
// but it is not docid based, so overwrite the docid
|
|
// in ksr.m_url with the parent multidoc url. it \0 terms it.
|
|
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
|
|
// this must be valid
|
|
//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
// set the key, ksr.m_key. isDel = false
|
|
// fake docid
|
|
pd = g_titledb.getProbableDocId(parentUrl);
|
|
ksr.setKey ( m_sreq.m_firstIp, pd , false );
|
|
// store this
|
|
if ( ! m_zbuf.pushChar(rd) )
|
|
return NULL;
|
|
// then the request
|
|
if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
|
|
return NULL;
|
|
returnList:
|
|
// prevent cores in indexDoc()
|
|
m_indexCode = EREINDEXREDIR;
|
|
m_indexCodeValid = true;
|
|
// for now we set this crap
|
|
m_metaList = m_zbuf.getBufStart();
|
|
m_metaListSize = m_zbuf.length();
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
//
|
|
// END DIFFBOT OBJECT QUERY REINDEX HACK
|
|
//
|
|
|
|
|
|
skip9:
|
|
// get our checksum
|
|
int32_t *plainch32 = getContentHash32();
|
|
if ( ! plainch32 || plainch32 == (void *)-1 ) return (char *)plainch32;
|
|
|
|
// get this too
|
|
int16_t *hs = getHttpStatus ();
|
|
if ( ! hs || hs == (void *)-1 ) return (char *)hs;
|
|
|
|
// make sure site is valid
|
|
char *site = getSite();
|
|
if ( ! site || site == (void *)-1 ) return (char *)site;
|
|
|
|
// this seems to be an issue as well for "unchanged" block below
|
|
char *isr = getIsSiteRoot();
|
|
if ( ! isr || isr == (void *)-1 ) return (char *)isr;
|
|
|
|
// get hash of all tags from tagdb that affect what we index
|
|
//int32_t *tagHash = getTagHash32 ( );
|
|
//if ( ! tagHash || tagHash == (void *)-1 ) return (char *)tagHash;
|
|
|
|
int64_t *sh64 = getSiteHash64();
|
|
if ( ! sh64 || sh64 == (void *)-1 ) return (char *)sh64;
|
|
|
|
// make sure docid valid
|
|
int64_t *mydocid = getDocId();
|
|
if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
|
|
|
|
// . get the old version of our XmlDoc from the previous spider time
|
|
// . set using the old title rec in titledb
|
|
// . should really not do any more than set m_titleRec...
|
|
// . should not even uncompress it!
|
|
// . getNewSpiderReply() will use this to set the reply if
|
|
// m_indexCode == EDOCUNCHANGED...
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
|
|
// point to the old xml doc if no error, etc.
|
|
XmlDoc *od = *pod;
|
|
|
|
// check if we are already indexed
|
|
char *isIndexed = getIsIndexed ();
|
|
if ( ! isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
|
|
// do not delete anything in these cases, but do remove us from
|
|
// spiderdb, and from tfndb (except for EDOCNOTNEW)
|
|
//if ( m_indexCode == EDOCNOTNEW || m_indexCode == EDOCNOTOLD )
|
|
// od = NULL;
|
|
|
|
// why call this way down here? it ends up downloading the doc!
|
|
int32_t *indexCode = getIndexCode();
|
|
if ( ! indexCode || indexCode ==(void *)-1) return (char *)indexCode;
|
|
// sanity check
|
|
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this means to abandon the injection
|
|
if ( *indexCode == EABANDONED ||
|
|
*indexCode == EHITCRAWLLIMIT ||
|
|
*indexCode == EHITPROCESSLIMIT ) {
|
|
m_metaList = (char *)0x123456;
|
|
m_metaListSize = 0;
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
// if diffbot reply is empty, don't bother adding anything except
|
|
// for the spider reply... reply might be "-1" too!
|
|
//if ( m_useDiffbot &&
|
|
// ! m_isDiffbotJSONObject &&
|
|
// m_diffbotReplyValid &&
|
|
// m_diffbotReply.length() <= 3 )
|
|
// diffbotEmptyReply = true;
|
|
|
|
// . some index code warrant retries, like EDNSTIMEDOUT, ETCPTIMEDOUT,
|
|
// etc. these are deemed temporary errors. other errors basically
|
|
// indicate a document that will never be indexable and should,
|
|
// if currently indexed, be deleted.
|
|
// . just add the spider reply and we're done
|
|
if ( *indexCode == EDNSTIMEDOUT
|
|
|| *indexCode == ETCPTIMEDOUT
|
|
|| *indexCode == EUDPTIMEDOUT
|
|
|| *indexCode == EDNSDEAD
|
|
|| *indexCode == ENETUNREACH
|
|
|| *indexCode == EHOSTUNREACH
|
|
// . rejected from a diffbot regex url crawl filter?
|
|
// . or no json objects returned from diffbot?
|
|
// . or rejected from the processign regex filter?
|
|
// . then just add the SpiderReply to avoid respidering
|
|
// . NO! still need to add outlinks
|
|
//|| diffbotEmptyReply
|
|
// . treat this as a temporary error i guess
|
|
// . getNewSpiderReply() below will clear the error in it and
|
|
// copy stuff over from m_sreq and m_oldDoc for this case
|
|
|| *indexCode == EDOCUNCHANGED
|
|
) {
|
|
// sanity - in repair mode?
|
|
if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
|
|
// . this seems to be an issue for blocking
|
|
// . if we do not have a valid ip, we can't compute this,
|
|
// in which case it will not be valid in the spider reply
|
|
// . why do we need this for timeouts etc? if the doc is
|
|
// unchanged
|
|
// we should probably update its siteinlinks in tagdb
|
|
// periodically and reindex the whole thing...
|
|
// . i think we were getting the sitenuminlinks for
|
|
// getNewSpiderReply()
|
|
if ( m_ipValid &&
|
|
m_ip != 0 &&
|
|
m_ip != -1 ) {
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
}
|
|
// all done!
|
|
bool addReply = true;
|
|
// Scraper.cpp uses this
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
|
|
// page parser calls set4 and sometimes gets a dns time out!
|
|
if ( m_sreqValid && m_sreq.m_isPageParser ) addReply = false;
|
|
// return nothing if done
|
|
if ( ! addReply ) {
|
|
m_metaListSize = 0;
|
|
m_metaList = (char *)0x1;
|
|
return m_metaList;
|
|
}
|
|
// save this
|
|
int32_t savedCode = *indexCode;
|
|
// before getting our spider reply, assign crap from the old
|
|
// doc to us since we are unchanged! this will allow us to
|
|
// call getNewSpiderReply() without doing any processing, like
|
|
// setting the Xml or Words classes, etc.
|
|
copyFromOldDoc ( od );
|
|
// need this though! i don't want to print out "Success"
|
|
// in the log in the logIt() function
|
|
m_indexCode = savedCode;
|
|
m_indexCodeValid = true;
|
|
// but set our m_contentHash32 from the spider request
|
|
// which got it from the spiderreply in the case of
|
|
// EDOCUNCHANGED. this way ch32=xxx will log correctly.
|
|
// I think this is only when EDOCUNCHANGED is set in the
|
|
// Msg13.cpp code, when we have a spider compression proxy.
|
|
if ( *indexCode == EDOCUNCHANGED &&
|
|
m_sreqValid &&
|
|
! m_contentHash32Valid ) {
|
|
m_contentHash32 = m_sreq.m_contentHash32;
|
|
m_contentHash32Valid = true;
|
|
}
|
|
// we need these got getNewSpiderReply()
|
|
m_wasInIndex = false;
|
|
if ( od ) m_wasInIndex = true;
|
|
m_isInIndex = m_wasInIndex;
|
|
m_wasInIndexValid = true;
|
|
m_isInIndexValid = true;
|
|
|
|
// unset our ptr_linkInfo1 so we do not free it and core
|
|
// since we might have set it in copyFromOldDoc() above
|
|
ptr_linkInfo1 = NULL;
|
|
size_linkInfo1 = 0;
|
|
m_linkInfo1Valid = false;
|
|
|
|
bool indexNewTimeStamp = false;
|
|
if ( getUseTimeAxis() &&
|
|
od &&
|
|
m_hasMetadata &&
|
|
*indexCode == EDOCUNCHANGED
|
|
//m_spideredTimeValid &&
|
|
//od->m_spideredTime != m_spideredTime
|
|
)
|
|
indexNewTimeStamp = true;
|
|
|
|
|
|
|
|
// . if not using spiderdb we are done at this point
|
|
// . this happens for diffbot json replies (m_dx)
|
|
if ( ! m_useSpiderdb && ! indexNewTimeStamp ) {
|
|
m_metaList = NULL;
|
|
m_metaListSize = 0;
|
|
return (char *)0x01;
|
|
}
|
|
|
|
// get our spider reply
|
|
SpiderReply *newsr = getNewSpiderReply();
|
|
// return on error
|
|
if ( ! newsr ) return (char *)newsr;
|
|
// . panic on blocking! this is supposed to be fast!
|
|
// . it might still have to lookup the tagdb rec?????
|
|
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// how much we need
|
|
int32_t needx = sizeof(SpiderReply) + 1;
|
|
|
|
|
|
// . INDEX SPIDER REPLY (1a)
|
|
// . index ALL spider replies as separate doc. error or not.
|
|
// . then print out error histograms.
|
|
// . we should also hash this stuff when indexing the
|
|
// doc as a whole
|
|
|
|
// i guess it is safe to do this after getting the spiderreply
|
|
SafeBuf *spiderStatusDocMetaList = NULL;
|
|
// if ( cr->m_indexSpiderReplies &&
|
|
// m_useSpiderdb &&
|
|
// // doing it for diffbot throws off smoketests.
|
|
// // yeah, but we need it, so we'll just have to update
|
|
// // the smoketests
|
|
// ! cr->m_isCustomCrawl ) {
|
|
// get the spiderreply ready to be added
|
|
spiderStatusDocMetaList = getSpiderStatusDocMetaList(newsr ,
|
|
forDelete);
|
|
// error?
|
|
if ( ! spiderStatusDocMetaList ) return NULL;
|
|
// blocked?
|
|
if (spiderStatusDocMetaList==(void *)-1)
|
|
return (char *)-1;
|
|
|
|
// . now append the new stuff.
|
|
// . we overwrite the old titlerec with the new one that has
|
|
// some more json in the ptr_metaInfo buffer so we hash
|
|
// its new timestamp. 'gbspiderdate' and any info from
|
|
// the meta info given in the injection request if there.
|
|
// this allows you to tag each document, even multiple
|
|
// versions of the same url with the same content. so if
|
|
// you spider the doc again and it is unchanged since last
|
|
// time we still index some of this meta stuff.
|
|
if ( indexNewTimeStamp )
|
|
appendNewMetaInfo (spiderStatusDocMetaList,forDelete);
|
|
|
|
// need to alloc space for it too
|
|
int32_t len = spiderStatusDocMetaList->length();
|
|
needx += len;
|
|
// this too
|
|
m_addedStatusDocSize = len;
|
|
m_addedStatusDocSizeValid = true;
|
|
//}
|
|
|
|
// doledb key?
|
|
//if ( m_doledbKey.n0 || m_doledbKey.n1 )
|
|
// needx += 1 + sizeof(key_t); // + 4;
|
|
// the FAKEDB unlock key for msg12 in spider.cpp
|
|
//needx += 1 + sizeof(key_t); // FAKEDB
|
|
// make the buffer
|
|
m_metaList = (char *)mmalloc ( needx , "metalist");
|
|
if ( ! m_metaList ) return NULL;
|
|
// save size for freeing later
|
|
m_metaListAllocSize = needx;
|
|
// ptr and boundary
|
|
m_p = m_metaList;
|
|
m_pend = m_metaList + needx;
|
|
|
|
// save it
|
|
char *saved = m_p;
|
|
|
|
// first store spider reply "document"
|
|
if ( spiderStatusDocMetaList ) {
|
|
gbmemcpy ( m_p,
|
|
spiderStatusDocMetaList->getBufStart(),
|
|
spiderStatusDocMetaList->length() );
|
|
m_p += spiderStatusDocMetaList->length();
|
|
}
|
|
|
|
/*
|
|
|
|
Not any more, now we remove from doledb as soon
|
|
as we get all the lock grants in our group (shard)
|
|
using Msg4 in Spider.cpp. That way we can add a
|
|
"0" entry into the waiting tree (or a time X ms into
|
|
the future from now) to try to enforce a sameIpWait
|
|
constraint and also allow up to maxSpidersPerIP.
|
|
|
|
// remove from doledb if we had a valid key
|
|
// (BEFORE adding SpiderReply)
|
|
if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
|
|
// note it
|
|
setStatus ( "removing key from doledb" );
|
|
// . now remove the original spider rec from "doledb"
|
|
// . rdbid first
|
|
*m_p = RDB_DOLEDB;
|
|
m_p++;
|
|
// then the key
|
|
*(key_t *)m_p = m_doledbKey;
|
|
// nukey, clear del bit to delete it
|
|
*m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key_t);
|
|
// then zero for data size
|
|
// *(int32_t *)m_p = 0;
|
|
//m_p += 4;
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
}
|
|
*/
|
|
|
|
// sanity check
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// . make a fake titledb key
|
|
// . remove the spider lock (Msg12 in Spider.cpp)
|
|
// . now SPider.cpp uses SpiderReply reception to remove lock
|
|
// - mdw 9/28/13
|
|
//*m_p++ = RDB_FAKEDB;
|
|
//*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
|
|
//key_t fakeKey;
|
|
//fakeKey.n1 = 0;
|
|
//fakeKey.n0 = m_docId;
|
|
//gbmemcpy ( m_p , &fakeKey , sizeof(key_t) );
|
|
//m_p += sizeof(key_t);
|
|
// now add the new rescheduled time
|
|
setStatus ( "adding SpiderReply to spiderdb" );
|
|
// rdbid first
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
*m_p++ = rd;
|
|
// get this
|
|
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
// store the spider rec
|
|
int32_t newsrSize = newsr->getRecSize();
|
|
gbmemcpy ( m_p , newsr , newsrSize );
|
|
m_p += newsrSize;
|
|
m_addedSpiderReplySize = newsrSize;
|
|
m_addedSpiderReplySizeValid = true;
|
|
// sanity check
|
|
if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
// verify it
|
|
m_metaListValid = true;
|
|
// set size
|
|
m_metaListSize = m_p - m_metaList;
|
|
// all done
|
|
return m_metaList;
|
|
|
|
}
|
|
|
|
// if using diffbot do not index the content of the web page we
|
|
// got the json objects from, although, do keep it cached in titledb
|
|
// because that can be useful
|
|
// Not any more, now index the pages as well! then restrict search
|
|
// to type:json to just search json objects.
|
|
//if ( m_useDiffbot && ! m_isDiffbotJSONObject ) {
|
|
// m_usePosdb = false;
|
|
// m_useClusterdb = false;
|
|
//}
|
|
|
|
// get the old meta list if we had an old doc
|
|
char *oldList = NULL;
|
|
int32_t oldListSize = 0;
|
|
if ( od ) {
|
|
od->m_useSpiderdb = false;
|
|
od->m_useTagdb = false;
|
|
// do not use diffbot for old doc since we call
|
|
// od->nukeJSONObjects below()
|
|
od->m_diffbotApiUrlValid = true;
|
|
// api url should be empty by default
|
|
//od->m_diffbotApiNum = DBA_NONE;
|
|
//log("break it here. shit this is not getting the list!!!");
|
|
// if we are doing diffbot stuff, we are still indexing this
|
|
// page, so we need to get the old doc meta list
|
|
oldList = od->getMetaList ( true );
|
|
oldListSize = od->m_metaListSize;
|
|
if ( ! oldList || oldList ==(void *)-1) return (char *)oldList;
|
|
}
|
|
|
|
// . set whether we should add recs to titledb, posdb, linkdb, etc.
|
|
// . if this doc is set by titlerec we won't change these
|
|
// . we only turn off m_usePosdb, etc. if there is a
|
|
// <meta name=noindex content=1>
|
|
// . we will still add to spiderdb, but not posdb, linkdb, titledb
|
|
// and clusterdb.
|
|
// . so we'll add the spiderreply for this doc and the spiderrequests
|
|
// for all outlinks and "firstIp" tagrecs to tagdb for those outlinks
|
|
// . we use this for adding the url seed file gbdmoz.urls.txt
|
|
// which contains a list of all the dmoz urls we want to spider.
|
|
// gbdmoz.urls.txt is generated by dmozparse.cpp. we spider all
|
|
// these dmoz urls so we can search the CONTENT of the pages in dmoz,
|
|
// something dmoz won't let you do.
|
|
char *mt = hasNoIndexMetaTag();
|
|
if ( ! mt || mt == (void *)-1 ) return (char *)mt;
|
|
if ( *mt ) {
|
|
m_usePosdb = false;
|
|
m_useLinkdb = false;
|
|
m_useTitledb = false;
|
|
m_useClusterdb = false;
|
|
// do not add the "firstIp" tagrecs of the outlinks any more
|
|
// because it might hurt us?
|
|
m_useTagdb = false;
|
|
}
|
|
|
|
if ( cr->m_isCustomCrawl )
|
|
m_useLinkdb = false;
|
|
|
|
// . should we recycle the diffbot reply for this url?
|
|
// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
|
|
// our existing diffbot reply, i.e. recycle it, even though we
|
|
// respidered this page.
|
|
bool *recycle = getRecycleDiffbotReply();
|
|
if ( ! recycle || recycle == (void *)-1) return (char *)recycle;
|
|
// in that case inherit this from the old doc...
|
|
if ( od && *recycle && od->m_diffbotJSONCount &&
|
|
// somehow i am seeing that this is empty!
|
|
// this is how many title hashes of diffbot replies we've
|
|
// stored in the old doc's titlerec. if these are not equal
|
|
// and we call reindexJSONObjects() below then it cores
|
|
// in redoJSONObjects().
|
|
od->size_linkInfo2/4 == od->m_diffbotJSONCount &&
|
|
// only call this once otherwise we double stock
|
|
// m_diffbotTitleHashBuf
|
|
m_diffbotJSONCount == 0 ) {//cr->m_isCustomCrawl){
|
|
m_diffbotJSONCount = od->m_diffbotJSONCount;
|
|
m_sentToDiffbot = od->m_sentToDiffbot;
|
|
m_gotDiffbotSuccessfulReply = od->m_gotDiffbotSuccessfulReply;
|
|
// copy title hashes info. it goes hand in hand with the
|
|
// NUMBER of diffbot items we have.
|
|
int nh = 0;
|
|
int32_t *ohbuf = od->getDiffbotTitleHashes ( &nh );
|
|
if ( ! m_diffbotTitleHashBuf.safeMemcpy ( ohbuf , nh*4 ) )
|
|
return NULL;
|
|
ptr_linkInfo2 =(LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
|
|
size_linkInfo2=m_diffbotTitleHashBuf.length();
|
|
|
|
}
|
|
// but we might have to call reindexJSONObjects() multiple times if
|
|
// it would block
|
|
if ( od && *recycle &&
|
|
// only reindex if it is a query reindex i guess otherwise
|
|
// just leave it alone
|
|
m_sreqValid && m_sreq.m_isPageReindex &&
|
|
od->m_diffbotJSONCount &&
|
|
size_linkInfo2 ) {
|
|
// similar to od->nukeJSONObjects
|
|
int32_t *ohbuf =(int32_t *)m_diffbotTitleHashBuf.getBufStart();
|
|
int32_t nh =m_diffbotTitleHashBuf.length() / 4;
|
|
int32_t *status = reindexJSONObjects( ohbuf , nh );
|
|
if ( ! status || status == (void *)-1) return (char *)status;
|
|
}
|
|
|
|
|
|
// just delete the json items whose "title hashes" are present
|
|
// in the "old doc" but NOT i the "new doc".
|
|
// we use the title hash to construct a unique url for each json item.
|
|
// if the title hash is present in both the old and new docs then
|
|
// do not delete it here, but we will reindex it later in
|
|
// getMetaList() below when we call indexDoc() on each one after
|
|
// setting m_dx to each one.
|
|
bool nukeJson = true;
|
|
if ( ! od ) nukeJson = false;
|
|
if ( od && od->m_diffbotJSONCount <= 0 ) nukeJson = false;
|
|
// if recycling json objects, leave them there!
|
|
if ( *recycle ) nukeJson = false;
|
|
// you have to be a diffbot crawl to do this
|
|
// no, not if you have th diffbot api url set... so take this out
|
|
//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
|
|
// do not remove old diffbot json objects if pageparser.cpp test
|
|
// because that can not change the index, etc.
|
|
if ( getIsPageParser() ) nukeJson = false;
|
|
|
|
if ( nukeJson ) {
|
|
// it should only nuke/delete the json items that we LOST,
|
|
// so if we still have the title hash in our latest
|
|
// diffbot reply, then do not nuke that json item, which
|
|
// will have a url ending in -diffboyxyz%" UINT32 "
|
|
// (where %" UINT32 " is the json item title hash).
|
|
// This will download the diffbot reply if not already there.
|
|
int32_t numHashes;
|
|
int32_t *th = getDiffbotTitleHashes(&numHashes);
|
|
if ( ! th && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
if ( ! th || th == (void *)-1 ) return (char *)th;
|
|
// this returns false if it blocks
|
|
int32_t *status = od->nukeJSONObjects( th , numHashes );
|
|
if ( ! status || status == (void *)-1) return (char *)status;
|
|
}
|
|
|
|
// . need this if useTitledb is true
|
|
// . otherwise XmlDoc::getTitleRecBuf() cores because its invalid
|
|
// . this cores if rebuilding just posdb because hashAll() needs
|
|
// the inlink texts for hashing
|
|
//if ( m_useTitledb ) {
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 )
|
|
return (char *)info1;
|
|
//}
|
|
|
|
// global debug
|
|
g_od = od;
|
|
|
|
/*
|
|
// is the document content unchanged?
|
|
bool unchanged = false;
|
|
if ( od && od->m_contentHash32 == *ch32 ) unchanged = true;
|
|
// http status of 304 means "not modified since"
|
|
if ( od && *hs == 304 ) unchanged = true;
|
|
|
|
// compare to last time
|
|
if ( od && *tagHash != od->m_tagHash32 ) unchanged = false;
|
|
|
|
// do not do this if from pageparser.cpp
|
|
//if ( m_sreqValid && m_sreq.m_isPageParser ) unchanged = false;
|
|
if ( getIsPageParser() ) unchanged = false;
|
|
|
|
// force reindex if it was from query reindex (docid based spider req)
|
|
if ( m_sreqValid && m_sreq.m_urlIsDocId ) unchanged = false;
|
|
|
|
// if we were turked... how to tell????
|
|
if ( m_sreqValid && m_sreq.m_isInjecting ) unchanged = false;
|
|
|
|
// just turn it all off for now because our parsing logic might
|
|
// have changed
|
|
unchanged = false;
|
|
|
|
// set this i guess for doConsistencyTest()
|
|
m_unchanged = unchanged;
|
|
m_unchangedValid = true;
|
|
|
|
// . if doc content was unchanged just add the SpiderReply to the
|
|
// meta list so that spiderdb knows we attempted it at this time.
|
|
// . copy over member vars of the old titleRec/XmlDoc into us so
|
|
// we can save time and cpu
|
|
if ( unchanged ) {
|
|
// this seems to be an issue for blocking
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
// all done!
|
|
bool addReply = true;
|
|
// Scraper.cpp uses this
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
|
|
// return nothing if done
|
|
if ( ! addReply ) {
|
|
m_metaListSize = 0;
|
|
m_metaList = (char *)0x1;
|
|
return m_metaList;
|
|
}
|
|
// before getting our spider reply, assign crap from the old
|
|
// doc to us since we are unchanged! this will allow us to
|
|
// call getNewSpiderReply() without doing any processing, like
|
|
// setting the Xml or Words classes, etc.
|
|
copyFromOldDoc ( od );
|
|
// and don't forget to validate this
|
|
int32_t *ic = getIndexCode();
|
|
// should never block since we copied from old doc
|
|
if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// get our spider reply
|
|
SpiderReply *newsr = getNewSpiderReply();
|
|
// return on error
|
|
if ( ! newsr ) return (char *)newsr;
|
|
// . panic on blocking! this is supposed to be fast!
|
|
// . it might still have to lookup the tagdb rec?????
|
|
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// unset our ptr_linkInfo1 so we do not free it and core
|
|
// since we might have set it in copyFromOldDoc() above
|
|
ptr_linkInfo1 = NULL;
|
|
size_linkInfo1 = 0;
|
|
// how much we need
|
|
int32_t needx = sizeof(SpiderReply) + 1;
|
|
// doledb key?
|
|
if ( m_doledbKey.n0 || m_doledbKey.n1 )
|
|
needx += 1 + sizeof(key_t); // + 4;
|
|
// the titledb unlock key for msg12 in spider.cpp
|
|
needx += 1 + sizeof(key_t);
|
|
// make the buffer
|
|
m_metaList = (char *)mmalloc ( needx , "metalist");
|
|
if ( ! m_metaList ) return NULL;
|
|
// save size for freeing later
|
|
m_metaListAllocSize = needx;
|
|
// ptr and boundary
|
|
m_p = m_metaList;
|
|
m_pend = m_metaList + needx;
|
|
// save it
|
|
char *saved = m_p;
|
|
// remove from doledb if we had a valid key (BEFORE adding SpiderReply)
|
|
if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
|
|
// note it
|
|
setStatus ( "removing key from doledb" );
|
|
// . now remove the original spider rec from "doledb"
|
|
// . rdbid first
|
|
*m_p = RDB_DOLEDB;
|
|
m_p++;
|
|
// then the key
|
|
*(key_t *)m_p = m_doledbKey;
|
|
// nukey, clear del bit to delete it
|
|
*m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key_t);
|
|
// then zero for data size
|
|
// *(int32_t *)m_p = 0;
|
|
//m_p += 4;
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
}
|
|
// sanity check
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// . make a fake titledb key
|
|
// . remove the spider lock (Msg12 in Spider.cpp)
|
|
*m_p++ = RDB_FAKEDB;
|
|
*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
|
|
m_p += sizeof(key_t);
|
|
// now add the new rescheduled time
|
|
// note it
|
|
setStatus ( "adding SpiderReply to spiderdb" );
|
|
// rdbid first
|
|
*m_p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
|
|
m_p++;
|
|
// get this
|
|
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
// store the spider rec
|
|
int32_t newsrSize = newsr->getRecSize();
|
|
gbmemcpy ( m_p , newsr , newsrSize );
|
|
m_p += newsrSize;
|
|
// sanity check
|
|
if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
// verify it
|
|
m_metaListValid = true;
|
|
// set size
|
|
m_metaListSize = m_p - m_metaList;
|
|
// all done
|
|
return m_metaList;
|
|
}
|
|
*/
|
|
|
|
// so getSiteRank() works
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
// so addTable144 works
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
|
|
|
|
|
|
//
|
|
// . before making the title rec we need to set all the ptrs!
|
|
// . so at least now set all the data members we will need to
|
|
// seriazlize into the title rec because we can't be blocking further
|
|
// down below after we set all the hashtables and XmlDoc::ptr_ stuff
|
|
if ( ! m_setFromTitleRec || m_useSecondaryRdbs ) {
|
|
// all member vars should already be valid if set from titlerec
|
|
char *ptg = prepareToMakeTitleRec ();
|
|
// return NULL with g_errno set on error
|
|
if ( ! ptg || ptg == (void *)-1 ) return (char *)ptg;
|
|
}
|
|
|
|
// sanity check - if the valid title rec is null, m_indexCode is set!
|
|
//if ( ! *tr && ! m_indexCode ) { char *xx=NULL;*xx=0; }
|
|
// . bail. return an empty meta list, m_metaListSize should be 0!
|
|
// . NO! we need to add a SpiderReply to spiderdb...
|
|
//if ( ! *tr )
|
|
// log("HEY");
|
|
/*
|
|
if ( ! *tr ) {
|
|
m_metaList = "";
|
|
m_metaListSize = 0;
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
*/
|
|
|
|
// get this for hashing stuff
|
|
//Spam *sp = getSpam();
|
|
//if ( ! sp || sp == (void *)-1 ) return (char *)sp;
|
|
|
|
// our next slated spider priority
|
|
char *spiderLinks3 = getSpiderLinks();
|
|
if ( ! spiderLinks3 || spiderLinks3 == (char *)-1 )
|
|
return (char *)spiderLinks3;
|
|
|
|
bool spideringLinks = *spiderLinks3;
|
|
|
|
// int16_tcut
|
|
XmlDoc *nd = this;
|
|
|
|
///////////////////////////////////
|
|
///////////////////////////////////
|
|
//
|
|
//
|
|
// if we had an error, do not add us regardless to the index
|
|
// although we might add SOME things depending on the error.
|
|
// Like add the redirecting url if we had a ESIMPLIFIEDREDIR error.
|
|
// So what we had to the Rdbs depends on the indexCode.
|
|
//
|
|
|
|
if ( m_indexCode ) nd = NULL;
|
|
|
|
// OR if deleting from index, we just want to get the metalist
|
|
// directly from "od"
|
|
if ( m_deleteFromIndex ) nd = NULL;
|
|
|
|
|
|
//
|
|
//
|
|
///////////////////////////////////
|
|
///////////////////////////////////
|
|
|
|
if ( ! nd )
|
|
spideringLinks = false;
|
|
|
|
// set these for getNewSpiderReply() so it can set
|
|
// SpiderReply::m_wasIndexed and m_isIndexed...
|
|
m_wasInIndex = false;
|
|
m_isInIndex = false;
|
|
if ( od ) m_wasInIndex = true;
|
|
if ( nd ) m_isInIndex = true;
|
|
m_wasInIndexValid = true;
|
|
m_isInIndexValid = true;
|
|
|
|
|
|
// if we are adding a simplified redirect as a link to spiderdb
|
|
if ( m_indexCode == EDOCSIMPLIFIEDREDIR )
|
|
spideringLinks = true;
|
|
|
|
// likewise if there error was ENONCANONICAL treat it like that
|
|
if ( m_indexCode == EDOCNONCANONICAL )
|
|
spideringLinks = true;
|
|
|
|
//
|
|
// . prepare the outlink info if we are adding links to spiderdb!
|
|
// . do this before we start hashing so we do not block and re-hash!!
|
|
//
|
|
if ( spideringLinks && ! m_doingConsistencyCheck && m_useSpiderdb){
|
|
setStatus ( "getting outlink info" );
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (char *)grv;
|
|
//char **iiv = getOutlinkIsIndexedVector();
|
|
//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
|
|
int32_t **ipv = getOutlinkFirstIpVector();
|
|
if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
|
|
//int8_t *hcv = getOutlinkHopCountVector();
|
|
//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
|
|
//char *ipi = getIsIndexed(); // is the parent indexed?
|
|
//if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
|
|
}
|
|
|
|
// get the tag buf to add to tagdb
|
|
SafeBuf *ntb = NULL;
|
|
if ( m_useTagdb && ! m_deleteFromIndex ) {
|
|
ntb = getNewTagBuf();
|
|
if ( ! ntb || ntb == (void *)-1 ) return (char *)ntb;
|
|
}
|
|
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (void *)-1 ) return (char *)ww;
|
|
|
|
int64_t *pch64 = getExactContentHash64();
|
|
//int64_t *pch64 = getLooseContentHash64();
|
|
if ( ! pch64 || pch64 == (void *)-1 ) return (char *)pch64;
|
|
|
|
// get the voting table which we will add to sectiondb
|
|
SectionVotingTable *nsvt = NULL;
|
|
SectionVotingTable *osvt = NULL;
|
|
// seems like
|
|
// sectiondb takes up about 15% of the disk space like this. no!
|
|
// cuz then there is revdb, so we are 30%. so that's a no go.
|
|
bool addSectionVotes = false;
|
|
if ( nd ) addSectionVotes = true;
|
|
if ( ! m_useSectiondb ) addSectionVotes = false;
|
|
// to save disk space no longer add the roots! nto only saves sectiondb
|
|
// but also saves space in revdb
|
|
//if ( nd && *isRoot ) addSectionVotes = true;
|
|
if ( addSectionVotes ) {
|
|
nsvt = getNewSectionVotingTable();
|
|
if ( ! nsvt || nsvt == (void *)-1 ) return (char *)nsvt;
|
|
// get the old table too!
|
|
osvt = getNewSectionVotingTable();
|
|
if ( ! osvt || osvt == (void *)-1 ) return (char *)osvt;
|
|
}
|
|
|
|
// get the addresses for hashing tag hashes that indicate place names
|
|
Addresses *na = NULL;
|
|
//Addresses *oa = NULL;
|
|
if ( nd ) na = getAddresses();
|
|
//if ( od ) oa = od->getAddresses();
|
|
|
|
// get dates ready for hashing
|
|
Dates *ndp = NULL;
|
|
//Dates *odp = NULL;
|
|
if ( nd ) {
|
|
ndp = nd->getDates();
|
|
if ( ! ndp || ndp==(void *)-1) return (char *)ndp;
|
|
}
|
|
//if ( od ) {
|
|
// odp = od->getDates();
|
|
// if ( ! odp || odp==(void *)-1) return (char *)odp;
|
|
//}
|
|
|
|
// need firstip if adding a rebuilt spider request
|
|
if ( m_useSecondaryRdbs && ! m_isDiffbotJSONObject && m_useSpiderdb ) {
|
|
int32_t *fip = getFirstIp();
|
|
if ( ! fip || fip == (void *)-1 ) return (char *)fip;
|
|
}
|
|
|
|
|
|
// shit, we need a spider reply so that it will not re-add the
|
|
// spider request to waiting tree, we ignore docid-based
|
|
// recs that have spiderreplies in Spider.cpp
|
|
SpiderReply *newsr = NULL;
|
|
if ( m_useSpiderdb ) { // && ! m_deleteFromIndex ) {
|
|
newsr = getNewSpiderReply();
|
|
if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
|
|
}
|
|
|
|
// the site hash for hashing
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (char *)sh32;
|
|
|
|
// set ptr_clockCandidatesData
|
|
// if ( nd ) {
|
|
// HashTableX *cct = nd->getClockCandidatesTable();
|
|
// if ( ! cct || cct==(void *)-1) return (char *)cct;
|
|
// }
|
|
|
|
if ( m_useLinkdb && ! m_deleteFromIndex ) {
|
|
int32_t *linkSiteHashes = getLinkSiteHashes();
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
|
|
return (char *)linkSiteHashes;
|
|
}
|
|
|
|
//SafeBuf *au = getDiffbotApiUrl();
|
|
//if ( ! au || au == (void *)-1 ) return (char *)au;
|
|
|
|
|
|
// test json parser
|
|
//
|
|
/*
|
|
char *json = "{\"icon\":\"http://www.pixar.com/sites/default/files/pixar_2012_favicon_0.ico\",\"text\":\"\",\"title\":\"Pixar\",\"type\":\"article\",\"media\":[{\"primary\":\"true\",\"link\":\"http://www.pixar.com/sites/default/files/home_billboard_v7.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/roz1_0.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/home_bu-thumb_v1.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/mu_home_thumb.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/brenda_homepage.jpg\",\"type\":\"image\"}],\"url\":\"http://www.pixar.com/\"}";
|
|
char *dd = getNextJSONObject ( json );
|
|
if ( *dd ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
///////////
|
|
//
|
|
// BEGIN the diffbot json object index hack
|
|
//
|
|
// if we are using diffbot, then each json object in the diffbot reply
|
|
// should be indexed as its own document.
|
|
//
|
|
///////////
|
|
|
|
|
|
// . get the reply of json objects from diffbot
|
|
// . this will be empty if we are a json object!
|
|
// . will also be empty if not meant to be sent to diffbot
|
|
// . the TOKENIZED reply consists of \0 separated json objects that
|
|
// we create from the original diffbot reply
|
|
SafeBuf *tdbr = getTokenizedDiffbotReply();
|
|
if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr;
|
|
|
|
|
|
// i guess it is safe to do this after getting the spiderreply
|
|
SafeBuf *spiderStatusDocMetaList = NULL;
|
|
//if ( indexReply ) {
|
|
|
|
// get the spiderreply ready to be added to the rdbs w/ msg4
|
|
// but if doing a rebuild operation then do not get it, we'll rebuild
|
|
// it since it will have its own titlerec
|
|
if ( ! m_useSecondaryRdbs ) {
|
|
spiderStatusDocMetaList =
|
|
getSpiderStatusDocMetaList (newsr,forDelete);
|
|
if ( ! spiderStatusDocMetaList ) {
|
|
log("build: ss doc metalist null. bad!");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if ( spiderStatusDocMetaList == (void *)-1)
|
|
return (char *)spiderStatusDocMetaList;
|
|
//}
|
|
|
|
|
|
|
|
|
|
int32_t tdbrLen = tdbr->length();
|
|
|
|
// do not index json items as separate docs if we are page parser
|
|
if ( getIsPageParser() ) tdbrLen = 0;
|
|
|
|
// same goes if appending -diffbotxyz%UINT32 would be too long
|
|
if ( m_firstUrl.getUrlLen() + 11 + 10 > MAX_URL_LEN )
|
|
tdbrLen = 0;
|
|
|
|
// once we have tokenized diffbot reply we can get a unique
|
|
// hash of the title of each json item. that way, if a page changes
|
|
// and it gains or loses a diffbot item, the old items will still
|
|
// have the same url and we can set their m_indexCode to EDOCUNCHANGED
|
|
// if the individual json item itself has not changed when we
|
|
// call m_dx->indexDoc() below.
|
|
int32_t numHashes = 0;
|
|
int32_t *titleHashBuf = NULL;
|
|
|
|
//
|
|
// if we got a json object or two from diffbot, index them
|
|
// as their own child xmldocs.
|
|
// watch out for reply from diffbot of "-1" indicating error!
|
|
//
|
|
if ( tdbrLen > 3 ) {
|
|
|
|
// get title hashes of the json items
|
|
titleHashBuf = getDiffbotTitleHashes(&numHashes);
|
|
if (!titleHashBuf || titleHashBuf == (void *)-1){
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// make sure diffbot reply is valid for sure
|
|
if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
|
|
// set status for this
|
|
setStatus ( "indexing diffbot json doc");
|
|
// new guy here
|
|
if ( ! m_dx ) {
|
|
try { m_dx = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("xmldoc: failed to alloc m_dx");
|
|
return NULL;
|
|
}
|
|
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
|
|
// we now parse the array of products out of the
|
|
// diffbot reply. each product is an item/object.
|
|
m_diffbotObj = tdbr->getBufStart();
|
|
m_diffbotJSONCount = 0;
|
|
}
|
|
// loop back up here to process next json object from below
|
|
jsonloop:
|
|
// if m_dx has no url set, call set4 i guess
|
|
if ( ! m_dx->m_contentValid ) {
|
|
|
|
// sanity. ensure the json item we are trying to
|
|
// index has a title hash in this buf
|
|
if(m_diffbotJSONCount>=numHashes){char *xx=NULL;*xx=0;}
|
|
|
|
// get the title of the json we are indexing
|
|
int32_t jth = titleHashBuf [ m_diffbotJSONCount ];
|
|
|
|
// make the fake url for this json object for indexing
|
|
SafeBuf fakeUrl;
|
|
fakeUrl.set ( m_firstUrl.getUrl() );
|
|
// append -diffbot-0 etc. for fake url
|
|
fakeUrl.safePrintf("-diffbotxyz%" UINT32 "",
|
|
//(int32_t)m_diffbotJSONCount);
|
|
(uint32_t)jth);
|
|
if ( fakeUrl.length() > MAX_URL_LEN ) {
|
|
log("build: diffbot enhanced url too long for "
|
|
"%s",fakeUrl.getBufStart());
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
m_diffbotJSONCount++;
|
|
// this can go on the stack since set4() copies it
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
// string ptr
|
|
char *url = fakeUrl.getBufStart();
|
|
// use this as the url
|
|
strcpy( sreq.m_url, url );
|
|
// parentdocid of 0
|
|
int32_t firstIp = hash32n ( url );
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = m_hopCount;
|
|
sreq.m_hopCountValid = m_hopCountValid;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
// so we can match url filters' "insitelist" directive
|
|
// in Spider.cpp::getUrlFilterNum()
|
|
sreq.m_domHash32 = m_domHash32;
|
|
sreq.m_siteHash32 = m_siteHash32;
|
|
sreq.m_hostHash32 = m_siteHash32;
|
|
// set this
|
|
if (!m_dx->set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have
|
|
// to be careful since we are a
|
|
// niceness of 0!!!!
|
|
m_niceness, // 1 ,
|
|
// inject this content
|
|
m_diffbotObj,
|
|
false, // deleteFromIndex ,
|
|
0, // forcedIp ,
|
|
CT_JSON, // contentType ,
|
|
0, // lastSpidered ,
|
|
false )) // hasMime
|
|
// g_errno should be set!
|
|
return NULL;
|
|
// we are indexing json objects, don't use all these
|
|
m_dx->m_useClusterdb = false;
|
|
m_dx->m_useSpiderdb = false;
|
|
m_dx->m_useTagdb = false;
|
|
m_dx->m_usePlacedb = false;
|
|
m_dx->m_useLinkdb = false;
|
|
m_dx->m_isChildDoc = true;
|
|
m_dx->m_parentDocPtr = this;
|
|
// we like to sort json objects using
|
|
// 'gbsortby:spiderdate' query to get the most
|
|
// recent json objects, so this must be valid
|
|
if ( m_spideredTimeValid ) {
|
|
m_dx->m_spideredTimeValid = true;
|
|
m_dx->m_spideredTime = m_spideredTime;
|
|
}
|
|
|
|
m_dx->m_isDiffbotJSONObject = true;
|
|
}
|
|
|
|
// when the indexdoc completes, or if it blocks, call us!
|
|
// we should just pass through here
|
|
//xd->setCallback ( this , getMetaListWrapper );
|
|
m_dx->setCallback ( m_masterState , m_masterLoop );
|
|
|
|
///////////////
|
|
// . inject the content of the json using this fake url
|
|
// . return -1 if this blocks
|
|
// . if m_dx got its msg4 reply it ends up here, in which
|
|
// case do NOT re-call indexDoc() so check for
|
|
// m_listAdded.
|
|
///////////////
|
|
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
|
|
return (char *)-1;
|
|
|
|
// critical error on our part trying to index it?
|
|
// does not include timeouts or 404s, etc. mostly just
|
|
// OOM errors.
|
|
if ( g_errno ) return NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// count as deleted
|
|
cr->m_localCrawlInfo.m_objectsAdded++;
|
|
cr->m_globalCrawlInfo.m_objectsAdded++;
|
|
cr->m_needsSave = true;
|
|
// we successfully index the json object, skip to next one
|
|
m_diffbotObj += gbstrlen(m_diffbotObj) + 1;
|
|
// but gotta set this crap back
|
|
log(LOG_INFO,"diffbot: resetting %s",m_dx->m_firstUrl.m_url);
|
|
// clear for next guy if there is one. clears
|
|
// m_dx->m_contentValid so the set4() can be called again above
|
|
m_dx->reset();
|
|
// have we breached the buffer of json objects? if not, do more
|
|
if ( m_diffbotObj < tdbr->getBuf() ) goto jsonloop;
|
|
}
|
|
|
|
/////
|
|
//
|
|
// END the diffbot json object index hack
|
|
//
|
|
/////
|
|
|
|
|
|
//
|
|
// CAUTION
|
|
//
|
|
// CAUTION
|
|
//
|
|
// We should never "block" after this point, lest the hashtables
|
|
// we create get messed up.
|
|
//
|
|
|
|
//
|
|
//
|
|
// START HASHING
|
|
//
|
|
//
|
|
|
|
|
|
// store what we hash into this table
|
|
if ( (m_pbuf || m_storeTermListInfo) && ! m_wts ) {
|
|
// init it. the value is a TermInfo class. allowDups=true!
|
|
m_wtsTable.set (12,sizeof(TermDebugInfo),
|
|
0,NULL,0,true,m_niceness,
|
|
"wts-tab");
|
|
// point to it, make it active
|
|
m_wts = &m_wtsTable;
|
|
}
|
|
|
|
// how much to alloc? compute an upper bound
|
|
int32_t need = 0;
|
|
// should we index this doc?
|
|
bool index1 = true;
|
|
|
|
setStatus ( "hashing posdb and datedb terms" );
|
|
// . hash our documents terms into "tt1"
|
|
// . hash the old document's terms into "tt2"
|
|
// . by old, we mean the older versioned doc of this url spidered b4
|
|
HashTableX tt1;
|
|
HashTableX tt2;
|
|
// how many words we got?
|
|
int32_t nw = m_words.getNumWords();
|
|
// . prepare it, 5000 initial terms
|
|
// . make it nw*8 to avoid have to re-alloc the table!!!
|
|
// . i guess we can have link and neighborhood text too! we don't
|
|
// count it here though... but add 5k for it...
|
|
int32_t need4 = nw * 4 + 5000;
|
|
if ( nd && index1 && m_usePosdb ) {
|
|
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,
|
|
"posdb-indx"))
|
|
return NULL;
|
|
int32_t did = tt1.m_numSlots;
|
|
//bool index2 = true;
|
|
// . hash the document terms into "tt1"
|
|
// . this is a biggie!!!
|
|
// . only hash ourselves if m_indexCode is false
|
|
// . m_indexCode is non-zero if we should delete the doc from
|
|
// index
|
|
// . i think this only adds to posdb
|
|
//log("xmldoc: CALLING HASHALL");
|
|
// shit, this blocks which is bad!!!
|
|
char *nod = hashAll ( &tt1 ) ;
|
|
// you can't block here because if we are re-called we lose tt1
|
|
if ( nod == (char *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// error?
|
|
if ( ! nod ) return NULL;
|
|
int32_t done = tt1.m_numSlots;
|
|
if ( done != did )
|
|
log("xmldoc: reallocated big table! bad. old=%" INT32 " "
|
|
"new=%" INT32 " nw=%" INT32 "",did,done,nw);
|
|
}
|
|
|
|
// if indexing the spider reply as well under a different docid
|
|
// there is no reason we can't toss it into our meta list here
|
|
if ( spiderStatusDocMetaList )
|
|
need += spiderStatusDocMetaList->length();
|
|
|
|
// now we use revdb
|
|
// before hashing the old doc into it
|
|
//if ( od && index2 ) {
|
|
// // if this hash table init fails, return NULL
|
|
// if (!tt2.set(12,4,5000,NULL,0,false,m_niceness)) return NULL;
|
|
// char *rod = od->hash ( &tt2 ) ;
|
|
// if ( ! rod || rod == (char *)-1 ) return rod;
|
|
//}
|
|
// space for indexdb AND DATEDB! +2 for rdbids
|
|
int32_t needIndexdb = 0;
|
|
needIndexdb +=tt1.m_numSlotsUsed*(sizeof(key144_t)+2+sizeof(key128_t));
|
|
//needIndexdb+=tt2.m_numSlotsUsed * (sizeof(key_t)+2+sizeof(key128_t));
|
|
need += needIndexdb;
|
|
// sanity check
|
|
//if ( ! od && m_skipIndexing && needIndexdb ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . sanity check - must have one or the other!
|
|
// . well, not in the case of EDOCNOTNEW or EDOCNOTOLD, in which
|
|
// case we just remove ourselves from spiderdb, and in the case
|
|
// of EDOCNOTOLD, from tfndb as well
|
|
//if ( ! od && ! nd ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// what pub dates do the old and new doc have? -1 means none.
|
|
int32_t date1 = -1; if ( nd ) date1 = nd->m_pubDate;
|
|
//int32_t date2 = -1; if ( od ) date2 = od->m_pubDate;
|
|
|
|
// now we also add the title rec. true = ownsCbuf? ret NULL on error
|
|
// with g_errno set.
|
|
//if ( nd && ! nd->compress( true , m_niceness ) ) return NULL;
|
|
|
|
|
|
/*
|
|
now we have the bit in the posdb key, so this should not be needed...
|
|
use Posdb::isShardedByTermId() to see if it is such a special case key
|
|
like Hostdb::getShardNum() now does...
|
|
|
|
setStatus ( "hashing nosplit keys" );
|
|
// hash no split terms into ns1 and ns2
|
|
HashTableX ns1;
|
|
// prepare it, 500 initial terms
|
|
if ( ! ns1.set ( 18 , 4 , 500,NULL,0,false,m_niceness,"nosplt-indx" ))
|
|
return NULL;
|
|
// . hash for no splits
|
|
// . like above, but these are "no split" termids
|
|
if ( nd && m_usePosdb && ! hashNoSplit ( &ns1 ) ) return NULL;
|
|
//if(index2 && od && ! od->hashNoSplit ( &ns2 ) ) return NULL;
|
|
// needs for hashing no split terms
|
|
int32_t needNoSplit1 = 0;
|
|
// add em up. +1 for rdbId. add to both indexdb AND datedb i guess...
|
|
needNoSplit1 += ns1.m_numSlotsUsed * (18+1); // +16+1);
|
|
//needNoSplit += ns2.m_numSlotsUsed * (12+1+16+1);
|
|
// add it in
|
|
need += needNoSplit1;
|
|
// sanity check
|
|
//if ( ! od && m_skipIndexing && needNoSplit ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
|
|
setStatus ( "hashing sectiondb keys" );
|
|
// add in special sections keys. "ns" = "new sections", etc.
|
|
// add in the special nosplit datedb terms from the Sections class
|
|
// these hash into the term table so we can do incremental updating
|
|
HashTableX st1; // <key128_t,char> dt1;
|
|
//HashTableX st2; // <key128_t,char> dt2;
|
|
// set key/data size
|
|
int32_t svs = sizeof(SectionVote);
|
|
st1.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness,"sectdb-indx");
|
|
// tell hashtable to use the sectionhash for determining the slot,
|
|
// not the lower 4 bytes because that is the docid which is the
|
|
// same for every key
|
|
st1.m_maskKeyOffset = 6;
|
|
//st2.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness);
|
|
// do not bother if deleting
|
|
if ( m_indexCode ) nsvt = NULL;
|
|
|
|
// . now we hash the root just to get some section votes i guess
|
|
//if ( nts && ! *isr ) nsvt = NULL;
|
|
// if old voting table add more than 100,000 votes forget it!!! do
|
|
// not bloat sectiondb that big...
|
|
if ( osvt && osvt->m_totalSiteVoters >= MAX_SITE_VOTERS ) nsvt = NULL;
|
|
// hash terms into a table that uses full datedb keys
|
|
if ( nsvt && ! nsvt->hash (m_docId,&st1,*sh64,m_niceness)) return NULL;
|
|
// needs for hashing no split terms
|
|
int32_t needSectiondb = 0;
|
|
// add em up. plus one for rdbId
|
|
needSectiondb += st1.m_numSlotsUsed * (16+svs+1);
|
|
//needSectiondb += st2.m_numSlotsUsed * (16+svs+1);
|
|
// add it in
|
|
need += needSectiondb;
|
|
|
|
|
|
// Sections::respiderLineWaiters() adds one docid-based spider rec
|
|
// for every url waiting in line. Sections::m_numLineWaiters. assume
|
|
// 64 bytes per line waiter spider rec i guess
|
|
//int32_t needLineWaiters = 0;
|
|
// +1 for rdbId
|
|
//if ( ns ) needLineWaiters = ns->m_numLineWaiters * 64;
|
|
// forgot to add this?
|
|
//need += needLineWaiters;
|
|
// . for adding Sections.cpp keys
|
|
// . Sections::hash() does not bother with invalid sections
|
|
// . waitInLine might be true in Sections::hash() too, so always add 12
|
|
//if ( ns ) need += (ns->m_numSections - ns->m_numInvalids)*12 + 12;
|
|
//if ( os ) need += (os->m_numSections - os->m_numInvalids)*12 + 12;
|
|
|
|
|
|
// for adding Addresses::m_keys[] (Addresses::hash())
|
|
//if ( na ) need += (na->m_numKeys * 16);
|
|
//if ( oa ) need += (oa->m_numKeys * 16);
|
|
|
|
// don't forget Dates!
|
|
//if ( ndp ) need += ndp->m_numPubDates * sizeof(key_t);
|
|
//if ( odp ) need += odp->m_numPubDates * sizeof(key_t);
|
|
|
|
// clusterdb keys. plus one for rdbId
|
|
int32_t needClusterdb = 0;
|
|
//if ( nd && ! nd->m_skipIndexing ) needClusterdb += 13;
|
|
//if ( od && ! od->m_skipIndexing ) needClusterdb += 13;
|
|
if ( nd ) needClusterdb += 13;
|
|
//if ( od ) needClusterdb += 13;
|
|
need += needClusterdb;
|
|
|
|
// . LINKDB
|
|
// . linkdb records. assume one per outlink
|
|
// . we may index 2 16-byte keys for each outlink
|
|
Links *nl2 = NULL;
|
|
//if ( spideringLinks ) nl2 = &m_links;
|
|
// if injecting, spideringLinks is false, but then we don't
|
|
// add the links to linkdb, which causes the qainlinks() test to fail
|
|
nl2 = &m_links;
|
|
// do not bother if deleting. but we do add simplified redirects
|
|
// to spiderdb as SpiderRequests now.
|
|
int32_t code = m_indexCode;
|
|
if ( code == EDOCSIMPLIFIEDREDIR ) code = 0;
|
|
if ( code == EDOCNONCANONICAL ) code = 0;
|
|
if ( code ) nl2 = NULL;
|
|
//Links *ol = NULL; if ( od ) ol = od->getLinks();
|
|
// . set key/data size
|
|
// . use a 16 byte key, not the usual 12
|
|
// . use 0 for the data, since these are pure keys, which have no
|
|
// scores to accumulate
|
|
HashTableX kt1;
|
|
//HashTableX kt2;
|
|
int32_t nis = 0;
|
|
if ( nl2 && m_useLinkdb ) nis = nl2->getNumLinks() * 4;
|
|
// pre-grow table based on # outlinks
|
|
kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" );
|
|
// use magic to make fast
|
|
kt1.m_useKeyMagic = true;
|
|
// linkdb keys will have the same lower 4 bytes, so make hashing fast.
|
|
// they are 28 byte keys. bytes 20-23 are the hash of the linkEE
|
|
// so that will be the most random.
|
|
kt1.m_maskKeyOffset = 20;
|
|
// faster
|
|
//kt2.set ( sizeof(key128_t) , 0,0,NULL,0,false,m_niceness );
|
|
// do not add these
|
|
//bool add1 = true;
|
|
// do not add negative key if no old doc
|
|
//if ( ! od ) add2 = false;
|
|
// . we already have a Links::hash into the Termtable for links: terms,
|
|
// but this will have to be for adding to Linkdb. basically take a
|
|
// lot of it from Linkdb::fillLinkdbList()
|
|
// . these return false with g_errno set on error
|
|
if ( m_useLinkdb && nl2 && ! hashLinksForLinkdb(&kt1) ) return NULL;
|
|
//if ( add2 && ol && ! !od->m_skipIndexing &&
|
|
// ol->hash(&kt2,od,m_niceness) )
|
|
// return NULL;
|
|
// add up what we need. +1 for rdbId
|
|
int32_t needLinkdb = 0;
|
|
needLinkdb += kt1.m_numSlotsUsed * (sizeof(key224_t)+1);
|
|
//needLinkdb += kt2.m_numSlotsUsed * (sizeof(key128_t)+1);
|
|
need += needLinkdb;
|
|
// sanity check
|
|
//if ( ! od && m_skipIndexing && needLinkdb ) { char *xx=NULL;*xx=0; }
|
|
|
|
// PLACEDB
|
|
HashTableX pt1;
|
|
//HashTableX pt2;
|
|
// . set key/data size
|
|
// . limit every address to 512 bytes
|
|
pt1.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness,"placedb-indx");
|
|
//pt2.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness);
|
|
//
|
|
// if this is true, then we just store the placedb recs
|
|
// directly into the title rec. That way we do not have
|
|
// to store the content of the web page, and we save space.
|
|
//
|
|
// otherwise, we have to parse out the sections and it is much slower
|
|
//else if (oa && !oa->hashForPlacedb(m_docId,*sh32,*od->getIp(),&pt2) )
|
|
// return NULL;
|
|
// hash terms into a table that uses full datedb keys
|
|
if ( na && !na->hashForPlacedb(m_docId,*sh32,*nd->getIp(),&pt1))
|
|
return NULL;
|
|
|
|
|
|
setStatus("hashing place info");
|
|
int32_t needPlacedb = 0;
|
|
// . +1 for rdbId
|
|
// . up to 512 bytes per address
|
|
needPlacedb += pt1.m_numSlotsUsed * (sizeof(key128_t)+1+512);
|
|
//needPlacedb += pt2.m_numSlotsUsed * (sizeof(key128_t)+1+512);
|
|
need += needPlacedb;
|
|
// sanity check -- coring here because we respider the page and
|
|
// the address is gone so it tries to delete it!
|
|
//if ( ! od && m_skipIndexing && needPlacedb ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we add a negative key to doledb usually (include datasize now)
|
|
int32_t needDoledb = sizeof(key_t) + 1 ; // + 4;
|
|
if ( forDelete ) needDoledb = 0;
|
|
need += needDoledb;
|
|
|
|
// for adding the SpiderReply to spiderdb (+1 for rdbId)
|
|
int32_t needSpiderdb1 = sizeof(SpiderReply) + 1;
|
|
if ( forDelete ) needSpiderdb1 = 0;
|
|
need += needSpiderdb1;
|
|
|
|
// if injecting we add a spiderrequest to be able to update it
|
|
// but don't do this if it is pagereindex. why is pagereindex
|
|
// setting the injecting flag anyway?
|
|
int32_t needSpiderdb3 = 0;
|
|
if ( m_sreqValid &&
|
|
m_sreq.m_isInjecting &&
|
|
m_sreq.m_fakeFirstIp &&
|
|
! m_sreq.m_forceDelete &&
|
|
// do not rebuild spiderdb if only rebuilding posdb
|
|
// this is explicitly for injecting so we need to add
|
|
// the spider request to spiderdb...
|
|
//m_useSpiderdb &&
|
|
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
|
|
! m_isDiffbotJSONObject ) {
|
|
needSpiderdb3 = m_sreq.getRecSize() + 1;
|
|
// NO! because when injecting a warc and the subdocs
|
|
// it contains, gb then tries to spider all of them !!! sux...
|
|
needSpiderdb3 = 0;
|
|
}
|
|
// or if we are rebuilding spiderdb
|
|
else if (m_useSecondaryRdbs && !m_isDiffbotJSONObject && m_useSpiderdb)
|
|
needSpiderdb3 = sizeof(SpiderRequest) + m_firstUrl.m_ulen+1;
|
|
|
|
need += needSpiderdb3;
|
|
|
|
//int32_t needSpiderdb3 = 0;
|
|
//if ( m_sreqValid ) needSpiderdb3 = m_sreq.getRecSize() + 1;
|
|
//need += needSpiderdb3;
|
|
|
|
// . for adding our outlinks to spiderdb
|
|
// . see SpiderRequest::getRecSize() for description
|
|
// . SpiderRequest::getNeededSize() will include the null terminator
|
|
int32_t hsize = SpiderRequest::getNeededSize ( 0 );
|
|
int32_t needSpiderdb2 = hsize * m_links.getNumLinks();
|
|
// and the url buffer of outlinks. includes \0 terminators i think
|
|
needSpiderdb2 += m_links.getLinkBufLen();
|
|
// don't need this if doing consistecy check
|
|
if ( m_doingConsistencyCheck ) needSpiderdb2 = 0;
|
|
// nor for generating the delete meta list for incremental indexing
|
|
if ( forDelete ) needSpiderdb2 = 0;
|
|
// accumulate it
|
|
need += needSpiderdb2;
|
|
|
|
// the new tags for tagdb
|
|
int32_t needTagdb = 0;
|
|
if ( ntb ) needTagdb = ntb->length() ;
|
|
// add 1 byte for up to 128 rdbids
|
|
//needTagdb += needTagdb/sizeof(Tag) + 1;
|
|
// add that in
|
|
need += needTagdb;
|
|
|
|
// . add in title rec size
|
|
// . should be valid because we called getTitleRecBuf() above
|
|
// . this should include the key
|
|
// . add in possible negative key for deleting old title rec
|
|
//int32_t needTitledb = sizeof(key96_t);
|
|
// +1 for rdbId
|
|
//if ( nd && m_useTitledb ) needTitledb = m_titleRecSize + 1;
|
|
//need += needTitledb;
|
|
|
|
|
|
//
|
|
// . CHECKSUM PARSING CONSISTENCY TEST
|
|
//
|
|
// . set m_metaListChecksum member (will be stored in titleRec header)
|
|
// . gotta set m_metaListCheckSum8 before making titleRec below
|
|
// . also, if set from titleRec, verify metalist is the same!
|
|
//
|
|
if ( ! m_computedMetaListCheckSum ) {
|
|
// do not call twice!
|
|
m_computedMetaListCheckSum = true;
|
|
// all keys in tt1, ns1, kt1 and pt1
|
|
int32_t ck32 = 0;
|
|
ck32 ^= tt1.getKeyChecksum32();
|
|
|
|
// show tt1
|
|
//
|
|
// UNCOMMENT this to debug parsing inconsistencies!!!
|
|
//
|
|
// SafeBuf sb;
|
|
// tt1.print(&sb);
|
|
// if(sb.getBufStart()) fprintf(stderr,"%s", sb.getBufStart());
|
|
|
|
//ck32 ^= ns1.getKeyChecksum32();
|
|
//ck32 ^= kt1.getKeyChecksum32();
|
|
//ck32 ^= pt1.getKeyChecksum32();
|
|
// set this before calling getTitleRecBuf() below
|
|
uint8_t currentMetaListCheckSum8 = (uint8_t)ck32;
|
|
// see if matches what was in old titlerec
|
|
if ( m_metaListCheckSum8Valid &&
|
|
// if we were set from a titleRec, see if we got
|
|
// a different hash of terms to index this time around...
|
|
m_setFromTitleRec &&
|
|
// fix for import log spam
|
|
! m_isImporting &&
|
|
m_version >= 120 &&
|
|
m_metaListCheckSum8 != currentMetaListCheckSum8 ) {
|
|
log("xmldoc: checksum parsing inconsistency for %s "
|
|
"(old)%i != %i(new). Uncomment tt1.print() "
|
|
"above to debug.",
|
|
m_firstUrl.getUrl(),
|
|
(int)m_metaListCheckSum8,
|
|
(int)currentMetaListCheckSum8);
|
|
// if doing qa test drop core
|
|
CollectionRec *cr = getCollRec();
|
|
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 ) {
|
|
log("xmldoc: sleep 1000");
|
|
sleep(1000);
|
|
exit(0);}//char *xx=NULL;*xx=0; }
|
|
}
|
|
// assign the new one, getTitleRecBuf() call below needs this
|
|
m_metaListCheckSum8 = currentMetaListCheckSum8;
|
|
m_metaListCheckSum8Valid = true;
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// now that we've set all the ptr_* members vars, we can make
|
|
// the title rec
|
|
//
|
|
|
|
// . MAKE the title rec from scratch, that is all we need at this point
|
|
// . sets m_indexCode to EDOCNOTNEW or EDOCNOTOLD sometimes
|
|
// . if repairing and not rebuilding titledb, we do not need the
|
|
// titlerec
|
|
if ( m_useTitledb ) {
|
|
// this buf includes key/datasize/compressdata
|
|
SafeBuf *tr = getTitleRecBuf ();
|
|
// panic if this blocks! it should not at this point because
|
|
// we'd have to re-hash the crap above
|
|
if ( tr == (void *) -1 ) { char *xx=NULL;*xx=0; }
|
|
// return NULL with g_errno set on error
|
|
if ( ! tr ) return (char *)tr;
|
|
// sanity check - if the valid title rec is null,
|
|
// m_indexCode is set!
|
|
if ( tr->length()==0 && ! m_indexCode ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// . add in title rec size
|
|
// . should be valid because we called getTitleRecBuf() above
|
|
// . this should include the key
|
|
// . add in possible negative key for deleting old title rec
|
|
int32_t needTitledb = sizeof(key96_t) + 1;
|
|
// +1 for rdbId
|
|
if ( nd && m_useTitledb && ! forDelete )
|
|
needTitledb += m_titleRecBuf.length();
|
|
// set new and old keys for titledb
|
|
//key_t ok;
|
|
key_t nk;
|
|
//ok.setMin();
|
|
nk.setMin();
|
|
//if ( od ) ok = *od->getTitleRecKey();
|
|
if ( nd && m_useTitledb ) nk = *nd->getTitleRecKey();
|
|
//if ( od && m_useTitledb && ok != nk ) needTitledb += sizeof(key_t)+1;
|
|
if ( m_useTitledb ) {
|
|
// then add it in
|
|
need += needTitledb;
|
|
// the titledb unlock key for msg12 in spider.cpp
|
|
need += sizeof(key_t);
|
|
}
|
|
|
|
//
|
|
// now space for the revdb record, which is the meta list itself!
|
|
//
|
|
//need = need + 12 + 4 + need;
|
|
|
|
// . alloc mem for metalist
|
|
// . sanity
|
|
if ( m_metaListSize > 0 ) { char *xx=NULL;*xx=0; }
|
|
// make the buffer
|
|
m_metaList = (char *)mmalloc ( need , "metalist");
|
|
if ( ! m_metaList ) return NULL;
|
|
// save size for freeing later
|
|
m_metaListAllocSize = need;
|
|
// ptr and boundary
|
|
m_p = m_metaList;
|
|
m_pend = m_metaList + need;
|
|
|
|
//
|
|
// TITLEDB
|
|
//
|
|
setStatus ("adding titledb recs");
|
|
// checkpoint
|
|
char *saved = m_p;
|
|
// . delete old title rec key if different
|
|
// . Repair.cpp might set useTitledb to false!
|
|
//if ( od && m_useTitledb && ok != nk ) {
|
|
// // rdbId
|
|
// *m_p++ = RDB_TITLEDB;
|
|
// // key
|
|
// *(key_t *)m_p = *od->getTitleRecKey();
|
|
// // make it negative
|
|
// *m_p &= 0xfe;
|
|
// // skip over it
|
|
// m_p += sizeof(key_t);
|
|
// // then data size, 0
|
|
// //*(int32_t *)m_p = 0;
|
|
// //m_p+= 4;
|
|
//}
|
|
// . store title rec
|
|
// . Repair.cpp might set useTitledb to false!
|
|
if ( nd && m_useTitledb ) {
|
|
// rdbId
|
|
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_TITLEDB2;
|
|
else *m_p++ = RDB_TITLEDB;
|
|
// sanity
|
|
if ( ! nd->m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
|
|
// key, dataSize, data is the whole rec
|
|
int32_t tsize = nd->m_titleRecBuf.length();
|
|
// if getting an "oldList" to do incremental posdb updates
|
|
// then do not include the data portion of the title rec
|
|
if ( forDelete ) tsize = sizeof(key_t);
|
|
gbmemcpy ( m_p , nd->m_titleRecBuf.getBufStart() , tsize );
|
|
// make it a negative key
|
|
//if ( forDelete ) *m_p = *m_p & 0xfe;
|
|
m_p += tsize;//nd->m_titleRecSize;
|
|
// store a zero datasize, key is still positive until the dt8
|
|
// table deletes it
|
|
//if ( forDelete ) { *(int32_t *)m_p = 0; m_p += 4; }
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needTitledb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// ADD BASIC INDEXDB/DATEDB TERMS
|
|
//
|
|
setStatus ( "adding posdb and datedb terms");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// store indexdb terms into m_metaList[]
|
|
if ( m_usePosdb && ! addTable144 ( &tt1 , m_docId )) return NULL;
|
|
//if(!addTable96 ( &tt2, &tt1, date2, date1, true ,false)) return NULL;
|
|
//if ( od ) tt2.clear();
|
|
// sanity check
|
|
if ( m_p - saved > needIndexdb ) { char*xx=NULL;*xx=0; }
|
|
// free all mem
|
|
tt1.reset();
|
|
//tt2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// ADD NOSPLIT INDEXDB/DATEDB TERMS
|
|
//
|
|
/*
|
|
we added these now in hashAll() to tt1, no longer ns1 since we
|
|
have the sharded by termid bit in the actual posdb key now so
|
|
Rebalance.cpp works
|
|
|
|
setStatus ( "adding posdb shardByTermId terms");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// no longer anything special now since the
|
|
// Posdb::isShardedyTermId() bit
|
|
// is in the key now so Rebalance.cpp can work
|
|
if ( m_usePosdb && ! addTable144 ( &ns1 )) return NULL;
|
|
//if(! addTable96 ( &ns2, &ns1, -1, -1, true ,true)) return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needNoSplit1 ) { char*xx=NULL;*xx=0; }
|
|
// free all mem
|
|
ns1.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
*/
|
|
|
|
|
|
/*
|
|
setStatus ( "adding datedb nosplit terms");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// this is now for datedb
|
|
if ( m_useDatedb && ! addTableDate(&ns2,m_docId,RDB_DATEDB,true))
|
|
return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needNoSplit2 ) { char*xx=NULL;*xx=0; }
|
|
// free all mem
|
|
ns2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
*/
|
|
|
|
//
|
|
// ADD SECTIONS SPECIAL TERMS
|
|
//
|
|
setStatus ( "adding sectiondb keys");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// add that table to the metalist
|
|
if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete))
|
|
return NULL;
|
|
//if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; }
|
|
// free mem
|
|
st1.reset();
|
|
//st2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
//
|
|
// ADD CLUSTERDB KEYS
|
|
//
|
|
setStatus ( "adding clusterdb keys" );
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . do we have adult content?
|
|
// . should already be valid!
|
|
if ( nd && ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
|
|
// . get new clusterdb key
|
|
// . we use the host hash for the site hash! hey, this is only 26 bits!
|
|
key_t newk ; newk.setMin();
|
|
if ( nd )
|
|
newk = g_clusterdb.makeClusterRecKey ( *nd->getDocId() ,
|
|
*nd->getIsAdult() ,
|
|
*nd->getLangId(),
|
|
nd->getHostHash32a(),
|
|
false ); // del?
|
|
//key_t oldk; oldk.setMin();
|
|
//if ( od ) // && add2 )
|
|
// oldk = g_clusterdb.makeClusterRecKey ( *od->getDocId(),
|
|
// *od->getIsAdult() ,
|
|
// *od->getLangId() ,
|
|
// od->getHostHash32a(),
|
|
// true ); // del?
|
|
// . store old only if new tr is good and keys are different from old
|
|
// . now we store even if skipIndexing is true because i'd like to
|
|
// see how many titlerecs we have and count them towards the
|
|
// docsIndexed count...
|
|
if ( nd && m_useClusterdb ) {
|
|
// store rdbid
|
|
*m_p = RDB_CLUSTERDB;
|
|
// use secondary if we should
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
|
|
// skip
|
|
m_p++;
|
|
// and key
|
|
*(key_t *)m_p = newk;
|
|
// skip it
|
|
m_p += sizeof(key_t);
|
|
}
|
|
// store new if different
|
|
//if ( od && ( ! nd || newk != oldk ) ) { // && !od->m_skipIndexing ) {
|
|
// // store rdbid
|
|
// *m_p = RDB_CLUSTERDB;
|
|
// // use secondary if we should
|
|
// if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
|
|
// // skip
|
|
// m_p++;
|
|
// // turn on last bit (undo del)
|
|
// //newk.n0 |= 0x01;
|
|
// // and key
|
|
// *(key_t *)m_p = oldk;
|
|
// // skip it
|
|
// m_p += sizeof(key_t);
|
|
//}
|
|
// sanity check
|
|
if ( m_p - saved > needClusterdb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
|
|
//
|
|
// ADD LINKDB KEYS
|
|
//
|
|
setStatus ( "adding linkdb keys" );
|
|
// checkpoint
|
|
saved = m_p;
|
|
// add that table to the metalist (LINKDB)
|
|
if ( m_useLinkdb && !addTable224(&kt1))
|
|
return NULL;
|
|
//if(add2&&!addTable128(&kt2,&kt1,RDB_LINKDB, false))return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needLinkdb ) { char *xx=NULL;*xx=0; }
|
|
// all done
|
|
kt1.reset();
|
|
//kt2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// . ADD ADDRESSES TO NAMEDB/PLACEDB
|
|
// . key is basically a hash of the address (excluding place name
|
|
// and street indicators)
|
|
//
|
|
setStatus ( "adding to placedb" );
|
|
// checkpoint
|
|
saved = m_p;
|
|
// add that table to the metalist
|
|
if ( m_usePlacedb && ! addTable128 ( &pt1, RDB_PLACEDB,forDelete))
|
|
return NULL;
|
|
//if(! addTable128 ( &pt2, &pt1, RDB_PLACEDB, true , true))return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needPlacedb ) { char *xx=NULL;*xx=0; }
|
|
// free mem
|
|
pt1.reset();
|
|
//pt2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
/*
|
|
//
|
|
// ADD REVDB RECORD
|
|
//
|
|
|
|
//
|
|
// . add the metalist to itself
|
|
// . this way, when we delete this doc from the index, we just
|
|
// lookup the original metalist in revdb, set all the
|
|
// delbits, and re-add that. this avoid having to ensure
|
|
// parsing consistency, which is a royal pain in the ass
|
|
// . now we also update getMetaList() to check revdb to get
|
|
// the meta list if the doc is already indexed...
|
|
//
|
|
// define current meta list
|
|
char *x = m_metaList;
|
|
char *xend = m_p;
|
|
// skip adding to revdb?
|
|
if ( ! m_useRevdb ) xend = x;
|
|
int32_t *dataSizePtr;
|
|
char *savedp;
|
|
// if nothing in current list do not add revdb rec
|
|
bool hadStuff = ( x < xend);
|
|
if ( hadStuff ) {
|
|
// put in the rdbId
|
|
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_REVDB2;
|
|
else *m_p++ = RDB_REVDB;
|
|
// the key
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
*(key_t *)m_p = g_revdb.makeKey ( m_docId , false );
|
|
m_p += sizeof(key_t);
|
|
// data size
|
|
dataSizePtr = (int32_t *)m_p;
|
|
// skip for now
|
|
m_p += 4;
|
|
// save it
|
|
savedp = m_p;
|
|
}
|
|
// scan the current metalist and add keys to the revdb record
|
|
for ( ; x < xend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save this
|
|
char byte = *x;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
//
|
|
// convert if adding to secondary rdbids!!!!!!!!
|
|
//
|
|
if ( m_useSecondaryRdbs ) {
|
|
if ( rdbId == RDB2_POSDB2 )
|
|
rdbId = RDB_POSDB;
|
|
else if ( rdbId == RDB2_DATEDB2 )
|
|
rdbId = RDB_DATEDB;
|
|
else if ( rdbId == RDB2_SECTIONDB2 )
|
|
rdbId = RDB_SECTIONDB;
|
|
else if ( rdbId == RDB2_PLACEDB2 )
|
|
rdbId = RDB_PLACEDB;
|
|
else if ( rdbId == RDB2_TITLEDB2 )
|
|
rdbId = RDB_TITLEDB;
|
|
else if ( rdbId == RDB2_LINKDB2 )
|
|
rdbId = RDB_LINKDB;
|
|
else if ( rdbId == RDB2_CLUSTERDB2 )
|
|
rdbId = RDB_CLUSTERDB;
|
|
else if ( rdbId == RDB2_SPIDERDB2 )
|
|
rdbId = RDB_SPIDERDB;
|
|
else if ( rdbId == RDB2_TAGDB2 )
|
|
rdbId = RDB_TAGDB;
|
|
// must be covered!!
|
|
else { char *xx=NULL;*xx=0; }
|
|
// rewrite byte now b/c we store it below
|
|
byte = (byte & 0x80) | rdbId;
|
|
}
|
|
// skip that
|
|
x++;
|
|
// copy that over
|
|
*m_p++ = byte;
|
|
// sanity check -- no negative keys allowed in here
|
|
if ( (x[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// copy that over
|
|
gbmemcpy ( m_p , x , ks );
|
|
// skip that
|
|
m_p += ks;
|
|
x += ks;
|
|
// datasize?
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
if ( ds == -1 ) {
|
|
ds = *(int32_t *)x;
|
|
x += 4;
|
|
}
|
|
// skip data
|
|
x += ds;
|
|
}
|
|
// record size of what we wrote
|
|
if ( hadStuff )
|
|
*dataSizePtr = ( m_p - savedp );
|
|
// sanity check
|
|
if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
*/
|
|
|
|
//////
|
|
//
|
|
// add SPIDERREPLY BEFORE and SPIDERREQUEST!!!
|
|
//
|
|
// add spider reply first so we do not immediately respider
|
|
// this same url if we were injecting it because no SpiderRequest
|
|
// may have existed, and SpiderColl::addSpiderRequest() will
|
|
// spawn a spider of this url again unless there is already a REPLY
|
|
// in spiderdb!!! crazy...
|
|
bool addReply = true;
|
|
// Scraper.cpp uses this
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
|
|
// save it
|
|
saved = m_p;
|
|
// now add the new rescheduled time
|
|
if ( addReply && m_useSpiderdb && ! forDelete ) {
|
|
// note it
|
|
setStatus ( "adding SpiderReply to spiderdb" );
|
|
// rdbid first
|
|
*m_p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
|
|
m_p++;
|
|
// get this
|
|
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
// store the spider rec
|
|
int32_t newsrSize = newsr->getRecSize();
|
|
gbmemcpy ( m_p , newsr , newsrSize );
|
|
m_p += newsrSize;
|
|
|
|
m_addedSpiderReplySize = newsrSize;
|
|
m_addedSpiderReplySizeValid = true;
|
|
|
|
// sanity check - must not be a request, this is a reply
|
|
if ( g_spiderdb.isSpiderRequest( &newsr->m_key ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( m_p - saved != needSpiderdb1 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
}
|
|
|
|
|
|
// if we are injecting we must add the spider request
|
|
// we are injecting from so the url can be scheduled to be
|
|
// spidered again.
|
|
// NO! because when injecting a warc and the subdocs
|
|
// it contains, gb then tries to spider all of them !!! sux...
|
|
if ( needSpiderdb3 ) {
|
|
// note it
|
|
setStatus("adding spider request");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// store it here
|
|
SpiderRequest revisedReq;
|
|
|
|
// if doing a repair/rebuild of spiderdb...
|
|
if ( m_useSecondaryRdbs )
|
|
getRebuiltSpiderRequest ( &revisedReq );
|
|
|
|
// this fills it in for doing injections
|
|
if ( ! m_useSecondaryRdbs ) {
|
|
getRevisedSpiderRequest ( &revisedReq );
|
|
// sanity log
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
// sanity log
|
|
if ( m_firstIp == 0 || m_firstIp == -1 ) {
|
|
char *url = "unknown";
|
|
if ( m_sreqValid ) url = m_sreq.m_url;
|
|
log("build: error3 getting real firstip of "
|
|
"%" INT32 " for %s. not adding new request.",
|
|
(int32_t)m_firstIp,url);
|
|
goto skipNewAdd2;
|
|
}
|
|
}
|
|
|
|
// copy it
|
|
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_SPIDERDB2;
|
|
else *m_p++ = RDB_SPIDERDB;
|
|
// store it back
|
|
gbmemcpy ( m_p , &revisedReq , revisedReq.getRecSize() );
|
|
// skip over it
|
|
m_p += revisedReq.getRecSize();
|
|
// sanity check
|
|
if ( m_p - saved > needSpiderdb3 ) { char *xx=NULL;*xx=0; }
|
|
|
|
m_addedSpiderRequestSize = revisedReq.getRecSize();
|
|
m_addedSpiderRequestSizeValid = true;
|
|
|
|
}
|
|
|
|
skipNewAdd2:
|
|
|
|
//
|
|
// ADD SPIDERDB RECORDS of outlinks
|
|
//
|
|
// - do this AFTER computing revdb since we do not want spiderdb recs
|
|
// to be in revdb.
|
|
//
|
|
setStatus ( "adding spiderdb keys" );
|
|
// sanity check. cannot spider until in sync
|
|
if ( ! isClockInSync() ) { char *xx=NULL;*xx=0; }
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . should be fixed from Links::setRdbList
|
|
// . we should contain the msge that msg16 uses!
|
|
// . we were checking m_msg16.m_recycleContent, but i have not done
|
|
// that in years!!! MDW
|
|
// . we were also checking if the # of banned outlinks >= 2, then
|
|
// we would not do this...
|
|
// . should also add with a time of now plus 5 seconds to that if
|
|
// we spider an outlink linkdb should be update with this doc
|
|
// pointing to it so it can get link text then!!
|
|
if ( spideringLinks && nl2 && ! m_doingConsistencyCheck &&
|
|
m_useSpiderdb && ! forDelete ){
|
|
// returns NULL and sets g_errno on error
|
|
char *ret = addOutlinkSpiderRecsToMetaList ();
|
|
// sanity check
|
|
if ( ! ret && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// return NULL on error
|
|
if ( ! ret ) return NULL;
|
|
// this MUST not block down here, to avoid re-hashing above
|
|
if ( ret == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needSpiderdb2 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// ADD TAG RECORDS TO TAGDB
|
|
//
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . only do this if NOT setting from a title rec
|
|
// . it might add a bunch of forced spider recs to spiderdb
|
|
// . store into tagdb even if indexCode is set!
|
|
if ( ntb && m_useTagdb && ! forDelete ) {
|
|
// ntb is a safebuf of Tags, which are already Rdb records
|
|
// so just gbmemcpy them directly over
|
|
char *src = ntb->getBufStart();
|
|
int32_t srcSize = ntb->length();
|
|
gbmemcpy ( m_p , src , srcSize );
|
|
m_p += srcSize;
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needTagdb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
|
|
//
|
|
// ADD INDEXED SPIDER REPLY with different docid so we can
|
|
// search index of spider replies! (NEW!)
|
|
//
|
|
// . index spider reply with separate docid so they are all searchable.
|
|
// . see getSpiderStatusDocMetaList() function to see what we index
|
|
// and the titlerec we create for it
|
|
if ( spiderStatusDocMetaList ) {
|
|
gbmemcpy ( m_p ,
|
|
spiderStatusDocMetaList->getBufStart() ,
|
|
spiderStatusDocMetaList->length() );
|
|
m_p += spiderStatusDocMetaList->length();
|
|
m_addedStatusDocSize = spiderStatusDocMetaList->length();
|
|
m_addedStatusDocSizeValid = true;
|
|
}
|
|
|
|
/*
|
|
//
|
|
// ADD FORCED RESPIDER DOCID-BASED SPIDER RECS for Sections
|
|
//
|
|
// used by Sections.cpp to respider docs because we just identified an
|
|
// article section and they need to be re-indexed to take advantage
|
|
// of that
|
|
//
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . only do this if NOT setting from a title rec
|
|
// . it might add a bunch of forced spider recs to spiderdb
|
|
if ( ! m_setFromTitleRec && nd ) { // && ! m_isInjecting ) {
|
|
Sections *ss = &m_sections;
|
|
m_p = ss->respiderLineWaiters ( m_p , m_pend );
|
|
if ( ! m_p ) return NULL;
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needLineWaiters ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
*/
|
|
|
|
|
|
//
|
|
// NOW UPDATE OURSELVES (OUR URL) IN SPIDERDB
|
|
//
|
|
|
|
// but not if injecting!
|
|
//if ( ! m_sreqValid ) {
|
|
// // set the list size, different from the alloc size
|
|
// m_metaListSize = m_p - m_metaList;
|
|
// // all done
|
|
// return m_metaList;
|
|
//}
|
|
|
|
// note it
|
|
//setStatus ( "deleting old spider rec key" );
|
|
// rdbid first
|
|
// *p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
//if ( m_useSecondaryRdbs ) *p = RDB2_SPIDERDB2;
|
|
//p++;
|
|
// must be legit
|
|
//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
// then the key
|
|
// *(key_t *)p = m_sreq.m_key;
|
|
// nukey, clear del bit to delete it
|
|
// *p &= 0xfe;
|
|
// skip key
|
|
//p += sizeof(key_t);
|
|
|
|
// int16_tcut
|
|
saved = m_p;
|
|
/*
|
|
|
|
See comment under DOLEDB above! this approach is no longer used.
|
|
|
|
// . remove from doledb if we had a valid key
|
|
// . DO THIS BEFORE adding the SpiderReply since
|
|
// Spider.cpp::addSpiderReply() will
|
|
// decrement the count for firstIp in m_doleIpTable
|
|
if ( (m_doledbKey.n0 || m_doledbKey.n1) &&
|
|
! m_useSecondaryRdbs &&
|
|
// do not add if we are generating the meta list for incremental
|
|
// indexing purposes from an old doc
|
|
! forDelete ) {
|
|
// note it
|
|
setStatus ( "removing key from doledb" );
|
|
// . now remove the original spider rec from "doledb"
|
|
// . rdbid first
|
|
*m_p = RDB_DOLEDB;
|
|
m_p++;
|
|
// then the key
|
|
*(key_t *)m_p = m_doledbKey;
|
|
// nukey, clear del bit to delete it
|
|
*m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key_t);
|
|
// datasize is 0
|
|
// *(int32_t *)m_p = 0;
|
|
//m_p += 4;
|
|
// sanity check
|
|
if ( m_p - saved != needDoledb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
}
|
|
*/
|
|
|
|
// note it
|
|
//setStatus ( "removing spider lock");
|
|
// . make a fake titledb key
|
|
// . remove the spider lock (Msg12 in Spider.cpp)
|
|
// . no need to do this if called from Repair.cpp
|
|
// . the uh48 is zero, that means fake!
|
|
// . i added "&& m_useSpiderdb" here because it was messing up
|
|
// the cacheTermLists() function which ONLY wants posdb keys and
|
|
// any other keys in the metalist messes it up. MDW 1/26/13
|
|
// . now SPider.cpp uses SpiderReply reception to remove lock
|
|
// - mdw 9/28/13
|
|
//if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) {
|
|
// *m_p++ = RDB_FAKEDB;
|
|
// ((key_t *)m_p)->n1 = 0;
|
|
// ((key_t *)m_p)->n0 = m_docId;
|
|
// //= g_titledb.makeKey ( m_docId , 0LL , true );
|
|
// m_p += sizeof(key_t);
|
|
//}
|
|
|
|
|
|
// MDW: new spider algo does not need this
|
|
/*
|
|
// save it
|
|
saved = m_p;
|
|
// re-add the same request since it was removed from Spider.cpp's
|
|
// m_urlBuf and the associated orderTree,ipTree, etc. and now
|
|
// since we are un-doling (undoling) it we need to re-add and this
|
|
// is the easiest way. it really was never removed from spiderdb
|
|
// but it will no longer be in the spider's cache since we delete
|
|
// it from there when we add it to doledb. so this is just a quick
|
|
// way of getting it back into the cache.
|
|
// now, we add this first since now Rdb.cpp calls evaluateAllReqeusts()
|
|
// AFTER the REPLY now
|
|
if ( m_sreqValid &&
|
|
// page parser has an invalid firstIp which causes printMetaList()
|
|
// to core when trying to print this out, so don't add it when
|
|
// doing page parser
|
|
! m_sreq.m_isPageParser ) {
|
|
// note it
|
|
setStatus ( "adding SpiderRequest back to spiderdb" );
|
|
// rdbid first
|
|
*m_p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
|
|
m_p++;
|
|
// store the spider rec
|
|
int32_t size = m_sreq.getRecSize();
|
|
gbmemcpy ( m_p , &m_sreq , size );
|
|
// set this one bit
|
|
SpiderRequest *rr = (SpiderRequest *)m_p;
|
|
rr->m_readd = 1;
|
|
// and hafta reset this junk otherwise it cores
|
|
// (see Spider.h::SpiderRequest::reset())
|
|
rr->m_ufn = -1;
|
|
rr->m_priority = -1;
|
|
rr->m_doled = 0;
|
|
// skip over the whole rec
|
|
m_p += size;
|
|
// sanity check - must not be a request, this is a reply
|
|
if ( ! g_spiderdb.isSpiderRequest( &m_sreq.m_key ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( m_p - saved != needSpiderdb3 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
}
|
|
*/
|
|
|
|
// sanity check
|
|
if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
/////////////////
|
|
//
|
|
// INCREMENTAL INDEXING / INCREMENTAL UPDATING
|
|
//
|
|
// now prune/manicure the metalist to remove records that
|
|
// were already added, and insert deletes for records that
|
|
// changed since the last time. this is how we do deletes
|
|
// now that we have revdb. this allows us to avoid
|
|
// parsing inconsistency errors.
|
|
//
|
|
/////////////////
|
|
|
|
// disable for parsing consistency testing of already indexed docs
|
|
//oldList = NULL;
|
|
|
|
if ( oldList ) { // && oldList->m_listSize > 16 ) {
|
|
// point to start of the old meta list, the first and only
|
|
// record in the oldList
|
|
char *om = oldList;// + 12 + 4;
|
|
// the size
|
|
int32_t osize = oldListSize;//*(int32_t *)(oldList + 12);
|
|
// the end
|
|
char *omend = om + osize;
|
|
int32_t needx = 0;
|
|
// init these. data is just the rdbid, a single byte.
|
|
//HashTableX dt12;
|
|
//HashTableX dt16;
|
|
//char dbuf12[30000];
|
|
//char dbuf16[40000];
|
|
//dt12.set ( 12,1,2048,dbuf12,30000,false,m_niceness);
|
|
//dt16.set ( 16,1,2048,dbuf16,40000,false,m_niceness);
|
|
HashTableX dt8;
|
|
char dbuf8[34900];
|
|
// value is the ptr to the rdbId/key in the oldList
|
|
dt8.set ( 8,sizeof(char *),2048,dbuf8,34900,
|
|
false,m_niceness,"dt8-tab");
|
|
// just for linkdb:
|
|
//HashTableX dt9;
|
|
//char dbuf9[30000];
|
|
//dt9.set ( 8,4,2048,dbuf9,30000,false,m_niceness,"dt9-tab");
|
|
// scan recs in that and hash them
|
|
for ( char *p = om ; p < omend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save this
|
|
char byte = *p;
|
|
// save this
|
|
char *rec = p;
|
|
// get the rdbid for this rec
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// get the key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// get that
|
|
char *k = p;
|
|
// unlike a real meta list, this meta list has
|
|
// no data field, just rdbIds and keys only! because
|
|
// we only use it for deleting, which only requires
|
|
// a key and not the data
|
|
p += ks;
|
|
// tally this up in case we have to add the delete
|
|
// version of this key back (add 1 for rdbId)
|
|
needx += ks + 1;
|
|
// always re-add titledb record!
|
|
// if our current/new list is basically empty
|
|
// except for a SpiderReply because it got deleted
|
|
// from the index, we need to store the titledb key
|
|
// in dt8 so we can add it as a negative! so i
|
|
// don't really know what this was trying to fix
|
|
// because it broke that!
|
|
//if ( rdbId == RDB_TITLEDB ) continue;
|
|
// for linkdb, sometimes we also add a "lost" link
|
|
// key in addition to deleting the old key! see below
|
|
if ( rdbId == RDB_LINKDB ) needx += ks + 1;
|
|
// do not add it if datasize > 0
|
|
uint64_t hk;
|
|
// do not include discovery or lost dates in the
|
|
// linkdb key...
|
|
if ( rdbId == RDB_LINKDB )
|
|
hk = hash64 (k+12,ks-12);
|
|
else
|
|
hk = hash64 (k,ks);
|
|
// sanity check
|
|
if ( rdbId == RDB_LINKDB &&
|
|
g_linkdb.getLinkerDocId_uk((key224_t *)k)!=
|
|
m_docId ) {
|
|
char *xx=NULL;*xx=0; }
|
|
//if ( getDataSize(rdbId) != 0 ) continue;
|
|
// hash this key
|
|
//bool status;
|
|
// sectiondb keys all have the same last few bits...
|
|
// so this clogs up the hash table.
|
|
// so mix up the key bits for hashing
|
|
//uint64_t hk = hash64 ( k,ks);
|
|
//if (ks == 12 ) status = dt12.addKey ( k, &byte);
|
|
//else if (ks == 16 ) status = dt16.addKey ( k, &byte);
|
|
//else { char *xx=NULL; *xx=0; }
|
|
if ( ! dt8.addKey(&hk,&rec) ) return NULL;
|
|
// return NULL with g_errno set on error
|
|
//if ( ! status ) return NULL;
|
|
}
|
|
// also need all the new keys just to be sure, in case none
|
|
// are already in the rdbs
|
|
needx += (m_p - m_metaList);
|
|
// now alloc for our new manicured metalist
|
|
char *nm = (char *)mmalloc( needx, "newmeta" );
|
|
if ( ! nm ) return NULL;
|
|
char *nptr = nm;
|
|
char *nmax = nm + needx;
|
|
// scan each rec in the current meta list, see if its in either
|
|
// the dt12 or dt16 hash table, if it already is, then
|
|
// do NOT add it to the new metalist, nm, because there is
|
|
// no need to.
|
|
char *p = m_metaList;
|
|
char *pend = p + (m_p - m_metaList);
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save it with the flag
|
|
char byte = *p;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// get key
|
|
char *key = p;
|
|
// skip that
|
|
p += ks;
|
|
// get data size
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
// assume we do not store the datasize
|
|
bool neg = false;
|
|
// . if key is negative, no data is present
|
|
// . the doledb key is negative for us here
|
|
if ( (key[0] & 0x01) == 0x00 ) { neg = true; ds = 0; }
|
|
// if datasize variable, read it in
|
|
if ( ds == -1 ) {
|
|
// get data size
|
|
ds = *(int32_t *)p;
|
|
// skip data size int32_t
|
|
p += 4;
|
|
}
|
|
// point to data
|
|
char *data = p;
|
|
// skip data if not zero
|
|
p += ds;
|
|
|
|
// mix it up for hashtable speed
|
|
uint64_t hk ;//= hash64 ( key,ks);
|
|
|
|
// skip if for linkdb, we do that below
|
|
if ( rdbId == RDB_LINKDB )
|
|
hk = hash64(key+12,ks-12);
|
|
else
|
|
hk = hash64(key,ks);
|
|
|
|
// was this key already in the "old" list?
|
|
int32_t slot = dt8.getSlot(&hk);
|
|
|
|
// do we got a linkdb key that existed last time
|
|
// we indexed this doc? if so, inherit its discovery
|
|
// date.
|
|
if ( slot >= 0 && rdbId == RDB_LINKDB ) {
|
|
/*
|
|
// get old key from last time
|
|
char *oldk=*(char**)dt8.getValueFromSlot(slot);
|
|
// skip rdbid
|
|
oldk++;
|
|
// sanity
|
|
if(g_linkdb.getLinkerDocId_uk((key224_t *)oldk)
|
|
!=m_docId){
|
|
char *xx=NULL;*xx=0; }
|
|
// copy rdbid into new meta list
|
|
*nptr++ = byte;
|
|
// point to where key will be stored in new lst
|
|
char *nk = nptr;
|
|
// store the new key in the new meta list
|
|
gbmemcpy ( nptr , key , ks );
|
|
// advance ptr
|
|
nptr += ks;
|
|
// get disocvery time of old key from last time
|
|
int32_t dd = g_linkdb.getDiscoveryDate_uk(oldk);
|
|
// sanity
|
|
if ( dd < 0 ) { char *xx=NULL;*xx=0; }
|
|
// but mod the new key's discovery time
|
|
g_linkdb.setDiscoveryDate_uk ( nk, dd );
|
|
*/
|
|
// . no need to deal with this any further
|
|
// . yeah, because there could be dups!
|
|
// so don't delete it just yet
|
|
// . but make the data ptr NULL so we
|
|
// know to disregard it below...???
|
|
dt8.removeSlot(slot);
|
|
// all done for this key
|
|
continue;
|
|
}
|
|
|
|
// see if already in an rdb, IFF dataless, otherwise
|
|
// the keys might be the same but with different data!
|
|
if ( slot >= 0 ) { // dt8.isInTable(&hk) ) {
|
|
// remove from hashtable so we do not add it
|
|
// as a delete key below
|
|
// dt8.removeKey(&hk);
|
|
dt8.removeSlot(slot);
|
|
// but do add like a titledb rec that has the
|
|
// same key, because its data is probably
|
|
// different...
|
|
// HACK: enable for now since we lost
|
|
// the url:www.geico.com term somehow!!!
|
|
// geico got deleted but not the title rec!!
|
|
// MAKE SURE TITLEREC gets deleted then!!!
|
|
if ( ds==0 && g_conf.m_doIncrementalUpdating )
|
|
continue;
|
|
}
|
|
// ok, it is not already in an rdb, so add it
|
|
*nptr++ = byte;
|
|
// store key
|
|
gbmemcpy ( nptr, key , ks );
|
|
// skip over it
|
|
nptr += ks;
|
|
// store data size. BUT not if negative key!
|
|
if ( getDataSizeFromRdbId(rdbId) == -1 && ! neg ) {
|
|
*(int32_t *)nptr = ds;
|
|
nptr += 4;
|
|
}
|
|
// store data
|
|
if ( ds ) {
|
|
gbmemcpy ( nptr , data , ds );
|
|
nptr += ds;
|
|
}
|
|
}
|
|
// now scan dt8 and add their keys as del keys
|
|
for ( int32_t i = 0 ; i < dt8.m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! dt8.m_flags[i] ) continue;
|
|
// store rdbid first
|
|
char *rec = *(char **)dt8.getValueFromSlot(i);
|
|
// get rdbId with hi bit possibly set
|
|
char rdbId = rec[0] & 0x7f;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// sanity test - no negative keys
|
|
if ( (rec[1] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0;}
|
|
// copy the rdbId byte and key
|
|
gbmemcpy ( nptr , rec , 1 + ks );
|
|
// skip over rdbid
|
|
nptr++;
|
|
// make it a negative key by clearing lsb
|
|
*nptr = *nptr & 0xfe;
|
|
// skip it
|
|
nptr += ks;
|
|
// if it is from linkdb, and unmet, then it is a
|
|
// lost link, so set the lost date of it. we keep
|
|
// these so we can graph lost links
|
|
if ( rdbId == RDB_LINKDB ) {
|
|
// the real linkdb rec is at rec+1
|
|
int32_t lost = g_linkdb.getLostDate_uk( rec+1 );
|
|
// how can it be non-zero? it should have
|
|
// been freshly made from the old titlerec...
|
|
if ( lost ) { char *xx=NULL;*xx=0; }
|
|
// if zero, set it to now!
|
|
//g_linkdb.setLostDate_uk(realRec,now);
|
|
// copy the rdbId byte and key
|
|
gbmemcpy ( nptr , rec , 1 + ks );
|
|
// set it in there now
|
|
g_linkdb.setLostDate_uk(nptr+1,now);
|
|
// carry it through on revdb, do not delete
|
|
// it! we want a linkdb history for seomasters
|
|
nptr += 1 + ks;
|
|
// and go on to delete the old linkdb key that
|
|
// did not have a lost date
|
|
//continue;
|
|
}
|
|
|
|
}
|
|
// sanity. check for metalist breach
|
|
if ( nptr > nmax ) { char *xx=NULL;*xx=0; }
|
|
// free the old meta list
|
|
mfree ( m_metaList , m_metaListAllocSize , "fm" );
|
|
// now switch over to the new one
|
|
m_metaList = nm;
|
|
m_metaListAllocSize = needx;
|
|
m_p = nptr;
|
|
}
|
|
|
|
|
|
// if we only removed it from index, set this flag
|
|
if ( oldList && ! nd ) m_didDelete = true;
|
|
|
|
//
|
|
// repeat this logic special for linkdb since we keep lost links
|
|
// and may update the discovery date or lost date in the keys
|
|
//
|
|
// 1. hash keys of old linkdb keys into dt9 here
|
|
// 2. do not hash the discovery/lost dates when making key hash for dt9
|
|
// 3. scan keys in meta list and add directly into new meta list
|
|
// if not in dt9
|
|
// 4. if in dt9 then add dt9 key instead
|
|
// 5. remove dt9 keys as we add them
|
|
// 6. then add remaining dt9 keys into meta list but with lost date
|
|
// set to now UNLESS it's already set
|
|
//
|
|
|
|
|
|
|
|
|
|
//
|
|
// validate us!
|
|
//
|
|
m_metaListValid = true;
|
|
|
|
// set the list size, different from the alloc size
|
|
m_metaListSize = m_p - m_metaList;//end - m_p;
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_metaList + m_metaListSize , forDelete );
|
|
|
|
// all done
|
|
return m_metaList;
|
|
}
|
|
|
|
// . copy from old title rec to us to speed things up!
|
|
// . returns NULL and set g_errno on error
|
|
// . returns -1 if blocked
|
|
// . returns 1 otherwise
|
|
// . when to doc content is unchanged, just inherit crap from the old title
|
|
// rec so we can make the spider reply in getNewSpiderReply()
|
|
void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
|
// skip if none
|
|
if ( ! od ) return;
|
|
// skip if already did it
|
|
if ( m_copied1 ) return;
|
|
// do not repeat
|
|
m_copied1 = true;
|
|
// set these
|
|
m_percentChanged = 0;
|
|
m_percentChangedValid = true;
|
|
|
|
// copy over bit members
|
|
m_contentHash32 = od->m_contentHash32;
|
|
//m_tagHash32 = od->m_tagHash32;
|
|
m_tagPairHash32 = od->m_tagPairHash32;
|
|
//m_sitePop = od->m_sitePop;
|
|
m_httpStatus = od->m_httpStatus;
|
|
m_hasAddress = od->m_hasAddress;
|
|
m_hasTOD = od->m_hasTOD;
|
|
//m_hasSiteVenue = od->m_hasSiteVenue;
|
|
m_isRSS = od->m_isRSS;
|
|
m_isPermalink = od->m_isPermalink;
|
|
m_hasContactInfo= od->m_hasContactInfo;
|
|
m_hopCount = od->m_hopCount;
|
|
m_crawlDelay = od->m_crawlDelay;
|
|
|
|
// do not forget the shadow members of the bit members
|
|
m_hasAddress2 = m_hasAddress;
|
|
m_hasTOD2 = m_hasTOD;
|
|
//m_hasSiteVenue2 = m_hasSiteVenue;
|
|
m_isRSS2 = m_isRSS;
|
|
m_isPermalink2 = m_isPermalink;
|
|
|
|
// validate them
|
|
m_contentHash32Valid = true;
|
|
//m_tagHash32Valid = true;
|
|
m_tagPairHash32Valid = true;
|
|
//m_sitePopValid = true;
|
|
m_httpStatusValid = true;
|
|
m_hasAddressValid = true;
|
|
m_hasTODValid = true;
|
|
//m_hasSiteVenueValid = true;
|
|
m_isRSSValid = true;
|
|
m_isPermalinkValid = true;
|
|
m_hasContactInfoValid= true;
|
|
m_hopCountValid = true;
|
|
m_crawlDelayValid = true;
|
|
|
|
m_pubDate = od->m_pubDate;
|
|
m_langId = od->m_langId;
|
|
|
|
m_pubDateValid = true;
|
|
m_langIdValid = true;
|
|
|
|
// so get sitenuminlinks doesn't crash when called by getNewSpiderReply
|
|
// because dns timed out. it timed out with EDNSTIMEDOUT before.
|
|
// so overwrite it here...
|
|
if ( m_ip == -1 || m_ip == 0 || ! m_ipValid ) {
|
|
m_ip = od->m_ip;
|
|
m_ipValid = true;
|
|
m_siteNumInlinks = od->m_siteNumInlinks;
|
|
// m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
|
|
// m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlo
|
|
// m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
|
|
|
|
m_siteNumInlinksValid =
|
|
od->m_siteNumInlinksValid;
|
|
// m_siteNumInlinksUniqueIpValid =
|
|
// od->m_siteNumInlinksUniqueIpValid;
|
|
// m_siteNumInlinksUniqueCBlockValid =
|
|
// od->m_siteNumInlinksUniqueCBlockValid;
|
|
// m_siteNumInlinksTotal =
|
|
// od->m_siteNumInlinksTotalValid;
|
|
}
|
|
|
|
m_indexCode = 0;//od->m_indexCode;
|
|
m_indexCodeValid = true;
|
|
|
|
// we need the link info too!
|
|
ptr_linkInfo1 = od->ptr_linkInfo1;
|
|
size_linkInfo1 = od->size_linkInfo1;
|
|
if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
|
|
else m_linkInfo1Valid = false;
|
|
|
|
// turn off for debug
|
|
ptr_sectiondbData = NULL;
|
|
size_sectiondbData = 0;
|
|
}
|
|
|
|
// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
|
|
SpiderReply *XmlDoc::getFakeSpiderReply ( ) {
|
|
|
|
if ( ! m_tagRecValid ) {
|
|
m_tagRec.reset();
|
|
m_tagRecValid = true;
|
|
}
|
|
|
|
if ( ! m_siteHash32Valid ) {
|
|
m_siteHash32 = 1;
|
|
m_siteHash32Valid = true;
|
|
}
|
|
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
|
|
if ( ! m_ipValid ) {
|
|
m_ipValid = true;
|
|
m_ip = atoip("1.2.3.4");
|
|
}
|
|
|
|
if ( ! m_spideredTimeValid ) {
|
|
m_spideredTimeValid = true;
|
|
m_spideredTime = getTimeGlobal();//0; use now!
|
|
}
|
|
|
|
// don't let it get the diffbot reply either! it should be empty.
|
|
if ( ! m_diffbotReplyValid ) {
|
|
m_diffbotReplyValid = true;
|
|
}
|
|
|
|
// if doing diffbot query reindex
|
|
// TODO: does this shard the request somewhere else???
|
|
if ( ! m_firstIpValid ) {
|
|
m_firstIp = m_ip;//atoip("1.2.3.4");
|
|
m_firstIpValid = true;
|
|
}
|
|
|
|
// this was causing nsr to block and core below on a bad engineer
|
|
// error loading the old title rec
|
|
if ( ! m_isPermalinkValid ) {
|
|
m_isPermalink = false;
|
|
m_isPermalinkValid = true;
|
|
}
|
|
|
|
//if ( ! m_sreqValid ) {
|
|
// m_sreqValid = true;
|
|
// m_sreq.m_parentDocId = 0LL;
|
|
// }
|
|
|
|
|
|
// if error is EFAKEFIRSTIP, do not core
|
|
//if ( ! m_isIndexedValid ) {
|
|
// m_isIndexed = false;
|
|
// m_isIndexedValid = true;
|
|
//}
|
|
|
|
// if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT
|
|
// or ECORRUPTDATA (corrupt gzip reply)
|
|
// then this should not block. we need a spiderReply to release the
|
|
// url spider lock in SpiderLoop::m_lockTable.
|
|
// if m_isChildDoc is true, like for diffbot url, this should be
|
|
// a bogus one.
|
|
SpiderReply *nsr = getNewSpiderReply ();
|
|
if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
|
|
if ( ! nsr ) {
|
|
log("doc: crap, could not even add spider reply "
|
|
"to indicate internal error: %s",mstrerror(g_errno));
|
|
if ( ! g_errno ) g_errno = EBADENGINEER;
|
|
//return true;
|
|
return NULL;
|
|
}
|
|
|
|
return nsr;
|
|
|
|
//if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; }
|
|
|
|
//CollectionRec *cr = getCollRec();
|
|
//if ( ! cr ) return true;
|
|
}
|
|
|
|
// getSpiderReply()
|
|
SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
|
|
|
if ( m_srepValid ) return &m_srep;
|
|
|
|
setStatus ( "getting spider reply" );
|
|
|
|
// diffbot guys, robots.txt, frames, sshould not be here
|
|
if ( m_isChildDoc ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . get the mime first
|
|
// . if we are setting XmlDoc from a titleRec, this causes
|
|
// doConsistencyCheck() to block and core
|
|
//HttpMime *mime = getMime();
|
|
//if ( ! mime || mime == (HttpMime *)-1 ) return (SpiderReply *)mime;
|
|
|
|
// if we had a critical error, do not do this
|
|
int32_t *indexCode = getIndexCode();
|
|
if (! indexCode || indexCode == (void *)-1)
|
|
return (SpiderReply *)indexCode;
|
|
|
|
|
|
// if it has been abandoned early, i.e. cut-off, then we should
|
|
// add a "fake" spider reply to release the lock in
|
|
// SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply()
|
|
// to see what parts of this are relevant.
|
|
/*
|
|
if ( *indexCode == EABANDONED ||
|
|
// . any internal "error" needs to be here really
|
|
// . was there an error unzipping the title rec?
|
|
*indexCode == ECORRUPTDATA ||
|
|
*indexCode == EHITCRAWLLIMIT ||
|
|
*indexCode == EHITPROCESSLIMIT ) {
|
|
// clear everything
|
|
m_srep.reset();
|
|
// get from spider request, if there
|
|
int32_t firstIp = 0;
|
|
if ( m_sreqValid ) firstIp = m_sreq.m_firstIp;
|
|
// otherwise, wtf?
|
|
if ( ! firstIp )
|
|
log("build: no first ip to make fake spiderReply. "
|
|
"injected?");
|
|
// we at least need this
|
|
m_srep.m_firstIp = firstIp;
|
|
Url *fu = getFirstUrl();
|
|
// this is the lock key
|
|
int64_t uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
|
|
m_srep.setKey ( firstIp, 0 , uh48 , false );
|
|
// tell it we are fake and not to really add us to
|
|
// spiderdb, but just to release the lock
|
|
m_srep.m_errCode = *indexCode;
|
|
m_srepValid = true;
|
|
return &m_srep;
|
|
}
|
|
*/
|
|
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;
|
|
|
|
// can't call getIsPermalink() here without entering a dependency loop
|
|
//char *pp = getIsUrlPermalinkFormat();
|
|
//if ( !pp || pp == (char *)-1 ) return (SpiderReply *)pp;
|
|
|
|
// the site hash
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SpiderReply *)sh32;
|
|
|
|
int64_t *de = getDownloadEndTime();
|
|
if ( ! de || de == (void *)-1 ) return (SpiderReply *)de;
|
|
|
|
// need to set m_sentToDiffbot!!
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( ! dbr || dbr == (void *)-1 ) return (SpiderReply *)dbr;
|
|
|
|
// was the doc index when we started trying to spider this url?
|
|
//char *wasIndexed = getIsIndexed();
|
|
//if ( ! wasIndexed || wasIndexed == (void *)-1 )
|
|
// return (SpiderReply *)wasIndexed;
|
|
|
|
//Tag *vt = m_oldTagRec.getTag("venueaddress");
|
|
//bool siteHasVenue = (bool)vt;
|
|
|
|
|
|
// int16_tcut
|
|
Url *fu = NULL;
|
|
// watch out for titlerec lookup errors for docid based spider reqs
|
|
if ( m_firstUrlValid ) fu = getFirstUrl();
|
|
|
|
// reset
|
|
m_srep.reset();
|
|
|
|
int32_t firstIp = -1;
|
|
// inherit firstIp
|
|
Tag *tag = m_tagRec.getTag("firstip");
|
|
// tag must be there?
|
|
if ( tag ) firstIp = atoip(tag->getTagData());
|
|
|
|
// this is usually the authority
|
|
if ( m_firstIpValid )
|
|
firstIp = m_firstIp;
|
|
|
|
// otherwise, inherit from oldsr to be safe
|
|
// BUT NOT if it was a fakeip and we were injecting because
|
|
// the SpiderRequest was manufactured and not actually taken
|
|
// from spiderdb! see XmlDoc::injectDoc() because that is where
|
|
// it came from!! if it has m_sreq.m_isAddUrl and
|
|
// m_sreq.m_fakeFirstIp then we actually do add the reply with that
|
|
// fake ip so that they will exist in the same shard.
|
|
// BUT if it is docid passed from PageReindex.cpp (a query reindex)
|
|
// we set the injection bit and the pagereindex bit, we should let
|
|
// thise guys keep the firstip because the docid-based spider request
|
|
// is in spiderdb. it needs to match up.
|
|
if ( m_sreqValid && (!m_sreq.m_isInjecting||m_sreq.m_isPageReindex) )
|
|
firstIp = m_sreq.m_firstIp;
|
|
|
|
// sanity
|
|
if ( firstIp == 0 || firstIp == -1 ) {
|
|
if ( m_firstUrlValid )
|
|
log("xmldoc: BAD FIRST IP for %s",m_firstUrl.getUrl());
|
|
else
|
|
log("xmldoc: BAD FIRST IP for %" INT64 "",m_docId);
|
|
firstIp = 12345;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
// store it
|
|
m_srep.m_firstIp = firstIp;
|
|
// assume no error
|
|
// MDW: not right...
|
|
m_srep.m_errCount = 0;
|
|
// otherwise, inherit from oldsr to be safe
|
|
//if ( m_sreqValid )
|
|
// m_srep.m_firstIp = m_sreq.m_firstIp;
|
|
|
|
// do not inherit this one, it MIGHT HAVE CHANGE!
|
|
m_srep.m_siteHash32 = m_siteHash32;
|
|
|
|
// need this for updating crawl delay table, m_cdTable in Spider.cpp
|
|
if ( fu ) m_srep.m_domHash32 = getDomHash32();
|
|
else m_srep.m_domHash32 = 0;
|
|
|
|
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . set other fields besides key
|
|
// . crap! if we are the "qatest123" collection then m_spideredTime
|
|
// was read from disk usually and is way in the past! watch out!!
|
|
m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// crap, for the test coll this is often a very old time and it
|
|
// causes the spider request to be repeatedly executed, so let's
|
|
// fix that
|
|
if ( ! strcmp(cr->m_coll,"qatest123") )
|
|
m_srep.m_spideredTime = getTimeGlobal();
|
|
|
|
|
|
// TODO: expire these when "ownershipchanged" tag is newer!!
|
|
if ( gr->getTag ( "ingoogle" ) ) {
|
|
m_srep.m_inGoogle = 1;
|
|
m_srep.m_inGoogleValid = 1;
|
|
}
|
|
if ( gr->getTag ( "authorityinlink" ) )
|
|
m_srep.m_hasAuthorityInlink = 1;
|
|
// automatically valid either way
|
|
m_srep.m_hasAuthorityInlinkValid = 1;
|
|
// but for this tag, it must exist even if it has no contact info
|
|
//tag = gr->getTag ( "hascontactinfo" );
|
|
//if ( tag ) {
|
|
|
|
int64_t uh48 = 0LL;
|
|
// we might be a docid based spider request so fu could be invalid
|
|
// if the titlerec lookup failed
|
|
if ( fu ) uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
|
|
int64_t parentDocId = 0LL;
|
|
if ( m_sreqValid )
|
|
parentDocId = m_sreq.getParentDocId();
|
|
//else { char *xx=NULL;*xx=0; }
|
|
|
|
// for docid based urls from PageReindex.cpp we have to make
|
|
// sure to set the urlhash48 correctly from that.
|
|
if ( m_sreqValid ) uh48 = m_sreq.getUrlHash48();
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("xmldoc: uh48=%" UINT64 " parentdocid=%" UINT64 "",uh48,parentDocId);
|
|
|
|
// set the key, m_srep.m_key
|
|
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
|
|
|
|
// . did we download a page? even if indexcode is set we might have
|
|
// . if this is non-zero that means its valid
|
|
if ( m_contentHash32Valid )
|
|
m_srep.m_contentHash32 = m_contentHash32;
|
|
|
|
// injecting the content (url implied)
|
|
if ( m_contentInjected ) // m_sreqValid && m_sreq.m_isInjecting )
|
|
m_srep.m_fromInjectionRequest = 1;
|
|
|
|
// can be injecting a url too, content not necessarily implied
|
|
if ( m_sreqValid && m_sreq.m_isInjecting )
|
|
m_srep.m_fromInjectionRequest = 1;
|
|
|
|
if ( m_sentToDiffbotThisTime )
|
|
m_srep.m_sentToDiffbotThisTime = true;
|
|
else
|
|
m_srep.m_sentToDiffbotThisTime = false;
|
|
|
|
if ( m_diffbotReplyError )
|
|
m_srep.m_hadDiffbotError = true;
|
|
else
|
|
m_srep.m_hadDiffbotError = false;
|
|
|
|
// if we only had an error code in the diffbot reply, record that
|
|
if ( ! m_indexCode && m_diffbotReplyError )
|
|
m_srep.m_errCode = m_diffbotReplyError;
|
|
|
|
// sanity. if being called directly from indexDoc() because of
|
|
// an error like out of memory, then we do not know if it is
|
|
// indexed or not or was indexed...
|
|
//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// were we already in titledb before we started spidering?
|
|
m_srep.m_wasIndexed = m_wasInIndex;
|
|
|
|
// note whether m_wasIndexed is valid because if it isn't then
|
|
// we shouldn't be counting this reply towards the page counts.
|
|
// if we never made it this far i guess we should not forcibly call
|
|
// getIsIndexed() at this point so our performance is fast in case
|
|
// this is an EFAKEFIRSTIP error or something similar where we
|
|
// basically just add this reply and we're done.
|
|
// NOTE: this also pertains to SpiderReply::m_isIndexed.
|
|
m_srep.m_wasIndexedValid = m_wasInIndexValid;
|
|
|
|
// assume no change
|
|
m_srep.m_isIndexed = m_isInIndex;
|
|
|
|
// we need to know if the m_isIndexed bit is valid or not
|
|
// because sometimes like if we are being called directly from
|
|
// indexDoc() because of an error situation, we do not know!
|
|
if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
|
|
else m_srep.m_isIndexedINValid = true;
|
|
|
|
// likewise, we need to know if we deleted it so we can decrement the
|
|
// quota count for this subdomain/host in SpiderColl::m_quotaTable
|
|
//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;
|
|
|
|
// treat error replies special i guess, since langId, etc. will be
|
|
// invalid
|
|
if ( m_indexCode ) {
|
|
// validate
|
|
m_srepValid = true;
|
|
// set these items if valid already, but don't bother
|
|
// trying to compute them, since we are not indexing.
|
|
if ( m_siteNumInlinksValid ) {
|
|
m_srep.m_siteNumInlinks = m_siteNumInlinks;
|
|
m_srep.m_siteNumInlinksValid = true;
|
|
}
|
|
//if ( m_percentChangedValid )
|
|
// m_srep.m_percentChangedPerDay = m_percentChanged;
|
|
if ( m_crawlDelayValid && m_crawlDelay >= 0 )
|
|
// we already multiply x1000 in isAllowed2()
|
|
m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
|
|
else
|
|
m_srep.m_crawlDelayMS = -1;
|
|
if ( m_pubDateValid ) m_srep.m_pubDate = m_pubDate;
|
|
if ( m_langIdValid ) m_srep.m_langId = m_langId;
|
|
if ( m_isRSSValid ) m_srep.m_isRSS = m_isRSS;
|
|
if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
|
|
if ( m_httpStatusValid ) m_srep.m_httpStatus = m_httpStatus;
|
|
// stuff that is automatically valid
|
|
m_srep.m_isPingServer = 0;
|
|
if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
|
|
// this was replaced by m_contentHash32
|
|
//m_srep.m_newRequests = 0;
|
|
m_srep.m_errCode = m_indexCode;
|
|
if ( m_downloadEndTimeValid )
|
|
m_srep.m_downloadEndTime = m_downloadEndTime;
|
|
else
|
|
m_srep.m_downloadEndTime = 0;
|
|
// is the original spider request valid?
|
|
if ( m_sreqValid ) {
|
|
// preserve the content hash in case m_indexCode is
|
|
// EDOCUNCHANGED. so we can continue to get that
|
|
// in the future. also, if we had the doc indexed,
|
|
// just carry the contentHash32 forward for the other
|
|
// errors like EDNSTIMEDOUT or whatever.
|
|
m_srep.m_contentHash32 = m_sreq.m_contentHash32;
|
|
// int16_tcuts
|
|
SpiderReply *n = &m_srep;
|
|
SpiderRequest *o = &m_sreq;
|
|
// more stuff
|
|
n->m_inGoogle = o->m_inGoogle;
|
|
n->m_hasContactInfo = o->m_hasContactInfo;
|
|
n->m_isContacty = o->m_isContacty;
|
|
n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
|
|
n->m_isPingServer = o->m_isPingServer;
|
|
// the validator flags
|
|
n->m_inGoogleValid = o->m_inGoogleValid;
|
|
n->m_hasContactInfoValid = o->m_hasContactInfoValid;
|
|
n->m_isContactyValid = o->m_isContactyValid;
|
|
n->m_hasAuthorityInlinkValid =
|
|
o->m_hasAuthorityInlinkValid;
|
|
// get error count from original spider request
|
|
int32_t newc = m_sreq.m_errCount;
|
|
// inc for us, since we had an error
|
|
newc++;
|
|
// contain to one byte
|
|
if ( newc > 255 ) newc = 255;
|
|
// store in our spiderreply
|
|
m_srep.m_errCount = newc;
|
|
}
|
|
// . and do not really consider this an error
|
|
// . i don't want the url filters treating it as an error reply
|
|
// . m_contentHash32 should have been carried forward from
|
|
// the block of code right above
|
|
if ( m_indexCode == EDOCUNCHANGED ) {
|
|
// we should have had a spider request, because that's
|
|
// where we got the m_contentHash32 we passed to
|
|
// Msg13Request.
|
|
if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
// make it a success
|
|
m_srep.m_errCode = 0;
|
|
// and no error count, it wasn't an error per se
|
|
m_srep.m_errCount = 0;
|
|
// call it 200
|
|
m_srep.m_httpStatus = 200;
|
|
}
|
|
// copy flags and data from old doc...
|
|
if ( m_indexCode == EDOCUNCHANGED &&
|
|
m_oldDocValid &&
|
|
m_oldDoc ) {
|
|
m_srep.m_pubDate = m_oldDoc->m_pubDate;
|
|
m_srep.m_langId = m_oldDoc->m_langId;
|
|
m_srep.m_isRSS = m_oldDoc->m_isRSS;
|
|
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
|
|
m_srep.m_hasAddress = m_oldDoc->m_hasAddress;
|
|
m_srep.m_hasTOD = m_oldDoc->m_hasTOD;
|
|
//m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
|
|
m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
|
|
// they're all valid
|
|
m_srep.m_hasAddressValid = true;
|
|
m_srep.m_hasTODValid = true;
|
|
//m_srep.m_hasSiteVenueValid = true;
|
|
m_srep.m_siteNumInlinksValid = true;
|
|
}
|
|
// do special things if
|
|
return &m_srep;
|
|
}
|
|
|
|
// this will help us avoid hammering ips & respect same ip wait
|
|
if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0; }
|
|
m_srep.m_downloadEndTime = m_downloadEndTime;
|
|
|
|
// . if m_indexCode was 0, we are indexed then...
|
|
// . this logic is now above
|
|
//m_srep.m_isIndexed = 1;
|
|
|
|
// get ptr to old doc/titlerec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (SpiderReply *)pod;
|
|
// this is non-NULL if it existed
|
|
XmlDoc *od = *pod;
|
|
|
|
// status is -1 if not found
|
|
int16_t *hs = getHttpStatus ();
|
|
if ( ! hs || hs == (void *)-1 ) return (SpiderReply *)hs;
|
|
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (SpiderReply *)sni;
|
|
|
|
float *pc = getPercentChanged();
|
|
if ( ! pc || pc == (void *)-1 ) return (SpiderReply *)pc;
|
|
|
|
// these are "non-dup" addresses (nondup)
|
|
bool *hasAddress = getHasAddress();
|
|
if ( ! hasAddress || hasAddress == (void *)-1 )
|
|
return (SpiderReply *)hasAddress;
|
|
// does it have a tod (i.e. 6pm) in there somewhere?
|
|
bool *hasTOD = getHasTOD();
|
|
if ( ! hasTOD || hasTOD == (void *)-1 )
|
|
return (SpiderReply *)hasTOD;
|
|
// does it have a venue address?
|
|
//bool *hasSiteVenue = getHasSiteVenue();
|
|
//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
|
|
// return (SpiderReply *)hasSiteVenue;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (SpiderReply *)isRoot;
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (SpiderReply *)hci;
|
|
|
|
|
|
|
|
int32_t *pubDate = getPubDate();
|
|
if ( ! pubDate || pubDate == (int32_t *)-1 )
|
|
return (SpiderReply *)pubDate;
|
|
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 )
|
|
return (SpiderReply *)langId;
|
|
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS || isRSS == (char *)-1 )
|
|
return (SpiderReply *)isRSS;
|
|
|
|
char *pl = getIsPermalink();
|
|
if ( ! pl || pl == (char *)-1 )
|
|
return (SpiderReply *)pl;
|
|
|
|
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_hasContactInfo ) {
|
|
m_srep.m_hasContactInfo = 1;
|
|
m_srep.m_hasContactInfoValid = 1;
|
|
}
|
|
|
|
// this is only know if we download the robots.tt...
|
|
if ( od && m_recycleContent ) {
|
|
m_crawlDelay = od->m_crawlDelay;
|
|
m_crawlDelayValid = true;
|
|
}
|
|
|
|
// sanity checks
|
|
//if(! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_isSpamValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_crawlDelayValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// httpStatus is -1 if not found (like for empty http replies)
|
|
m_srep.m_httpStatus = *hs;
|
|
|
|
// zero if none
|
|
//m_srep.m_percentChangedPerDay = 0;
|
|
// . only if had old one
|
|
// . we use this in url filters to set the respider wait time usually
|
|
if ( od ) {
|
|
int32_t spideredTime = getSpideredTime();
|
|
int32_t oldSpideredTime = od->getSpideredTime();
|
|
float numDays = spideredTime - oldSpideredTime;
|
|
m_srep.m_percentChangedPerDay = (m_percentChanged+.5)/numDays;
|
|
}
|
|
|
|
// . update crawl delay, but we must store now as milliseconds
|
|
// because Spider.cpp like it better that way
|
|
// . -1 implies crawl delay unknown or not found
|
|
if ( m_crawlDelay >= 0 && m_crawlDelayValid )
|
|
// we already multiply x1000 in isAllowed2()
|
|
m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
|
|
else
|
|
// -1 means invalid/unknown
|
|
m_srep.m_crawlDelayMS = -1;
|
|
|
|
if ( ! m_hasAddressValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hasTODValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }
|
|
|
|
// . we use this to store "bad" spider recs to keep from respidering
|
|
// a "bad" url over and over again
|
|
// . it is up to the url filters whether they want to retry this
|
|
// again or not!
|
|
// . TODO: how to represent "ETCPTIMEDOUT"????
|
|
// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
|
|
// ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
|
|
m_srep.m_siteNumInlinks = m_siteNumInlinks;
|
|
m_srep.m_pubDate = *pubDate;
|
|
// this was replaced by m_contentHash32
|
|
//m_srep.m_newRequests = 0;
|
|
m_srep.m_langId = *langId;
|
|
m_srep.m_isRSS = (bool)*isRSS;
|
|
m_srep.m_isPermalink = (bool)*pl;
|
|
m_srep.m_isPingServer = (bool)fu->isPingServer();
|
|
//m_srep.m_isSpam = m_isSpam;
|
|
|
|
m_srep.m_siteNumInlinksValid = true;
|
|
|
|
// . ignore address in dup sections (nondup/non-dup addresses only)
|
|
// . this way if the place always has their address in the header or
|
|
// footer of every web page we will ignore it
|
|
m_srep.m_hasAddress = *hasAddress;
|
|
m_srep.m_isContacty = *hci;//getIsContacty(fu,
|
|
// info1,
|
|
// m_hopCount ,
|
|
// *ct , // contentType
|
|
// *isRoot ,
|
|
// m_niceness );
|
|
m_srep.m_hasTOD = *hasTOD;
|
|
//m_srep.m_hasSiteVenue = *hasSiteVenue;
|
|
|
|
// validate all
|
|
m_srep.m_inGoogleValid = 1;
|
|
m_srep.m_hasContactInfoValid = 1;
|
|
m_srep.m_hasAuthorityInlinkValid = 1;
|
|
m_srep.m_isContactyValid = 1;
|
|
m_srep.m_hasAddressValid = 1;
|
|
m_srep.m_hasTODValid = 1;
|
|
//m_srep.m_hasSiteVenueValid = 1;
|
|
|
|
// a quick validation. reply must unlock the url from the lock table.
|
|
// so the locks must be equal.
|
|
if ( m_sreqValid &&
|
|
// we create a new spiderrequest if injecting with a fake firstip
|
|
// so it will fail this test...
|
|
! m_sreq.m_isInjecting ) {
|
|
int64_t lock1 = makeLockTableKey(&m_sreq);
|
|
int64_t lock2 = makeLockTableKey(&m_srep);
|
|
if ( lock1 != lock2 ) {
|
|
log("build: lock1 != lock2 lock mismatch for %s",
|
|
m_firstUrl.m_url);
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
}
|
|
|
|
// validate
|
|
m_srepValid = true;
|
|
|
|
return &m_srep;
|
|
}
|
|
|
|
// . so Msg20 can see if we are banned now or not...
|
|
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
|
|
// because things like "parentIsRSS" can be both true or false since a url
|
|
// can have multiple spider recs associated with it!
|
|
void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq ,
|
|
SpiderReply *srep ) {
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_isUrlPermalinkFormatValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
Url *fu = getFirstUrl();
|
|
|
|
// get this
|
|
//TagRec *gr = (TagRec *)ptr_tagRecData;
|
|
//Tag *tag = NULL;
|
|
//if ( gr ) tag = gr->getTag("sitenuminlinks");
|
|
// reset
|
|
sreq->reset();
|
|
// assume not valid
|
|
sreq->m_siteNumInlinks = -1;
|
|
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// how many site inlinks?
|
|
sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
sreq->m_siteNumInlinksValid = true;
|
|
|
|
// set other fields besides key
|
|
sreq->m_firstIp = m_ip;
|
|
sreq->m_hostHash32 = m_hostHash32a;
|
|
//sreq->m_domHash32 = m_domHash32;
|
|
//sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
//sreq->m_pageNumInlinks = m_pageNumInlinks;
|
|
sreq->m_hopCount = m_hopCount;
|
|
|
|
sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32;
|
|
sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32;
|
|
sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32;
|
|
sreq->m_pageNumInlinks = 0;//m_sreq.m_parentFirstIp;
|
|
|
|
sreq->m_isNewOutlink = 0;
|
|
sreq->m_isAddUrl = 0;//m_isAddUrl;
|
|
sreq->m_isPingServer = fu->isPingServer();
|
|
//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;
|
|
|
|
// transcribe from old spider rec, stuff should be the same
|
|
sreq->m_addedTime = m_firstIndexedDate;
|
|
sreq->m_sameDom = 0;//m_sreq.m_sameDom;
|
|
sreq->m_sameHost = 0;//m_sreq.m_sameHost;
|
|
sreq->m_sameSite = 0;//m_sreq.m_sameSite;
|
|
sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed;
|
|
sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS;
|
|
sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink;
|
|
sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer;
|
|
|
|
// validate the stuff so getUrlFilterNum() acks it
|
|
sreq->m_hopCountValid = 1;
|
|
|
|
srep->reset();
|
|
|
|
srep->m_spideredTime = getSpideredTime();//m_spideredTime;
|
|
//srep->m_isSpam = isSpam; // real-time update this!!!
|
|
srep->m_isRSS = m_isRSS;
|
|
srep->m_isPermalink = m_isPermalink;
|
|
srep->m_httpStatus = 200;
|
|
//srep->m_retryNum = 0;
|
|
srep->m_langId = m_langId;
|
|
srep->m_percentChangedPerDay = 0;//m_percentChanged;
|
|
|
|
// we need this now for ucp ucr upp upr new url filters that do
|
|
// substring matching on the url
|
|
if ( m_firstUrlValid )
|
|
strcpy(sreq->m_url,m_firstUrl.m_url);
|
|
}
|
|
|
|
// defined in PageCrawlBot.cpp
|
|
int32_t isInSeedBuf ( CollectionRec *cr , char *url, int len ) ;
|
|
|
|
// . add the spiderdb recs to the meta list
|
|
// . used by XmlDoc::setMetaList()
|
|
// . returns NULL and sets g_errno on error
|
|
// . otherwise returns the "new p"
|
|
// . if Scraper.cpp or PageAddUrl.cpp and Msg7.cpp should all use the XmlDoc
|
|
// class even if just adding links. they should make a fake html page and
|
|
// "inject" it, with only m_useSpiderdb set to true...
|
|
char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
|
|
|
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do not do this if recycling content
|
|
// UNLESS REBUILDING...
|
|
if ( m_recycleContent && ! m_useSecondaryRdbs ) return (char *)0x01;
|
|
|
|
|
|
// for now skip in repair tool
|
|
if ( m_useSecondaryRdbs && ! g_conf.m_rebuildAddOutlinks )
|
|
return (char *)0x01;
|
|
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (char *)links;
|
|
|
|
char *spiderLinks = getSpiderLinks();
|
|
if ( ! spiderLinks || spiderLinks == (char *)-1 )
|
|
return (char *)spiderLinks;
|
|
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (char *)grv;
|
|
//char **iiv = getOutlinkIsIndexedVector();
|
|
//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
|
|
int32_t **ipv = getOutlinkFirstIpVector();
|
|
if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
|
|
//int8_t *hcv = getOutlinkHopCountVector();
|
|
//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
|
|
char *ipi = getIsIndexed(); // is the parent indexed?
|
|
if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (Addresses *)-1 ) return (char *)aa;
|
|
// sanity check
|
|
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . ignore address in dup sections
|
|
// . this way if the place always has their address in the header or
|
|
// footer of every web page we will ignore it (SEC_DUP section flag)
|
|
bool parentHasAddress = (bool)(aa->getNumNonDupAddresses()>0);
|
|
|
|
// need this
|
|
int32_t parentDomHash32 = getDomHash32();
|
|
if ( parentDomHash32 != m_domHash32 ) { char *xx=NULL;*xx=0; }
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
|
|
|
|
int32_t *psni = getSiteNumInlinks();
|
|
if ( ! psni || psni == (int32_t *)-1 ) return (char *)psni;
|
|
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (char *)pfip;
|
|
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (char *)d;
|
|
|
|
Url *fu = getFirstUrl();
|
|
if ( ! fu || fu == (void *)-1 ) return (char *)fu;
|
|
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (char *)cu;
|
|
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;
|
|
|
|
// validate this to prevent core for simplified redirect links
|
|
int32_t hostHash32a = getHostHash32a();
|
|
|
|
// so linkSites[i] is site for link #i in Links.cpp class
|
|
int32_t *linkSiteHashes = getLinkSiteHashes ( );
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
|
|
return (char *)linkSiteHashes;
|
|
|
|
|
|
XmlDoc *nd = this;
|
|
|
|
// set "od". will be NULL if no old xml doc, i.e. no old title rec
|
|
//XmlDoc **pod = getOldXmlDoc ( );
|
|
//if ( ! pod || pod == (void *)-1 ) return (char *)pod;
|
|
//XmlDoc *od = *pod;
|
|
|
|
// if this page is hacked, then do not spider external outlinks
|
|
//char *comp = getIsCompromised();
|
|
//if ( ! comp || comp == (char *)-1 ) return (char *)comp;
|
|
//if ( *comp )
|
|
// onlyInternal = true;
|
|
|
|
bool isParentRSS = false;
|
|
bool parentIsPermalink = false;
|
|
bool parentIsSiteMap = false;
|
|
// PageAddUrl.cpp does not supply a valid new doc, so this is NULL
|
|
if ( nd ) {
|
|
isParentRSS = *nd->getIsRSS() ;
|
|
parentIsPermalink = *nd->getIsPermalink();
|
|
parentIsSiteMap = *nd->getIsSiteMap();
|
|
}
|
|
|
|
int32_t n = links->m_numLinks;
|
|
// return early if nothing to do. do not return NULL though cuz we
|
|
// do not have g_errno set!
|
|
if ( n <= 0 ) return (char *)0x01;
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int64_t myUh48 = m_firstUrl.getUrlHash48();
|
|
|
|
// . pre-allocate a buffer to hold the spider recs
|
|
// . taken from SpiderRequest::store()
|
|
int32_t size = 0;
|
|
for ( int32_t i = 0 ; i < n ; i++ )
|
|
size += SpiderRequest::getNeededSize ( links->getLinkLen(i) );
|
|
|
|
// append spider recs to this list ptr
|
|
char *p = m_p;
|
|
|
|
// hash table to avoid dups
|
|
HashTableX ht;
|
|
char buf2[8192];
|
|
if ( ! ht.set ( 4,0,1000,buf2 , 8192,false,m_niceness,"linkdedup" ) )
|
|
return NULL;
|
|
|
|
// count how many we add
|
|
int32_t numAdded = 0;
|
|
int32_t numAddedFromSameDomain = 0;
|
|
int32_t linksBanned = 0;
|
|
int32_t linksFiltered = 0;
|
|
|
|
bool isParentPingServer = false;
|
|
if ( fu && fu->isPingServer() ) isParentPingServer = true;
|
|
if ( cu && cu->isPingServer() ) isParentPingServer = true;
|
|
|
|
// int16_tcut
|
|
bool isScraping = (m_sreqValid && m_sreq.m_isScraping);
|
|
//bool useTestSpiderDir = (m_sreqValid && m_sreq.m_useTestSpiderDir);
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// do not do this if not test collection for now
|
|
bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
|
|
// turn off for now
|
|
isTestColl = false;
|
|
|
|
//char **wptrs = m_words.getWords();
|
|
//int32_t *wlens = m_words.getWordLens();
|
|
|
|
// need this for setting SpiderRequest::m_spiderTime
|
|
//int32_t nowGlobal = getTimeGlobal();
|
|
|
|
// for setting LF_CONTACTY bit on the outlinks
|
|
char disbuf[1000];
|
|
HashTableX disqualify;
|
|
disqualify.set(4,0,32,disbuf,1000,false,m_niceness,"disqual");
|
|
int32_t consec = 0;
|
|
int32_t linkTypes[2000];
|
|
int32_t lastType = 0;
|
|
|
|
|
|
|
|
// if the file we are indexing now has
|
|
// "<meta name=spiderlinkslinks value=0>" then that means to
|
|
// add the links to spiderdb, but do not spider their links!
|
|
// dmozparse uses this to make a file called gbdmoz.urs.txt.0
|
|
// that is just filled with urls that are in dmoz. and we want
|
|
// to index just those urls.
|
|
//
|
|
// now just make dmozparse output urls as <a href=> tags.
|
|
//
|
|
char mbuf[16];
|
|
mbuf[0] = '\0';
|
|
char *tag = "spiderlinkslinks";
|
|
int32_t tlen = gbstrlen(tag);
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
bool avoid = false;
|
|
if ( mbuf[0] == '0' ) avoid = true;
|
|
|
|
// if this is a simplified redir and we should not be spidering
|
|
// links then turn it off as well! because we now add simplified
|
|
// redirects back into spiderdb using this function.
|
|
if ( m_spiderLinksValid && ! m_spiderLinks )
|
|
avoid = true;
|
|
|
|
// it also has this meta tag now too
|
|
mbuf[0] = '\0';
|
|
tag = "ignorelinksexternalerrors";
|
|
tlen = gbstrlen(tag);
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
bool ignore = false;
|
|
if ( mbuf[0] == '1' ) ignore = true;
|
|
|
|
// for diffbot crawlbot, if we are a seed url and redirected to a
|
|
// different domain... like bn.com --> barnesandnoble.com
|
|
int32_t redirDomHash32 = 0;
|
|
int32_t redirHostHash32 = 0;
|
|
//int32_t redirSiteHash32 = 0;
|
|
if ( //cr->m_isCustomCrawl == 1 &&
|
|
//isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) &&
|
|
m_hopCount == 0 &&
|
|
m_redirUrlValid &&
|
|
ptr_redirUrl &&
|
|
//m_redirUrlPtr && (this gets reset to NULL as being LAST redir)
|
|
// this is the last non-empty redir here:
|
|
m_redirUrl.getUrlLen() > 0 ) {
|
|
log("build: seed REDIR: %s",m_redirUrl.getUrl());
|
|
redirDomHash32 = m_redirUrl.getDomainHash32();
|
|
redirHostHash32 = m_redirUrl.getHostHash32();
|
|
}
|
|
|
|
|
|
//SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
|
|
|
|
//
|
|
// serialize each link into the metalist now
|
|
//
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// grab our info
|
|
TagRec *gr = (*grv)[i];
|
|
int32_t firstIp = (*ipv)[i];
|
|
//char isIndexed = (*iiv)[i];
|
|
//int32_t hc = hcv[i];
|
|
// ip lookup failed? do not add to spiderdb then
|
|
if ( firstIp == 0 || firstIp == -1 ) continue;
|
|
|
|
// if firstIp is in the SpiderColl::m_overflowFirstIps list
|
|
// then do not add any more links to it. it already has
|
|
// more than 500MB worth.
|
|
// this was moved to Rdb.cpp's addRecord()
|
|
// if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
|
|
// m_linkOverflows++;
|
|
// g_stats.m_totalOverflows++;
|
|
// continue;
|
|
// }
|
|
|
|
// sanity check
|
|
//if ( firstIp == 0x03 ) {char *xx=NULL;*xx=0; }
|
|
// get flags
|
|
linkflags_t flags = links->m_linkFlags[i];
|
|
// . skip if we are rss page and this link is an <a href> link
|
|
// . we only harvest <link> urls from rss feeds, not href links
|
|
// . or in the case of feedburner, those orig tags
|
|
if ( isParentRSS && (flags & LF_AHREFTAG) ) continue;
|
|
// if we have a <feedburner:origLink> tag, then ignore <link>
|
|
// tags and only get the links from the original links
|
|
if ( links->m_isFeedBurner && !(flags & LF_FBTAG) ) continue;
|
|
// do not add self links, pointless
|
|
if ( flags & LF_SELFLINK ) continue;
|
|
// do not add if no follow
|
|
if ( flags & LF_NOFOLLOW ) continue;
|
|
// point to url
|
|
char *s = links->getLink (i);
|
|
int32_t slen = links->getLinkLen(i);
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get hash
|
|
int32_t uh = hash32 ( s , slen );
|
|
// it does not like keys of 0, that means empty slot
|
|
if ( uh == 0 ) uh = 1;
|
|
// skip if dup
|
|
if ( ht.isInTable ( &uh ) ) continue;
|
|
// add it, returns false and sets g_errno on error
|
|
if ( ! ht.addKey ( &uh ) ) return NULL;
|
|
// we now supports HTTPS
|
|
if ( strncmp(s,"http://",7) && strncmp(s,"https://",8) )
|
|
continue;
|
|
// . do not add if "old"
|
|
// . Links::set() calls flagOldOutlinks()
|
|
// . that just means we probably added it the last time
|
|
// we spidered this page
|
|
// . no cuz we might have a different siteNumInlinks now
|
|
// and maybe this next hop count is now allowed where as
|
|
// before it was not!
|
|
//if ( flags & LF_OLDLINK ) continue;
|
|
|
|
// set it. addWWW = true! no.. make it false because of issues
|
|
// like tmblr.co/ZHw5yo1E5TAaW injection where
|
|
// www.tmblr.co has no IP
|
|
Url url; url.set ( s , slen , false ); // true );
|
|
|
|
// if hostname length is <= 2 then SILENTLY reject it
|
|
if ( url.getHostLen() <= 2 ) continue;
|
|
|
|
// are we a new outlink from a ? i.e. a "hot link"? assume so
|
|
bool newOutlink = true;
|
|
// if no old links, can not be a new outlink then
|
|
if ( flags & LF_OLDLINK ) newOutlink = false;
|
|
// . do not consider outlinks of new pages to be newOutlinks.
|
|
// that is somewhat redundant.
|
|
// . you can use "parentisnew" to do what you want in the url
|
|
// filters table
|
|
//if ( ! isIndexed ) newOutlink = false;
|
|
|
|
// get # of inlinks to this site... if recorded...
|
|
int32_t ksni = -1;
|
|
Tag *st = NULL;
|
|
if ( gr ) st = gr->getTag ("sitenuminlinks");
|
|
if ( st ) ksni = atol(st->getTagData());
|
|
|
|
int32_t hostHash32 = url.getHostHash32();
|
|
// . consult our sitelinks.txt file
|
|
// . returns -1 if not found
|
|
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
|
|
|
// try with www if not there
|
|
if ( min < 0 && ! url.hasSubdomain() ) {
|
|
int32_t wwwHash32 = url.getHash32WithWWW();
|
|
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
|
}
|
|
|
|
if ( min >= 0 && ksni < min )
|
|
ksni = min;
|
|
|
|
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
//int32_t ksni = m_siteNumInlinks;
|
|
|
|
// . get possible pub date from url (.../2008/09/23/page.htm)
|
|
// . this returns 0 if none found
|
|
//int32_t urlPubDate = parseDateFromUrl(s);
|
|
|
|
// use zero for the timestamp so SiteGetter does not recompute
|
|
// any tags in the tagRec thereby blocking!
|
|
//SiteGetter sg;
|
|
//sg.getSite ( s , gr , 0, m_coll, m_niceness,false,NULL,NULL);
|
|
// get this
|
|
bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );
|
|
//int32_t siteHash32 = hash32n ( linkSite );
|
|
|
|
// get it quick
|
|
bool ispingserver = url.isPingServer();
|
|
int32_t domHash32 = url.getDomainHash32();
|
|
|
|
// is link rss?
|
|
//bool isrss = false;
|
|
//if (slen>6 && !strncasecmp(s+slen-4,".rss",4)) isrss = true;
|
|
bool isRSSExt = false;
|
|
char *ext = url.getExtension();
|
|
if ( ext && strcasecmp(ext,"rss" ) == 0 ) isRSSExt = true;
|
|
if ( ext && strcasecmp(ext,"xml" ) == 0 ) isRSSExt = true;
|
|
if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;
|
|
|
|
|
|
// make the spider request rec for it
|
|
SpiderRequest ksr;
|
|
// to defaults (zero out)
|
|
ksr.reset();
|
|
// set other fields besides key
|
|
ksr.m_firstIp = firstIp;
|
|
ksr.m_hostHash32 = hostHash32;
|
|
ksr.m_domHash32 = domHash32;
|
|
ksr.m_siteHash32 = linkSiteHashes[i];//siteHash32;
|
|
ksr.m_siteNumInlinks = ksni;
|
|
ksr.m_siteNumInlinksValid = true;
|
|
ksr.m_isRSSExt = isRSSExt;
|
|
// continue using "test-spider" subdir to cache web pages
|
|
// if our parent was using that
|
|
//ksr.m_useTestSpiderDir = useTestSpiderDir;
|
|
ksr.m_parentIsSiteMap = parentIsSiteMap;
|
|
|
|
ksr.m_hasMediaExtension = url.hasMediaExtension();
|
|
ksr.m_hasMediaExtensionValid = 1;
|
|
|
|
// now we need this so we can share Msg12 spider locks with
|
|
// query reindex docid-based spider requests. that way
|
|
// we do not spider the same document at the same time.
|
|
//ksr.m_probDocId = g_titledb.getProbableDocId(&url);
|
|
|
|
//ksr.m_pageNumInlinks = 0;
|
|
|
|
// hop count is now 16 bits so do not wrap that around
|
|
int32_t hc = m_hopCount + 1;
|
|
if ( hc > 65535 ) hc = 65535;
|
|
ksr.m_hopCount = hc;
|
|
|
|
// keep hopcount the same for redirs
|
|
if ( m_indexCodeValid &&
|
|
( m_indexCode == EDOCSIMPLIFIEDREDIR ||
|
|
m_indexCode == EDOCNONCANONICAL ) )
|
|
ksr.m_hopCount = m_hopCount;
|
|
|
|
// for diffbot custom crawls we keep the computed hopcount
|
|
if ( ! cr->m_isCustomCrawl ) {
|
|
if ( issiteroot ) ksr.m_hopCount = 0;
|
|
if ( ispingserver ) ksr.m_hopCount = 0;
|
|
//if ( isrss ) ksr.m_hopCount = 0;
|
|
}
|
|
|
|
// log("ksr: url=%s hc=%i (isr=%i ips=%i icv=%i ic=%i mhc=%i)",
|
|
// url.getUrl(),(int)ksr.m_hopCount,
|
|
// (int)issiteroot,(int)ispingserver,(int)m_indexCodeValid,
|
|
// (int)m_indexCode,(int)m_hopCount
|
|
// );
|
|
|
|
// validate it
|
|
ksr.m_hopCountValid = true;
|
|
|
|
ksr.m_addedTime = getSpideredTime();//m_spideredTime;
|
|
//ksr.m_lastAttempt = 0;
|
|
//ksr.m_urlPubDate = urlPubDate;
|
|
//ksr.m_errCode = 0;
|
|
ksr.m_parentHostHash32 = hostHash32a;
|
|
ksr.m_parentDomHash32 = m_domHash32;
|
|
ksr.m_parentSiteHash32 = m_siteHash32;
|
|
|
|
// if a seed/hopcount0 url redirected to a different domain
|
|
// then use that if it is the same. that way we can satisft
|
|
// the "isonsamedomain" expression in the url filters table.
|
|
if ( redirDomHash32 == domHash32 && redirDomHash32 )
|
|
ksr.m_parentDomHash32 = redirDomHash32;
|
|
if ( redirHostHash32 == hostHash32 && redirHostHash32 )
|
|
ksr.m_parentHostHash32 = redirHostHash32;
|
|
|
|
//ksr.m_parentFirstIp = *pfip;//m_ip;
|
|
ksr.m_pageNumInlinks = 0;
|
|
|
|
ksr.m_parentHasAddress = parentHasAddress;
|
|
// get this
|
|
bool isupf = ::isPermalink(NULL,&url,CT_HTML,NULL,isRSSExt);
|
|
// set some bit flags. the rest are 0 since we call reset()
|
|
if ( newOutlink ) ksr.m_isNewOutlink = 1;
|
|
if ( isupf ) ksr.m_isUrlPermalinkFormat = 1;
|
|
//if ( isIndexed ) ksr.m_isIndexed = 1;
|
|
if ( ispingserver ) ksr.m_isPingServer = 1;
|
|
|
|
// is it like www.xxx.com/* (does not include www.xxx.yyy.com)
|
|
// includes xxx.com/* however
|
|
ksr.m_isWWWSubdomain = url.isSimpleSubdomain();
|
|
|
|
// get link text we use for this outlink
|
|
/*
|
|
char tbuf[200];
|
|
int32_t tlen = links->getLinkText2 ( i ,
|
|
tbuf ,
|
|
200 ,
|
|
NULL ,
|
|
NULL ,
|
|
NULL ,
|
|
m_niceness );
|
|
*/
|
|
|
|
// the updated isContacty algo to fix www.apha.org which
|
|
// has a ton of apha.org/about/* links
|
|
int32_t t = getIsContacty ( &url,
|
|
NULL ,
|
|
ksr.m_hopCount ,
|
|
0 , // content type
|
|
(ksr.m_hopCount==0),
|
|
m_niceness );
|
|
// if same type as last one we might disqualify if 3 in a row
|
|
if ( t && t == lastType ) consec++;
|
|
else consec = 0;
|
|
// disqualify this pattern as a contacty link if is abused
|
|
if ( consec >= 3 )
|
|
if ( ! disqualify.addKey(&t) )
|
|
return NULL;
|
|
// remember. use numAdded as the index for this since we do
|
|
// not add all the outlinks to this list.
|
|
if ( numAdded < 2000 ) linkTypes[numAdded] = t;
|
|
// set this
|
|
lastType = t;
|
|
|
|
// validate
|
|
ksr.m_isContactyValid = 1;
|
|
|
|
// if parent is a root of a popular site, then it is considered
|
|
// an authority linker. (see updateTagdb() function above)
|
|
if ( *isRoot && *psni >= 500 )
|
|
ksr.m_hasAuthorityInlink = 1;
|
|
// this is in request now as well as reply
|
|
//Tag *tag;
|
|
// hascontactinfo tag can have a value of 0 or 1
|
|
//tag = gr->getTag("hascontactinfo");
|
|
//if ( tag ) {
|
|
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_hasContactInfo ) {
|
|
ksr.m_hasContactInfo = 1;
|
|
ksr.m_hasContactInfoValid = true;
|
|
}
|
|
|
|
// if we just set the contact info, use us, more recent
|
|
if ( linkSiteHashes[i]==m_siteHash32 && m_hasContactInfoValid){
|
|
ksr.m_hasContactInfo = m_hasContactInfo;
|
|
ksr.m_hasContactInfoValid = true;
|
|
}
|
|
|
|
if ( gr->getTag("ingoogle" ) ) {
|
|
ksr.m_inGoogle = 1;
|
|
ksr.m_inGoogleValid = true;
|
|
}
|
|
// the mere existence of these tags is good
|
|
if ( gr->getTag("authorityinlink"))ksr.m_hasAuthorityInlink =1;
|
|
ksr.m_hasAuthorityInlinkValid = true;
|
|
|
|
// if our url was a seed and redirected to another domain
|
|
// allow outlinks on that other domain to be on domain too.
|
|
// only used for diffbot crawlbot right now.
|
|
if ( domHash32 == redirDomHash32 && redirDomHash32 )
|
|
ksr.m_sameDom = 1;
|
|
if ( hostHash32 == redirHostHash32 && redirHostHash32 )
|
|
ksr.m_sameHost = 1;
|
|
// if ( linkSiteHashes[i]==redirSiteHash32 && redirSiteHash32)
|
|
// ksr.m_sameSite = 1;
|
|
|
|
// set parent based info
|
|
if ( domHash32 == m_domHash32 ) ksr.m_sameDom = 1;
|
|
if ( hostHash32 == m_hostHash32a ) ksr.m_sameHost = 1;
|
|
if ( linkSiteHashes[i]==m_siteHash32 ) ksr.m_sameSite = 1;
|
|
if ( *ipi ) ksr.m_wasParentIndexed = 1;
|
|
if ( isParentRSS ) ksr.m_parentIsRSS = 1;
|
|
if ( parentIsPermalink ) ksr.m_parentIsPermalink = 1;
|
|
if ( isParentPingServer ) ksr.m_parentIsPingServer= 1;
|
|
if ( parentIsSiteMap ) ksr.m_parentIsSiteMap = 1;
|
|
|
|
// this is used for building dmoz. we just want to index
|
|
// the urls in dmoz, not their outlinks.
|
|
if ( avoid ) ksr.m_avoidSpiderLinks = 1;
|
|
|
|
// this is used for building dmoz. we need to index this
|
|
// url even in the case of ETCPTIMEDOUT, etc.
|
|
if ( ignore ) ksr.m_ignoreExternalErrors = 1;
|
|
|
|
// . if this is the 2nd+ time we were spidered and this outlink
|
|
// wasn't there last time, then set this!
|
|
// . if this is the first time spidering this doc then set it
|
|
// to zero so that m_minPubDate is set to -1 when the outlink
|
|
// defined by "ksr" is spidered.
|
|
if ( m_oldDocValid && m_oldDoc ) {
|
|
int32_t oldSpideredTime = m_oldDoc->getSpideredTime();
|
|
ksr.m_parentPrevSpiderTime = oldSpideredTime;
|
|
}
|
|
else
|
|
ksr.m_parentPrevSpiderTime = 0;
|
|
|
|
//
|
|
// . inherit manual add bit if redirecting to simplified url
|
|
// . so we always spider seed url even if prohibited by
|
|
// the regex, and even if it simplified redirects
|
|
//
|
|
if ( m_indexCodeValid &&
|
|
( m_indexCode == EDOCSIMPLIFIEDREDIR ||
|
|
m_indexCode == EDOCNONCANONICAL ) &&
|
|
m_sreqValid ) {
|
|
if ( m_sreq.m_isInjecting )
|
|
ksr.m_isInjecting = 1;
|
|
if ( m_sreq.m_isAddUrl )
|
|
ksr.m_isAddUrl = 1;
|
|
}
|
|
|
|
// it is useful to know the primary langid of the parent
|
|
// when prioritizing links for spidering in the case of
|
|
// focussing the search engine on a particular set of langs
|
|
ksr.m_parentLangId = *langId;
|
|
|
|
// don't forget this one!
|
|
//ksr.m_spiderTime = nowGlobal;
|
|
|
|
// . is it "spam"? XmlDoc.cpp::isSpam()
|
|
// . we need to make that root quality into site root quality!
|
|
// . let's put spam detection logic into url filters
|
|
//if ( isSpam ( s,gr,m_spideredTime,true ) )
|
|
// // set the bit flag
|
|
// ksr.m_isSpam = 1;
|
|
// copy the url into SpiderRequest::m_url buffer
|
|
strcpy(ksr.m_url,s);
|
|
// this must be valid
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set the key, ksr.m_key. isDel = false
|
|
ksr.setKey ( firstIp, *d , false );
|
|
|
|
// we were hopcount 0, so if we link to ourselves we override
|
|
// our original hopcount of 0 with this guy that has a
|
|
// hopcount of 1. that sux... so don't do it.
|
|
if ( ksr.getUrlHash48() == myUh48 ) continue;
|
|
|
|
// if we've recently added this url to spiderdb in Spider.cpp, skip it
|
|
//if ( sc && sc->isInDupCache ( &ksr , false ) )
|
|
// continue;
|
|
|
|
// . technically speaking we do not have any reply so we
|
|
// should not be calling this! cuz we don't have all the info
|
|
// . see if banned or filtered, etc.
|
|
// . at least try to call it. getUrlFilterNum() should
|
|
// break out and return -1 if it encounters a filter rule
|
|
// that it does not have enough info to answer.
|
|
// so if your first X filters all map to a "FILTERED"
|
|
// priority and this url matches one of them we can
|
|
// confidently toss this guy out.
|
|
// . show this for debugging!
|
|
// int32_t ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
|
|
// false, m_niceness, cr,
|
|
// false,//true , // outlink?
|
|
// NULL ); // quotatable
|
|
// logf(LOG_DEBUG,"build: ufn=%" INT32 " for %s",
|
|
// ufn,ksr.m_url);
|
|
|
|
// bad?
|
|
//if ( ufn < 0 ) {
|
|
// log("build: link %s had bad url filter."
|
|
// , ksr.m_url );
|
|
// g_errno = EBADENGINEER;
|
|
// return NULL;
|
|
//}
|
|
|
|
//int32_t priority = -1;
|
|
//if ( ufn >= 0 )
|
|
// priority = cr->m_spiderPriorities[ufn];
|
|
|
|
// debug
|
|
if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
|
|
// print the tag rec out into sb2
|
|
SafeBuf sb2;
|
|
if ( gr ) gr->printToBuf ( &sb2 );
|
|
// get it
|
|
//SafeBuf sb1;
|
|
char *action = "add";
|
|
if ( isScraping ) action = "scrape";
|
|
logf(LOG_DEBUG,
|
|
"spider: attempting to %s link. "
|
|
"%s "
|
|
"tags=%s "
|
|
"onpage=%s"
|
|
,
|
|
action ,
|
|
ksr.m_url,
|
|
//sb1.getBufStart(),
|
|
sb2.getBufStart(),
|
|
m_firstUrl.m_url);
|
|
}
|
|
// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
|
|
// . mdw: oct 24, 2013. now i add so the urls show up in
|
|
// the pagecrawlbot.cpp spiderdb dump, so you can examine
|
|
// exactly why a url was crawled or not. plus if you change
|
|
// your mind about banning/filtering then it'd be nice to
|
|
// have these urls readily available.
|
|
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
|
// linksFiltered++; continue; }
|
|
//if ( priority == SPIDER_PRIORITY_BANNED ) {
|
|
// linksBanned++; continue; }
|
|
|
|
|
|
// serialize into the buffer
|
|
int32_t need = ksr.getRecSize();
|
|
// is that what we thought it would be?
|
|
//int32_t thought = links->m_linkLens[i] + 1 + hsize;
|
|
// sanity check
|
|
//if ( need + 12 + 4 > thought ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( p + 1 + need > m_pend ) { char *xx=NULL;*xx=0; }
|
|
// store the rdbId
|
|
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
|
|
else *p++ = RDB_SPIDERDB;
|
|
// print it for debug
|
|
if ( isTestColl ) {
|
|
SafeBuf tmp;
|
|
ksr.print(&tmp);
|
|
log("spider: attempting to add outlink "
|
|
"%s",tmp.getBufStart());
|
|
}
|
|
// store the spider rec
|
|
gbmemcpy ( p , &ksr , need );
|
|
// skip it
|
|
p += need;
|
|
// count it
|
|
numAdded++;
|
|
// check domain
|
|
//if ( domHash32 == m_domHash32 ) numAddedFromSameDomain++;
|
|
if ( ksr.m_sameDom ) numAddedFromSameDomain++;
|
|
}
|
|
|
|
//
|
|
// scan through requests and set m_isContacty
|
|
//
|
|
char *s = m_p;
|
|
int32_t k = 0;
|
|
for ( ; s < p ; k++ ) {
|
|
// advance over rdbid
|
|
s++;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// cast
|
|
SpiderRequest *ksr = (SpiderRequest *)s;
|
|
// set size
|
|
size = ksr->getRecSize();
|
|
// advance over that
|
|
s += size;
|
|
// stop if breach
|
|
if ( k >= 2000 ) break;
|
|
// must be isContacty
|
|
if ( ! linkTypes[k] ) continue;
|
|
// and not disqualified
|
|
if ( disqualify.isInTable(&linkTypes[k] )) continue;
|
|
// ok, we are good to go
|
|
ksr->m_isContacty = 1;
|
|
}
|
|
|
|
// . this is just how many urls we tried to index
|
|
// . move into Spider::addSpiderRequest()
|
|
//cr->m_localCrawlInfo.m_urlsHarvested += numAdded;
|
|
//cr->m_globalCrawlInfo.m_urlsHarvested += numAdded;
|
|
//cr->m_needsSave = true;
|
|
|
|
// save it
|
|
m_numOutlinksAdded = numAdded;
|
|
m_numOutlinksAddedValid = true;
|
|
m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
|
|
m_numOutlinksFiltered = linksFiltered;
|
|
m_numOutlinksBanned = linksBanned;
|
|
// update end of list once we have successfully added all spider recs
|
|
m_p = p;
|
|
// return current ptr
|
|
return m_p ;
|
|
}
|
|
|
|
|
|
/*
|
|
// add keys/recs from the table into the metalist
|
|
bool XmlDoc::addTable96 ( HashTableX *tt1 ,
|
|
int32_t date1 ,
|
|
bool nosplit ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// docid is handy
|
|
int64_t d = *getDocId();
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// use secondary rdbs if repairing
|
|
//bool useRdb2 = ( g_repair.isRepairActive() &&
|
|
// ! g_repair.m_fullRebuild &&
|
|
// ! g_repair.m_removeBadPages );
|
|
char rdbId1 = RDB_INDEXDB;
|
|
char rdbId2 = RDB_DATEDB;
|
|
if ( m_useSecondaryRdbs ) { // useRdb2 ) {
|
|
rdbId1 = RDB2_INDEXDB2;
|
|
rdbId2 = RDB2_DATEDB2;
|
|
}
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// get its key
|
|
int64_t *termId1 = (int64_t *)tt1->getKey ( i );
|
|
// get the score
|
|
uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
|
|
// sanity check
|
|
if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// store rdbid
|
|
*m_p++ = (rdbId1 | f);
|
|
// store it. not a del key.
|
|
*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,false);
|
|
// skip it
|
|
m_p += sizeof(key_t);
|
|
// add to datedb?
|
|
if ( date1 == -1 ) continue;
|
|
// yes
|
|
*m_p++ = (rdbId2 | f);
|
|
// store it. not a del key.
|
|
*(key128_t *)m_p=
|
|
g_datedb.makeKey(*termId1,date1,score1,d,false);
|
|
// advance over that
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
uint8_t rdbId ,
|
|
bool forDelete ) {
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
|
|
// store this rdbId into the list
|
|
char useRdbId = rdbId;
|
|
//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) useRdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) useRdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) useRdbId = RDB2_DATEDB2;
|
|
if ( useRdb2 && rdbId == RDB_PLACEDB ) useRdbId = RDB2_PLACEDB2;
|
|
if ( useRdb2 && rdbId == RDB_SECTIONDB ) useRdbId = RDB2_SECTIONDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else if ( rdbId == RDB_SECTIONDB ) {
|
|
int32_t svs = sizeof(SectionVote);
|
|
if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
int32_t count = 0;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key128_t *k = (key128_t *)tt1->getKey ( i );
|
|
// no key is allowed to have the del bit clear at this point
|
|
// because we reserve that for making negative keys!
|
|
if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
|
|
// store rdbid
|
|
*m_p++ = useRdbId; // (useRdbId | f);
|
|
// store it
|
|
// *(key128_t *)m_p = *k; does this work?
|
|
gbmemcpy ( m_p , k , sizeof(key128_t) );
|
|
// all keys must be positive at this point
|
|
if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
|
|
// or if getting for incremental indexing and this is
|
|
// from the "oldList"
|
|
//if ( forDelete ) *m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key128_t);
|
|
// count it
|
|
count++;
|
|
// do not add the data if deleting
|
|
if ( forDelete ) continue;
|
|
// skip if not sectiondb or placedb
|
|
if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
|
|
// ok test it out (MDW)
|
|
//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
|
|
//if ( count > 1 ) continue;
|
|
// get the data value
|
|
char *val = (char *)tt1->getValue ( k );
|
|
// get the size of the data to store. assume Sectiondb vote.
|
|
int32_t ds = sizeof(SectionVote);
|
|
// placedb is special even. include the \0 terminator
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
// "ds" is how many bytes we store as data
|
|
ds = gbstrlen(val)+1;
|
|
// store dataSize first
|
|
*(int32_t *)m_p = ds;
|
|
// skip it
|
|
m_p += 4;
|
|
}
|
|
// store possible accompanying date of the rdb record
|
|
gbmemcpy (m_p,val, ds );
|
|
// skip it
|
|
m_p += ds;
|
|
}
|
|
//if(rdbId==RDB_LINKDB ) log("doc: added %" INT32 " linkdb keys" ,count);
|
|
//if(rdbId==RDB_SECTIONDB ) log("doc: added %" INT32 " sectiondb keys",count);
|
|
return true;
|
|
}
|
|
|
|
int32_t XmlDoc::getSiteRank ( ) {
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
return ::getSiteRank ( m_siteNumInlinks );
|
|
}
|
|
|
|
// . add keys/recs from the table into the metalist
|
|
// . we store the keys into "m_p" unless "buf" is given
|
|
bool XmlDoc::addTable144 ( HashTableX *tt1 , int64_t docId , SafeBuf *buf ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key144_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// assume we are storing into m_p
|
|
char *p = m_p;
|
|
|
|
// reserve space if we had a safebuf and point into it if there
|
|
if ( buf ) {
|
|
int32_t slotSize = (sizeof(key144_t)+2+sizeof(key128_t));
|
|
int32_t need = tt1->getNumSlotsUsed() * slotSize;
|
|
if ( ! buf->reserve ( need ) ) return false;
|
|
// get cursor into buf, NOT START of buf
|
|
p = buf->getBufStart();
|
|
}
|
|
|
|
int32_t siteRank = getSiteRank ();
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
char rdbId = RDB_POSDB;
|
|
if ( m_useSecondaryRdbs ) rdbId = RDB2_POSDB2;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// get its key
|
|
char *kp = (char *)tt1->getKey ( i );
|
|
// store rdbid
|
|
*p++ = rdbId; // (rdbId | f);
|
|
// store it as is
|
|
gbmemcpy ( p , kp , sizeof(key144_t) );
|
|
// sanity check
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//h64 &= TERMID_MASK;
|
|
//if ( g_posdb.getTermId(kp) == h64 ) {
|
|
// log("hey: docid=%" INT64 " float=%f",m_docId,
|
|
// g_posdb.getFloat(kp) );
|
|
//}
|
|
/*
|
|
// get the score
|
|
int32_t score = tt1->getScoreFromSlot ( i ) ;
|
|
// set the M-bits to the score. used to accumulate link texts
|
|
// that are the same so pages like google.com do not have
|
|
// the word 'google' like 1 million times. this should reduce
|
|
// our "score" logarithmacly into the 7-bits or whatever.
|
|
//
|
|
// NO! now we just always increment the distance cursor
|
|
// m_dist so there will never be a collision of any posdb
|
|
// key we add... so we think
|
|
if ( score ) {
|
|
int32_t newScore = score;
|
|
if ( score >= 65 ) newScore = 65 +(score/100);
|
|
//if ( score >= 65+3200) newScore = 65 +(score/100);
|
|
if ( newScore > MAXMULTIPLIER )
|
|
newScore = MAXMULTIPLIER;
|
|
g_posdb.setMultiplierBits(m_p,(unsigned char)newScore);
|
|
}
|
|
*/
|
|
// this was zero when we added these keys to zero, so fix it
|
|
g_posdb.setDocIdBits ( p , docId );
|
|
// if this is a numeric field we do not want to set
|
|
// the siterank or langid bits because it will mess up
|
|
// sorting by the float which is basically in the position
|
|
// of the word position bits.
|
|
if ( g_posdb.isAlignmentBitClear ( p ) ) {
|
|
// make sure it is set again. it was just cleared
|
|
// to indicate that this key contains a float
|
|
// like a price or something, and we should not
|
|
// set siterank or langid so that its termlist
|
|
// remains sorted just by that float
|
|
g_posdb.setAlignmentBit ( p , 1 );
|
|
}
|
|
// otherwise, set the siterank and langid
|
|
else {
|
|
// this too
|
|
g_posdb.setSiteRankBits ( p , siteRank );
|
|
// set language here too
|
|
g_posdb.setLangIdBits ( p , m_langId );
|
|
}
|
|
// advance over it
|
|
p += sizeof(key144_t);
|
|
}
|
|
|
|
// all done
|
|
if ( ! buf ) { m_p = p; return true; }
|
|
|
|
// update safebuf otherwise
|
|
char *start = buf->getBufStart();
|
|
// fix SafeBuf::m_length
|
|
buf->setLength ( p - start );
|
|
// sanity
|
|
if ( buf->length() > buf->getCapacity() ) { char *xx=NULL;*xx=0; }
|
|
|
|
return true;
|
|
}
|
|
|
|
// add keys/recs from the table into the metalist
|
|
bool XmlDoc::addTable224 ( HashTableX *tt1 ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key224_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 0 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
char rdbId = RDB_LINKDB;
|
|
if ( m_useSecondaryRdbs ) rdbId = RDB2_LINKDB2;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// get its key
|
|
char *kp = (char *)tt1->getKey ( i );
|
|
// store rdbid
|
|
*m_p++ = rdbId; // (rdbId | f);
|
|
// store it as is
|
|
gbmemcpy ( m_p , kp , sizeof(key224_t) );
|
|
// advance over it
|
|
m_p += sizeof(key224_t);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// . add table into our metalist pointed to by m_p
|
|
// . k.n1 = date (see hashWords() below)
|
|
// . k.n0 = termId (see hashWords() below)
|
|
// . and the value is the score, 32-bits
|
|
bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
uint64_t docId ,
|
|
uint8_t rdbId ,
|
|
bool nosplit ) {
|
|
|
|
if ( tt1->m_numSlotsUsed == 0 ) return true;
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity checks
|
|
if ( nosplit ) {
|
|
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key96_t *k = (key96_t *)tt1->getKey ( i );
|
|
// get its value
|
|
uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
|
|
// convert to 8 bits
|
|
v = score32to8 ( v );
|
|
// . make the meta list key for datedb
|
|
// . a datedb key (see Datedb.h)
|
|
key128_t mk = g_datedb.makeKey ( k->n0 , // termId
|
|
k->n1 , // date
|
|
v , // score (8 bits)
|
|
docId ,
|
|
false );// del key?
|
|
// store rdbid with optional "nosplit" flag
|
|
*m_p++ = (rdbId | f);
|
|
// store it. it is a del key.
|
|
*(key128_t *)m_p = mk;
|
|
// skip it
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// add keys/recs from the table into the metalist
|
|
bool XmlDoc::addTable96 ( HashTableX *tt1 ,
|
|
HashTableX *tt2 ,
|
|
int32_t date1 ,
|
|
int32_t date2 ,
|
|
bool del ,
|
|
bool nosplit ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
if ( tt2->m_numSlots ) {
|
|
if ( tt2->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt2->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// docid is handy
|
|
int64_t d = *getDocId();
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// use secondary rdbs if repairing
|
|
//bool useRdb2 = ( g_repair.isRepairActive() &&
|
|
// ! g_repair.m_fullRebuild &&
|
|
// ! g_repair.m_removeBadPages );
|
|
char rdbId1 = RDB_INDEXDB;
|
|
char rdbId2 = RDB_DATEDB;
|
|
if ( m_useSecondaryRdbs ) { // useRdb2 ) {
|
|
rdbId1 = RDB2_INDEXDB2;
|
|
rdbId2 = RDB2_DATEDB2;
|
|
}
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
int64_t *termId1 = (int64_t *)tt1->getKey ( i );
|
|
// get the score
|
|
uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
|
|
// sanity check
|
|
if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// see if in "tt2"
|
|
int32_t slot = tt2->getSlot ( termId1 );
|
|
// assume 0
|
|
uint8_t score2 = 0;
|
|
// look it up in the positive key table
|
|
if ( slot >= 0 ) {
|
|
score2 = score32to8 ( tt2->getScoreFromSlot(slot) );
|
|
// sanity check
|
|
if ( score2 <= 0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// we annihilate!
|
|
if ( score1 != score2 ) {
|
|
// store rdbid
|
|
*m_p++ = (rdbId1 | f);
|
|
// store it. it is a del key.
|
|
*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,del);
|
|
// skip it
|
|
m_p += sizeof(key_t);
|
|
}
|
|
// add to datedb?
|
|
if ( date1 == -1 ) continue;
|
|
// same dates too?
|
|
if ( date1 == date2 && score1 == score2 ) continue;
|
|
// yes
|
|
*m_p++ = (rdbId2 | f);
|
|
// store it. it is a del key.
|
|
*(key128_t *)m_p=g_datedb.makeKey(*termId1,date1,score1,d,del);
|
|
// advance over that
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . add table into our metalist pointed to by m_p
|
|
// . k.n1 = date (see hashWords() below)
|
|
// . k.n0 = termId (see hashWords() below)
|
|
// . and the value is the score, 32-bits
|
|
bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
HashTableX *tt2 , // <key128_t,char> *tt2
|
|
uint64_t docId ,
|
|
uint8_t rdbId ,
|
|
bool del ,
|
|
bool nosplit ) {
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity checks
|
|
if ( nosplit ) {
|
|
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key96_t *k = (key96_t *)tt1->getKey ( i );
|
|
// get its value
|
|
uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
|
|
// convert to 8 bits
|
|
v = score32to8 ( v );
|
|
// see if in "tt2"
|
|
int32_t slot = tt2->getSlot ( k );
|
|
// get value if there
|
|
if ( slot >= 0 ) {
|
|
// get it
|
|
uint32_t val =*(uint32_t *)tt2->getValueFromSlot(slot);
|
|
// convert to 8 bits
|
|
val = score32to8 ( val );
|
|
// compare, if same, skip it!
|
|
if ( val == v ) continue;
|
|
}
|
|
// . make the meta list key for datedb
|
|
// . a datedb key (see Datedb.h)
|
|
key128_t mk = g_datedb.makeKey ( k->n0 , // termId
|
|
k->n1 , // date
|
|
v , // score (8 bits)
|
|
docId ,
|
|
del );// del key?
|
|
// store rdbid with optional "nosplit" flag
|
|
*m_p++ = (rdbId | f);
|
|
// store it. it is a del key.
|
|
*(key128_t *)m_p = mk;
|
|
// skip it
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
HashTableX *tt2 , // <key128_t,char> *tt2
|
|
uint8_t rdbId ,
|
|
bool del ,
|
|
bool nosplit ) {
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity checks
|
|
if ( nosplit ) {
|
|
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
|
|
if ( rdbId == RDB_DATEDB ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else if ( rdbId == RDB_SECTIONDB ) {
|
|
int32_t svs = sizeof(SectionVote);
|
|
if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != svs ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
int32_t count = 0;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key128_t *k = (key128_t *)tt1->getKey ( i );
|
|
// no key is allowed to have the del bit clear at this point
|
|
// because we reserve that for making negative keys!
|
|
if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
|
|
// see if in "tt2"
|
|
int32_t slot = tt2->getSlot ( k );
|
|
// . skip if already indexed
|
|
// . do not do incremental indexing for sectiondb/placedb since
|
|
// it may have the same key but different data!!!!!!!
|
|
if ( slot >= 0 &&
|
|
rdbId != RDB_SECTIONDB &&
|
|
rdbId != RDB_PLACEDB )
|
|
continue;
|
|
// store rdbid with optional "nosplit" flag
|
|
*m_p++ = (rdbId | f);
|
|
// store it
|
|
// *(key128_t *)m_p = *k; does this work?
|
|
gbmemcpy ( m_p , k , sizeof(key128_t) );
|
|
// all keys must be positive at this point
|
|
if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
|
|
// clear the del bit if we are an unmatched key and "del"
|
|
// is true. we need to be a negative key now
|
|
if ( del ) m_p[0] = m_p[0] & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key128_t);
|
|
// count it
|
|
count++;
|
|
// skip if not sectiondb or placedb
|
|
if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
|
|
// ok test it out (MDW)
|
|
//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
|
|
//if ( count > 1 ) continue;
|
|
// if we were a negative key, do not add a value, even for
|
|
// sectiondb
|
|
if ( del ) continue;
|
|
// get the data value
|
|
char *val = (char *)tt1->getValue ( k );
|
|
// get the size of the data to store. assume Sectiondb vote.
|
|
int32_t ds = sizeof(SectionVote);
|
|
// placedb is special even. include the \0 terminator
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
// "ds" is how many bytes we store as data
|
|
ds = gbstrlen(val)+1;
|
|
// store dataSize first
|
|
*(int32_t *)m_p = ds;
|
|
// skip it
|
|
m_p += 4;
|
|
}
|
|
// store possible accompanying date of the rdb record
|
|
gbmemcpy (m_p,val, ds );
|
|
// skip it
|
|
m_p += ds;
|
|
}
|
|
//if(rdbId==RDB_LINKDB ) log("doc: added %" INT32 " linkdb keys" ,count);
|
|
//if(rdbId==RDB_SECTIONDB ) log("doc: added %" INT32 " sectiondb keys",count);
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
// . hash terms that are sharded by TERMID not DOCID!!
|
|
//
|
|
// . returns false and sets g_errno on error
|
|
// . these terms are stored in indexdb/datedb, but all terms with the same
|
|
// termId reside in one and only one group. whereas normally the records
|
|
// are split based on docid and every group gets 1/nth of the termlist.
|
|
// . we do this "no splitting" so that only one disk seek is required, and
|
|
// we know the termlist is small, or the termlist is being used for spidering
|
|
// or parsing purposes and is usually not sent across the network.
|
|
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
|
|
|
//if ( m_pbuf )
|
|
// m_pbuf->safePrintf("<h3>Terms which are immune to indexdb "
|
|
// "splitting:</h3>");
|
|
|
|
//if ( m_skipIndexing ) return true;
|
|
|
|
// this should be ready to go and not block!
|
|
int64_t *pch64 = getExactContentHash64();
|
|
//int64_t *pch64 = getLooseContentHash64();
|
|
if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// int16_tcut
|
|
Url *fu = getFirstUrl();
|
|
|
|
if ( ! hashVectors ( tt ) ) return false;
|
|
|
|
// constructor should set to defaults automatically
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
// usually we shard by docid, but these are terms we shard by termid!
|
|
hi.m_shardByTermId = true;
|
|
|
|
|
|
// for exact content deduping
|
|
setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
|
|
char cbuf[64];
|
|
int32_t clen = sprintf(cbuf,"%" UINT64 "",*pch64);
|
|
hi.m_prefix = "gbcontenthash";
|
|
if ( ! hashString ( cbuf,clen,&hi ) ) return false;
|
|
|
|
////
|
|
//
|
|
// let's stop here for now, until other stuff is actually used again
|
|
//
|
|
////
|
|
|
|
// let's bring back image thumbnail support for the widget project
|
|
//return true;
|
|
|
|
|
|
|
|
char *host = fu->getHost ();
|
|
//int32_t hlen = fu->getHostLen ();
|
|
|
|
/*
|
|
setStatus ( "hashing no-split qdom keys" );
|
|
|
|
char *dom = fu->getDomain ();
|
|
int32_t dlen = fu->getDomainLen();
|
|
|
|
// desc is NULL, prefix will be used as desc
|
|
hi.m_prefix = "qdom";
|
|
if ( ! hashString ( dom,dlen,&hi ) ) return false;
|
|
|
|
|
|
setStatus ( "hashing no-split qhost keys" );
|
|
|
|
// desc is NULL, prefix will be used as desc
|
|
hi.m_prefix = "qhost";
|
|
if ( ! hashString ( host,hlen,&hi ) ) return false;
|
|
*/
|
|
|
|
|
|
// now hash the site
|
|
|
|
|
|
setStatus ( "hashing no-split SiteGetter terms");
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// these are now no-split terms
|
|
//
|
|
char *s = fu->getUrl ();
|
|
int32_t slen = fu->getUrlLen();
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->m_plen <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
|
|
//Dates *dp = getDates ();
|
|
// hash the clocks into indexdb
|
|
//if ( ! dp->hash ( m_docId , tt , this ) ) return false;
|
|
|
|
// . hash special site/hopcount thing for permalinks
|
|
// . used by Images.cpp for doing thumbnails
|
|
// . this returns false and sets g_errno on error
|
|
// . let's try thumbnails for all...
|
|
//if ( ! *getIsPermalink() ) return true;
|
|
|
|
setStatus ( "hashing no-split gbsitetemplate keys" );
|
|
|
|
// must be valid
|
|
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
|
|
char buf[MAX_URL_LEN+20];
|
|
//uint32_t th = m_tagVector.getVectorHash();
|
|
uint32_t tph = *getTagPairHash32();
|
|
// . skip this so we can do site:xyz.com queries
|
|
// . but if this is https:// then you will have to
|
|
// specify that...
|
|
char *site = getSite();
|
|
// sanity check, must NOT start with http://
|
|
if ( ! strncmp ( site , "http://", 7 ) ) { char *xx=NULL;*xx=0;}
|
|
// this must match what we search in Images.cpp::getThumbnail()
|
|
int32_t blen = sprintf(buf,"%" UINT32 "%s",tph,site);
|
|
|
|
// use the prefix as the description if description is NULL
|
|
hi.m_prefix = "gbsitetemplate";
|
|
//if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
if ( ! hashSingleTerm ( buf,blen,&hi ) ) return false;
|
|
|
|
|
|
setStatus ( "hashing no-split gbimage keys" );
|
|
|
|
hi.m_prefix = "gbimage";
|
|
// hash gbimage: for permalinks only for Images.cpp
|
|
for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) {
|
|
// get the node number
|
|
//int32_t nn = m_images.m_imageNodes[i];
|
|
// get the url of the image
|
|
//XmlNode *xn = m_xml.getNodePtr(nn);
|
|
int32_t srcLen;
|
|
char *src = m_images.getImageUrl(i,&srcLen);
|
|
// set it to the full url
|
|
Url iu;
|
|
// use "pageUrl" as the baseUrl
|
|
Url *cu = getCurrentUrl();
|
|
// we can addwww to normalize since this is for deduping kinda
|
|
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
|
|
char *u = iu.getUrl ();
|
|
int32_t ulen = iu.getUrlLen();
|
|
// hash each one
|
|
//if ( ! hashString ( u,ulen,&hi ) ) return false;
|
|
// hash a single entity
|
|
if ( ! hashSingleTerm ( u,ulen,&hi) ) return false;
|
|
//log("test: %s",u);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . "sr" is the tagdb Record
|
|
// . "ws" store the terms for PageParser.cpp display
|
|
char *XmlDoc::hashAll ( HashTableX *table ) {
|
|
|
|
setStatus ( "hashing document" );
|
|
|
|
if ( m_allHashed ) return (char *)1;
|
|
|
|
// sanity checks
|
|
if ( table->m_ks != 18 ) { char *xx=NULL;*xx=0; }
|
|
if ( table->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( m_wts && m_wts->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
// ptr to term = 4 + score = 4 + ptr to sec = 4
|
|
if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){char *xx=NULL;*xx=0;}
|
|
|
|
unsigned char *hc = (unsigned char *)getHopCount();
|
|
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
|
|
|
|
// need this for hashing
|
|
HashTableX *cnt = getCountTable();
|
|
if ( ! cnt ) return (char *)cnt;
|
|
if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// and this
|
|
//Weights *we = getWeights();
|
|
//if ( ! we || we == (void *)-1 ) return (char *)we;
|
|
// and this
|
|
Links *links = getLinks();
|
|
if ( ! links ) return (char *)links;
|
|
if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// and now this
|
|
//Synonyms *syn = getSynonyms();
|
|
//if ( ! syn || syn == (void *)-1 ) return (char *)syn;
|
|
|
|
char *wordSpamVec = getWordSpamVec();
|
|
if (!wordSpamVec) return (char *)wordSpamVec;
|
|
if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;}
|
|
|
|
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
|
|
if ( ! fragVec ) return (char *)fragVec;
|
|
if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// why do we need this?
|
|
if ( m_wts ) {
|
|
uint8_t *lv = getLangVector();
|
|
if ( ! lv ) return (char *)lv;
|
|
if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr ) return (char *)gr;
|
|
if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; }
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// just keep it somewhat sane...
|
|
//if ( nw > 30000 ) nw = 30000;
|
|
|
|
// then each singleton has one phrase, and 1 empty for good hashing
|
|
//if ( ! table->setTableSize ( nw * 4 ) )
|
|
// return log("build: Could not allocate %" INT32 " bytes for table "
|
|
// "for indexing document.",
|
|
// (nw*4)*(8+sizeof(int32_t)));
|
|
|
|
/*
|
|
const char *help =
|
|
"<table><td bgcolor=lightgreen>\n"
|
|
"Each document has several associated pieces. Each piece "
|
|
"is indexed individually. The pieces are listed below and "
|
|
"are preceeded with a table dictating the parameters with "
|
|
"which the piece was indexed."
|
|
|
|
"<br><br>"
|
|
|
|
"Below that table the actual text of the piece is displayed. "
|
|
"Each alphanumeric word in the text has two subscripts of the "
|
|
"form <i>X/Y</i> where X and Y are percentage weights on the "
|
|
"score of that particular alphanumeric word. X is the weight "
|
|
"on the word itself and Y is the weight on the phrase which "
|
|
"is started by that word. A weight of 100% "
|
|
"indicates a weight which does not affect the score."
|
|
|
|
"<br><br>"
|
|
|
|
"Words that are struck out and in a box with a red background "
|
|
"instead of light blue are considered to be spam, meaning "
|
|
"they are repeated in a pattern. They "
|
|
"contain a number in that box which indicates the probability "
|
|
"they are spam and 100 minus that probability is weighted "
|
|
"with their score to get a new, spam-adjusted score. "
|
|
"<br>\n"
|
|
"</tr>\n"
|
|
"</table>\n"
|
|
"</td></table>\n"
|
|
"<br><br>\n";
|
|
|
|
if ( m_pbuf ) m_pbuf->safePrintf("%s",help);
|
|
*/
|
|
|
|
/*
|
|
int32_t inlinks = *getSiteNumInlinks();
|
|
int32_t boost1 = getBoostFromSiteNumInlinks ( inlinks );
|
|
|
|
// . now we hard code "boost2"
|
|
// . based on # of alnum words
|
|
// . this makes us look at keyword density, not just the
|
|
// plain keyword count
|
|
int32_t naw = m_words.getNumAlnumWords();
|
|
// . keep at 100% for up to 200 words then reduce linearly
|
|
// . only do this for newer title recs to avoid undeletable data
|
|
// . if we have a huge document, it can still contain a very
|
|
// relevant paragraph that is dense in the query terms, so
|
|
// we really only want to punish enough so the post query
|
|
// reranking has some good candidates for doing proximity
|
|
// scoring.
|
|
// . back off by .90 every 1000 words
|
|
float nn = naw;
|
|
float bb = 100.0;
|
|
while ( nn > 1000 ) {
|
|
nn *= .9;
|
|
bb *= .9;
|
|
}
|
|
// never drop below %1
|
|
if ( bb < 1.0 ) bb = 1.0;
|
|
// set it
|
|
int64_t boost2 = (int64_t)bb;
|
|
*/
|
|
|
|
/*
|
|
int32_t siteNumInlinks = *getSiteNumInlinks();
|
|
|
|
if ( m_pbuf )
|
|
m_pbuf->safePrintf(
|
|
|
|
"<table border=1 cellpadding=2>"
|
|
|
|
"<tr><td>siteNumInlinks</td><td><b>%" INT32 "%%</b></td></tr>"
|
|
|
|
"<tr><td>siteNumInlinksBoost</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>numAlnumWords</td>"
|
|
"<td>%" INT32 "</td></tr> "
|
|
|
|
"<tr><td>scoreWeightFromNumAlnumWords"
|
|
"</td><td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>headerWeight</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>urlPathWeight</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>externalLinkTextWeight</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>internalLinkTextWeight</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>conceptWeight</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"<tr><td>titleWeight</td>"
|
|
"<td>%" INT32 "%%</td></tr>"
|
|
|
|
"</table>"
|
|
"<br>"
|
|
,
|
|
(int32_t)siteNumInlinks,
|
|
(int32_t)boost1,
|
|
//(int32_t)len,
|
|
(int32_t)naw,
|
|
(int32_t)boost2,
|
|
(int32_t)boost1,
|
|
(int32_t)boost2,
|
|
//(int32_t)boost1,
|
|
(int32_t)m_headerWeight,
|
|
(int32_t)m_urlPathWeight,
|
|
(int32_t)m_externalLinkTextWeight,
|
|
(int32_t)m_internalLinkTextWeight,
|
|
(int32_t)m_conceptWeight,
|
|
(int32_t)m_titleWeight,
|
|
(int32_t)m_titleWeight,
|
|
(int32_t)boost1,
|
|
(int32_t)boost1,
|
|
);
|
|
*/
|
|
|
|
// do not repeat this if the cachedb storage call blocks
|
|
m_allHashed = true;
|
|
|
|
// reset distance cursor
|
|
m_dist = 0;
|
|
|
|
// hash diffbot's json output here
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
/*
|
|
if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) {
|
|
// hash the content type for type:json query
|
|
if ( ! hashContentType ( table ) ) return NULL;
|
|
// and the url: query support
|
|
if ( ! hashUrl ( table ) ) return NULL;
|
|
// language support
|
|
if ( ! hashLanguage ( table ) ) return NULL;
|
|
// country?
|
|
if ( ! hashCountry ( table ) ) return NULL;
|
|
if ( ! hashTagRec ( table ) ) return NULL;
|
|
// hash for gbsortby:gbspiderdate
|
|
if ( ! hashDateNumbers ( table ) ) return NULL;
|
|
// has gbhasthumbnail:1 or 0
|
|
if ( ! hashImageStuff ( table ) ) return NULL;
|
|
// and the json itself
|
|
return hashJSON ( table );
|
|
}
|
|
*/
|
|
|
|
if ( ! hashContentType ( table ) ) return NULL;
|
|
if ( ! hashUrl ( table ) ) return NULL;
|
|
if ( ! hashLanguage ( table ) ) return NULL;
|
|
if ( ! hashCountry ( table ) ) return NULL;
|
|
if ( ! hashSiteNumInlinks( table ) ) return NULL;
|
|
if ( ! hashTagRec ( table ) ) return NULL;
|
|
if ( ! hashAds ( table ) ) return NULL;
|
|
if ( ! hashSubmitUrls ( table ) ) return NULL;
|
|
if ( ! hashIsAdult ( table ) ) return NULL;
|
|
|
|
// has gbhasthumbnail:1 or 0
|
|
if ( ! hashImageStuff ( table ) ) return NULL;
|
|
|
|
// . hash sectionhash:xxxx terms
|
|
// . diffbot still needs to hash this for voting info
|
|
if ( ! hashSections ( table ) ) return NULL;
|
|
|
|
// now hash the terms sharded by termid and not docid here since they
|
|
// just set a special bit in posdb key so Rebalance.cpp can work.
|
|
// this will hash the content checksum which we need for deduping
|
|
// which we use for diffbot custom crawls as well.
|
|
if ( ! hashNoSplit ( table ) ) return NULL;
|
|
|
|
|
|
// MDW: i think we just inject empty html with a diffbotreply into
|
|
// global index now, so don't need this... 9/28/2014
|
|
|
|
// stop indexing xml docs
|
|
bool indexDoc = true;
|
|
if ( cr->m_isCustomCrawl ) indexDoc = false;
|
|
if ( ! cr->m_indexBody ) indexDoc = false;
|
|
// if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject )
|
|
// indexDoc = true;
|
|
// always index diffbot json objects for GI (custom crawl is false)
|
|
if ( m_isDiffbotJSONObject )
|
|
indexDoc = true;
|
|
|
|
// global index unless this is a json object in which case it is
|
|
// hashed above in the call to hashJSON(). this will decrease disk
|
|
// usage by about half, posdb* files are pretty big.
|
|
if ( ! indexDoc ) return (char *)1;
|
|
|
|
// hash json fields
|
|
if ( *ct == CT_JSON ) {
|
|
// this hashes both with and without the fieldname
|
|
hashJSONFields ( table );
|
|
goto skip;
|
|
}
|
|
|
|
// same for xml now, so we can search for field:value like w/ json
|
|
if ( *ct == CT_XML ) {
|
|
// this hashes both with and without the fieldname
|
|
hashXMLFields ( table );
|
|
goto skip;
|
|
}
|
|
|
|
// hash the body of the doc first so m_dist is 0 to match
|
|
// the rainbow display of sections
|
|
if ( ! hashBody2 (table ) ) return NULL;
|
|
|
|
// hash the title now too so neighborhood singles have more
|
|
// to match. plus, we only hash these title terms iff they
|
|
// are not already in the hash table, so as to avoid hashing
|
|
// repeated title terms because we do not do spam detection
|
|
// on them. thus, we need to hash these first before anything
|
|
// else. give them triple the body score
|
|
if ( ! hashTitle ( table )) return NULL;
|
|
|
|
// . hash the keywords tag, limited to first 2k of them so far
|
|
// . hash above the neighborhoods so the neighborhoods only index
|
|
// what is already in the hash table
|
|
if ( ! hashMetaKeywords(table ) ) return NULL;
|
|
|
|
// then hash the incoming link text, NO ANOMALIES, because
|
|
// we index the single words in the neighborhoods next, and
|
|
// we had songfacts.com coming up for the 'street light facts'
|
|
// query because it had a bunch of anomalous inlink text.
|
|
if ( ! hashIncomingLinkText(table,false,true)) return NULL;
|
|
|
|
// then the meta summary and description tags with half the score of
|
|
// the body, and only hash a term if was not already hashed above
|
|
// somewhere.
|
|
if ( ! hashMetaSummary(table) ) return NULL;
|
|
|
|
skip:
|
|
|
|
// this will only increment the scores of terms already in the table
|
|
// because we neighborhoods are not technically in the document
|
|
// necessarily and we do not want to ruin our precision
|
|
if ( ! hashNeighborhoods ( table ) ) return NULL;
|
|
|
|
|
|
if ( ! hashLinks ( table ) ) return NULL;
|
|
if ( ! hashDateNumbers ( table ) ) return NULL;
|
|
if ( ! hashMetaTags ( table ) ) return NULL;
|
|
if ( ! hashMetaZip ( table ) ) return NULL;
|
|
if ( ! hashDMOZCategories( table ) ) return NULL;
|
|
if ( ! hashCharset ( table ) ) return NULL;
|
|
if ( ! hashRSSInfo ( table ) ) return NULL;
|
|
if ( ! hashPermalink ( table ) ) return NULL;
|
|
|
|
// hash gblang:de last for parsing consistency
|
|
if ( ! hashLanguageString ( table ) ) return NULL;
|
|
|
|
// we set this now in hashWords3()
|
|
if ( m_doingSEO )
|
|
m_wordPosInfoBufValid = true;
|
|
|
|
// store the m_wordPosInfoBuf into cachedb
|
|
// NO! we are not allowed to block in here it messes shit up!!!
|
|
//if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
|
|
// return (char *)-1;
|
|
|
|
// . hash gbkeyword:gbmininlinks where the score is the inlink count
|
|
// . the inlink count can go from 1 to 255
|
|
// . an ip neighborhood can vote no more than once
|
|
// . this is in LinkInfo::hash
|
|
//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;
|
|
|
|
if ( ! hashMetaData ( table ) ) return NULL;
|
|
|
|
// return true if we don't need to print parser info
|
|
//if ( ! m_pbuf ) return true;
|
|
// print out the table into g_bufPtr now if we need to
|
|
//table->print ( );
|
|
return (char *)1;
|
|
}
|
|
|
|
// . "inlinks" is # of inlinks to the SITE
|
|
// . returns a percentage boost
|
|
int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) {
|
|
// . base on # of site inlinks
|
|
// . just hard code this for now
|
|
int32_t boost1 = 100;
|
|
if ( inlinks >= 10 ) boost1 = 150;
|
|
if ( inlinks >= 50 ) boost1 = 200;
|
|
if ( inlinks >= 100 ) boost1 = 250;
|
|
if ( inlinks >= 200 ) boost1 = 300;
|
|
if ( inlinks >= 400 ) boost1 = 350;
|
|
if ( inlinks >= 800 ) boost1 = 400;
|
|
if ( inlinks >= 1600 ) boost1 = 450;
|
|
if ( inlinks >= 3200 ) boost1 = 500;
|
|
if ( inlinks >= 6400 ) boost1 = 550;
|
|
if ( inlinks >= 12800 ) boost1 = 600;
|
|
if ( inlinks >= 25600 ) boost1 = 650;
|
|
if ( inlinks >= 51200 ) boost1 = 700;
|
|
return boost1;
|
|
}
|
|
|
|
bool XmlDoc::appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) {
|
|
|
|
// set4() called from the inject sets these two things for meta data
|
|
// which is basically json that augments the doc, tags it with stuff
|
|
if ( ! m_hasMetadata ) return true;
|
|
if ( ! ptr_metadata ) return true;
|
|
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod ) { char *xx=NULL;*xx=0; }
|
|
if ( pod == (XmlDoc **)-1 ) { char *xx=NULL;*xx=0; }
|
|
// this is non-NULL if it existed
|
|
XmlDoc *od = *pod;
|
|
|
|
// wtf?
|
|
if ( ! od ) return true;
|
|
|
|
|
|
// dedup. if already in there, do not re-add it
|
|
if ( strstr ( od->ptr_metadata , ptr_metadata ) )
|
|
return true;
|
|
|
|
SafeBuf md;
|
|
|
|
// copy over and append
|
|
if ( ! md.safeMemcpy ( od->ptr_metadata , od->size_metadata ) )
|
|
return false;
|
|
// remove trailing \0 if there
|
|
md.removeLastChar ( '\0' );
|
|
// separate from the new stuff
|
|
if ( ! md.safePrintf(",\n") )
|
|
return false;
|
|
|
|
if ( ! md.safeMemcpy ( ptr_metadata , size_metadata ) )
|
|
return false;
|
|
|
|
if ( ! md.nullTerm ( ) )
|
|
return false;
|
|
// update his meta data
|
|
od->ptr_metadata = md.getBufStart();
|
|
od->size_metadata = md.length();
|
|
|
|
int32_t nw = od->size_metadata * 4;
|
|
|
|
HashTableX tt1;
|
|
int32_t need4 = nw * 4 + 5000;
|
|
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,"posdb-i2"))
|
|
return false;
|
|
|
|
od->hashMetaData ( &tt1 );
|
|
|
|
// store the posdb keys from tt1 into our safebuf, tmp
|
|
SafeBuf sb;
|
|
if ( m_usePosdb && ! addTable144 ( &tt1 , od->m_docId , &sb ) )
|
|
return false;
|
|
|
|
// this could use time axis so that is taken into account
|
|
int64_t uh48 = getFirstUrlHash48();
|
|
|
|
// and re-formulate (and compress) his new title rec
|
|
SafeBuf trec;
|
|
if ( ! od->setTitleRecBuf ( &trec , od->m_docId , uh48 ) )
|
|
return false;
|
|
|
|
// force the title rec key to be the same
|
|
// if ( od->m_titleRecKeyValid && trec.getLength() >= sizeof(key_t) ) {
|
|
// char *p = trec.getBufStart();
|
|
// *(key_t *)p = od->m_titleRecKey;
|
|
// }
|
|
// else {
|
|
// log("build: old titlerec invalid docid=%"INT64,od->m_docId);
|
|
// }
|
|
|
|
// store the posdb keys in the meta list
|
|
if ( m_usePosdb && ! metaList->safeMemcpy ( &sb ) )
|
|
return false;
|
|
|
|
// store the updated titlerec into the meta list
|
|
if ( m_useTitledb && ! metaList->pushChar(RDB_TITLEDB) )
|
|
return false;
|
|
if ( m_useTitledb && ! metaList->safeMemcpy(&trec) )
|
|
return false;
|
|
|
|
m_updatedMetaData = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . this is kinda hacky because it uses a short XmlDoc on the stack
|
|
// . no need to hash this stuff for regular documents since all the terms
|
|
// are fielded by gberrorstr, gberrornum or gbisreply.
|
|
// . normally we might use a separate xmldoc class for this but i wanted
|
|
// something more lightweight
|
|
SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
|
|
bool forDelete ) {
|
|
|
|
// set status for this
|
|
setStatus ( "getting spider reply meta list");
|
|
|
|
if ( m_spiderStatusDocMetaListValid )
|
|
return &m_spiderStatusDocMetaList;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
if ( ! cr->m_indexSpiderReplies || forDelete ) {
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// if docid based do not hash a spider reply. docid-based spider
|
|
// requests are added to spiderdb from the query reindex tool.
|
|
// do not do for diffbot subdocuments either, usespiderdb should be
|
|
// false for those.
|
|
// MDW: i disagree, i want to see when these get updated! 9/6/2014
|
|
// ok, let's index for diffbot objects so we can see if they are
|
|
// a dup of another diffbot object, or so we can see when they get
|
|
// revisted, etc.
|
|
//if ( m_setFromDocId || ! m_useSpiderdb ) {
|
|
if ( ! m_useSpiderdb && ! m_isDiffbotJSONObject ) {
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// do not add a status doc if doing a query delete on a status doc
|
|
if ( m_contentTypeValid && m_contentType == CT_STATUS ) {
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// doing it for diffbot throws off smoketests
|
|
// ok, smoketests are updated now, so remove this
|
|
// if ( strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
|
|
// m_spiderStatusDocMetaListValid = true;
|
|
// return &m_spiderStatusDocMetaList;
|
|
// }
|
|
|
|
// we double add regular html urls in a query reindex because the
|
|
// json url adds the parent, so the parent gets added twice sometimes,
|
|
// and for some reason it is adding a spider status doc the 2nd time
|
|
// so cut that out. this is kinda a hack b/c i'm not sure what's
|
|
// going on. but you can set a break point here and see what's up if
|
|
// you want.
|
|
// MDW: likewise, take this out, i want these recorded as well..
|
|
// if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
|
|
// m_spiderStatusDocMetaListValid = true;
|
|
// return &m_spiderStatusDocMetaList;
|
|
// }
|
|
|
|
// . fake this out so we do not core
|
|
// . hashWords3() uses it i guess
|
|
bool forcedLangId = false;
|
|
if ( ! m_langIdValid ) {
|
|
forcedLangId = true;
|
|
m_langIdValid = true;
|
|
m_langId = langUnknown;
|
|
}
|
|
|
|
// prevent more cores
|
|
bool forcedSiteNumInlinks = false;
|
|
if ( ! m_siteNumInlinksValid ) {
|
|
forcedSiteNumInlinks = true;
|
|
m_siteNumInlinks = 0;
|
|
m_siteNumInlinksValid = true;
|
|
}
|
|
|
|
SafeBuf *mbuf = getSpiderStatusDocMetaList2 ( reply );
|
|
|
|
if ( forcedLangId )
|
|
m_langIdValid = false;
|
|
|
|
if ( forcedSiteNumInlinks ) {
|
|
m_siteNumInlinksValid = false;
|
|
}
|
|
|
|
return mbuf;
|
|
}
|
|
|
|
// . the spider status doc
|
|
// . TODO:
|
|
// usedProxy:1
|
|
// proxyIp:1.2.3.4
|
|
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
|
|
|
setStatus ( "making spider reply meta list");
|
|
|
|
// . we also need a unique docid for indexing the spider *reply*
|
|
// as a separate document
|
|
// . use the same url, but use a different docid.
|
|
// . use now to mix it up
|
|
//int32_t now = getTimeGlobal();
|
|
//int64_t h = hash64(m_docId, now );
|
|
// to keep qa test consistent this docid should be consistent
|
|
// so base it on spidertime of parent doc.
|
|
// if doc is being force deleted then this is invalid!
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int64_t h = hash64(m_docId, m_spideredTime );
|
|
// mask it out
|
|
int64_t d = h & DOCID_MASK;
|
|
// try to get an available docid, preferring "d" if available
|
|
int64_t *uqd = getAvailDocIdOnly ( d );
|
|
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
|
|
|
|
m_addedStatusDocId = *uqd;
|
|
|
|
// unsigned char *hc = (unsigned char *)getHopCount();
|
|
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
|
|
|
int32_t tmpVal = -1;
|
|
int32_t *priority = &tmpVal;
|
|
int32_t *ufn = &tmpVal;
|
|
|
|
// prevent a core if sreq is not valid, these will freak out
|
|
// diffbot replies may not have a valid m_sreq
|
|
if ( m_sreqValid ) {
|
|
priority = getSpiderPriority();
|
|
if ( ! priority || priority == (void *)-1 )
|
|
return (SafeBuf *)priority;
|
|
|
|
ufn = getUrlFilterNum();
|
|
if ( ! ufn || ufn == (void *)-1 )
|
|
return (SafeBuf *)ufn;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
Json *jp1 = NULL;
|
|
// i've seen ptr_utf8Content NULL and content type as html for
|
|
// some reason when deleting a diffbot object doc so check for that
|
|
// here and forget it. we don't want getParsedJson() to core.
|
|
if ( m_isDiffbotJSONObject &&
|
|
m_contentType == CT_JSON &&
|
|
m_contentTypeValid ) {
|
|
jp1 = getParsedJson();
|
|
if ( ! jp1 || jp1 == (void *)-1) return (SafeBuf *)jp1;
|
|
}
|
|
|
|
// sanity
|
|
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// why isn't gbhopcount: being indexed consistently?
|
|
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// reset just in case
|
|
m_spiderStatusDocMetaList.reset();
|
|
|
|
// sanity
|
|
if ( *uqd <= 0 || *uqd > MAX_DOCID ) {
|
|
log("xmldoc: avail docid = %" INT64 ". could not index spider "
|
|
"reply or %s",*uqd,m_firstUrl.m_url);
|
|
//char *xx=NULL;*xx=0; }
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// the old doc
|
|
XmlDoc *od = NULL;
|
|
if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;
|
|
|
|
Url *fu = &m_firstUrl;
|
|
|
|
// . make a little json doc that we'll hash up
|
|
// . only index the fields in this doc, no extra gbdocid: inurl:
|
|
// hash terms
|
|
SafeBuf jd;
|
|
jd.safePrintf("{\n");
|
|
|
|
// so type:status query works
|
|
jd.safePrintf("\"type\":\"status\",\n");
|
|
|
|
jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl() );
|
|
|
|
if ( ptr_redirUrl )
|
|
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
|
|
ptr_redirUrl);
|
|
|
|
if ( m_indexCodeValid ) {
|
|
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
|
jd.safePrintf("\"gbssStatusMsg\":\"");
|
|
jd.jsonEncode (mstrerror(m_indexCode));
|
|
jd.safePrintf("\",\n");
|
|
}
|
|
else {
|
|
jd.safePrintf("\"gbssStatusCode\":-1,\n");
|
|
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
|
|
}
|
|
|
|
|
|
if ( m_httpStatusValid )
|
|
jd.safePrintf("\"gbssHttpStatus\":%" INT32 ",\n",
|
|
(int32_t)m_httpStatus);
|
|
|
|
// do not index gbssIsSeedUrl:0 because there will be too many usually
|
|
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
|
if ( isSeed )
|
|
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
|
|
|
|
if ( od )
|
|
jd.safePrintf("\"gbssWasIndexed\":1,\n");
|
|
else
|
|
jd.safePrintf("\"gbssWasIndexed\":0,\n");
|
|
|
|
int32_t now = getTimeGlobal();
|
|
if ( od )
|
|
jd.safePrintf("\"gbssAgeInIndex\":"
|
|
"%" UINT32 ",\n",now - od->m_spideredTime);
|
|
|
|
if ( m_isDiffbotJSONObject ) { // && cr->m_isCustomCrawl
|
|
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
|
|
JsonItem *jsonItem = NULL;
|
|
if ( jp1 ) jsonItem = jp1->getItem("diffbotUri");
|
|
if ( jsonItem ) {
|
|
jd.safePrintf("\"gbssDiffbotUri\":\"");
|
|
int32_t vlen;
|
|
char *val = jsonItem->getValueAsString( &vlen );
|
|
if ( val ) jd.safeMemcpy ( val , vlen );
|
|
jd.safePrintf("\",\n");
|
|
}
|
|
else
|
|
jd.safePrintf("\"gbssDiffbotUri\":"
|
|
"\"none\",\n");
|
|
// show the type as gbssDiffbotType:"article" etc.
|
|
JsonItem *dti = NULL;
|
|
if ( jp1 )
|
|
dti = jp1->getItem("type");
|
|
if ( dti ) {
|
|
jd.safePrintf("\"gbssDiffbotType\":\"");
|
|
int32_t vlen;
|
|
char *val = dti->getValueAsString( &vlen );
|
|
if ( val ) jd.jsonEncode ( val , vlen );
|
|
jd.safePrintf("\",\n");
|
|
}
|
|
|
|
}
|
|
else { // if ( cr->m_isCustomCrawl ) {
|
|
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
|
|
}
|
|
|
|
jd.safePrintf("\"gbssDomain\":\"");
|
|
jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
|
|
jd.safePrintf("\",\n");
|
|
|
|
jd.safePrintf("\"gbssSubdomain\":\"");
|
|
jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
|
|
jd.safePrintf("\",\n");
|
|
|
|
//if ( m_redirUrlPtr && m_redirUrlValid )
|
|
//if ( m_numRedirectsValid )
|
|
jd.safePrintf("\"gbssNumRedirects\":%" INT32 ",\n",m_numRedirects);
|
|
|
|
if ( m_docIdValid )
|
|
jd.safePrintf("\"gbssDocId\":%" INT64 ",\n", m_docId);//*uqd);
|
|
|
|
if ( m_parentDocPtr && m_isChildDoc && m_parentDocPtr->m_docIdValid )
|
|
jd.safePrintf("\"gbssParentDocId\":%" INT64 ",\n",
|
|
m_parentDocPtr->m_docId);
|
|
|
|
if ( m_hopCountValid )
|
|
//jd.safePrintf("\"gbssHopCount\":%" INT32 ",\n",(int32_t)*hc);
|
|
jd.safePrintf("\"gbssHopCount\":%" INT32 ",\n",(int32_t)m_hopCount);
|
|
|
|
// crawlbot round
|
|
if ( cr->m_isCustomCrawl )
|
|
jd.safePrintf("\"gbssCrawlRound\":%" INT32 ",\n",
|
|
cr->m_spiderRoundNum);
|
|
|
|
// for -diffbotxyz fake docs addedtime is 0
|
|
if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
|
|
// in Spider.cpp we try to set m_sreq's m_addedTime to the
|
|
// min of all the spider requests, and we try to ensure
|
|
// that in the case of deduping we preserve the one with
|
|
// the oldest time. no, now we actually use
|
|
// m_discoveryTime since we were using m_addedTime in
|
|
// the url filters as it was originally intended.
|
|
jd.safePrintf("\"gbssDiscoveredTime\":%" INT32 ",\n",
|
|
m_sreq.m_discoveryTime);
|
|
}
|
|
|
|
if ( m_isDupValid && m_isDup )
|
|
jd.safePrintf("\"gbssDupOfDocId\":%" INT64 ",\n",
|
|
m_docIdWeAreADupOf);
|
|
|
|
// how many spiderings were successful vs. failed
|
|
// these don't work because we only store one reply
|
|
// which overwrites any older reply. that's how the
|
|
// key is. we can change the key to use the timestamp
|
|
// and not parent docid in makeKey() for spider
|
|
// replies later.
|
|
// if ( m_sreqValid ) {
|
|
// jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%" INT32 ",\n",
|
|
// m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
|
|
// jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%" INT32 ",\n",
|
|
// m_sreq.m_reservedc1);
|
|
// jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%" INT32 ",\n",
|
|
// m_sreq.m_reservedc2);
|
|
// }
|
|
|
|
if ( m_spideredTimeValid )
|
|
jd.safePrintf("\"gbssSpiderTime\":%" INT32 ",\n",
|
|
m_spideredTime);
|
|
else
|
|
jd.safePrintf("\"gbssSpiderTime\":%" INT32 ",\n",0);
|
|
|
|
if ( m_firstIndexedDateValid )
|
|
jd.safePrintf("\"gbssFirstIndexed\":%" UINT32 ",\n",
|
|
m_firstIndexedDate);
|
|
|
|
if ( m_contentHash32Valid )
|
|
jd.safePrintf("\"gbssContentHash32\":%" UINT32 ",\n",
|
|
m_contentHash32);
|
|
|
|
// so we know what hostid spidered the url. this is not the
|
|
// same hostid that will store it necessarily
|
|
jd.safePrintf("\"gbssSpideredByHostId\":%" INT32 ",\n",
|
|
(int32_t)g_hostdb.getMyHostId());
|
|
|
|
// which shard will store the titlerec and index terms? it
|
|
// is based on docid.
|
|
if ( m_docIdValid ) {
|
|
int32_t shardNum = getShardNumFromDocId ( m_docId );
|
|
jd.safePrintf("\"gbssStoredOnShard\":%" INT32 ",\n",shardNum);
|
|
}
|
|
|
|
if ( m_downloadStartTimeValid && m_downloadEndTimeValid ) {
|
|
jd.safePrintf("\"gbssDownloadStartTimeMS\":%" INT64 ",\n",
|
|
m_downloadStartTime);
|
|
jd.safePrintf("\"gbssDownloadEndTimeMS\":%" INT64 ",\n",
|
|
m_downloadEndTime);
|
|
|
|
int64_t took = m_downloadEndTime - m_downloadStartTime;
|
|
jd.safePrintf("\"gbssDownloadDurationMS\":%" INT64 ",\n",took);
|
|
|
|
jd.safePrintf("\"gbssDownloadStartTime\":%" UINT32 ",\n",
|
|
(uint32_t)(m_downloadStartTime/1000));
|
|
|
|
jd.safePrintf("\"gbssDownloadEndTime\":%" UINT32 ",\n",
|
|
(uint32_t)(m_downloadEndTime/1000));
|
|
}
|
|
|
|
|
|
jd.safePrintf("\"gbssUsedRobotsTxt\":%" INT32 ",\n",
|
|
m_useRobotsTxt);
|
|
|
|
if ( m_linksValid )
|
|
jd.safePrintf("\"gbssNumOutlinksOnPage\":%" INT32 ",\n",
|
|
(int32_t)m_links.getNumLinks());
|
|
|
|
//if ( m_numOutlinksAddedValid )
|
|
// crap, this is not right because we only call addOutlinksToMetaList()
|
|
// after we call this function.
|
|
// jd.safePrintf("\"gbssNumOutlinksAdded\":%" INT32 ",\n",
|
|
// (int32_t)m_numOutlinksAdded);
|
|
|
|
// how many download/indexing errors we've had, including this one
|
|
// if applicable.
|
|
if ( m_srepValid )
|
|
jd.safePrintf("\"gbssConsecutiveErrors\":%" INT32 ",\n",
|
|
m_srep.m_errCount);
|
|
else
|
|
jd.safePrintf("\"gbssConsecutiveErrors\":%" INT32 ",\n",0);
|
|
|
|
|
|
if ( m_ipValid )
|
|
jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
|
|
else
|
|
jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");
|
|
|
|
if ( m_ipEndTime ) {
|
|
int64_t took = m_ipEndTime - m_ipStartTime;
|
|
jd.safePrintf("\"gbssIpLookupTimeMS\":%" INT64 ",\n",took);
|
|
}
|
|
|
|
if ( m_siteNumInlinksValid ) {
|
|
jd.safePrintf("\"gbssSiteNumInlinks\":%" INT32 ",\n",
|
|
(int32_t)m_siteNumInlinks);
|
|
char siteRank = getSiteRank();
|
|
jd.safePrintf("\"gbssSiteRank\":%" INT32 ",\n",
|
|
(int32_t)siteRank);
|
|
}
|
|
|
|
jd.safePrintf("\"gbssContentInjected\":%" INT32 ",\n",
|
|
(int32_t)m_contentInjected);
|
|
|
|
if ( m_percentChangedValid && od )
|
|
jd.safePrintf("\"gbssPercentContentChanged\""
|
|
":%.01f,\n",
|
|
m_percentChanged);
|
|
|
|
if ( ! m_isDiffbotJSONObject )
|
|
jd.safePrintf("\"gbssSpiderPriority\":%" INT32 ",\n",
|
|
*priority);
|
|
|
|
// this could be -1, careful
|
|
if ( *ufn >= 0 && ! m_isDiffbotJSONObject )
|
|
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
|
|
cr->m_regExs[*ufn].getBufStart());
|
|
|
|
// we forced the langid valid above
|
|
if ( m_langIdValid && m_contentLen )
|
|
jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
|
|
getLangAbbr(m_langId));
|
|
|
|
if ( m_contentTypeValid && m_contentLen )
|
|
jd.safePrintf("\"gbssContentType\":\"%s\",\n",
|
|
g_contentTypeStrings[m_contentType]);
|
|
|
|
if ( m_contentValid )
|
|
jd.safePrintf("\"gbssContentLen\":%" INT32 ",\n",
|
|
m_contentLen);
|
|
|
|
if ( m_isContentTruncatedValid )
|
|
jd.safePrintf("\"gbssIsContentTruncated\":%" INT32 ",\n",
|
|
(int32_t)m_isContentTruncated);
|
|
|
|
|
|
// do not show the -1 any more, just leave it out then
|
|
// to make things look prettier
|
|
if ( m_crawlDelayValid && m_crawlDelay >= 0 &&
|
|
! m_isDiffbotJSONObject )
|
|
// -1 if none?
|
|
jd.safePrintf("\"gbssCrawlDelayMS\":%" INT32 ",\n",
|
|
(int32_t)m_crawlDelay);
|
|
|
|
// was this url ever sent to diffbot either now or at a previous
|
|
// spider time?
|
|
if ( ! m_isDiffbotJSONObject ) {
|
|
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
|
|
(int)m_sentToDiffbot);
|
|
|
|
// sent to diffbot?
|
|
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
|
|
(int)m_sentToDiffbotThisTime);
|
|
}
|
|
|
|
// page must have been downloaded for this one
|
|
if ( cr->m_isCustomCrawl &&
|
|
m_utf8ContentValid &&
|
|
! m_isDiffbotJSONObject &&
|
|
m_content &&
|
|
m_contentValid &&
|
|
cr->m_diffbotPageProcessPattern.getBufStart() &&
|
|
cr->m_diffbotPageProcessPattern.getBufStart()[0] ) {
|
|
char match = doesPageContentMatchDiffbotProcessPattern();
|
|
jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n",
|
|
(int)match);
|
|
}
|
|
if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){
|
|
|
|
char *url = getFirstUrl()->getUrl();
|
|
|
|
// the crawl regex
|
|
int match = 1;
|
|
regex_t *ucr = &cr->m_ucr;
|
|
if ( ! cr->m_hasucr ) ucr = NULL;
|
|
if ( ucr && regexec(ucr,url,0,NULL,0) ) match = 0;
|
|
if ( ucr )
|
|
jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
|
|
match);
|
|
|
|
// now the substring pattern
|
|
match = 1;
|
|
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
|
|
if ( ucp && ! ucp[0] ) ucp = NULL;
|
|
if ( ucp && ! doesStringContainPattern(url,ucp) ) match = 0;
|
|
if ( ucp )
|
|
jd.safePrintf("\"gbssMatchesUrlCrawlPattern\":%i,\n",
|
|
match);
|
|
|
|
// now process regex
|
|
match = 1;
|
|
regex_t *upr = &cr->m_upr;
|
|
if ( ! cr->m_hasupr ) upr = NULL;
|
|
if ( upr && regexec(upr,url,0,NULL,0) ) match = 0;
|
|
if ( upr )
|
|
jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
|
|
match);
|
|
|
|
// now process pattern
|
|
match = 1;
|
|
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
|
|
if ( upp && ! upp[0] ) upp = NULL;
|
|
if ( upp && ! doesStringContainPattern(url,upp) ) match = 0;
|
|
if ( upp )
|
|
jd.safePrintf("\"gbssMatchesUrlProcessPattern\":%i,\n",
|
|
match);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime &&
|
|
! m_isDiffbotJSONObject ) {
|
|
jd.safePrintf("\"gbssDiffbotReplyCode\":%" INT32 ",\n",
|
|
m_diffbotReplyError);
|
|
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
|
|
jd.jsonEncode(mstrerror(m_diffbotReplyError));
|
|
jd.safePrintf("\",\n");
|
|
jd.safePrintf("\"gbssDiffbotReplyLen\":%" INT32 ",\n",
|
|
m_diffbotReply.length());
|
|
int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
|
|
jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%" INT64 ",\n",
|
|
took );
|
|
jd.safePrintf("\"gbssDiffbotReplyRetries\":%" INT32 ",\n",
|
|
m_diffbotReplyRetries );
|
|
// this is not correct at this point we haven't parsed the json
|
|
// jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%" INT32 ",\n",
|
|
// m_diffbotJSONCount);
|
|
}
|
|
|
|
// remove last ,\n
|
|
jd.incrementLength(-2);
|
|
// end the json spider status doc
|
|
jd.safePrintf("\n}\n");
|
|
|
|
// BEFORE ANY HASHING
|
|
int32_t savedDist = m_dist;
|
|
|
|
// add the index list for it. it returns false and sets g_errno on err
|
|
// otherwise it sets m_spiderStatusDocMetaList
|
|
if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
|
|
return NULL;
|
|
|
|
// now make the titlerec
|
|
char xdhead[2048];
|
|
// just the head of it. this is the hacky part.
|
|
XmlDoc *xd = (XmlDoc *)xdhead;
|
|
// clear it out
|
|
memset ( xdhead, 0 , 2048);
|
|
|
|
// copy stuff from THIS so the spider reply "document" has the same
|
|
// header info stuff
|
|
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
|
|
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
|
|
gbmemcpy ( xdhead , (char *)this , hsize );
|
|
|
|
// override spider time in case we had error to be consistent
|
|
// with the actual SpiderReply record
|
|
//xd->m_spideredTime = reply->m_spideredTime;
|
|
//xd->m_spideredTimeValid = true;
|
|
// sanity
|
|
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
|
|
|
|
// this will cause the maroon box next to the search result to
|
|
// say "STATUS" similar to "PDF" "DOC" etc.
|
|
xd->m_contentType = CT_STATUS;
|
|
|
|
int32_t fullsize = &m_dummyEnd - (char *)this;
|
|
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
// the ptr_* were all zero'd out, put the ones we want to keep back in
|
|
SafeBuf tmp;
|
|
// was "Spider Status: %s" but that is unnecessary
|
|
tmp.safePrintf("<title>%s</title>",
|
|
mstrerror(m_indexCode));
|
|
|
|
// if we are a dup...
|
|
if ( m_indexCode == EDOCDUP )
|
|
tmp.safePrintf("Dup of docid %" INT64 "<br>", m_docIdWeAreADupOf );
|
|
|
|
if ( m_redirUrlPtr && m_redirUrlValid )
|
|
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
|
|
*/
|
|
|
|
// put stats like we log out from logIt
|
|
//tmp.safePrintf("<div style=max-width:800px;>\n");
|
|
// store log output into doc
|
|
//logIt(&tmp);
|
|
//tmp.safePrintf("\n</div>");
|
|
|
|
// the content is just the title tag above
|
|
// xd->ptr_utf8Content = tmp.getBufStart();
|
|
// xd->size_utf8Content = tmp.length()+1;
|
|
xd->ptr_utf8Content = jd.getBufStart();
|
|
xd->size_utf8Content = jd.length()+1;
|
|
|
|
// keep the same url as the doc we are the spider reply for
|
|
xd->ptr_firstUrl = ptr_firstUrl;
|
|
xd->size_firstUrl = size_firstUrl;
|
|
|
|
// serps need site, otherwise search results core
|
|
xd->ptr_site = ptr_site;
|
|
xd->size_site = size_site;
|
|
|
|
// if this is null then ip lookup failed i guess so just use
|
|
// the subdomain
|
|
if ( ! ptr_site && m_firstUrlValid ) {
|
|
xd->ptr_site = m_firstUrl.getHost();
|
|
xd->size_site = m_firstUrl.getHostLen();
|
|
}
|
|
|
|
// use the same uh48 of our parent
|
|
int64_t uh48 = m_firstUrl.getUrlHash48();
|
|
// then make into a titlerec but store in metalistbuf, not m_titleRec
|
|
SafeBuf titleRecBuf;
|
|
// this should not include ptrs that are NULL when compressing
|
|
// using its m_internalFlags1
|
|
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
|
|
return NULL;
|
|
|
|
// concat titleRec to our posdb key records
|
|
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
|
|
return NULL;
|
|
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
|
|
return NULL;
|
|
|
|
// return the right val
|
|
m_dist = savedDist;
|
|
|
|
// ok, good to go, ready to add to posdb and titledb
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
|
|
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
|
|
|
|
// the posdb table
|
|
HashTableX tt4;
|
|
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
|
|
return false;
|
|
|
|
|
|
Json jp2;
|
|
if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
|
|
g_errno = EBADJSONPARSER;
|
|
return false;
|
|
}
|
|
|
|
// re-set to 0
|
|
m_dist = 0;
|
|
|
|
// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = &tt4;
|
|
hi.m_desc = "json spider status object";
|
|
hi.m_useCountTable = false;
|
|
hi.m_useSections = false;
|
|
|
|
// fill up tt4. false -> do not hash without field prefixes.
|
|
hashJSONFields2 ( &tt4 , &hi , &jp2 , false );
|
|
|
|
|
|
/*
|
|
char buf[64];
|
|
int32_t bufLen;
|
|
|
|
// hash 'type:status' similar to 'type:json' etc.
|
|
hi.m_prefix = "type";
|
|
if ( ! hashString("status" , &hi ) ) return NULL;
|
|
|
|
// . hash gbstatus:0 for no error, otherwise the error code
|
|
// . this also hashes it as a number so we don't have to
|
|
// . so we can do histograms on this #
|
|
hi.m_prefix = "gbstatus";
|
|
hi.m_desc = "spider error number as string";
|
|
bufLen = sprintf ( buf , "%" UINT32 "", (uint32_t)m_indexCode );
|
|
if ( ! hashString( buf , &hi ) ) return NULL;
|
|
*/
|
|
|
|
/*
|
|
logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
|
|
logf(LOG_DEBUG,"hashing indexcode=%" INT32 "",m_indexCode);
|
|
bool ok = false;
|
|
if ( m_indexCode ) ok = true;
|
|
// scan the keys in tt and make sure the termid fo
|
|
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
|
|
int32_t recSize = 0;
|
|
int32_t rcount = 0;
|
|
char *p = m_spiderStatusDocMetaList.getBufStart();
|
|
char *pend =m_spiderStatusDocMetaList.getBuf();
|
|
for ( ; p < pend ; p += recSize ) {
|
|
// get rdbid, RDB_POSDB
|
|
uint8_t rdbId = *p & 0x7f;
|
|
// skip
|
|
p++;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// init this
|
|
int32_t recSize = ks;
|
|
// convert into a key128_t, the biggest possible key
|
|
//key224_t k ;
|
|
char k[MAX_KEY_BYTES];
|
|
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
|
|
//k.setMin();
|
|
gbmemcpy ( &k , p , ks );
|
|
// is it a negative key?
|
|
char neg = false;
|
|
if ( ! ( p[0] & 0x01 ) ) neg = true;
|
|
// this is now a bit in the posdb key so we can rebalance
|
|
char shardByTermId = false;
|
|
if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
|
|
shardByTermId = true;
|
|
// skip it
|
|
p += ks;
|
|
// . always zero if key is negative
|
|
// . this is not the case unfortunately...
|
|
if ( neg ) {char *xx=NULL;*xx=0; }
|
|
// print dbname
|
|
if ( rdbId != RDB_POSDB ) { char *xx=NULL;*xx=0; }
|
|
// get termid et al
|
|
key144_t *k2 = (key144_t *)k;
|
|
int64_t tid = g_posdb.getTermId(k2);
|
|
log("db: tid=%" INT64 "",tid);
|
|
if ( tid == 199947062354729LL ) ok = true;
|
|
//if ( m_indexCode == 0 && tid != 199947062354729LL ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
}
|
|
if ( ! ok ) { char *xx=NULL;*xx=0; }
|
|
goto SKIP;
|
|
// was here....
|
|
*/
|
|
|
|
/*
|
|
// gbstatus:"tcp timed out"
|
|
hi.m_prefix = "gbstatusmsg";
|
|
hi.m_desc = "spider error msg";
|
|
if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;
|
|
|
|
//hi.m_prefix = "gbdocid";
|
|
//hi.m_desc = "docid";
|
|
//bufLen = sprintf ( buf , "%" UINT64 "", *uqd ) ;
|
|
//if ( ! hashString( buf , &hi ) ) return NULL;
|
|
|
|
// . then the url. url: site: ip: etc. terms
|
|
// . do NOT hash non-fielded terms so we do not get "status"
|
|
// results polluting the serps => false
|
|
if ( ! hashUrl ( &tt4 , true ) ) return NULL;
|
|
|
|
// false --> do not hash the gbdoc* terms (CT_STATUS)
|
|
hashDateNumbers ( &tt4 , true );
|
|
*/
|
|
|
|
// store keys in safebuf then to make our own meta list
|
|
addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );
|
|
|
|
// debug this shit
|
|
//SafeBuf tmpsb;
|
|
//printMetaList ( m_spiderStatusDocMetaList.getBufStart() ,
|
|
// m_spiderStatusDocMetaList.getBuf(),
|
|
// &tmpsb );
|
|
//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());
|
|
|
|
return true;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta tags" );
|
|
|
|
// assume it's empty
|
|
char buf [ 32*1024 ];
|
|
int32_t bufLen = 32*1024 - 1;
|
|
buf[0] = '\0';
|
|
int32_t n = m_xml.getNumNodes();
|
|
XmlNode *nodes = m_xml.getNodes();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "custom meta tag";
|
|
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != 68 ) continue;
|
|
// only get content for <meta name=..> not <meta http-equiv=..>
|
|
int32_t tagLen;
|
|
char *tag = m_xml.getString ( i , "name" , &tagLen );
|
|
char *tptr = tag;
|
|
char tagLower[128];
|
|
int32_t j ;
|
|
int32_t code;
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// make tag name lower case and do not allow bad chars
|
|
if ( tagLen > 126 ) tagLen = 126 ;
|
|
to_lower3_a ( tag , tagLen , tagLower );
|
|
for ( j = 0 ; j < tagLen ; j++ ) {
|
|
// bail if has unacceptable chars
|
|
if ( ! is_alnum_a ( tag[j] ) &&
|
|
tag[j] != '-' &&
|
|
tag[j] != '_' &&
|
|
tag[j] != '.' ) break;
|
|
// convert to lower
|
|
tagLower[j] = to_lower_a ( tag[j] );
|
|
}
|
|
// skip this meta if had unacceptable chars
|
|
if ( j < tagLen ) continue;
|
|
// is it recognized?
|
|
code = getFieldCode ( tag , tagLen );
|
|
// after version 45 or more, do not allow gbrss
|
|
// meta tags, because those are now reserved for us
|
|
if ( code == FIELD_GBRSS ) continue;
|
|
// allow gbrss: fields for earlier versions though
|
|
if ( code == FIELD_GBRSS ) code = FIELD_GENERIC;
|
|
// . do not allow reserved tag names
|
|
// . title,url,suburl,
|
|
if ( code != FIELD_GENERIC ) continue;
|
|
// this is now reserved
|
|
// do not hash keyword, keywords, description, or summary metas
|
|
// because that is done in hashRange() below based on the
|
|
// tagdb (ruleset) record
|
|
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
|
|
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
|
|
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
|
|
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
|
|
continue;
|
|
// . don't allow reserved names: site, url, suburl, link and ip
|
|
// . actually, the colon is included as part of those
|
|
// field names, so we really lucked out...!
|
|
// . index this converted tag name
|
|
tptr = tagLower;
|
|
|
|
// get the content
|
|
int32_t len;
|
|
char *s = m_xml.getString ( i , "content" , &len );
|
|
if ( ! s || len <= 0 ) continue;
|
|
// . ensure not too big for our buffer (keep room for a \0)
|
|
// . TODO: this is wrong, should be len+1 > bufLen,
|
|
// but can't fix w/o resetting the index (COME BACK HERE
|
|
// and see where we index meta tags besides this place!!!)
|
|
// remove those other places, except... what about keywords
|
|
// and description?
|
|
if ( len+1 >= bufLen ) {
|
|
//len = bufLen - 1;
|
|
// assume no punct to break on!
|
|
len = 0;
|
|
// only cut off at punctuation
|
|
char *p = s;
|
|
char *pend = s + len;
|
|
char *last = NULL;
|
|
int32_t size ;
|
|
for ( ; p < pend ; p += size ) {
|
|
// skip if utf8 char
|
|
size = getUtf8CharSize(*p);
|
|
// skip if 2+ bytes
|
|
if ( size > 1 ) continue;
|
|
// skip if not punct
|
|
if ( is_alnum_a(*p) ) continue;
|
|
// mark it
|
|
last = p;
|
|
}
|
|
if ( last ) len = last - s;
|
|
// this old way was faster...:
|
|
//while ( len > 0 && is_alnum(s[len-1]) ) len--;
|
|
}
|
|
// convert html entities to their chars
|
|
len = saftenTags ( buf , bufLen , s , len );
|
|
// NULL terminate the buffer
|
|
buf[len] = '\0';
|
|
|
|
// temp null term
|
|
char c = tptr[tagLen];
|
|
tptr[tagLen] = 0;
|
|
// custom
|
|
hi.m_prefix = tptr;
|
|
// desc is NULL, prefix will be used as desc
|
|
bool status = hashString ( buf,len,&hi );
|
|
// put it back
|
|
tptr[tagLen] = c;
|
|
// bail on error, g_errno should be set
|
|
if ( ! status ) return false;
|
|
|
|
// return false with g_errno set on error
|
|
//if ( ! hashNumber ( buf , bufLen , &hi ) )
|
|
// return false;
|
|
}
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashMetaData ( HashTableX *tt ) {
|
|
|
|
if ( ! ptr_metadata || !ptr_metadata[0] ) return true;
|
|
|
|
Json jp;
|
|
|
|
if ( ! jp.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
|
|
log("XmlDoc had error parsing json in metadata %s",
|
|
ptr_metadata);
|
|
return false;
|
|
}
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta data";
|
|
hi.m_useCountTable = false;
|
|
|
|
// always reset to word pos to 0 now when hashing a json field
|
|
// since it shouldn't matter because they are in a field so we
|
|
// have to search like myfield:whatever. this way we can
|
|
// augment ptr_metadata on an EDOCUNCHANGED error and
|
|
// not end up with undeleteable data in posdb. if we have
|
|
// duplicate fields in our doc and our doc is json, we could have
|
|
// some word position conflicts, which kinda sucks, but can be
|
|
// avoided becomes this is HASHGROUP_INMETATAG, but should really
|
|
// be HASHGROUP_INMETADATA just to be sure.
|
|
int32_t saved = m_dist;
|
|
m_dist = 0;
|
|
|
|
hashJSONFields2 ( tt , &hi , &jp , false );
|
|
|
|
m_dist = saved;
|
|
|
|
return true;
|
|
}
|
|
|
|
// slightly greater than m_spideredTime, which is the download time.
|
|
// we use this for sorting as well, like for the widget so things
|
|
// don't really get added out of order and not show up in the top spot
|
|
// of the widget list.
|
|
int32_t XmlDoc::getIndexedTime() {
|
|
if ( m_indexedTimeValid ) return m_indexedTime;
|
|
m_indexedTime = getTimeGlobal();
|
|
return m_indexedTime;
|
|
}
|
|
|
|
// . hash dates for sorting by using gbsortby: and gbrevsortby:
|
|
// . do 'gbsortby:gbspiderdate' as your query to see this in action
|
|
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
|
|
|
// stop if already set
|
|
if ( ! m_spideredTimeValid ) return true;
|
|
|
|
int32_t indexedTime = getIndexedTime();
|
|
|
|
// first the last spidered date
|
|
HashInfo hi;
|
|
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "last spidered date";
|
|
hi.m_prefix = "gbspiderdate";
|
|
|
|
char buf[64];
|
|
int32_t bufLen = sprintf ( buf , "%" UINT32 "", (uint32_t)m_spideredTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
// and index time is >= spider time, so you want to sort by that for
|
|
// the widget for instance
|
|
hi.m_desc = "last indexed date";
|
|
hi.m_prefix = "gbindexdate";
|
|
bufLen = sprintf ( buf , "%" UINT32 "", (uint32_t)indexedTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
// do not index the rest if we are a "spider reply" document
|
|
// which is like a fake document for seeing spider statuses
|
|
//if ( isStatusDoc == CT_STATUS ) return true;
|
|
//if ( isStatusDoc ) return true;
|
|
|
|
// now for CT_STATUS spider status "documents" we also index
|
|
// gbspiderdate so index this so we can just do a
|
|
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
|
|
// spider status "documents"
|
|
hi.m_desc = "doc last spidered date";
|
|
hi.m_prefix = "gbdocspiderdate";
|
|
bufLen = sprintf ( buf , "%" UINT32 "", (uint32_t)m_spideredTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
hi.m_desc = "doc last indexed date";
|
|
hi.m_prefix = "gbdocindexdate";
|
|
bufLen = sprintf ( buf , "%" UINT32 "", (uint32_t)indexedTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta zip" );
|
|
|
|
// . set the score based on quality
|
|
// . scores are multiplied by 256 to preserve fractions for adding
|
|
uint32_t score = *getSiteNumInlinks8() * 256 ;
|
|
if ( score <= 0 ) score = 1;
|
|
// search for meta date
|
|
char buf [ 32 ];
|
|
int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
|
|
if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
|
|
char *p = buf;
|
|
char *pend = buf + bufLen ;
|
|
if ( bufLen <= 0 ) return true;
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
//hi.m_prefix = "zipcode";
|
|
hi.m_prefix = "gbzipcode";
|
|
|
|
nextZip:
|
|
// . parse out the zip codes, may be multiple ones
|
|
// . skip non-digits
|
|
while ( p < pend && ! is_digit(*p) ) p++;
|
|
// skip if no digits
|
|
if ( p == pend ) return true;
|
|
// need at least 5 consecutive digits
|
|
if ( p + 5 > pend ) return true;
|
|
// if not a zip code, skip it
|
|
if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
|
|
if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
|
|
if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
|
|
if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
|
|
// do we have too many consectuive digits?
|
|
if ( p + 5 != pend && is_digit(p[5]) ) {
|
|
// if so skip this whole string of digits
|
|
p += 5; while ( p < pend && is_digit(*p) ) p++;
|
|
goto nextZip;
|
|
}
|
|
// 90210 --> 90 902 9021 90210
|
|
for ( int32_t i = 0 ; i <= 3 ; i++ )
|
|
// use prefix as description
|
|
if ( ! hashString ( p,5-i,&hi ) ) return false;
|
|
p += 5;
|
|
goto nextZip;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashContentType ( HashTableX *tt ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
uint8_t ctype = *getContentType();
|
|
char *s = NULL;
|
|
|
|
setStatus ( "hashing content type" );
|
|
|
|
|
|
// hash numerically so we can do gbfacetint:type on it
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "type";
|
|
|
|
char tmp[6];
|
|
sprintf(tmp,"%" UINT32 "",(uint32_t)ctype);
|
|
if ( ! hashString (tmp,gbstrlen(tmp),&hi ) ) return false;
|
|
|
|
|
|
// these ctypes are defined in HttpMime.h
|
|
switch (ctype) {
|
|
case CT_HTML: s = "html"; break;
|
|
case CT_TEXT: s = "text"; break;
|
|
case CT_XML : s = "xml" ; break;
|
|
case CT_PDF : s = "pdf" ; break;
|
|
case CT_DOC : s = "doc" ; break;
|
|
case CT_XLS : s = "xls" ; break;
|
|
case CT_PPT : s = "ppt" ; break;
|
|
case CT_PS : s = "ps" ; break;
|
|
// for diffbot. so we can limit search to json objects
|
|
// in Diffbot.cpp
|
|
case CT_JSON: s = "json" ; break;
|
|
}
|
|
// bail if unrecognized content type
|
|
if ( ! s ) return true;
|
|
|
|
// hack for diffbot. do not hash type:json because diffbot uses
|
|
// that for searching diffbot json objects
|
|
if ( cr->m_isCustomCrawl && ctype==CT_JSON && !m_isDiffbotJSONObject )
|
|
return true;
|
|
|
|
// . now hash it
|
|
// . use a score of 1 for all
|
|
// . TODO: ensure doc counting works ok with this when it does
|
|
// it's interpolation
|
|
return hashString (s,gbstrlen(s),&hi );
|
|
}
|
|
|
|
// . hash the link: terms
|
|
// . ensure that more useful linkers are scored higher
|
|
// . useful for computing offsite link text for qdb-ish algorithm
|
|
// . NOTE: for now i do not hash links to the same domain in order to
|
|
// hopefully save 10%-25% index space
|
|
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
|
|
// different site links with no link text will be ranked behind them
|
|
// . the 8-bit bitmap of the score of a link: term:
|
|
// . 00ubdcss u = link is Unbanned? b = link isBanned?
|
|
// d = link dirty? c = link clean?
|
|
// s = 01 if no link text, 10 if link text
|
|
// . NOTE: this is used in Msg18.cpp for extraction
|
|
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
|
|
// so i moved the bits down
|
|
bool XmlDoc::hashLinks ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing links" );
|
|
|
|
// int16_tcuts
|
|
bool isRSSFeed = *getIsRSS();
|
|
Url *cu = getCurrentUrl() ;
|
|
Url *ru = *getRedirUrl() ;
|
|
|
|
char dbuf[8*4*1024];
|
|
HashTableX dedup;
|
|
dedup.set( 8,0,1024,dbuf,8*4*1024,false,m_niceness,"hldt");
|
|
|
|
// see ../url/Url2.cpp for hashAsLink() algorithm
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// skip links with zero 0 length
|
|
if ( m_links.m_linkLens[i] == 0 ) continue;
|
|
// . skip if we are rss page and this link is an <a href> link
|
|
// . we only harvest/index <link> urls from rss feeds
|
|
// . or in the case of feedburner, those orig tags
|
|
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) )
|
|
continue;
|
|
// if we have a <feedburner:origLink> tag, then ignore <link>
|
|
// tags and only get the links from the original links
|
|
if ( m_links.m_isFeedBurner &&
|
|
!(m_links.m_linkFlags[i] & LF_FBTAG) )
|
|
continue;
|
|
// normalize the link
|
|
Url link;
|
|
// now we always add "www" to these links so that any link
|
|
// to cnn.com is same as link to www.cnn.com, because either
|
|
// we index cnn.com or www.cnn.com but not both providing
|
|
// their content is identical (deduping). This way whichever
|
|
// one we index, we can take advantage of all link text whether
|
|
// it's to cnn.com or www.cnn.com.
|
|
// Every now and then we add new session ids to our list in
|
|
// Url.cpp, too, so we have to version that.
|
|
// Since this is just for hashing, it shouldn't matter that
|
|
// www.tmblr.co has no IP whereas only tmblr.co does.
|
|
link.set ( m_links.m_linkPtrs[i] ,
|
|
m_links.m_linkLens[i] ,
|
|
true , // addWWW?
|
|
m_links.m_stripIds ,
|
|
false , // stripPound?
|
|
false , // stripCommonFile?
|
|
m_version );// used for new session id stripping
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . the score depends on some factors:
|
|
// . NOTE: these are no longer valid! (see score bitmap above)
|
|
// . 4 --> if link has different domain AND has link text
|
|
// . 3 --> if link has same domain AND has link text
|
|
// . 2 --> if link has different domain AND no link text
|
|
// . 1 --> if link has sam domain AND no link text
|
|
// . is domain the same as ours?
|
|
// . NOTE: ideally, using the IP domain would be better, but
|
|
// we do not know the ip of the linker right now... so scores
|
|
// may be topped with a bunch of same-ip domain links so that
|
|
// we may not get as much link text as we'd like, since we
|
|
// only sample from one link text per ip domain
|
|
// . now we also just use the mid domain! (excludes TLD)
|
|
bool internal = false;
|
|
int32_t mdlen = cu->getMidDomainLen();
|
|
if ( mdlen == link.getMidDomainLen() &&
|
|
strncmp(cu->getMidDomain(),link.getMidDomain(),mdlen)==0)
|
|
//continue; // sameMidDomain = true;
|
|
internal = true;
|
|
// also check the redir url
|
|
if ( ru ) {
|
|
mdlen = ru->getMidDomainLen();
|
|
if ( mdlen == link.getMidDomainLen() &&
|
|
strncmp(ru->getMidDomain(),
|
|
link.getMidDomain(),mdlen)==0)
|
|
//continue; // sameMidDomain = true;
|
|
internal = true;
|
|
}
|
|
// now make the score
|
|
//unsigned char score ;
|
|
// . TODO: consider not hashing link w/o text!
|
|
// . otherwise, give it a higher score if it's got link TEXT
|
|
//bool gotLinkText = m_links.hasLinkText ( i, m_version );
|
|
// otherwise, beginning with version 21, allow internal links,
|
|
// but with lower scores
|
|
// score
|
|
// internal, no link text: 2
|
|
// internal, w/ link text: 4
|
|
// external, no link text: 6
|
|
// external, w/ link text: 8
|
|
//if ( internal ) {
|
|
// if ( ! gotLinkText ) score = 0x02;
|
|
// else score = 0x04;
|
|
//}
|
|
//else {
|
|
// if ( ! gotLinkText ) score = 0x06;
|
|
// else score = 0x08;
|
|
//}
|
|
|
|
|
|
// dedup this crap
|
|
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "link";
|
|
|
|
// hash link:<url>
|
|
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi ))
|
|
return false;
|
|
|
|
|
|
h = hash64 ( link.getHost() , link.getHostLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
|
|
// fix parm
|
|
hi.m_prefix = "sitelink";
|
|
|
|
// hash sitelink:<urlHost>
|
|
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi))
|
|
return false;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
}
|
|
|
|
// skip this for now
|
|
return true;
|
|
|
|
/*
|
|
setStatus ("hashing gbhasbannedoutlink" );
|
|
|
|
// only lets a domain vote once
|
|
int32_t numBannedOutlinks = *getNumBannedOutlinks();
|
|
//if ( numBannedOutlinks <= 0 ) return true;
|
|
// a score of 235 seems to give a negative return for score8to32()
|
|
uint32_t score = score8to32 ( numBannedOutlinks );
|
|
// make score at least 1!
|
|
if ( score <= 0 ) score = 1;
|
|
// a hack fix
|
|
if ( score > 0x7fffffff ) score = 0x7fffffff;
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "gbhasbannedoutlink";
|
|
|
|
// hash this special thing to help us de-spam the index
|
|
if ( numBannedOutlinks > 0 ) return hashString ("1",1,&hi );
|
|
else return hashString ("0",1,&hi );
|
|
*/
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . hash for linkdb
|
|
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
|
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key224_t) ) { char *xx=NULL;*xx=0; }
|
|
if ( dt->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this will be different with our new site definitions
|
|
uint32_t linkerSiteHash32 = *getSiteHash32();
|
|
|
|
char siteRank = getSiteRank();
|
|
|
|
if ( ! m_linksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we need to store this in the title rec for re-building
|
|
// the meta list from the title rec...
|
|
// is this just site info?
|
|
//TagRec ***pgrv = getOutlinkTagRecVector();
|
|
//if ( ! pgrv || pgrv == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
//TagRec **grv = *pgrv;
|
|
|
|
int32_t *linkSiteHashes = getLinkSiteHashes();
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ){
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// convert siteNumInlinks into a score
|
|
//int32_t numSiteInlinks = *xd->getSiteNumInlinks();
|
|
|
|
unsigned char hopCount = *getHopCount();
|
|
|
|
// use spidered time! might not be current time! like if rebuilding
|
|
// or injecting from a past spider time
|
|
int32_t discoveryDate = getSpideredTime();//TimeGlobal();
|
|
int32_t lostDate = 0;
|
|
|
|
// add in new links
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// give up control
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if empty
|
|
if ( m_links.m_linkLens[i] == 0 ) continue;
|
|
// . skip if spam, ALWAYS allow internal outlinks though!!
|
|
// . CAUTION: now we must version islinkspam()
|
|
bool spam = m_links.isLinkSpam(i) ;
|
|
// or if it has no link text, skip it
|
|
//if ( ! links->hasLinkText(i,TITLEREC_CURRENT_VERSION) )
|
|
//continue;
|
|
// get site of outlink from tagrec if in there
|
|
int32_t linkeeSiteHash32 = linkSiteHashes[i];
|
|
/*
|
|
TagRec *gr = grv[i];
|
|
char *site = NULL;
|
|
int32_t siteLen = 0;
|
|
if ( gr ) {
|
|
int32_t dataSize = 0;
|
|
site = gr->getString("site",NULL,&dataSize);
|
|
if ( dataSize ) siteLen = dataSize - 1;
|
|
}
|
|
// otherwise, make it the host or make it cut off at
|
|
// a "/user/" or "/~xxxx" or whatever path component
|
|
if ( ! site ) {
|
|
// GUESS link site... TODO: augment for /~xxx
|
|
char *s = m_links.getLink(i);
|
|
//int32_t slen = m_links.getLinkLen(i);
|
|
//siteLen = slen;
|
|
site = ::getHost ( s , &siteLen );
|
|
}
|
|
uint32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
|
|
*/
|
|
|
|
//
|
|
// when setting the links class it should set the site hash
|
|
//
|
|
|
|
|
|
// set this key, it is the entire record
|
|
key224_t k;
|
|
k = g_linkdb.makeKey_uk ( linkeeSiteHash32 ,
|
|
m_links.getLinkHash64(i) ,
|
|
spam , // link spam?
|
|
siteRank , // was quality
|
|
hopCount,
|
|
*getIp() ,
|
|
*getDocId() ,
|
|
discoveryDate ,
|
|
lostDate ,
|
|
false , // new add?
|
|
linkerSiteHash32 ,
|
|
false );// delete?
|
|
/*
|
|
// debug
|
|
if ( m_links.getLinkHash64(i) != 0x3df1c439a364e18dLL )
|
|
continue;
|
|
//char c = site[siteLen];
|
|
//site[siteLen]=0;
|
|
//char tmp[1024];
|
|
//sprintf(tmp,"xmldoc: hashinglink site=%s sitelen=%" INT32 " ",
|
|
// site,siteLen);
|
|
//site[siteLen] = c;
|
|
log(//"%s "
|
|
"url=%s "
|
|
"linkeesitehash32=0x%08" XINT32 " "
|
|
"linkersitehash32=0x%08" XINT32 " "
|
|
"urlhash64=0x%16llx "
|
|
"docid=%" INT64 " k=%s",
|
|
//tmp,
|
|
m_links.getLink(i),
|
|
(int32_t)linkeeSiteHash32,
|
|
linkerSiteHash32,
|
|
m_links.getLinkHash64(i),
|
|
*getDocId(),
|
|
KEYSTR(&k,sizeof(key224_t))
|
|
);
|
|
*/
|
|
// store in hash table
|
|
if ( ! dt->addKey ( &k , NULL ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::getUseTimeAxis ( ) {
|
|
if ( m_useTimeAxisValid )
|
|
return m_useTimeAxis;
|
|
if ( m_setFromTitleRec )
|
|
// return from titlerec header
|
|
return m_useTimeAxis;
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if ( ! cr ) return false;
|
|
m_useTimeAxis = cr->m_useTimeAxis;
|
|
m_useTimeAxisValid = true;
|
|
// sanity check
|
|
// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
|
|
// log("build: custom crawls can't use time axis");
|
|
// char *xx=NULL;*xx=0;
|
|
// m_useTimeAxis = false;
|
|
// }
|
|
return m_useTimeAxis;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
|
bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
|
|
|
setStatus ( "hashing url colon" );
|
|
|
|
// get the first url
|
|
Url *fu = getFirstUrl();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
|
|
// we do not need diversity bits for this
|
|
hi.m_useCountTable = false;
|
|
//
|
|
// HASH url: term
|
|
//
|
|
// append a "www." for doing url: searches
|
|
Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
|
|
hi.m_prefix = "url";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "url2";
|
|
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
|
return false;
|
|
|
|
if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
|
|
hi.m_prefix = "gbtimeurl";
|
|
SafeBuf *tau = getTimeAxisUrl();
|
|
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
|
|
}
|
|
|
|
// use hash of url as score so we can get a # of docs per site est.
|
|
//uint16_t score = hash16 ( fu->getUrl() , fu->getUrlLen() );
|
|
|
|
setStatus ( "hashing inurl colon" );
|
|
|
|
//
|
|
// HASH inurl: terms
|
|
//
|
|
char *s = fu->getUrl ();
|
|
int32_t slen = fu->getUrlLen();
|
|
hi.m_prefix = "inurl";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "inurl2";
|
|
if ( ! hashString ( s,slen, &hi ) ) return false;
|
|
|
|
setStatus ( "hashing ip colon" );
|
|
|
|
//
|
|
// HASH ip:a.b.c.d
|
|
//
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
// copy it to save it
|
|
char ipbuf[64];
|
|
int32_t iplen = sprintf(ipbuf,"%s",iptoa(m_ip));
|
|
//char *tmp = iptoa ( m_ip );
|
|
//int32_t tlen = gbstrlen(tmp);
|
|
hi.m_prefix = "ip";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "ip2";
|
|
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
|
|
|
|
//
|
|
// HASH ip:a.b.c
|
|
//
|
|
char *end1 = ipbuf + iplen - 1;
|
|
while ( *end1 != '.' ) end1--;
|
|
if ( ! hashSingleTerm(ipbuf,end1-ipbuf,&hi) ) return false;
|
|
|
|
|
|
// . sanity check
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// get the boost
|
|
//floatboost1=(float)getBoostFromSiteNumInlinks(m_siteNumInlinks)/100.0
|
|
|
|
|
|
|
|
//
|
|
// HASH the url path plain as if in body
|
|
//
|
|
// get number of components in the path. does not include the filename
|
|
int32_t pathDepth = fu->getPathDepth(false);
|
|
// make it a density thing
|
|
//pathScore /= ( pathDepth + 1 );
|
|
// ensure score positive
|
|
//if ( pathScore <= 0 ) pathScore = 1;
|
|
// get it
|
|
char *path = fu->getPath();
|
|
int32_t plen = fu->getPathLen();
|
|
/*
|
|
// update it
|
|
float boost2 = (float)m_urlPathWeight / 100;
|
|
// again
|
|
float boost3 = 1.0 / ((float)pathDepth + 1.0) ;
|
|
// make a description
|
|
char tmp3[190];
|
|
sprintf( tmp3 ,
|
|
"path score = "
|
|
"siteInlinksBoost * "
|
|
"urlPathWeight * "
|
|
"pathDepthBoost * "
|
|
"256 = %.02f * %.02f * %.02f * 256 " ,
|
|
boost1 ,
|
|
boost2 ,
|
|
boost3 );
|
|
*/
|
|
//int32_t pathScore = (int32_t) (256.0 * boost1 * boost2 * boost3);
|
|
// update parms
|
|
//hi.m_desc = tmp3;
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "url path";
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
|
|
// if parm "index article content only" is true, do not index this!
|
|
//if ( m_eliminateMenus ) skipIndex=true;
|
|
|
|
setStatus ( "hashing gbpathdepth");
|
|
|
|
//
|
|
// HASH gbpathdepth:X
|
|
//
|
|
// xyz.com/foo --> 0
|
|
// xyz.com/foo/ --> 1
|
|
// xyz.com/foo/boo --> 1
|
|
// xyz.com/foo/boo/ --> 2
|
|
char buf[20];
|
|
int32_t blen = sprintf(buf,"%" INT32 "",pathDepth);
|
|
// update parms
|
|
hi.m_prefix = "gbpathdepth";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// hash gbpathdepth:X
|
|
if ( ! hashString ( buf,blen,&hi) ) return false;
|
|
|
|
|
|
|
|
//
|
|
// HASH gbhopcount:X
|
|
//
|
|
setStatus ( "hashing gbhopcount");
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
blen = sprintf(buf,"%" INT32 "",(int32_t)m_hopCount);
|
|
// update parms
|
|
hi.m_prefix = "gbhopcount";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// hash gbpathdepth:X
|
|
if ( ! hashString ( buf,blen,&hi) ) return false;
|
|
|
|
|
|
|
|
setStatus ( "hashing gbhasfilename");
|
|
|
|
//
|
|
// HASH gbhasfilename:0 or :1
|
|
//
|
|
char *hm;
|
|
if ( fu->getFilenameLen() ) hm = "1";
|
|
else hm = "0";
|
|
// update parms
|
|
hi.m_prefix = "gbhasfilename";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
|
|
// hash gbhasfilename:[0|1]
|
|
if ( ! hashString ( hm,1,&hi) ) return false;
|
|
|
|
setStatus ( "hashing gbiscgi");
|
|
|
|
//
|
|
// HASH gbiscgi:0 or gbiscgi:1
|
|
//
|
|
if ( fu->isCgi() ) hm = "1";
|
|
else hm = "0";
|
|
hi.m_prefix = "gbiscgi";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
|
|
if ( ! hashString ( hm,1,&hi) ) return false;
|
|
|
|
|
|
setStatus ( "hashing gbext");
|
|
|
|
//
|
|
// HASH gbhasext:0 or gbhasext:1 (does it have a fileextension)
|
|
//
|
|
// . xyz.com/foo --> gbhasext:0
|
|
// . xyz.com/foo.xxx --> gbhasext:1
|
|
if ( fu->getExtensionLen() ) hm = "1";
|
|
else hm = "0";
|
|
hi.m_prefix = "gbhasext";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
|
|
if ( ! hashString ( hm,1,&hi) ) return false;
|
|
|
|
//
|
|
// HASH the url's mid domain and host as they were in the body
|
|
//
|
|
setStatus ( "hashing site colon terms");
|
|
|
|
//
|
|
// HASH the site: terms
|
|
//
|
|
// . hash the pieces of the site
|
|
// . http://host.domain.com/~harry/level1/ should hash to:
|
|
// . site:host.domain.com/~harry/level1/
|
|
// . site:host.domain.com/~harry/
|
|
// . site:host.domain.com/~
|
|
// . site:host.domain.com/
|
|
// . site:domain.com/~harry/level1/
|
|
// . site:domain.com/~harry/
|
|
// . site:domain.com/~
|
|
// . site:domain.com/
|
|
// ensure score is positive
|
|
//if ( siteScore <= 0 ) siteScore = 1;
|
|
// get the hostname (later we set to domain name)
|
|
char *name = fu->getHost();
|
|
int32_t nameLen = fu->getHostLen();
|
|
// . point to the end of the whole thing, including port field
|
|
// . add in port, if non default
|
|
char *end3 = name + fu->getHostLen() + fu->getPortLen();
|
|
loop:
|
|
// now loop through the sub paths of this url's path
|
|
for ( int32_t i = 0 ; ; i++ ) {
|
|
// get the subpath
|
|
int32_t len = fu->getSubPathLen(i);
|
|
// FIX: always include first /
|
|
if ( len == 0 ) len = 1;
|
|
// write http://www.whatever.com/path into buf
|
|
char buf[MAX_URL_LEN+10];
|
|
char *p = buf;
|
|
gbmemcpy ( p , "http://" , 7 ); p += 7;
|
|
gbmemcpy ( p , name , nameLen ); p += nameLen;
|
|
gbmemcpy ( p , fu->getPath() , len ); p += len;
|
|
*p = '\0';
|
|
// update hash parms
|
|
hi.m_prefix = "site";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "site2";
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
// this returns false on failure
|
|
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
|
|
// break when we hash the root path
|
|
if ( len <=1 ) break;
|
|
}
|
|
// now keep moving the period over in the hostname
|
|
while ( name < end3 && *name != '.' ) { name++; nameLen--; }
|
|
// skip the '.'
|
|
name++; nameLen--;
|
|
// if not '.' we're done
|
|
if ( name < end3 ) goto loop;
|
|
|
|
setStatus ( "hashing ext colon");
|
|
|
|
//
|
|
// HASH ext: term
|
|
//
|
|
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
|
|
char *ext = fu->getExtension();
|
|
int32_t elen = fu->getExtensionLen();
|
|
// update hash parms
|
|
hi.m_prefix = "ext";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "ext2";
|
|
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
|
|
|
|
|
|
setStatus ( "hashing gbdocid" );
|
|
hi.m_prefix = "gbdocid";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
|
|
char buf2[32];
|
|
sprintf(buf2,"%" UINT64 "",(m_docId) );
|
|
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
|
|
|
|
// if indexing a json diffbot object, index
|
|
// gbparenturl:xxxx of the original url from which the json was
|
|
// datamined. we use this so we can act as a diffbot json cache.
|
|
if ( m_isDiffbotJSONObject ) {
|
|
setStatus ( "hashing gbparenturl term");
|
|
char *p = fu->getUrl() + fu->getUrlLen() - 1;
|
|
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
|
|
for ( ; *p && *p != '-' ; p-- );
|
|
// set up the hashing parms
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "diffbot parent url";
|
|
// append a "www." as part of normalization
|
|
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
|
|
hi.m_prefix = "gbparenturl";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
|
|
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
|
return false;
|
|
}
|
|
|
|
//if ( isStatusDoc ) return true;
|
|
|
|
setStatus ( "hashing SiteGetter terms");
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->m_plen <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
|
|
char *host = fu->getHost ();
|
|
int32_t hlen = fu->getHostLen ();
|
|
|
|
// tags from here out
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_shardByTermId = true;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
hi.m_shardByTermId = false;
|
|
|
|
setStatus ( "hashing urlhashdiv10 etc");
|
|
|
|
//
|
|
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
|
|
//
|
|
// this is for proving how many docs are in the index
|
|
uint32_t h = hash32 ( s , slen );
|
|
blen = sprintf(buf,"%" UINT32 "",h);
|
|
hi.m_prefix = "urlhash";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
blen = sprintf(buf,"%" UINT32 "",h/10);
|
|
// update hashing parms
|
|
hi.m_prefix = "urlhashdiv10";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
blen = sprintf(buf,"%" UINT32 "",h/100);
|
|
// update hashing parms
|
|
hi.m_prefix = "urlhashdiv100";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
|
|
|
|
setStatus ( "hashing url mid domain");
|
|
// the final score
|
|
//int32_t plainScore = (int32_t)(256.0 * boost1 * boost2 * fw);
|
|
// update parms
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "middle domain";//tmp3;
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
// if parm "index article content only" is true, do not index this!
|
|
//if ( m_eliminateMenus ) plainScore = 0;
|
|
//char *mid = fu->getMidDomain ();
|
|
//int32_t mlen = fu->getMidDomainLen();
|
|
//hi.m_desc = "url mid dom";
|
|
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
|
|
//hi.m_desc = "url host";
|
|
if ( ! hashString ( host,hlen,&hi)) return false;
|
|
|
|
|
|
setStatus ( "hashing url path");
|
|
|
|
// hash the path plain
|
|
if ( ! hashString (path,plen,&hi) ) return false;
|
|
|
|
return true;
|
|
}
|
|
/////////////
|
|
//
|
|
// CHROME DETECTION
|
|
//
|
|
// we search for these terms we hash here in getSectionsWithDupStats()
|
|
// so we can remove chrome.
|
|
//
|
|
/////////////
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
|
bool XmlDoc::hashSections ( HashTableX *tt ) {
|
|
|
|
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( m_contentType == CT_HTML ) return true;
|
|
|
|
setStatus ( "hashing sections" );
|
|
|
|
if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
Sections *ss = &m_sections;
|
|
|
|
int32_t siteHash32 = *getSiteHash32();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
// the prefix is custom set for each section below
|
|
//hi.m_prefix = "gbsectionhash";
|
|
// put all guys with the same xpath/site on the same shard
|
|
hi.m_shardByTermId = true;
|
|
|
|
Section *si = ss->m_rootSection;
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . skip if empty
|
|
// . this needs to be like 48 bits because 32 bits is not
|
|
// big enough!
|
|
//uint64_t ih64 = si->m_sentenceContentHash64;
|
|
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index
|
|
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
|
|
// skip if sentence, only hash tags now i guess for diffbot
|
|
//if ( si->m_sentenceContentHash64 )
|
|
// continue;
|
|
|
|
// get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
|
if ( ! val32 )
|
|
continue;
|
|
|
|
// the termid is now the xpath and the sitehash, the "value"
|
|
// will be the hash of the innerhtml, m_sentenceContentHash64
|
|
uint64_t thash64 = (uint32_t)si->m_turkTagHash32;
|
|
// combine with site hash
|
|
thash64 ^= (uint32_t)siteHash32;
|
|
|
|
// this is a special hack we need to make it the
|
|
// hash of the inner html
|
|
//hi.m_sentHash32 = (uint32_t)ih64;
|
|
|
|
// . get section xpath & site hash
|
|
// . now if user does a gbfacets:gbxpathsitehashxxxxxx query
|
|
// he will get back a histogram of the values it hash,
|
|
// which are 32-bit hashes of the innerhtml for that
|
|
// xpath on this site.
|
|
char prefix[96];
|
|
sprintf(prefix,"gbxpathsitehash%" UINT64 "",thash64);
|
|
|
|
// like a normal key but we store "ih64" the innerHTML hash
|
|
// of the section into the key instead of wordbits etc.
|
|
// similar to hashNumber*() functions.
|
|
//if ( ! hashSectionTerm ( term , &hi, (uint32_t)ih64 ) )
|
|
// return false;
|
|
|
|
// i guess use facets
|
|
hi.m_prefix = prefix;
|
|
|
|
// we already have the hash of the inner html of the section
|
|
hashFacet2 ( "gbfacetstr",
|
|
prefix,
|
|
//(int32_t)(uint32_t)ih64 ,
|
|
val32,
|
|
hi.m_tt ,
|
|
// shard by termId?
|
|
true );
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
|
bool hashAnomalies ,
|
|
bool hashNonAnomalies ) {
|
|
|
|
// do not index ANY of the body if it is NOT a permalink and
|
|
// "menu elimination" technology is enabled.
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing link text" );
|
|
|
|
// . now it must have an rss item to be indexed in all its glory
|
|
// . but if it tells us it has an rss feed, toss it and wait for
|
|
// the feed.... BUT sometimes the rss feed outlink is 404!
|
|
// . NO, now we discard with ENORSS at Msg16.cpp
|
|
//if ( ! *getHasRSSItem() && m_eliminateMenus ) return true;
|
|
|
|
// sanity check
|
|
if ( hashAnomalies == hashNonAnomalies ) { char *xx = NULL; *xx =0; }
|
|
// display this note in page parser
|
|
char *note = "hashing incoming link text";
|
|
// sanity
|
|
if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
|
|
// . finally hash in the linkText terms from the LinkInfo
|
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
|
// somewhere in TitleRec
|
|
// . otherwise, we generated it from merging a bunch of LinkInfos
|
|
// and storing them in this new TitleRec
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
LinkInfo **pinfo2 = getLinkInfo2 ();
|
|
LinkInfo *info2 = *pinfo2;
|
|
LinkInfo *linkInfo = info1;
|
|
// pick the one with the most inlinks with valid incoming link text,
|
|
// otherwise, we end up with major bias when we stop importing
|
|
// link text from another cluster, because some pages will have
|
|
// twice as many links as they should!
|
|
if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
|
|
linkInfo = info2;
|
|
note = "hashing incoming link text from other cluster";
|
|
}
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// brought the following code in from LinkInfo.cpp
|
|
//
|
|
|
|
int32_t noteLen = 0;
|
|
if ( note ) noteLen = gbstrlen ( note );
|
|
// count "external" inlinkers
|
|
int32_t ecount = 0;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_useSynonyms = true;
|
|
// hashstring should update this like a cursor.
|
|
hi.m_startDist = 0;
|
|
|
|
// loop through the link texts and hash them
|
|
for ( Inlink *k = NULL; (k = linkInfo->getNextInlink(k)) ; ) {
|
|
// is this inlinker internal?
|
|
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// count external inlinks we have for indexing gbmininlinks:
|
|
if ( ! internal ) ecount++;
|
|
// get score
|
|
//int64_t baseScore = k->m_baseScore;
|
|
// get the weight
|
|
//int64_t ww ;
|
|
//if ( internal ) ww = m_internalLinkTextWeight;
|
|
//else ww = m_externalLinkTextWeight;
|
|
// modify the baseScore
|
|
//int64_t final = (baseScore * ww) / 100LL;
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// sanity check
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 2 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
// if it is anomalous, set this, we don't
|
|
//if ( k->m_isAnomaly )
|
|
// hi.m_hashIffNotUnique = true;
|
|
//hi.m_baseScore = final;
|
|
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
|
|
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
|
|
// store the siterank of the linker in this and use that
|
|
// to set the multiplier M bits i guess
|
|
hi.m_linkerSiteRank = k->m_siteRank;
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
k->m_wordPosStart = m_dist; // hi.m_startDist;
|
|
// . hash the link text into the table
|
|
// . returns false and sets g_errno on error
|
|
// . we still have the score punish from # of words though!
|
|
// . for inlink texts that are the same it should accumulate
|
|
// and use the reserved bits as a multiplier i guess...
|
|
if ( ! hashString ( txt,tlen,&hi) ) return false;
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
//k->m_wordPosEnd = hi.m_startDist;
|
|
// spread it out
|
|
hi.m_startDist += 20;
|
|
}
|
|
|
|
/*
|
|
// . hash gbkeyword:numinlinks where score is # of inlinks from 1-255
|
|
// . do not hash gbkeyword:numinlinks if we don't got any
|
|
if ( ecount <= 0 ) return true;
|
|
// limit it since our score can't be more than 255 (8-bits)
|
|
//if ( ecount > 255 ) ecount = 255;
|
|
// convert our 32 bit score to 8-bits so we trick it!
|
|
//int32_t score = score8to32 ( (uint8_t)ecount );
|
|
// watch out for wrap
|
|
//if ( score < 0 ) score = 0x7fffffff;
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "gbkeyword";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// for terms where word position/density/diversity is irrelevant,
|
|
// we can store this value...
|
|
hi.m_fakeValue = ecount;
|
|
// hash gbkeyword:numinlinks term
|
|
if ( ! hashString ( "numinlinks",10,&hi ) )return false;
|
|
*/
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
|
|
|
// seems like iffUnique is off, so do this
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing neighborhoods" );
|
|
|
|
//g_tt = table;
|
|
|
|
// . now we also hash the neighborhood text of each inlink, that is,
|
|
// the text surrounding the inlink text.
|
|
// . this is also destructive in that it will remove termids that
|
|
// were not in the document being linked to in order to save
|
|
// space in the titleRec
|
|
// . now we only do one or the other, not both
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
LinkInfo **pinfo2 = getLinkInfo2 ();
|
|
LinkInfo *info2 = *pinfo2;
|
|
LinkInfo *linkInfo = info1;
|
|
|
|
char *note = " (internal cluster)";
|
|
// pick the one with the most inlinks with valid incoming link text
|
|
// otherwise, we end up with major bias when we stop importing
|
|
// link text from another cluster, because some pages will have
|
|
// twice as many links as they should!
|
|
if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
|
|
linkInfo = info2;
|
|
note = " (external cluster)";
|
|
}
|
|
|
|
// loop over all the Inlinks
|
|
Inlink *k = NULL;
|
|
loop:
|
|
// get the next inlink
|
|
k = linkInfo->getNextInlink( k );
|
|
// break if done
|
|
if ( ! k ) return true;
|
|
|
|
// skip if internal, they often have the same neighborhood text
|
|
if ( (k->m_ip&0x0000ffff)==(m_ip&0x0000ffff) ) goto loop;
|
|
|
|
// get the left and right texts and hash both
|
|
char *s = k->getSurroundingText();
|
|
if ( ! s || k->size_surroundingText <= 1 ) goto loop;
|
|
|
|
//int32_t inlinks = *getSiteNumInlinks();
|
|
|
|
// HACK: to avoid having to pass a flag to TermTable, then to
|
|
// Words::hash(), Phrases::hash(), etc. just flip a bit in the
|
|
// table to make it not add anything unless it is already in there.
|
|
tt->m_addIffNotUnique = true;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "surrounding text";
|
|
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
|
|
|
|
// . hash that
|
|
// . this returns false and sets g_errno on error
|
|
int32_t len = k->size_surroundingText - 1;
|
|
if ( ! hashString ( s, len, &hi ) ) return false;
|
|
|
|
// now turn it back off
|
|
tt->m_addIffNotUnique = false;
|
|
|
|
// get the next Inlink
|
|
goto loop;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing rss info" );
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . finally hash in the linkText terms from the LinkInfo
|
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
|
// somewhere in TitleRec
|
|
// . otherwise, we generated it from merging a bunch of LinkInfos
|
|
// and storing them in this new TitleRec
|
|
LinkInfo *linkInfo = getLinkInfo1();
|
|
|
|
// get the xml of the first rss/atom item/entry referencing this url
|
|
Xml xml;
|
|
// . returns NULL if no item xml
|
|
// . this could also be a "channel" blurb now, so we index channel pgs
|
|
if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;
|
|
|
|
if ( xml.isEmpty() )
|
|
// hash gbrss:0
|
|
return hashRSSTerm ( tt , false );
|
|
|
|
// parser info msg
|
|
//if ( m_pbuf ) {
|
|
// m_pbuf->safePrintf(
|
|
// "<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
|
|
//}
|
|
|
|
// hash nothing if not a permalink and eliminating "menus"
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
// . IMPORTANT: you must be using the new link algo, so turn it on
|
|
// in the spider controls. this allows us to include LinkTexts from
|
|
// the same IP in our LinkInfo class in the TitleRec.
|
|
// . is it rss or atom? both use title tag, so doesn't matter
|
|
// . get the title tag
|
|
bool isHtmlEncoded;
|
|
int32_t titleLen;
|
|
char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
|
|
char c = 0;
|
|
|
|
// sanity check
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
bool hashIffUnique = true;
|
|
// but if we had no content because we were an mp3 or whatever,
|
|
// do not worry about avoiding double hashing
|
|
if ( size_utf8Content <= 0 ) hashIffUnique = false;
|
|
|
|
// decode it?
|
|
// should we decode it? if they don't use [CDATA[]] then we should
|
|
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
|
|
// but most other feeds do not use it
|
|
if ( isHtmlEncoded && title && titleLen > 0 ) {
|
|
// it is html encoded so that the <'s are encoded to <'s so
|
|
// we must decode them back. this could turn latin1 into utf8
|
|
// though? no, because the &'s should have been encoded, too!
|
|
int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
|
|
// make sure we don't overflow the buffer
|
|
if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
|
|
// reassign the length
|
|
titleLen = newLen;
|
|
// NULL terminate it
|
|
c = title[titleLen];
|
|
title[titleLen] = '\0';
|
|
}
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
hi.m_desc = "rss title";
|
|
|
|
// . hash the rss title
|
|
// . only hash the terms if they are unique to stay balanced with docs
|
|
// that are not referenced by an rss feed
|
|
bool status = hashString ( title,titleLen,&hi ) ;
|
|
// pop the end back just in case
|
|
if ( c ) title[titleLen] = c;
|
|
// return false with g_errno set on error
|
|
if ( ! status ) return false;
|
|
|
|
// get the rss description
|
|
int32_t descLen;
|
|
char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );
|
|
|
|
// for advanced hashing
|
|
Xml xml2;
|
|
Words w;
|
|
//Scores scores;
|
|
Words *wordsPtr = NULL;
|
|
//Scores *scoresPtr = NULL;
|
|
c = 0;
|
|
// should we decode it? if they don't use [CDATA[]] then we should
|
|
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
|
|
// but most other feeds do not use it
|
|
if ( isHtmlEncoded && desc && descLen > 0 ) {
|
|
// it is html encoded so that the <'s are encoded to <'s so
|
|
// we must decode them back. this could turn latin1 into utf8
|
|
// though? no, because the &'s should have been encoded, too!
|
|
int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
|
|
// make sure we don't overflow the buffer
|
|
if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
|
|
// reassign the length
|
|
descLen = newLen;
|
|
}
|
|
|
|
// NULL terminate it
|
|
if ( desc ) {
|
|
c = desc[descLen];
|
|
desc[descLen] = '\0';
|
|
// set the xml class from the decoded html
|
|
if ( ! xml2.set ( desc ,
|
|
descLen ,
|
|
false , // own data?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
true , // set parents?
|
|
m_niceness ,
|
|
*ct ) )
|
|
return false;
|
|
// set the words class from the xml, returns false and sets
|
|
// g_errno on error
|
|
if ( ! w.set ( &xml2 ,
|
|
true , // compute Ids
|
|
true ))// has html ents? (WERE encoded twice!)
|
|
|
|
return false;
|
|
// pass it in to TermTable::hash() below
|
|
wordsPtr = &w;
|
|
}
|
|
|
|
// update hash parms
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "rss body";
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
// . hash the rss/atom description
|
|
// . only hash the terms if they are unique to stay balanced with docs
|
|
// that are not referenced by an rss feed
|
|
status = hashString ( desc, descLen, &hi );
|
|
// pop the end back just in case
|
|
if ( c ) desc[descLen] = c;
|
|
// return false with g_errno set
|
|
if ( ! status ) return false;
|
|
|
|
// hash gbrss:1
|
|
if ( ! hashRSSTerm ( tt , true ) ) return false;
|
|
|
|
// parser info msg
|
|
//if ( m_pbuf ) {
|
|
// m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
|
|
// "</b><br><br>");
|
|
//}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
|
|
// hash gbrss:0 or gbrss:1
|
|
char *value;
|
|
if ( inRSS ) value = "1";
|
|
else value = "0";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "gbinrss";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
|
|
// returns false and sets g_errno on error
|
|
if ( ! hashString(value,1,&hi ) ) return false;
|
|
|
|
// hash gbisrss:1 if we are an rss page ourselves
|
|
if ( *getIsRSS() ) value = "1";
|
|
else value = "0";
|
|
// update hash parms
|
|
hi.m_prefix = "gbisrss";
|
|
// returns false and sets g_errno on error
|
|
if ( ! hashString(value,1,&hi) ) return false;
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
// . the new Weights class hashes title as part of body now with a high weight
|
|
// given by "titleWeight" parm
|
|
bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
|
// sanity check
|
|
if ( m_hashedTitle ) { char *xx=NULL ; *xx=0; }
|
|
|
|
setStatus ( "hashing title" );
|
|
|
|
// this has been called, note it
|
|
m_hashedTitle = true;
|
|
|
|
nodeid_t *tids = m_words.m_tagIds;
|
|
int32_t nw = m_words.m_numWords;
|
|
|
|
// find the first <title> tag in the doc
|
|
int32_t i ;
|
|
for ( i = 0 ; i < nw ; i++ )
|
|
if ( tids[i] == TAG_TITLE ) break;
|
|
|
|
// return true if no title
|
|
if ( i >= nw ) return true;
|
|
|
|
// skip tag
|
|
i++;
|
|
// mark it as start of title
|
|
int32_t a = i;
|
|
|
|
// limit end
|
|
int32_t max = i + 40;
|
|
if ( max > nw ) max = nw;
|
|
|
|
// find end of title, either another <title> or a <title> tag
|
|
for ( ; i < max ; i++ )
|
|
if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break;
|
|
|
|
// ends on a <title> tag?
|
|
if ( i == a ) return true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "title";
|
|
hi.m_useSynonyms= true;
|
|
|
|
// the new posdb info
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
|
|
// . hash it up! use 0 for the date
|
|
// . use XmlDoc::hashWords()
|
|
// . use "title" as both prefix and description
|
|
//if ( ! hashWords (a,i,&hi ) ) return false;
|
|
|
|
char **wptrs = m_words.getWords();
|
|
int32_t *wlens = m_words.getWordLens();
|
|
char *title = wptrs[a];
|
|
char *titleEnd = wptrs[i-1] + wlens[i-1];
|
|
int32_t titleLen = titleEnd - title;
|
|
if ( ! hashString ( title, titleLen, &hi) ) return false;
|
|
|
|
// now hash as without title: prefix
|
|
hi.m_prefix = NULL;
|
|
if ( ! hashString ( title, titleLen, &hi) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
|
|
|
|
// do not index ANY of the body if it is NOT a permalink and
|
|
// "menu elimination" technology is enabled.
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing body" );
|
|
|
|
// if more than X% of words are spammed to some degree, index all
|
|
// words with a minimum score
|
|
//int64_t x[] = {30,40,50,70,90};
|
|
//int64_t y[] = {6,8,10,20,30};
|
|
//int32_t mp = getY ( *getSiteNumInlinks8() , x , y , 5 );
|
|
|
|
//int32_t nw = m_words.getNumWords();
|
|
|
|
// record this
|
|
m_bodyStartPos = m_dist;
|
|
m_bodyStartPosValid = true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "body";
|
|
hi.m_useSynonyms= true;
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
// use NULL for the prefix
|
|
return hashWords (&hi );
|
|
}
|
|
|
|
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta keywords" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta keywords";
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// call XmlDoc::hashString
|
|
return hashString ( mk , mklen , &hi);
|
|
}
|
|
|
|
|
|
// . hash the meta summary, description and keyword tags
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
|
|
|
// sanity check
|
|
if ( m_hashedMetas ) { char *xx=NULL ; *xx=0; }
|
|
|
|
// this has been called, note it
|
|
m_hashedMetas = true;
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta summary" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
|
|
int32_t mslen;
|
|
char *ms = getMetaSummary ( &mslen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// update hashing parms
|
|
hi.m_desc = "meta summary";
|
|
// hash it
|
|
if ( ! hashString ( ms , mslen , &hi )) return false;
|
|
|
|
|
|
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription ( &mdlen );
|
|
|
|
// update hashing parms
|
|
hi.m_desc = "meta desc";
|
|
// . TODO: only hash if unique????? set a flag on ht then i guess
|
|
if ( ! hashString ( md , mdlen , &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
//bool XmlDoc::linksToGigablast ( ) {
|
|
// // check m_links for a link to gigablast.com or www.gigablast.com
|
|
// return m_links.linksToGigablast();
|
|
//}
|
|
|
|
bool XmlDoc::searchboxToGigablast ( ) {
|
|
// . they may have a form variable like
|
|
// . <form method=get action=http://www.gigablast.com/cgi/0.cgi name=f>
|
|
return m_xml.hasGigablastForm();
|
|
}
|
|
|
|
// . bring back support for dmoz integration
|
|
// . when clicking on a "search within this category" it does a gbpdcat:<catid>
|
|
// search to capture all pages that have that dmoz category as one of their
|
|
// parent topics
|
|
bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
|
|
|
|
getDmozTitles();
|
|
|
|
|
|
char *titlePtr = ptr_dmozTitles;
|
|
char *sumPtr = ptr_dmozSumms;
|
|
//char *anchPtr = ptr_dmozAnchors;
|
|
|
|
char buf[128];
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
|
|
int32_t *catIds = (int32_t *)ptr_catIds;
|
|
int32_t numCatIds = size_catIds / 4;
|
|
// go through the catIds and hash them
|
|
for (int32_t i = 0; i < numCatIds; i++) {
|
|
// write the catid as a string
|
|
sprintf(buf, "%" UINT32 "", (uint32_t)catIds[i]);
|
|
// term prefix for hashing
|
|
hi.m_prefix = "gbcatid";
|
|
// hash it
|
|
hashString ( buf , gbstrlen(buf) , &hi );
|
|
// we also want to hash the parents
|
|
int32_t currCatId = catIds[i];
|
|
int32_t currParentId = catIds[i];
|
|
int32_t currCatIndex;
|
|
// loop to the Top, Top = 1
|
|
while ( currCatId > 1 ) {
|
|
// hash the parent
|
|
sprintf(buf, "%" UINT32 "", (uint32_t)currParentId);
|
|
hi.m_prefix = "gbpcatid";
|
|
hashString ( buf , gbstrlen(buf), &hi );
|
|
// next cat
|
|
currCatId = currParentId;
|
|
// get the index for this cat
|
|
currCatIndex = g_categories->getIndexFromId(currCatId);
|
|
if ( currCatIndex <= 0 ) break;
|
|
// get the parent for this cat
|
|
currParentId =
|
|
g_categories->m_cats[currCatIndex].m_parentid;
|
|
}
|
|
|
|
// do not hash titles or summaries if "index article content
|
|
// only" parm is on
|
|
//if ( tr->eliminateMenus() ) continue;
|
|
|
|
// hash dmoz title
|
|
hi.m_prefix = NULL;
|
|
// call this DMOZ title as regular title i guess
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
// hash the DMOZ title
|
|
hashString ( titlePtr , gbstrlen(titlePtr), &hi );
|
|
// next title
|
|
titlePtr += gbstrlen(titlePtr) + 1;
|
|
|
|
// hash DMOZ summary
|
|
hi.m_prefix = NULL;
|
|
// call this DMOZ summary as body i guess
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
// hash the DMOZ summary
|
|
hashString ( sumPtr , gbstrlen(sumPtr), &hi );
|
|
// next summary
|
|
sumPtr += gbstrlen(sumPtr) + 1;
|
|
}
|
|
|
|
int32_t numIndCatIds = size_indCatIds / 4;
|
|
int32_t *indCatIds = (int32_t *)ptr_indCatIds;
|
|
// go through the INDIRECT catIds and hash them
|
|
for (int32_t i = 0 ; i < numIndCatIds; i++) {
|
|
|
|
// write the catid as a string
|
|
sprintf(buf, "%" UINT32 "", (uint32_t)indCatIds[i]);
|
|
// use prefix
|
|
hi.m_prefix = "gbicatid";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// hash it
|
|
hashString ( buf , gbstrlen(buf), &hi );
|
|
|
|
// we also want to hash the parents
|
|
int32_t currCatId = indCatIds[i];
|
|
int32_t currParentId = indCatIds[i];
|
|
int32_t currCatIndex;
|
|
// loop to the Top, Top = 1
|
|
while (currCatId > 1) {
|
|
// hash the parent
|
|
sprintf(buf, "%" UINT32 "", (uint32_t)currParentId);
|
|
// new prefix
|
|
hi.m_prefix = "gbipcatid";
|
|
// hash it
|
|
hashString ( buf , gbstrlen(buf), &hi );
|
|
// next cat
|
|
currCatId = currParentId;
|
|
// get the index for this cat
|
|
currCatIndex = g_categories->getIndexFromId(currCatId);
|
|
if ( currCatIndex <= 0 ) break;
|
|
// get the parent for this cat
|
|
currParentId =
|
|
g_categories->m_cats[currCatIndex].m_parentid;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
char s[32]; // numeric langid
|
|
int32_t slen = sprintf(s, "%" INT32 "", langId );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
// try lang abbreviation
|
|
sprintf(s , "%s ", getLangAbbr(langId) );
|
|
// go back to broken way to try to fix parsing consistency bug
|
|
// by adding hashLanguageString() function below
|
|
//sprintf(s , "%s ", getLangAbbr(langId) );
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language string" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
// try lang abbreviation
|
|
char s[32];
|
|
int32_t slen = sprintf(s , "%s ", getLangAbbr(langId) );
|
|
// go back to broken way to try to fix parsing consistency bug
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashCountry ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing country" );
|
|
|
|
//uint16_t *cids = getCountryIds();
|
|
//if ( ! cids ) return true;
|
|
//if ( cids == (uint16_t *)-1 ) return false;
|
|
uint16_t *cid = getCountryId();
|
|
if ( ! cid || cid == (uint16_t *)-1 ) return false;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbcountry";
|
|
|
|
for ( int32_t i = 0 ; i < 1 ; i++ ) {
|
|
// get the ith country id
|
|
//int32_t cid = cids[i];
|
|
// convert it
|
|
char buf[32];
|
|
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
|
|
// hash it
|
|
if ( ! hashString ( buf, blen, &hi ) ) return false;
|
|
}
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing site num inlinks" );
|
|
|
|
char s[32];
|
|
int32_t slen = sprintf(s, "%" INT32 "", (int32_t)*getSiteNumInlinks() );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbsitenuminlinks";
|
|
|
|
// hack test
|
|
// slen = sprintf(s,"%" UINT32 "",
|
|
// ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
|
|
// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);
|
|
|
|
return hashString ( s, slen, &hi );
|
|
}
|
|
|
|
bool XmlDoc::hashCharset ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing charset" );
|
|
|
|
char s[128]; // charset string
|
|
int32_t slen;
|
|
|
|
// hash the charset as a string
|
|
if ( ! get_charset_str(*getCharset()))
|
|
slen = sprintf(s, "unknown");
|
|
else
|
|
slen = sprintf(s, "%s", get_charset_str(*getCharset()));
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbcharset";
|
|
|
|
if ( ! hashString ( s,slen, &hi ) ) return false;
|
|
|
|
// hash charset as a number
|
|
slen = sprintf(s, "%d", *getCharset());
|
|
|
|
return hashString ( s,slen, &hi ) ;
|
|
}
|
|
|
|
|
|
// . only hash certain tags (single byte scores and ST_COMMENT)
|
|
// . do not hash clocks, ST_SITE, ST_COMMENT
|
|
// . term = gbtag:blog1 score=0-100
|
|
// . term = gbtag:blog2 score=0-100
|
|
// . term = gbtag:english1 score=0-100
|
|
// . term = gbtag:pagerank1 score=0-100, etc. ...
|
|
// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
|
|
// . later we can support query like gbtag:english1>30
|
|
bool XmlDoc::hashTagRec ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing tag rec" );
|
|
|
|
//char *field = "gbtag:";
|
|
//int32_t fieldlen = gbstrlen(field);
|
|
//bool retval = true;
|
|
|
|
// . this tag rec does not have the ST_SITE tag in it to save space
|
|
// . it does not have clocks either?
|
|
TagRec *gr = getTagRec();
|
|
|
|
// count occurrence of each tag id
|
|
//int16_t count [ LAST_TAG ];
|
|
//memset ( count , 0 , 2 * LAST_TAG );
|
|
|
|
// loop over all tags in the title rec
|
|
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get id
|
|
int32_t type = tag->m_type;
|
|
// skip tags we are not supposed to index, like
|
|
// ST_CLOCK, etc. or anything with a dataSize not 1
|
|
if ( ! tag->isIndexable() ) continue;
|
|
// hash these metas below
|
|
//if ( type == ST_META ) continue;
|
|
//if ( tag->isType("meta") ) continue;
|
|
// only single byters. this should have been covered by the
|
|
// isIndexable() function.
|
|
//if ( tag->getTagDataSize() != 1 ) continue;
|
|
// get the name
|
|
char *str = getTagStrFromType ( type );
|
|
// get data size
|
|
//uint8_t *data = (uint8_t *)tag->getTagData();
|
|
// make it a string
|
|
//char dataStr[6];
|
|
//sprintf ( dataStr , "%" INT32 "",(int32_t)*data );
|
|
// skip if has non numbers
|
|
//bool num = true;
|
|
//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
|
|
// if ( ! is_digit(tag->getTagData()[i]) ) num = false;
|
|
// skip if it has more than just digits, we are not indexing
|
|
// strings at this point
|
|
//if ( ! num ) continue;
|
|
// point to it, should be a NULL terminated string
|
|
char *dataStr = tag->getTagData();
|
|
// skip if number is too big
|
|
//int32_t val = atol ( dataStr );
|
|
// boost by one so we can index "0" score
|
|
//val++;
|
|
// we really only want to index scores from 0-255
|
|
//if ( val > 255 ) continue;
|
|
// no negatives
|
|
//if ( val <= 0 ) continue;
|
|
// count occurrence
|
|
//count [ type ]++;
|
|
// . make the term name to hash after the gbtag:
|
|
// . we want to hash "gbtag:english3" for example, for the
|
|
// ST_ENGLISH tag id.
|
|
char prefix[64];
|
|
// . do not include the count for the first occurrence
|
|
// . follows the gbruleset:36 convention
|
|
// . index gbtagspam:0 or gbtagspam:1, etc.!!!
|
|
//if ( count[type] == 1 )
|
|
sprintf ( prefix , "gbtag%s",str);
|
|
// assume that is good enough
|
|
//char *prefix = tmp;
|
|
// store prefix into m_wbuf so XmlDoc::print() works!
|
|
//if ( m_pbuf ) {
|
|
// int32_t tlen = gbstrlen(tmp);
|
|
// m_wbuf.safeMemcpy(tmp,tlen+1);
|
|
// prefix = m_wbuf.getBuf() - (tlen+1);
|
|
//}
|
|
//else
|
|
// sprintf ( tmp , "gbtag%s%" INT32 "",str,(int32_t)count[type]);
|
|
// "unmap" it so when it is hashed it will have the correct
|
|
// 8-bit score. IndexList.cpp will convert it back to 8 bits
|
|
// in IndexList::set(table), which sets our termlist from
|
|
// this "table".
|
|
//int32_t score = score8to32 ( val );
|
|
// we already incorporate the score as a string when we hash
|
|
// gbtagtagname:tagvalue so why repeat it?
|
|
//int32_t score = 1;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = prefix;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
|
|
// meta is special now
|
|
if ( tag->isType("meta") ) {
|
|
hi.m_prefix = NULL;
|
|
}
|
|
|
|
// hash it. like "gbtagenglish:1" with a score of 1, etc.
|
|
// or "gbtagspam:33" with a score of 33. this would also
|
|
// hash gbtagclock:0xfe442211 type things as well.
|
|
int32_t dlen = gbstrlen(dataStr);
|
|
if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashPermalink ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing is permalink" );
|
|
|
|
// put a colon in there so it can't be faked using a meta tag.
|
|
char *s = "0";
|
|
if ( *getIsPermalink() ) s = "1";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbpermalink";
|
|
|
|
return hashString ( s,1,&hi );
|
|
}
|
|
|
|
|
|
//hash the tag pair vector, the gigabit vector and the sample vector
|
|
bool XmlDoc::hashVectors ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing vectors" );
|
|
|
|
int32_t score = *getSiteNumInlinks8() * 256;
|
|
if ( score <= 0 ) score = 1;
|
|
char buf[32];
|
|
uint32_t h;
|
|
//char *field;
|
|
//char *descr;
|
|
//h = m_tagVector.getVectorHash();
|
|
uint32_t tph = *getTagPairHash32();
|
|
int32_t blen = sprintf(buf,"%" UINT32 "", tph);
|
|
//field = "gbtagvector";
|
|
//descr = "tag vector hash";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbtagvector";
|
|
hi.m_desc = "tag vector hash";
|
|
hi.m_shardByTermId = true;
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen, &hi ) ) return false;
|
|
|
|
h = *getGigabitVectorScorelessHash();
|
|
blen = sprintf(buf,"%" UINT32 "",(uint32_t)h);
|
|
// update hash parms
|
|
hi.m_prefix = "gbgigabitvector";
|
|
hi.m_desc = "gigabit vector hash";
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi) ) return false;
|
|
|
|
// . dup checking uses the two hashes above, not this hash!!! MDW
|
|
// . i think this vector is just used to see if the page changed
|
|
// significantly since last spidering
|
|
// . it is used by getPercentChanged() and by Dates.cpp
|
|
// . sanity check
|
|
//if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
|
|
//int32_t *pc = m_pageSampleVec;
|
|
//h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE);
|
|
//blen = sprintf(buf,"%" UINT32 "",(int32_t unsigned int)h);
|
|
//field = "gbsamplevector";
|
|
//descr = "sample vector hash";
|
|
// this returns false on failure
|
|
//if ( ! hashString ( tt,buf,blen,score,field,descr) )
|
|
// return false;
|
|
|
|
// . hash combined for Dup Detection
|
|
// . must match XmlDoc::getDupList ( );
|
|
//uint64_t h1 = m_tagVector.getVectorHash();
|
|
//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
|
|
//uint64_t h64 = hash64 ( h1 , h2 );
|
|
|
|
// take this out for now
|
|
/*
|
|
uint64_t *dh = getDupHash ( );
|
|
blen = sprintf(buf,"%" UINT64 "", *dh );//h64);
|
|
//field = "gbduphash";
|
|
//descr = "dup vector hash";
|
|
// update hash parms
|
|
hi.m_prefix = "gbduphash";
|
|
hi.m_desc = "dup vector hash";
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
*/
|
|
|
|
// hash the wikipedia docids we match
|
|
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
|
|
for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) {
|
|
blen = sprintf(buf,"%" UINT64 "",ptr_wikiDocIds[i]);
|
|
// convert to int32_t
|
|
//int32_t convScore = (int32_t)ptr_wikiScores[i];
|
|
// get score
|
|
//uint32_t ws = score8to32 ( convScore );
|
|
// update hash parms
|
|
hi.m_prefix = "gbwikidocid";
|
|
hi.m_desc = "wiki docid";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashAds ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing ad ids" );
|
|
|
|
for(int32_t i = 0; i < size_adVector / 8 ; i++) {
|
|
int32_t score = *getSiteNumInlinks8() * 256;
|
|
if ( score <= 0 ) score = 1;
|
|
char buf[128];
|
|
char *field;
|
|
char *descr;
|
|
//buflen = snprintf(buf,128,"%s-%s",
|
|
// m_adProvider[i],m_adClient[i]);
|
|
snprintf(buf,128,"%" UINT64 "",ptr_adVector[i] );
|
|
int32_t bufLen = gbstrlen(buf);
|
|
field = "gbad";
|
|
descr = "ad provider and id";
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbad";
|
|
hi.m_desc = "ad provider and id";
|
|
//log(LOG_WARN, "build: url %s indexing ad termid %s:%s",
|
|
// getFirstUrl()->getUrl(), field, buf);
|
|
//this returns false on failure
|
|
if ( ! hashString ( buf,bufLen,&hi ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Url *XmlDoc::getBaseUrl ( ) {
|
|
if ( m_baseUrlValid ) return &m_baseUrl;
|
|
// need this
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
|
|
// no longer set addWWW to true since tmblr.co has an IP but
|
|
// www.tmblr.co does not
|
|
m_baseUrl.set ( cu , false ); // addWWW = true
|
|
// look for base url
|
|
for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
|
|
// 12 is the <base href> tag id
|
|
if ( xml->getNodeId ( i ) != TAG_BASE ) continue;
|
|
// get the href field of this base tag
|
|
int32_t linkLen;
|
|
char *link = (char *) xml->getString ( i, "href", &linkLen );
|
|
// skip if not valid
|
|
if ( ! link || linkLen == 0 ) continue;
|
|
// set base to it. addWWW=true
|
|
m_baseUrl.set(link, linkLen, false);//true);
|
|
break;
|
|
}
|
|
// fix invalid <base href="/" target="_self"/> tag
|
|
if ( m_baseUrl.getHostLen () <= 0 || m_baseUrl.getDomainLen() <= 0 )
|
|
m_baseUrl.set ( cu , false );
|
|
|
|
m_baseUrlValid = true;
|
|
return &m_baseUrl;
|
|
}
|
|
|
|
// hash gbhasthumbnail:0|1
|
|
bool XmlDoc::hashImageStuff ( HashTableX *tt ) {
|
|
|
|
setStatus ("hashing image stuff");
|
|
|
|
char *val = "0";
|
|
char **td = getThumbnailData();
|
|
if ( *td ) val = "1";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbhasthumbnail";
|
|
hi.m_desc = "has a thumbnail";
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( val,1,&hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
|
|
|
|
setStatus ("hashing isadult");
|
|
|
|
char *ia = getIsAdult();
|
|
// this should not block or return error! should have been
|
|
// set in prepareToMakeTitleRec() before hashAll() was called!
|
|
if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }
|
|
|
|
// index gbisadult:1 if adult or gbisadult:0 if not
|
|
char *val;
|
|
if ( *ia ) val = "1";
|
|
else val = "0";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbisadult";
|
|
hi.m_desc = "is document adult content";
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( val,1,&hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// hash destination urls for embedded gb search boxes
|
|
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing submit urls" );
|
|
|
|
Url *baseUrl = getBaseUrl();
|
|
if ( ! baseUrl || baseUrl == (Url *)-1) { char*xx=NULL;*xx=0;}
|
|
|
|
for ( int32_t i = 0 ; i < m_xml.getNumNodes() ; i++ ) {
|
|
// Find forms
|
|
if ( m_xml.getNodeId(i) != TAG_FORM ) continue;
|
|
if ( m_xml.isBackTag(i) ) continue;
|
|
int32_t score = *getSiteNumInlinks8() * 256;
|
|
if ( score <= 0 ) score = 1;
|
|
int32_t len;
|
|
char *s = m_xml.getString ( i , "action" , &len );
|
|
if (!s || len == 0) continue;
|
|
Url url; url.set(baseUrl, s, len, true);
|
|
|
|
char *buf = url.getUrl();
|
|
int32_t blen = url.getUrlLen();
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbsubmiturl";
|
|
hi.m_desc = "submit url for form";
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
//
|
|
// STUFF IMPORTED FROM INDEXLIST.CPP
|
|
//
|
|
|
|
// we also assume all scores are above 256, too
|
|
uint8_t score32to8 ( uint32_t score ) {
|
|
// ensure score is > 0... no! not any more
|
|
if ( score <= 0 ) return (unsigned char) 0;
|
|
// extremely large scores need an adjustment to avoid wrapping
|
|
if ( score < (uint32_t)0xffffffff - 128 )
|
|
score += 128;
|
|
// scores are multiplied by 256 to preserve fractions, so undo that
|
|
score /= 256;
|
|
// ensure score is > 0
|
|
if ( score <= 0 ) return (unsigned char) 1;
|
|
// if score < 128 return it now
|
|
if ( score < 128 ) return (unsigned char) score;
|
|
// now shrink it so it's now from 1 upwards
|
|
score -= 127;
|
|
|
|
// . take NATURAL log of score now
|
|
// . PROBLEM: for low scores logscore may increase by close to 1.0
|
|
// for a score increase of 1.0. and since s_maxscore is about 22.0
|
|
// we end up moving 1.0/22.0 of 128 total pts causing a jump of
|
|
// 2 or more score points!! oops!!! to fix, let's add 10 pts
|
|
// to the score
|
|
score += 10;
|
|
double logscore = ::log ( (double)score );
|
|
// now the max it can be
|
|
//double maxscore = ::log ( (double)(0x00ffffff - 127));
|
|
static double s_maxscore = -1.0;
|
|
static double s_minscore = -1.0;
|
|
if ( s_maxscore == -1.0 ) {
|
|
uint32_t max = ((0xffffffff + 0)/256) - 127 + 10;
|
|
uint32_t min = ( 128 ) - 127 + 10;
|
|
s_maxscore = ::log((double)max);
|
|
s_minscore = ::log((double)min);
|
|
// adjust
|
|
s_maxscore -= s_minscore;
|
|
}
|
|
// adjust it
|
|
logscore -= s_minscore;
|
|
// scale it into [126,0] (add .5 for rounding)
|
|
double scaled = (logscore* 127.0) / s_maxscore + .5;
|
|
// sanity check
|
|
if ( (unsigned char)scaled >= 128 ) { char *xx=NULL;*xx=0; }
|
|
// . go into the 8 bit score now
|
|
// . set the hi bit so they know we took its log
|
|
unsigned char score8 = (unsigned char)scaled | 128;
|
|
return score8;
|
|
}
|
|
|
|
// for score8to32() below
|
|
static uint32_t s_scoreMap[] = {
|
|
0UL,
|
|
1UL,
|
|
385UL,
|
|
641UL,
|
|
897UL,
|
|
1153UL,
|
|
1409UL,
|
|
1665UL,
|
|
1921UL,
|
|
2177UL,
|
|
2433UL,
|
|
2689UL,
|
|
2945UL,
|
|
3201UL,
|
|
3457UL,
|
|
3713UL,
|
|
3969UL,
|
|
4225UL,
|
|
4481UL,
|
|
4737UL,
|
|
4993UL,
|
|
5249UL,
|
|
5505UL,
|
|
5761UL,
|
|
6017UL,
|
|
6273UL,
|
|
6529UL,
|
|
6785UL,
|
|
7041UL,
|
|
7297UL,
|
|
7553UL,
|
|
7809UL,
|
|
8065UL,
|
|
8321UL,
|
|
8577UL,
|
|
8833UL,
|
|
9089UL,
|
|
9345UL,
|
|
9601UL,
|
|
9857UL,
|
|
10113UL,
|
|
10369UL,
|
|
10625UL,
|
|
10881UL,
|
|
11137UL,
|
|
11393UL,
|
|
11649UL,
|
|
11905UL,
|
|
12161UL,
|
|
12417UL,
|
|
12673UL,
|
|
12929UL,
|
|
13185UL,
|
|
13441UL,
|
|
13697UL,
|
|
13953UL,
|
|
14209UL,
|
|
14465UL,
|
|
14721UL,
|
|
14977UL,
|
|
15233UL,
|
|
15489UL,
|
|
15745UL,
|
|
16001UL,
|
|
16257UL,
|
|
16513UL,
|
|
16769UL,
|
|
17025UL,
|
|
17281UL,
|
|
17537UL,
|
|
17793UL,
|
|
18049UL,
|
|
18305UL,
|
|
18561UL,
|
|
18817UL,
|
|
19073UL,
|
|
19329UL,
|
|
19585UL,
|
|
19841UL,
|
|
20097UL,
|
|
20353UL,
|
|
20609UL,
|
|
20865UL,
|
|
21121UL,
|
|
21377UL,
|
|
21633UL,
|
|
21889UL,
|
|
22145UL,
|
|
22401UL,
|
|
22657UL,
|
|
22913UL,
|
|
23169UL,
|
|
23425UL,
|
|
23681UL,
|
|
23937UL,
|
|
24193UL,
|
|
24449UL,
|
|
24705UL,
|
|
24961UL,
|
|
25217UL,
|
|
25473UL,
|
|
25729UL,
|
|
25985UL,
|
|
26241UL,
|
|
26497UL,
|
|
26753UL,
|
|
27009UL,
|
|
27265UL,
|
|
27521UL,
|
|
27777UL,
|
|
28033UL,
|
|
28289UL,
|
|
28545UL,
|
|
28801UL,
|
|
29057UL,
|
|
29313UL,
|
|
29569UL,
|
|
29825UL,
|
|
30081UL,
|
|
30337UL,
|
|
30593UL,
|
|
30849UL,
|
|
31105UL,
|
|
31361UL,
|
|
31617UL,
|
|
31873UL,
|
|
32129UL,
|
|
32385UL,
|
|
32641UL,
|
|
32897UL,
|
|
33488UL,
|
|
33842UL,
|
|
34230UL,
|
|
34901UL,
|
|
35415UL,
|
|
35979UL,
|
|
36598UL,
|
|
37278UL,
|
|
38025UL,
|
|
39319UL,
|
|
40312UL,
|
|
41404UL,
|
|
43296UL,
|
|
44747UL,
|
|
46343UL,
|
|
48098UL,
|
|
51138UL,
|
|
53471UL,
|
|
56037UL,
|
|
58859UL,
|
|
61962UL,
|
|
65374UL,
|
|
71287UL,
|
|
75825UL,
|
|
80816UL,
|
|
86305UL,
|
|
92342UL,
|
|
98982UL,
|
|
110492UL,
|
|
119326UL,
|
|
129042UL,
|
|
139728UL,
|
|
151481UL,
|
|
171856UL,
|
|
187496UL,
|
|
204699UL,
|
|
223622UL,
|
|
244437UL,
|
|
267333UL,
|
|
307029UL,
|
|
337502UL,
|
|
371022UL,
|
|
407893UL,
|
|
448450UL,
|
|
493062UL,
|
|
570408UL,
|
|
629783UL,
|
|
695095UL,
|
|
766938UL,
|
|
845965UL,
|
|
982981UL,
|
|
1088163UL,
|
|
1203862UL,
|
|
1331130UL,
|
|
1471124UL,
|
|
1625117UL,
|
|
1892110UL,
|
|
2097072UL,
|
|
2322530UL,
|
|
2570533UL,
|
|
2843335UL,
|
|
3143416UL,
|
|
3663697UL,
|
|
4063102UL,
|
|
4502447UL,
|
|
4985726UL,
|
|
5517332UL,
|
|
6439034UL,
|
|
7146599UL,
|
|
7924919UL,
|
|
8781070UL,
|
|
9722836UL,
|
|
10758778UL,
|
|
12554901UL,
|
|
13933735UL,
|
|
15450451UL,
|
|
17118838UL,
|
|
18954063UL,
|
|
20972809UL,
|
|
24472927UL,
|
|
27159874UL,
|
|
30115514UL,
|
|
33366717UL,
|
|
36943040UL,
|
|
43143702UL,
|
|
47903786UL,
|
|
53139877UL,
|
|
58899576UL,
|
|
65235244UL,
|
|
72204478UL,
|
|
84287801UL,
|
|
93563849UL,
|
|
103767501UL,
|
|
114991518UL,
|
|
127337936UL,
|
|
140918995UL,
|
|
164465962UL,
|
|
182542348UL,
|
|
202426372UL,
|
|
224298798UL,
|
|
248358466UL,
|
|
290073346UL,
|
|
322096762UL,
|
|
357322519UL,
|
|
396070851UL,
|
|
438694015UL,
|
|
485579494UL,
|
|
566869982UL,
|
|
629274552UL,
|
|
697919578UL,
|
|
773429105UL,
|
|
856489583UL,
|
|
947856107UL,
|
|
1106268254UL,
|
|
1227877095UL,
|
|
1361646819UL,
|
|
1508793514UL,
|
|
1670654878UL,
|
|
1951291651UL,
|
|
2166729124UL,
|
|
2403710344UL,
|
|
2664389686UL,
|
|
2951136962UL,
|
|
3266558965UL,
|
|
3813440635UL,
|
|
4233267317UL
|
|
};
|
|
|
|
uint32_t score8to32 ( uint8_t score8 ) {
|
|
|
|
/*
|
|
int32_t test = score32to8((uint32_t)0xffffffff);
|
|
static bool s_set = false;
|
|
if ( ! s_set ) {
|
|
s_set = true;
|
|
uint8_t lasts = 0;
|
|
int32_t step = 128;
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
for ( uint64_t i=1 ; i<(uint32_t)0xffffffff ; i+=step) {
|
|
// get the score
|
|
uint8_t s = score32to8(i);
|
|
// print it out now
|
|
if ( s != lasts ) {
|
|
fprintf(stderr,"\t%" UINT32 "UL,\n",i);
|
|
}
|
|
// if no change, skip it
|
|
if (lasts != 0 && s == lasts ) {
|
|
if ( s > 128 )
|
|
step = (int32_t)((float)step * 1.1);
|
|
continue;
|
|
}
|
|
// otherwise set it
|
|
s_scoreMap[s] = i;
|
|
// reset
|
|
lasts = s;
|
|
}
|
|
// sanity test
|
|
for ( int32_t j = 1 ; j < 256 ; j++ ) {
|
|
uint32_t big = s_scoreMap[j];
|
|
if ( score32to8(big) != j ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
int64_t end = gettimeofdayInMilliseconds();
|
|
logf(LOG_DEBUG,
|
|
"gb: took %" INT64 " ms to build score table.",
|
|
end-start);
|
|
|
|
}
|
|
// sanity test
|
|
static bool s_set = false;
|
|
if ( ! s_set ) {
|
|
for ( int32_t j = 1 ; j < 256 ; j++ ) {
|
|
uint32_t big = s_scoreMap[j];
|
|
uint8_t tt;
|
|
tt = score32to8(big);
|
|
if ( tt != j ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
s_set = true;
|
|
}
|
|
*/
|
|
|
|
return(s_scoreMap[score8]);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//
|
|
// Summary/Title generation for Msg20
|
|
//
|
|
////////////////////////////////////////////////////////////
|
|
|
|
void XmlDoc::set20 ( Msg20Request *req ) {
|
|
// clear it all out
|
|
reset();
|
|
// this too
|
|
m_reply.reset();
|
|
|
|
m_pbuf = NULL;//pbuf;
|
|
m_niceness = req->m_niceness;
|
|
// remember this
|
|
m_req = req;
|
|
// and this!
|
|
//m_coll = req->ptr_coll;
|
|
//setCollNum ( req->ptr_coll );
|
|
m_collnum = req->m_collnum;
|
|
m_collnumValid = true;
|
|
// make this stuff valid
|
|
if ( m_req->m_docId > 0 ) {
|
|
m_docId = m_req->m_docId;
|
|
m_docIdValid = true;
|
|
}
|
|
// set url too if we should
|
|
if ( m_req->size_ubuf > 1 )
|
|
setFirstUrl ( m_req->ptr_ubuf , false );
|
|
}
|
|
|
|
#define MAX_LINK_TEXT_LEN 512
|
|
#define MAX_RSSITEM_SIZE 30000
|
|
|
|
void getMsg20ReplyWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// return if it blocked
|
|
if ( THIS->getMsg20Reply ( ) == (void *)-1 ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// . returns NULL with g_errno set on error
|
|
// . returns -1 if blocked
|
|
Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
|
|
|
// return it right away if valid
|
|
if ( m_replyValid ) return &m_reply;
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block, this callback will be called
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getMsg20ReplyWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// used by Msg20.cpp to time this XmlDoc::getMsg20Reply() function
|
|
if ( ! m_startTimeValid && isClockInSync() ) {
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
m_startTimeValid = true;
|
|
}
|
|
|
|
// caller shouldhave the callback set
|
|
if ( ! m_callback1 && ! m_callback2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
//char safeStack[100000];
|
|
//safeStack[0] = 0;
|
|
//safeStack[90000] = 0;
|
|
|
|
// int16_tcut
|
|
Msg20Reply *reply = &m_reply;
|
|
|
|
m_niceness = m_req->m_niceness;
|
|
|
|
m_collnum = m_req->m_collnum;//cr->m_collnum;
|
|
m_collnumValid = true;
|
|
|
|
//char *coll = m_req->ptr_coll;
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }
|
|
|
|
|
|
//CollectionRec *cr = getCollRec();
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// set this important member var
|
|
//if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll));
|
|
// return NULL with g_errno set on error
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// . cache it for one hour
|
|
// . this will set our ptr_ and size_ member vars
|
|
char **otr = getOldTitleRec ( );
|
|
if ( ! otr || otr == (void *)-1 ) return (Msg20Reply *)otr;
|
|
|
|
// must have a title rec in titledb
|
|
if ( ! *otr ) { g_errno = ENOTFOUND; return NULL; }
|
|
|
|
// sanity
|
|
if ( *otr != m_oldTitleRec ) { char *xx=NULL;*xx=0; }
|
|
|
|
// what is this?
|
|
int32_t maxSize = 0;
|
|
|
|
// . set our ptr_ and size_ member vars from it after uncompressing
|
|
// . returns false and sets g_errno on error
|
|
if ( ! m_setTr ) {
|
|
// . this completely resets us
|
|
// . this returns false with g_errno set on error
|
|
bool status = set2( *otr, maxSize, cr->m_coll, NULL,
|
|
m_niceness);
|
|
// sanity check
|
|
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// if there was an error, g_errno should be set.
|
|
if ( ! status ) return NULL;
|
|
m_setTr = true;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// init
|
|
reply->m_nextMerged = NULL;
|
|
|
|
reply->m_collnum = m_collnum;
|
|
|
|
// MsgE uses this one
|
|
if ( m_req->m_getTitleRec ) {
|
|
// this is the original compressed titleRec, preceeded
|
|
// by key and dataSize and followed by the data
|
|
reply-> ptr_tr = m_oldTitleRec;
|
|
reply->size_tr = m_oldTitleRecSize;
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
|
|
// if they provided a query with gbfacet*: terms then we have
|
|
// to get those facet values.
|
|
if ( ! m_gotFacets ) {
|
|
// only do this once
|
|
m_gotFacets = true;
|
|
// get facet term
|
|
char *qs = m_req->ptr_qbuf;
|
|
facetPrintLoop:
|
|
for ( ; qs && *qs ; qs++ ) {
|
|
if ( qs[0] != 'g' ) continue;
|
|
if ( qs[1] != 'b' ) continue;
|
|
if ( qs[2] != 'f' ) continue;
|
|
if ( strncasecmp(qs,"gbfacet",7) ) continue;
|
|
qs += 7;
|
|
// gbfacetstr: gbfacetint: gbfacetfloat:
|
|
if ( strncasecmp(qs,"str:" ,4) == 0 ) qs += 4;
|
|
else if ( strncasecmp(qs,"int:" ,4) == 0 ) qs += 4;
|
|
else if ( strncasecmp(qs,"float:",6) == 0 ) qs += 6;
|
|
else continue;
|
|
break;
|
|
}
|
|
// if we had a facet, get the values it has in the doc
|
|
if ( qs && *qs ) {
|
|
// need this for storeFacetValues() if we are json
|
|
if ( m_contentType == CT_JSON ||
|
|
// spider status docs are really json
|
|
m_contentType == CT_STATUS ) {
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1)
|
|
return (Msg20Reply *)jp;
|
|
}
|
|
if ( m_contentType == CT_HTML ||
|
|
m_contentType == CT_XML ) {
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml==(void *)-1)
|
|
return (Msg20Reply *)xml;
|
|
}
|
|
// find end of it
|
|
char *e = qs;
|
|
for ( ; *e && ! is_wspace_a(*e) ; e++ );
|
|
// tmp null it
|
|
char c = *e; *e = '\0';
|
|
// this is zero if unspecifed
|
|
FacetValHash_t fvh = m_req->m_facetValHash;
|
|
// . this will store facetField/facetValue pairs
|
|
// . stores into safebuf, m_tmpBuf2
|
|
// . it will terminate all stored strings with \0
|
|
// . we check meta tags for html docs
|
|
// . otherwise we check xml/json doc fields
|
|
// . returns false with g_errno set on error
|
|
bool ret = storeFacetValues ( qs , &m_tmpBuf2 , fvh ) ;
|
|
// revert the \0
|
|
*e = c;
|
|
// return NULL with g_errno set on error
|
|
if ( ! ret ) return NULL;
|
|
// advance
|
|
qs = e;
|
|
// do another one
|
|
goto facetPrintLoop;
|
|
}
|
|
// assign
|
|
reply-> ptr_facetBuf = m_tmpBuf2.getBufStart();
|
|
reply->size_facetBuf = m_tmpBuf2.length();
|
|
}
|
|
|
|
if ( m_req->m_justGetFacets ) {
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
if ( m_req->m_getTermListBuf ) {
|
|
// ensure content is recycled from title rec
|
|
m_recycleContent = true;
|
|
//xd->m_recycleLinkInfo = true;
|
|
// only get posdb keys really for this stuff
|
|
m_useTitledb = false;
|
|
m_useTagdb = false;
|
|
m_useClusterdb = false;
|
|
m_useSpiderdb = false;
|
|
m_useLinkdb = false;
|
|
// time it
|
|
if ( m_tlbufTimer == 0 )
|
|
m_tlbufTimer = gettimeofdayInMilliseconds();
|
|
// . shit limit content for speed!!!
|
|
// . this is for getting matching queries/relatedqueries
|
|
// anyway, so should be ok
|
|
if ( size_utf8Content > 150000 ) {
|
|
char *p = ptr_utf8Content + 150000 - 1;
|
|
char *pstart = ptr_utf8Content;
|
|
// back up until we hit punct
|
|
for ( ; p > pstart ; p-- )
|
|
if ( is_punct_utf8(p) ) break;
|
|
// set new size then
|
|
*p = '\0';
|
|
size_utf8Content = p - pstart + 1;
|
|
}
|
|
// hack: should be sorted by lower 32bits of termids
|
|
// so handleRequest8e does not have to sort before doing
|
|
// its query matching algo with queries in g_qbuf.
|
|
// but these termlists are really mostly used for doing
|
|
// the gbdocid:|xxxx queries in handleRequest8e.
|
|
SafeBuf *tbuf = getTermListBuf();
|
|
if ( ! tbuf || tbuf == (void *)-1 ) return (Msg20Reply *)tbuf;
|
|
SafeBuf *tibuf = getTermId32Buf();
|
|
if ( ! tibuf || tibuf == (void *)-1)return (Msg20Reply *)tibuf;
|
|
// time it
|
|
int64_t took = gettimeofdayInMilliseconds() - m_tlbufTimer;
|
|
log("seo: tlistbuf gen took %" INT64 " ms for docid %" INT64 "",
|
|
took,m_docId);
|
|
// just that
|
|
reply-> ptr_tlistBuf = tbuf->getBufStart();
|
|
reply->size_tlistBuf = tbuf->length();
|
|
reply-> ptr_tiBuf = tibuf->getBufStart();
|
|
reply->size_tiBuf = tibuf->length();
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
// lookup the tagdb rec fresh if setting for a summary. that way we
|
|
// can see if it is banned or not. but for getting m_getTermListBuf
|
|
// and stuff above, skip the tagrec lookup!
|
|
// save some time when SPIDERING/BUILDING by skipping fresh
|
|
// tagdb lookup and using tags in titlerec
|
|
if ( m_req && ! m_req->m_getLinkText && ! m_checkedUrlFilters )
|
|
m_tagRecDataValid = false;
|
|
|
|
// set and validate member vars
|
|
//if ( ! m_setFromTitleRec )
|
|
// // return NULL with g_errno set on error
|
|
// if ( ! set ( tr , NULL , m_niceness ) ) return NULL;
|
|
|
|
// if shard responsible for tagrec is dead, then
|
|
// just recycle!
|
|
if ( m_req && ! m_checkedUrlFilters && ! m_tagRecDataValid ) {
|
|
char *site = getSite();
|
|
TAGDB_KEY tk1 = g_tagdb.makeStartKey ( site );
|
|
TAGDB_KEY tk2 = g_tagdb.makeDomainStartKey ( &m_firstUrl );
|
|
uint32_t shardNum1 = g_hostdb.getShardNum(RDB_TAGDB,&tk1);
|
|
uint32_t shardNum2 = g_hostdb.getShardNum(RDB_TAGDB,&tk2);
|
|
// shardnum1 and shardnum2 are often different!
|
|
// log("db: s1=%i s2=%i",(int)shardNum1,(int)shardNum2);
|
|
if ( g_hostdb.isShardDead ( shardNum1 ) ) {
|
|
log("query: skipping tagrec lookup for dead shard "
|
|
"# %" INT32 ""
|
|
,shardNum1);
|
|
m_tagRecDataValid = true;
|
|
}
|
|
if ( g_hostdb.isShardDead ( shardNum2 ) && m_firstUrlValid ) {
|
|
log("query: skipping tagrec lookup for dead shard "
|
|
"# %" INT32 ""
|
|
,shardNum2);
|
|
m_tagRecDataValid = true;
|
|
}
|
|
}
|
|
|
|
|
|
// if we are showing sites that have been banned in tagdb, we dont
|
|
// have to do a tagdb lookup. that should speed things up.
|
|
TagRec *gr = NULL;
|
|
if ( cr && cr->m_doTagdbLookups ) {
|
|
gr = getTagRec();
|
|
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
|
|
}
|
|
|
|
//reply-> ptr_tagRec = (char *)gr;
|
|
//reply->size_tagRec = gr->getSize();
|
|
|
|
// we use this instead of nowGlobal
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this should be valid, it is stored in title rec
|
|
if ( m_contentHash32Valid ) reply->m_contentHash32 = m_contentHash32;
|
|
else reply->m_contentHash32 = 0;
|
|
|
|
// if this page is potential spam, toss it!
|
|
//char *isSpam = getIsSpam();
|
|
//if ( ! isSpam || isSpam == (char *)-1 ) return (Msg20Reply *)isSpam;
|
|
|
|
if ( ! m_checkedUrlFilters ) {
|
|
// do it
|
|
//int32_t *rn = getRegExpNum2(-1);
|
|
//if ( ! rn || rn == (int32_t *)-1 ) return (Msg20Reply *)rn;
|
|
// do not re-check
|
|
m_checkedUrlFilters = true;
|
|
|
|
// a non-www url?
|
|
/*
|
|
|
|
now we allow domain-only urls in the index, so this is
|
|
hurting us...
|
|
|
|
if ( ! m_req->m_getLinkText ) {
|
|
Url tmp;
|
|
tmp.set ( ptr_firstUrl );
|
|
if ( tmp.getHostLen() == tmp.getDomainLen() ) {
|
|
// set m_errno
|
|
reply->m_errno = EDOCFILTERED;
|
|
// tmp debug
|
|
log("xmldoc: filtering non www url %s",
|
|
ptr_firstUrl);
|
|
// and this
|
|
reply->m_isFiltered = true;
|
|
// give back the url at least
|
|
reply->ptr_ubuf = getFirstUrl()->getUrl();
|
|
reply->size_ubuf =getFirstUrl()->getUrlLen()+1;
|
|
// validate
|
|
m_replyValid = true;
|
|
// and return
|
|
return reply;
|
|
}
|
|
}
|
|
*/
|
|
|
|
// get this
|
|
//time_t nowGlobal = getTimeGlobal();
|
|
// get this
|
|
SpiderRequest sreq;
|
|
SpiderReply srep;
|
|
setSpiderReqForMsg20 ( &sreq , &srep );//, *isSpam );
|
|
int32_t spideredTime = getSpideredTime();
|
|
int32_t langIdArg = -1;
|
|
if ( m_langIdValid ) langIdArg = m_langId;
|
|
// get it
|
|
int32_t ufn;
|
|
ufn=::getUrlFilterNum(&sreq,&srep,spideredTime,true,
|
|
m_niceness,cr,
|
|
false, // isOutlink?
|
|
NULL ,
|
|
langIdArg);
|
|
// sanity check
|
|
if ( ufn < 0 ) {
|
|
log("msg20: bad url filter for url %s", sreq.m_url);
|
|
}
|
|
|
|
// save it
|
|
reply->m_urlFilterNum = ufn;
|
|
// get spider priority if ufn is valid
|
|
int32_t pr = 0;
|
|
//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
|
|
if ( cr->m_forceDelete[ufn] ) pr = -3;
|
|
|
|
// this is an automatic ban!
|
|
if ( gr && gr->getLong("manualban",0))
|
|
pr=-3;//SPIDER_PRIORITY_BANNED;
|
|
|
|
// is it banned
|
|
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
|
|
// set m_errno
|
|
reply->m_errno = EDOCBANNED;
|
|
// and this
|
|
reply->m_isBanned = true;
|
|
}
|
|
|
|
//
|
|
// for now always allow it until we can fix this better
|
|
// we probably should assume NOT filtered unless it matches
|
|
// a string match only url filter... but at least we will
|
|
// allow it to match "BANNED" filters for now...
|
|
//
|
|
pr = 0;
|
|
|
|
|
|
// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
|
|
// // set m_errno
|
|
// reply->m_errno = EDOCFILTERED;
|
|
// // and this
|
|
// reply->m_isFiltered = true;
|
|
// }
|
|
// done if we are
|
|
if ( reply->m_errno && ! m_req->m_showBanned ) {
|
|
// give back the url at least
|
|
reply->ptr_ubuf = getFirstUrl()->getUrl();
|
|
reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// a special hack for XmlDoc::getRecommendedLinksBuf() so we exclude
|
|
// links that link to the main url's site/domain as well as a
|
|
// competitor url (aka related docid)
|
|
Links *links = NULL;
|
|
if ( m_req->m_ourHostHash32 || m_req->m_ourDomHash32 ) {
|
|
links = getLinks();
|
|
if ( ! links || links==(Links *)-1) return (Msg20Reply *)links;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// truncate content length if we should
|
|
// this was hurting our linkdb lookups! do not do it for those!
|
|
/*
|
|
if ( size_utf8Content > cr->m_contentLenMaxForSummary &&
|
|
// fix for link text fetching!
|
|
! req->m_getLinkText ) {
|
|
logf(LOG_DEBUG,"summary: truncating doc of len %" INT32 " to %" INT32 " for "
|
|
"generating summary",
|
|
size_utf8Content,cr->m_contentLenMaxForSummary);
|
|
size_utf8Content = cr->m_contentLenMaxForSummary ;
|
|
// null term just in case
|
|
ptr_utf8Content[size_utf8Content-1] = '\0';
|
|
}
|
|
*/
|
|
// do they want a summary?
|
|
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
|
|
char *hsum = getHighlightedSummary();
|
|
|
|
if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
|
|
//Summary *s = getSummary();
|
|
//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
|
|
//int32_t sumLen = m_finalSummaryBuf.length();
|
|
// is it size and not length?
|
|
int32_t hsumLen = 0;
|
|
// seems like it can return 0x01 if none...
|
|
if ( hsum == (char *)0x01 ) hsum = NULL;
|
|
// get len. this is the HIGHLIGHTED summary so it is ok.
|
|
if ( hsum ) hsumLen = gbstrlen(hsum);
|
|
// must be \0 terminated. not any more, it can be a subset
|
|
// of a larger summary used for deduping
|
|
if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
|
|
// assume size is 0
|
|
//int32_t sumSize = 0;
|
|
// include the \0 in size
|
|
//if ( sum ) sumSize = sumLen + 1;
|
|
// do not get any more than "me" lines/excerpts of summary
|
|
//int32_t max = m_req->m_numSummaryLines;
|
|
// grab stuff from it!
|
|
//reply->m_proximityScore = s->getProximityScore();
|
|
reply-> ptr_displaySum = hsum;//s->getSummary();
|
|
reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
|
|
// this is unhighlighted for deduping, and it might be longer
|
|
// . seems like we are not using this for deduping but using
|
|
// the gigabit vector in Msg40.cpp, so take out for now
|
|
//reply-> ptr_dedupSum = s->m_summary;
|
|
//reply->size_dedupSum = s->m_summaryLen+1;
|
|
//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
|
|
//reply->m_diversity = s->getDiversity();
|
|
}
|
|
|
|
reply->m_numAlnumWords = 0;
|
|
if ( m_wordsValid )
|
|
reply->m_numAlnumWords = m_words.m_numAlnumWords;
|
|
|
|
// . we filter out search results that do not have all the query terms
|
|
// . Matches.cpp checks the link text, dmoz, etc. for all query terms
|
|
// . it must get into the results form indexdb corruption?
|
|
// . this filtering method is/was known as the "BIG HACK"
|
|
// . We also make sure that matches aren't based on
|
|
// . "anomalous" link text, where a doc has so many link texts
|
|
// . that most common dictionary terms appear in or around
|
|
// . a link to the site.
|
|
if ( m_req->size_qbuf > 1 ) {
|
|
Matches *mm = getMatches();
|
|
int32_t numInlinks = getLinkInfo1()->getNumLinkTexts( );
|
|
reply->m_hasAllQueryTerms = mm->docHasQueryTerms(numInlinks);
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// copy the link info stuff?
|
|
if ( ! m_req->m_getLinkText ) {
|
|
reply->ptr_linkInfo = (char *)ptr_linkInfo1;
|
|
reply->size_linkInfo = size_linkInfo1;
|
|
}
|
|
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
bool getThatTitle = true;
|
|
if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
|
|
if ( reply->ptr_tbuf ) getThatTitle = false;
|
|
// if steve's requesting the inlink summary we will want to get
|
|
// the title of each linker even if they are spammy!
|
|
// only get title here if NOT getting link text otherwise
|
|
// we only get it down below if not a spammy voter, because
|
|
// this sets the damn slow sections class
|
|
if ( m_req->m_getLinkText &&
|
|
! m_useSiteLinkBuf &&
|
|
! m_usePageLinkBuf &&
|
|
// m_pbuf is used by pageparser.cpp now, not the other two things
|
|
// above this.
|
|
! m_pbuf )
|
|
getThatTitle = false;
|
|
|
|
// if steve is getting the inlinks, bad and good, for displaying
|
|
// then get the title here now... otherwise, if we are just spidering
|
|
// and getting the inlinks, do not bother getting the title because
|
|
// the inlink might be linkspam... and we check down below...
|
|
if ( ! m_req->m_onlyNeedGoodInlinks )
|
|
getThatTitle = true;
|
|
|
|
// ... no more seo so stop it... disable this for sp
|
|
if ( m_req->m_getLinkText )
|
|
getThatTitle = false;
|
|
|
|
if ( getThatTitle ) {
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
|
|
char *tit = ti->getTitle();
|
|
int32_t titLen = ti->getTitleLen();
|
|
reply-> ptr_tbuf = tit;
|
|
reply->size_tbuf = titLen + 1; // include \0
|
|
// sanity
|
|
if ( tit && tit[titLen] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
if ( ! tit || titLen <= 0 ) {
|
|
reply->ptr_tbuf = NULL;
|
|
reply->size_tbuf = 0;
|
|
}
|
|
}
|
|
|
|
// this is not documented because i don't think it will be popular
|
|
if ( m_req->m_getHeaderTag ) {
|
|
SafeBuf *htb = getHeaderTagBuf();
|
|
if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
|
|
// . it should be null terminated
|
|
// . actually now it is a \0 separated list of the first
|
|
// few h1 tags
|
|
// . we call SafeBuf::pushChar(0) to add each one
|
|
reply->ptr_htag = htb->getBufStart();
|
|
reply->size_htag = htb->getLength();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( m_req->m_getMatches && ! reply->ptr_mbuf ) {
|
|
MatchOffsets *mo = getMatchOffsets();
|
|
if ( ! mo || mo == (MatchOffsets *)-1) return (Msg20Reply *)mo;
|
|
reply-> ptr_mbuf = (char *)mo->m_matchOffsets;
|
|
reply->size_mbuf = mo->m_numMatches*4;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// get site
|
|
reply->ptr_site = ptr_site;
|
|
reply->size_site = size_site;
|
|
|
|
// assume unknown
|
|
reply->m_noArchive = 0;
|
|
// are we noarchive? only check this if not getting link text
|
|
if ( ! m_req->m_getLinkText ) {
|
|
char *na = getIsNoArchive();
|
|
if ( ! na || na == (char *)-1 ) return (Msg20Reply *)na;
|
|
reply->m_noArchive = *na;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
int32_t nowUTC2 = m_req->m_nowUTC;
|
|
if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet;
|
|
|
|
// . summary vector for deduping
|
|
// . does not compute anything if we should not! (svSize will be 0)
|
|
if ( ! reply->ptr_vbuf &&
|
|
m_req->m_getSummaryVector &&
|
|
cr->m_percentSimilarSummary > 0 &&
|
|
cr->m_percentSimilarSummary < 100 ) {
|
|
int32_t *sv = getSummaryVector ( );
|
|
if ( ! sv || sv == (void *)-1 ) return (Msg20Reply *)sv;
|
|
reply-> ptr_vbuf = (char *)m_summaryVec;
|
|
reply->size_vbuf = m_summaryVecSize;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( m_req->m_numSummaryLines > 0 ) {
|
|
// turn off for now since we added this to posdb
|
|
uint8_t *sl = getSummaryLangId();
|
|
if ( ! sl || sl == (void *)-1 ) return (Msg20Reply *)sl;
|
|
reply->m_summaryLanguage = *sl;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// returns values of specified meta tags
|
|
if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) {
|
|
int32_t dsize; char *d;
|
|
d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
|
|
if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d;
|
|
reply->ptr_dbuf = d;
|
|
reply->size_dbuf = dsize; // includes \0
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// . sample buffer for doing gigabit generation
|
|
// . Msg40.cpp calls intersectGigabits on all these samples from
|
|
// all the Msg20Replies it gets in the search results
|
|
//if ( ! reply->ptr_gigabitQuery && m_req->m_bigSampleMaxLen > 0 ) {
|
|
if ( ! reply->ptr_gigabitSample && m_req->m_bigSampleMaxLen > 0 ) {
|
|
// before we got a chunk of text from the doc
|
|
SafeBuf *gsbuf = getSampleForGigabits();
|
|
if ( ! gsbuf||gsbuf ==(void *)-1) return (Msg20Reply *)gsbuf;
|
|
reply->ptr_gigabitSample = gsbuf->getBufStart();
|
|
reply->size_gigabitSample = gsbuf->length();
|
|
// . now we use the gigabit query!
|
|
// . this is really used to find out what wikipedia pages
|
|
// we match the best...
|
|
// . this also sets the vector
|
|
/*
|
|
char *gq = getGigabitQuery();
|
|
if ( ! gq || gq == (char *)-1) return (Msg20Reply *)gq;
|
|
reply-> ptr_gigabitQuery = m_gigabitQuery;
|
|
reply->size_gigabitQuery = gbstrlen(m_gigabitQuery)+1;
|
|
reply-> ptr_gigabitScores = ptr_gigabitScores;
|
|
reply->size_gigabitScores = size_gigabitScores;
|
|
*/
|
|
}
|
|
|
|
// get full image url. but not if we already have a thumbnail...
|
|
if ( ! reply->ptr_imgUrl&&!reply->ptr_imgData&&!m_req->m_getLinkText){
|
|
// && m_req->m_getImageUrl ) {
|
|
char **iu = getImageUrl();
|
|
if ( ! iu || iu == (char **)-1 ) return (Msg20Reply *)iu;
|
|
reply-> ptr_imgUrl = *iu;
|
|
reply->size_imgUrl = 0;
|
|
if ( *iu ) reply->size_imgUrl = gbstrlen(*iu)+1;
|
|
}
|
|
|
|
// get thumbnail image DATA
|
|
if ( ! reply->ptr_imgData && ! m_req->m_getLinkText ) {
|
|
// && m_req->m_getImageUrl ) {
|
|
reply-> ptr_imgData = ptr_imageData;
|
|
reply->size_imgData = size_imageData;
|
|
}
|
|
|
|
// . adids contained in the doc
|
|
// . get from title rec rather than generating
|
|
// . but we need to generate to store in titleRec at index time
|
|
// . they are 32 bits each
|
|
int64_t **avp = getAdVector();
|
|
if ( ! avp || avp == (void *)-1 ) return (Msg20Reply *)avp;
|
|
|
|
// get firstip
|
|
int32_t *fip = getFirstIp();
|
|
if ( ! fip || fip == (void *)-1 ) return (Msg20Reply *)fip;
|
|
|
|
|
|
//Url **redir = getRedirUrl();
|
|
//if ( ! redir || redir == (Url **)-1 ) return (Msg20Reply *)redir;
|
|
//int32_t redirSize = 0;
|
|
//if ( *redir ) redirSize = (*redir)->getUrlLen() + 1;
|
|
//char *ru = NULL;
|
|
//if ( *redir ) ru = (*redir)->getUrl();
|
|
char *ru = ptr_redirUrl;
|
|
int32_t rulen = 0;
|
|
if ( ru ) rulen = gbstrlen(ru)+1;
|
|
|
|
// . Msg25.cpp uses m_adIdHash for restricting voting
|
|
// . these are 64 bit termids hashes
|
|
reply-> ptr_gbAdIds = (char *)*avp;
|
|
// this size is in bytes and includes the \0
|
|
reply->size_gbAdIds = size_adVector;
|
|
|
|
// need full cached page of each search result?
|
|
// include it always for spider status docs.
|
|
if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
|
|
reply-> ptr_content = ptr_utf8Content;
|
|
reply->size_content = size_utf8Content;
|
|
}
|
|
|
|
// if ( m_req->m_getSectionVotingInfo && m_tmpBuf3.getCapacity() <=0) {
|
|
// Sections *ss = getSections();
|
|
// if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
|
|
// // will at least store a \0 in there, but will not count
|
|
// // as part of the m_tmpBuf.length()
|
|
// ss->printVotingInfoInJSON ( &m_tmpBuf3 );
|
|
// reply-> ptr_sectionVotingInfo = m_tmpBuf3.getBufStart();
|
|
// reply->size_sectionVotingInfo = m_tmpBuf3.length() + 1;
|
|
// }
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// do they want to know if this doc has an outlink to a url
|
|
// that has the provided site and domain hash, Msg20Request::
|
|
// m_ourHostHash32 and m_ourDomHash32?
|
|
int32_t nl = 0;
|
|
if ( links ) nl = links->getNumLinks();
|
|
// scan all outlinks we have on this page
|
|
int32_t i ; for ( i = 0 ; i < nl ; i++ ) {
|
|
// get the normalized url
|
|
//char *url = links->getLinkPtr(i);
|
|
// get the site. this will not block or have an error.
|
|
int32_t hh32 = (int32_t)((uint32_t)links->getHostHash64(i));
|
|
if ( hh32 == m_req->m_ourHostHash32 ) break;
|
|
int32_t dh32 = links->getDomHash32(i);
|
|
if ( dh32 == m_req->m_ourDomHash32 ) break;
|
|
}
|
|
reply->m_hasLinkToOurDomOrHost = false;
|
|
if ( i < nl )
|
|
reply->m_hasLinkToOurDomOrHost = true;
|
|
|
|
|
|
// easy ones
|
|
reply->m_isPermalink = m_isPermalink;
|
|
reply->m_ip = m_ip;
|
|
reply->m_firstIp = *fip;
|
|
reply->m_domHash = getDomHash32();//domHash;
|
|
reply->m_docId = m_docId;
|
|
reply->m_urlHash48 = getFirstUrlHash48();
|
|
reply->m_contentLen = size_utf8Content;
|
|
reply->m_lastSpidered = getSpideredTime();//m_spideredTime;
|
|
reply->m_datedbDate = m_pubDate;
|
|
reply->m_firstIndexedDate = m_firstIndexedDate;
|
|
reply->m_firstSpidered = m_firstIndexedDate;
|
|
reply->m_contentType = m_contentType;
|
|
reply->m_hostHash = getHostHash32a();
|
|
//reply->m_contentHash = *getContentHash32();
|
|
reply->m_language = m_langId;
|
|
reply->m_country = *getCountryId();
|
|
//reply->m_hasAllQueryTerms = false;
|
|
reply->m_hopcount = m_hopCount;
|
|
reply->m_siteRank = getSiteRank();
|
|
|
|
reply->ptr_ubuf = getFirstUrl()->getUrl();
|
|
reply->ptr_rubuf = ru;
|
|
reply->ptr_catIds = ptr_catIds;
|
|
reply->ptr_indCatIds = ptr_indCatIds;
|
|
reply->ptr_dmozTitles = ptr_dmozTitles;
|
|
reply->ptr_dmozSumms = ptr_dmozSumms;
|
|
reply->ptr_dmozAnchors = ptr_dmozAnchors;
|
|
reply->ptr_metadataBuf = ptr_metadata;
|
|
|
|
|
|
reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
|
|
reply->size_rubuf = rulen;
|
|
reply->size_catIds = size_catIds;
|
|
reply->size_indCatIds = size_indCatIds;
|
|
reply->size_dmozTitles = size_dmozTitles;
|
|
reply->size_dmozSumms = size_dmozSumms;
|
|
reply->size_dmozAnchors = size_dmozAnchors;
|
|
reply->size_metadataBuf = size_metadata;
|
|
|
|
|
|
// breathe
|
|
QUICKPOLL( m_req->m_niceness );
|
|
|
|
/*
|
|
// truncate if necessary (buzz)
|
|
int32_t maxLen = 150000;
|
|
// truncate it?
|
|
bool trunc = true;
|
|
// not if getting link text
|
|
if ( req->m_getLinkText ) trunc = false;
|
|
// or outlinks
|
|
if ( req->m_getOutlinks ) trunc = false;
|
|
// or any niceness 1+ for that matter, that indicates a build operation
|
|
if ( req->m_niceness > 0 ) trunc = false;
|
|
// this is causing us to get EMISSINGQUERYTERMS errors!!!
|
|
trunc = false;
|
|
// MDW: int16_ten for speed test
|
|
//int32_t maxLen = 1000;
|
|
if ( trunc && contentLen > maxLen+1 ) {
|
|
contentLen = maxLen;
|
|
content [maxLen ] = '\0';
|
|
}
|
|
*/
|
|
|
|
// check the tag first
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
|
|
//Tag *tag1 = gr->getTag ("sitenuminlinks");
|
|
//Tag *tag2 = gr->getTag ("sitepop");
|
|
//int32_t sni = 0;
|
|
//int32_t spop = 0;
|
|
//if ( tag1 ) sni = atol(tag1->m_data);
|
|
//if ( tag2 ) spop = atol(tag2->m_data);
|
|
reply->m_siteNumInlinks = m_siteNumInlinks;
|
|
//reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
|
|
//reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
|
|
//reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
|
|
//reply->m_sitePop = m_sitePop;
|
|
|
|
// . get stuff from link info
|
|
// . this is so fast, just do it for all Msg20 requests
|
|
// . no! think about it -- this can be huge for pages like
|
|
// google.com!!!
|
|
LinkInfo *info1 = ptr_linkInfo1;
|
|
if ( info1 ) { // && m_req->m_getLinkInfo ) {
|
|
reply->m_pageNumInlinks = info1->m_totalInlinkingDocIds;
|
|
reply->m_pageNumGoodInlinks = info1->m_numGoodInlinks;
|
|
reply->m_pageNumUniqueIps = info1->m_numUniqueIps;
|
|
reply->m_pageNumUniqueCBlocks = info1->m_numUniqueCBlocks;
|
|
reply->m_pageInlinksLastUpdated = info1->m_lastUpdated;
|
|
//reply->m_pagePop = 0;//info1->m_pagePop;
|
|
//reply->m_siteNumInlinks = info1->m_siteNumInlinks;
|
|
//reply->m_sitePop = info1->m_sitePop;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// getLinkText is true if we are getting the anchor text for a
|
|
// supplied url as part of the SPIDER process..
|
|
// this was done by Msg23 before
|
|
if ( ! m_req->m_getLinkText ) {
|
|
m_replyValid = true;
|
|
return &m_reply;
|
|
}
|
|
|
|
// use the first url of the linker by default
|
|
Url *linker = &m_firstUrl;
|
|
|
|
// the base url, used for doing links: terms, is the final url,
|
|
// just in case there were any redirects
|
|
Url redir;
|
|
if ( ru ) {
|
|
redir.set ( ru );
|
|
linker = &redir;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// . get score weight of link text
|
|
// . phase out the sitedb*.xml files
|
|
//int64_t x[] = {0,20,30,40,50,70,90,100}; qualities!
|
|
// map these siteNumInlinks (x) to a weight (y)
|
|
//int64_t x[] = {0,50,100,200,500,3000,10000,50000};
|
|
// these are the weights the link text will receive
|
|
//int64_t y[] = {10,30,2000,3000,4000,5000,6000,7000};
|
|
// sanity check
|
|
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
//int32_t sni = m_siteNumInlinks;// *getSiteNumInlinks();
|
|
// get the final link text weight as a percentage
|
|
//int32_t ltw = getY ( m_siteNumInlinks , x , y , 8 );
|
|
// store the weight in the reply
|
|
//reply->m_linkTextScoreWeight = ltw;
|
|
|
|
//log(LOG_DEBUG,"build: got score weight of %" INT32 " for sni=%" INT32 "",
|
|
// (int32_t)reply->m_linkTextScoreWeight, m_siteNumInlinks);
|
|
|
|
// breathe
|
|
//QUICKPOLL( m_niceness );
|
|
|
|
// . we need the mid doma hash in addition to the ip domain because
|
|
// chat.yahoo.com has different ip domain than www.yahoo.com , ...
|
|
// and we don't want them both to be able to vote
|
|
// . the reply is zeroed out in call the reply->reset() above so
|
|
// if this is not yet set it will be 0
|
|
if ( reply->m_midDomHash == 0 ) {
|
|
char *m = linker->getMidDomain();
|
|
int32_t mlen = linker->getMidDomainLen();
|
|
reply->m_midDomHash = hash32 ( m , mlen );
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
// if not set from above, set it here
|
|
if ( ! links ) links = getLinks ( true ); // do quick set?
|
|
if ( ! links || links == (Links *)-1 ) return (Msg20Reply *)links;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Msg20Reply *)pos;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Msg20Reply *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Msg20Reply *)xml;
|
|
//Sections *ss = getSections();
|
|
//if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
|
|
|
|
// . is this page a dynamic page?
|
|
// . like a guestbook, access log stats, etc.
|
|
// . we don't like to count such pages for links analysis because
|
|
// they can be spammed so easily
|
|
// . TODO: guestbooks and message boards typically contain cgi links
|
|
// can we use that to identify?
|
|
// . the coll size includes the \0
|
|
//CollectionRec *cr ;
|
|
//cr = g_collectiondb.getRec ( m_req->ptr_coll,m_req->size_coll-1);
|
|
// g_errno should be ENOCOLLREC
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// . we want link text for this url, "linkee"
|
|
// . TODO: true --> add "www" to see if that fixes our problem
|
|
// i guess Links.cpp does that with the outlinks, so when
|
|
// Linkdb::fillList() uses Links.cpp, the outlinks have "www"
|
|
// prepended on them...
|
|
//Url linkee;
|
|
//linkee.set ( m_req->ptr_linkee , m_req->size_linkee );
|
|
|
|
// get a ptr to the link in the content. will point to the
|
|
// stuff in the href field of the anchor tag. used for seeing if
|
|
// we have bad links or not.
|
|
int32_t linkNode = -1;
|
|
int32_t linkNum = -1;
|
|
// . get associated link text from the linker's document for our "url"
|
|
// . only gets from FIRST link to us
|
|
// . TODO: allow more link text from better quality pages?
|
|
// . TODO: limit score based on link text length?
|
|
// . should always be NULL terminated
|
|
// . should not break in the middle of a word
|
|
// . this will return the item/entry if we are extracting from an
|
|
// rss/atom feed
|
|
char *rssItem = NULL;
|
|
int32_t rssItemLen = 0;
|
|
// store link text in here
|
|
char linkTextBuf[MAX_LINK_TEXT_LEN];
|
|
|
|
//
|
|
// TODO: for getting siteinlinks just match the site in the url
|
|
// not the full url... and maybe match the one with the int16_test path.
|
|
//
|
|
|
|
// . get the link text
|
|
// . linkee might be a site if m_isSiteLinkInfo is true in which
|
|
// case we get the best inlink to that site, and linkee is
|
|
// something like blogspot.com/mary/ or some other site.
|
|
int32_t blen = links->getLinkText ( m_req->ptr_linkee ,//&linkee,
|
|
m_req->m_isSiteLinkInfo ,
|
|
linkTextBuf ,
|
|
MAX_LINK_TEXT_LEN-2 ,
|
|
&rssItem ,
|
|
&rssItemLen ,
|
|
&linkNode ,
|
|
&linkNum ,
|
|
m_niceness );
|
|
|
|
|
|
// . BUT this skips the news topic stuff too. bad?
|
|
// . THIS HAPPENED before because we were truncating the xml(see above)
|
|
if ( linkNode < 0 ) {
|
|
|
|
int64_t took = gettimeofdayInMilliseconds() - start;
|
|
if ( took > 100 )
|
|
log("build: took %" INT64 " ms to get link text for "
|
|
"%s from linker %s",
|
|
took,
|
|
m_req->ptr_linkee,
|
|
m_firstUrl.m_url );
|
|
|
|
logf(LOG_DEBUG,"build: Got linknode = %" INT32 " < 0. Cached "
|
|
"linker %s does not have outlink to %s like linkdb "
|
|
"says it should. page is probably too big and the "
|
|
"outlink is past our limit. contentLen=%" INT32 ". or "
|
|
"a sitehash collision, or an area tag link.",
|
|
linkNode,getFirstUrl()->getUrl(),m_req->ptr_linkee,
|
|
m_xml.getContentLen());
|
|
//g_errno = ECORRUPTDATA;
|
|
// do not let multicast forward to a twin! so use this instead
|
|
// of ECORRUTPDATA
|
|
g_errno = EBADENGINEER;
|
|
//char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
if ( ! verifyUtf8 ( linkTextBuf , blen ) ) {
|
|
log("xmldoc: bad OUT link text from url=%s for %s",
|
|
m_req->ptr_linkee,m_firstUrl.m_url);
|
|
linkTextBuf[0] = '\0';
|
|
blen = 0;
|
|
}
|
|
|
|
// verify for rss as well. seems like we end up coring because
|
|
// length/size is not in cahoots and [size-1] != '\0' sometimes
|
|
if ( ! verifyUtf8 ( rssItem , rssItemLen ) ) {
|
|
log("xmldoc: bad RSS ITEM text from url=%s for %s",
|
|
m_req->ptr_linkee,m_firstUrl.m_url);
|
|
rssItem[0] = '\0';
|
|
rssItemLen = 0;
|
|
}
|
|
|
|
// point to it, include the \0.
|
|
if ( blen > 0 ) {
|
|
reply->ptr_linkText = linkTextBuf;
|
|
// save the size into the reply, include the \0
|
|
reply->size_linkText = blen + 1;
|
|
// sanity check
|
|
if ( blen + 2 > MAX_LINK_TEXT_LEN ) { char *xx=NULL;*xx=0; }
|
|
// sanity check. null termination required.
|
|
if ( linkTextBuf[blen] ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// . the link we link to
|
|
// . important when getting site info because the link url
|
|
// can be different than the root url!
|
|
reply-> ptr_linkUrl = links->getLink (linkNum);
|
|
reply->size_linkUrl = links->getLinkLen(linkNum)+1;
|
|
|
|
// save the rss item in our state so we can point to it, include \0
|
|
//if(rssItemLen > MAX_RSSITEM_SIZE-2 ) rssItemLen = MAX_RSSITEM_SIZE-2;
|
|
//char rssItemBuf[MAX_RSSITEM_SIZE];
|
|
if ( rssItemLen > MAX_RSSITEM_SIZE )
|
|
rssItemLen = MAX_RSSITEM_SIZE;
|
|
if ( rssItemLen > 0) {
|
|
m_rssItemBuf.safeMemcpy ( rssItem , rssItemLen );
|
|
m_rssItemBuf.pushChar('\0');
|
|
// gbmemcpy ( rssItemBuf, rssItem , rssItemLen );
|
|
// // NULL terminate it
|
|
// rssItemBuf[rssItemLen] = 0;
|
|
}
|
|
|
|
// point to it, include the \0
|
|
if ( rssItemLen > 0 ) {
|
|
reply->ptr_rssItem = m_rssItemBuf.getBufStart();
|
|
reply->size_rssItem = m_rssItemBuf.getLength();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
if ( ! m_req->m_doLinkSpamCheck )
|
|
reply->m_isLinkSpam = false;
|
|
|
|
if ( m_req->m_doLinkSpamCheck ) {
|
|
// reset to NULL to avoid gbstrlen segfault
|
|
char *note = NULL;
|
|
// need this
|
|
if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; }
|
|
// time it
|
|
//int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
Url linkeeUrl;
|
|
linkeeUrl.set ( m_req->ptr_linkee );
|
|
|
|
// get it. does not block.
|
|
reply->m_isLinkSpam = ::isLinkSpam ( linker ,
|
|
m_ip ,
|
|
ptr_indCatIds ,
|
|
size_indCatIds / 4 ,
|
|
m_siteNumInlinks,
|
|
&m_xml,
|
|
links,
|
|
// if doc length more
|
|
// than 150k then consider
|
|
// it linkspam
|
|
// automatically so it
|
|
// can't vote
|
|
150000,//MAXDOCLEN//150000
|
|
¬e ,
|
|
&linkeeUrl , // url ,
|
|
linkNode ,
|
|
cr->m_coll ,
|
|
m_niceness );
|
|
// store it
|
|
if ( note ) {
|
|
// include the \0
|
|
reply->ptr_note = note;
|
|
reply->size_note = gbstrlen(note)+1;
|
|
}
|
|
// log the reason why it is a log page
|
|
if ( reply->m_isLinkSpam )
|
|
log(LOG_DEBUG,"build: linker %s: %s.",
|
|
linker->getUrl(),note);
|
|
// sanity
|
|
if ( reply->m_isLinkSpam && ! note )
|
|
log("linkspam: missing note for d=%" INT64 "!",m_docId);
|
|
// store times... nah, might have yielded cpu!
|
|
reply->m_timeLinkSpam = 0;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// sanity check
|
|
if ( reply->ptr_rssItem &&
|
|
reply->size_rssItem>0 &&
|
|
reply->ptr_rssItem[reply->size_rssItem-1]!=0) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
|
|
//log ("nogl=%" INT32 "",(int32_t)m_req->m_onlyNeedGoodInlinks );
|
|
|
|
// . skip all this junk if we are a spammy voter
|
|
// . we get the title above in "getThatTitle"
|
|
if ( reply->m_isLinkSpam ) {
|
|
m_replyValid = true; return reply; }
|
|
|
|
// . this vector is set from a sample of the entire doc
|
|
// . it is used to dedup voters in Msg25.cpp
|
|
// . this has pretty much been replaced by vector2, it was
|
|
// also saying a doc was a dup if all its words were
|
|
// contained by another, like if it was a small subset, which
|
|
// wasn't the best behaviour.
|
|
// . yeah neighborhood text is much better and this is setting
|
|
// the slow sections class, so i took it out
|
|
getPageSampleVector ();
|
|
// must not block or error out. sanity check
|
|
if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
|
|
//st->m_v1.setPairHashes ( ww , -1 , m_niceness );
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
//st->m_v2.setPairHashes ( ww,linkWordNum, m_niceness );
|
|
// . this vector is set from the text after the link text
|
|
// . it terminates at at a breaking tag
|
|
// . check it out in ~/fff/src/Msg20.cpp
|
|
getPostLinkTextVector ( linkNode );
|
|
// must not block or error out. sanity check
|
|
//if ( ! m_postLinkTextVecValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// set from the hashes of the tag id pairs
|
|
//st->m_v3.setTagPairHashes ( xml , m_niceness );
|
|
// get it
|
|
getTagPairHashVector();
|
|
// must not block or error out. sanity check
|
|
if ( ! m_tagPairHashVecValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// this vector is set from the hashes of the path components
|
|
// with punctuation stripped out
|
|
//v4.set ( xml, NULL , linker, -1 ,buf4,size);
|
|
// . the 4th vector is provided, this will point to m_topIps[] buffer
|
|
// . this is temporarily disabled
|
|
// . this is the top 2 bytes of the ips of each inlink
|
|
// . we were looking this info up in linkdb
|
|
// . so if two good inlinkers had their inlinks from the same ip
|
|
// neighborhoods, then one would have its voting power "deduped".
|
|
// . see the old LinkText.cpp for the logic that read these from linkdb
|
|
//v5.set2 ( (char *)incomingIps , numIncomingIps );
|
|
|
|
// reference the vectors in our reply
|
|
reply-> ptr_vector1 = m_pageSampleVec;//(char *)&st->m_v1;
|
|
reply->size_vector1 = m_pageSampleVecSize;//st->m_v1.getSize();
|
|
reply-> ptr_vector2 = m_postVec;//(char *)&st->m_v2;
|
|
reply->size_vector2 = m_postVecSize;//st->m_v2.getSize();
|
|
reply-> ptr_vector3 = m_tagPairHashVec; // (char *)&st->m_v3;
|
|
reply->size_vector3 = m_tagPairHashVecSize;//st->m_v3.getSize();
|
|
|
|
// crap, we gotta bubble sort these i think
|
|
// but only tag pair hash vec
|
|
bool flag = true;
|
|
uint32_t *d = (uint32_t *)m_tagPairHashVec;
|
|
// exclude the terminating 0 int32_t
|
|
int32_t nd = (m_tagPairHashVecSize / 4) - 1;
|
|
while ( flag ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
flag = false;
|
|
for ( int32_t i = 1 ; i < nd ; i++ ) {
|
|
if ( d[i-1] <= d[i] ) continue;
|
|
uint32_t tmp = d[i-1];
|
|
d[i-1] = d[i];
|
|
d[i] = tmp;
|
|
flag = true;
|
|
}
|
|
}
|
|
|
|
|
|
// just always do it
|
|
//if ( ! req->m_getInlinkNeighborhoods ) return true;
|
|
|
|
// convert "linkNode" into a string ptr into the document
|
|
char *node = xml->getNodePtr(linkNode)->m_node;
|
|
// . find the word index, "n" for this node
|
|
// . this is INEFFICIENT!!
|
|
char **wp = ww->getWords();
|
|
int32_t nw = ww->getNumWords();
|
|
int32_t n;
|
|
for ( n = 0; n < nw && wp[n] < node ; n++ )
|
|
QUICKPOLL(m_niceness);
|
|
// sanity check
|
|
//if ( n >= nw ) { char *xx=NULL; *xx=0; }
|
|
if ( n >= nw ) {
|
|
log("links: crazy! could not get word before linknode");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
|
|
// get the ptrs to the sections, 1-1 with words
|
|
//Section **sp = NULL;
|
|
//if ( ss ) sp = ss->m_sectionPtrs;
|
|
// . even tags in the article section have positive scores
|
|
// . the scores array is 1-1 with the words in Words, not the nodes
|
|
// in Xml. so we had to do that conversion.
|
|
//if ( ! sp || !(sp[n]->m_flags & NOINDEXFLAGS) )
|
|
// reply->m_outlinkInContent = true;
|
|
|
|
//
|
|
// get the surrounding link text, around "linkNode"
|
|
//
|
|
// radius of 80 characters around n
|
|
char sbuf[1201];
|
|
int32_t radius = 80;
|
|
char *p = sbuf;
|
|
char *pend = sbuf + 600;
|
|
// . make a neighborhood in the "words" space [a,b]
|
|
// . radius is in characters, so "convert" into words by dividing by 5
|
|
int32_t a = n - radius / 5;
|
|
int32_t b = n + radius / 5;
|
|
if ( a < 0 ) a = 0;
|
|
if ( b > nw ) b = nw;
|
|
int32_t *pp = pos->m_pos;
|
|
int32_t len;
|
|
// if too big shring the biggest, a or b?
|
|
while ( (len=pp[b]-pp[a]) >= 2 * radius + 1 ) {
|
|
// decrease the largest, a or b
|
|
if ( a<n && (pp[n]-pp[a])>(pp[b]-pp[n])) a++;
|
|
else if ( b>n ) b--;
|
|
}
|
|
// only store it if we can
|
|
if ( p + len + 1 < pend ) {
|
|
// store it
|
|
// FILTER the html entities!!
|
|
int32_t len2 = pos->filter(p,pend,ww,a,b,NULL);//ss);
|
|
// ensure NULL terminated
|
|
p[len2] = '\0';
|
|
// store in reply. it will be serialized when sent.
|
|
// thanks to isj for finding this bug fix.
|
|
m_surroundingTextBuf.safeMemcpy ( p , len2 + 1 );
|
|
reply->ptr_surroundingText =m_surroundingTextBuf.getBufStart();
|
|
reply->size_surroundingText=m_surroundingTextBuf.getLength();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// get title? its slow because it sets the sections class
|
|
if ( m_req->m_titleMaxLen > 0 && ! reply->ptr_tbuf &&
|
|
// don't get it anymore if getting link info because it
|
|
// is slow...
|
|
getThatTitle ) {
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
|
|
char *tit = ti->getTitle();
|
|
int32_t titLen = ti->getTitleLen();
|
|
reply-> ptr_tbuf = tit;
|
|
reply->size_tbuf = titLen + 1; // include \0
|
|
if ( ! tit || titLen <= 0 ) {
|
|
reply->ptr_tbuf = NULL;
|
|
reply->size_tbuf = 0;
|
|
}
|
|
}
|
|
|
|
int64_t took = gettimeofdayInMilliseconds() - start;
|
|
if ( took > 100 )
|
|
log("build: took %" INT64 " ms to get link text for "
|
|
"%s from linker %s",
|
|
took,
|
|
m_req->ptr_linkee,
|
|
m_firstUrl.m_url );
|
|
|
|
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
//static void gotMsg5ListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
|
|
// XmlDoc *THIS = (XmlDoc *)state;
|
|
// THIS->m_masterLoop ( THIS->m_masterState );
|
|
//}
|
|
|
|
|
|
char **XmlDoc::getDiffbotPrimaryImageUrl ( ) {
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (char **)jp;
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
|
|
// assume none
|
|
m_imageUrl2 = NULL;
|
|
m_imageUrl2Valid = true;
|
|
|
|
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
|
|
//char *topName = NULL;
|
|
// what name level are we?
|
|
// int32_t numNames = 1;
|
|
// JsonItem *pi = ji->m_parent;
|
|
// for ( ; pi ; pi = pi->m_parent ) {
|
|
// // empty name?
|
|
// if ( ! pi->m_name ) continue;
|
|
// if ( ! pi->m_name[0] ) continue;
|
|
// topName = pi->m_name;
|
|
// numNames++;
|
|
// }
|
|
|
|
char *name0 = ji->m_name;
|
|
char *name1 = NULL;
|
|
char *name2 = NULL;
|
|
if ( ji->m_parent )
|
|
name1 = ji->m_parent->m_name;
|
|
if ( ji->m_parent->m_parent )
|
|
name2 = ji->m_parent->m_parent->m_name;
|
|
|
|
// stop at first image for "images":[{ indicator
|
|
if ( strcmp(name0,"url") == 0 &&
|
|
name1 &&
|
|
strcmp(name1,"images") == 0 )
|
|
break;
|
|
|
|
|
|
// for products
|
|
if ( strcmp(name0,"link") == 0 &&
|
|
name1 &&
|
|
strcmp(name1,"media") == 0 )
|
|
break;
|
|
}
|
|
|
|
|
|
if ( ! ji )
|
|
return &m_imageUrl2;
|
|
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
|
|
// ok, we got it, just copy that
|
|
m_imageUrlBuf2.safeMemcpy ( val , vlen );
|
|
m_imageUrlBuf2.nullTerm();
|
|
m_imageUrl2 = m_imageUrlBuf2.getBufStart();
|
|
return &m_imageUrl2;
|
|
}
|
|
|
|
// get the image url SPECIFIED by the page, so there is no guesswork here
|
|
// unlike with the Images.cpp class
|
|
char **XmlDoc::getImageUrl() {
|
|
// return if valid
|
|
if ( m_imageUrlValid ) return &m_imageUrl;
|
|
// get first url
|
|
Url *f = getFirstUrl();
|
|
if ( ! f || f == (Url *)-1 ) return (char **)f;
|
|
|
|
// assume none
|
|
m_imageUrl = NULL;
|
|
m_imageUrlValid = true;
|
|
|
|
// we use getDiffbotPrimaryImageUrl() above for doing thumbs
|
|
if ( m_isDiffbotJSONObject || m_contentType == CT_JSON )
|
|
return &m_imageUrl;
|
|
|
|
// all done if not youtube or meta cafe
|
|
char *host = f->getHost();
|
|
char found = 0;
|
|
if ( ! strncmp ( host , "www.youtube.com/" , 16 ) ) found = 1;
|
|
if ( ! strncmp ( host , "youtube.com/" , 12 ) ) found = 1;
|
|
if ( ! strncmp ( host , "www.metacafe.com/" , 17 ) ) found = 2;
|
|
if ( ! strncmp ( host , "metacafe.com/" , 13 ) ) found = 2;
|
|
if ( ! found ) return &m_imageUrl;
|
|
// char ptr
|
|
char *u = f->getUrl();
|
|
// make it
|
|
if ( found == 1 ) {
|
|
char *s = strstr(u,"v=");
|
|
// if url does not contain a "v=" then forget it
|
|
if ( ! s ) return &m_imageUrl;
|
|
// point to the id
|
|
s += 2;
|
|
//m_imageUrl = m_imageUrlBuf;
|
|
//char *p = m_imageUrlBuf;
|
|
m_imageUrlBuf.safeStrcpy("http://img.youtube.com/vi/");
|
|
// do not break
|
|
//char *pend = m_imageUrlBuf + 80;
|
|
// copy the id/number
|
|
//for ( ; is_digit(*s) && p < pend ; ) *p++ = *s++;
|
|
for ( ; is_digit(*s) ; s++ )
|
|
m_imageUrlBuf.pushChar(*s);
|
|
// wrap it up
|
|
m_imageUrlBuf.safeStrcpy ( "/2.jpg" );
|
|
// size includes \0;
|
|
//m_imageUrlSize = p - m_imageUrl ;
|
|
// sanity check
|
|
//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
|
|
m_imageUrl = m_imageUrlBuf.getBufStart();
|
|
return &m_imageUrl;
|
|
}
|
|
// must be meta cafe now
|
|
// http://www.metacafe.com/watch/559561/surfer_girls_vol_2/
|
|
// http://s2.mcstatic.com/thumb/559561.jpg
|
|
// scan url path for first digit
|
|
for ( char *t = f->getPath() ; *t ; t++ ) {
|
|
// look for digit
|
|
if ( ! is_digit ( *t ) ) t++;
|
|
// grab that
|
|
int32_t id = atol ( t );
|
|
// skip ifnot good
|
|
if ( id <= 0 ) continue;
|
|
// make the url
|
|
//m_imageUrl = m_imageUrlBuf;
|
|
//char *p = m_imageUrlBuf;
|
|
//gbmemcpy ( p , "http://s2.mcstatic.com/thumb/" , 29 );
|
|
//p += 29;
|
|
//p += sprintf ( p , "%" INT32 "" , id );
|
|
//gbmemcpy ( p , ".jpg\0" , 5 );
|
|
//p += 5;
|
|
m_imageUrlBuf.safePrintf("http://s2.mcstatic."
|
|
"com/thumb/%" INT32 ".jpg", id);
|
|
m_imageUrl = m_imageUrlBuf.getBufStart();
|
|
// size includes \0;
|
|
//m_imageUrlSize = p - m_imageUrl ;
|
|
// sanity check
|
|
//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
|
|
break;
|
|
}
|
|
return &m_imageUrl;
|
|
}
|
|
|
|
|
|
MatchOffsets *XmlDoc::getMatchOffsets () {
|
|
// return it if it is set
|
|
if ( m_matchOffsetsValid ) return &m_matchOffsets;
|
|
|
|
// need a buncha crap
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (MatchOffsets *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (MatchOffsets *)xml;
|
|
Matches *mm = getMatches();
|
|
if ( ! mm || mm == (Matches *)-1 ) return (MatchOffsets *)mm;
|
|
|
|
m_matchOffsets.set ( xml , ww , mm , true ); // getMatches=true
|
|
m_matchOffsetsValid = true;
|
|
return &m_matchOffsets;
|
|
}
|
|
|
|
Query *XmlDoc::getQuery() {
|
|
if ( m_queryValid ) return &m_query;
|
|
// bail if no query
|
|
if ( ! m_req || ! m_req->ptr_qbuf ) {
|
|
m_queryValid = true;
|
|
return &m_query;
|
|
}
|
|
// return NULL with g_errno set on error
|
|
if ( ! m_query.set2( m_req->ptr_qbuf ,
|
|
m_req->m_langId ,
|
|
true ) ) return NULL;
|
|
m_queryValid = true;
|
|
return &m_query;
|
|
}
|
|
|
|
Matches *XmlDoc::getMatches () {
|
|
// return it if it is set
|
|
if ( m_matchesValid ) return &m_matches;
|
|
|
|
// if no query, matches are empty
|
|
if ( ! m_req->ptr_qbuf ) {
|
|
m_matchesValid = true;
|
|
return &m_matches;
|
|
}
|
|
|
|
// cache it for one hour
|
|
//XmlDoc *od = getOldXmlDoc ( 3600 );
|
|
//if ( ! od || od == (XmlDoc *)-1 ) return (Matches *)od;
|
|
//if ( od->isEmpty() ) od = NULL;
|
|
|
|
// need a buncha crap
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Matches *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Matches *)xml;
|
|
Bits *bits = getBitsForSummary();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (Matches *)bits;
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss == (void *)-1) return (Matches *)ss;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Matches *)pos;
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Matches *)ti;
|
|
//Synonyms *syn = getSynonyms();
|
|
//if ( ! syn || syn == (void *)-1 ) return (Matches *)syn;
|
|
Phrases *phrases = getPhrases();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (Matches *)phrases;
|
|
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (Matches *)q;
|
|
|
|
// set it up
|
|
m_matches.setQuery ( q );
|
|
// returns false and sets g_errno on error
|
|
if ( ! m_matches.set ( this ,
|
|
ww ,
|
|
//syn ,
|
|
phrases ,
|
|
ss ,
|
|
bits ,
|
|
pos ,
|
|
xml ,
|
|
ti ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
// we got it
|
|
m_matchesValid = true;
|
|
return &m_matches;
|
|
}
|
|
|
|
// sender wants meta description, custom tags, etc.
|
|
char *XmlDoc::getDescriptionBuf ( char *displayMetas , int32_t *dsize ) {
|
|
// return the buffer if we got it
|
|
if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
// now get the content of the requested display meta tags
|
|
//char dbuf [ 1024*64 ];
|
|
char *dbufEnd = m_dbuf + 1024;//1024*64;
|
|
char *dptr = m_dbuf;
|
|
char *pp = displayMetas;
|
|
char *ppend = pp + gbstrlen(displayMetas);
|
|
// loop over the list of requested meta tag names
|
|
while ( pp < ppend && dptr < dbufEnd ) {
|
|
// skip initial spaces. meta tag names are ascii always i guess
|
|
while ( *pp && is_wspace_a(*pp) ) pp++;
|
|
// that's the start of the meta tag name
|
|
char *s = pp;
|
|
// . find end of that meta tag name
|
|
// . can end in :<integer> which specifies max len
|
|
while ( *pp && ! is_wspace_a(*pp) && *pp != ':' ) pp++;
|
|
// assume no max length to the content of this meta tag
|
|
int32_t maxLen = 0x7fffffff;
|
|
// save current char
|
|
char c = *pp;
|
|
// . NULL terminate the name
|
|
// . before, overflowed the request buffer and caused core!
|
|
// . seems like it is already NULL terminated
|
|
if ( *pp ) *pp = '\0';
|
|
// always advance regardless though
|
|
pp++;
|
|
// if ':' was specified, get the max length
|
|
if ( c == ':' ) {
|
|
if ( is_digit(*pp) ) maxLen = atoi ( pp );
|
|
// skip over the digits
|
|
while ( *pp && ! is_wspace_a (*pp) ) pp++;
|
|
}
|
|
// don't exceed our total buffer size (save room for \0 at end)
|
|
int32_t avail = dbufEnd - dptr - 1;
|
|
if ( maxLen > avail ) maxLen = avail;
|
|
// store the content at "dptr" (do not exceed "maxLen" bytes)
|
|
int32_t wlen = xml->getMetaContent ( dptr , // write buf
|
|
maxLen , // buf length
|
|
s , // name value
|
|
gbstrlen(s) , // name len
|
|
"name" , // http-equiv/name
|
|
false );// convert &#'s?
|
|
dptr[wlen] = '\0';
|
|
|
|
// test it out
|
|
if ( ! verifyUtf8 ( dptr ) ) {
|
|
log("xmldoc: invalid utf8 content for meta tag %s.",s);
|
|
continue;
|
|
}
|
|
|
|
// advance and NULL terminate
|
|
dptr += wlen;
|
|
*dptr++ = '\0';
|
|
// bitch if we truncated
|
|
if ( dptr >= dbufEnd )
|
|
log("query: More than %" INT32 " bytes of meta tag "
|
|
"content "
|
|
"was encountered. Truncating.",
|
|
(int32_t)(dbufEnd-m_dbuf));
|
|
}
|
|
// what is the size of the content of displayed meta tags?
|
|
m_dbufSize = dptr - m_dbuf;
|
|
m_dbufValid = true;
|
|
*dsize = m_dbufSize;
|
|
return m_dbuf;
|
|
}
|
|
|
|
SafeBuf *XmlDoc::getHeaderTagBuf() {
|
|
if ( m_htbValid ) return &m_htb;
|
|
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
|
|
|
|
int32_t count = 0;
|
|
|
|
// scan sections
|
|
Section *si = ss->m_rootSection;
|
|
|
|
moreloop:
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
if ( si->m_tagId != TAG_H1 ) continue;
|
|
// if it contains now text, this will be -1
|
|
// so give up on it
|
|
if ( si->m_firstWordPos < 0 ) continue;
|
|
if ( si->m_lastWordPos < 0 ) continue;
|
|
// ok, it works, get it
|
|
break;
|
|
}
|
|
// if no h1 tag then make buf empty
|
|
if ( ! si ) {
|
|
m_htb.nullTerm();
|
|
m_htbValid = true;
|
|
return &m_htb;
|
|
}
|
|
// otherwise, set it
|
|
char *a = m_words.m_words[si->m_firstWordPos];
|
|
char *b = m_words.m_words[si->m_lastWordPos] ;
|
|
b += m_words.m_wordLens[si->m_lastWordPos];
|
|
|
|
// copy it
|
|
m_htb.safeMemcpy ( a , b - a );
|
|
m_htb.pushChar('\0');
|
|
|
|
si = si->m_next;
|
|
|
|
// add more?
|
|
if ( count++ < 3 ) goto moreloop;
|
|
|
|
m_htbValid = true;
|
|
return &m_htb;
|
|
}
|
|
|
|
|
|
Title *XmlDoc::getTitle ( ) {
|
|
if ( m_titleValid ) return &m_title;
|
|
// need a buncha crap
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Title *)ww;
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) return (Title *)sections;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Title *)pos;
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (Title *)q;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
int32_t titleMaxLen = cr->m_titleMaxLen;
|
|
if ( m_req ) titleMaxLen = m_req->m_titleMaxLen;
|
|
// limit for speed, some guys have a 100k word title!
|
|
if ( titleMaxLen > 256 ) titleMaxLen = 256;
|
|
|
|
m_titleValid = true;
|
|
if ( ! m_title.setTitle ( this ,
|
|
xml ,
|
|
ww ,
|
|
sections ,
|
|
pos ,
|
|
titleMaxLen ,
|
|
0xffff ,
|
|
NULL ,
|
|
q ,
|
|
cr ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
return &m_title;
|
|
}
|
|
|
|
|
|
Summary *XmlDoc::getSummary () {
|
|
if ( m_summaryValid ) return &m_summary;
|
|
|
|
// xml and json docs have empty summaries for now
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (Summary *)ct;
|
|
|
|
if ( *ct == CT_JSON || *ct == CT_XML ) {
|
|
m_summaryValid = true;
|
|
return &m_summary;
|
|
}
|
|
|
|
// need a buncha crap
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Summary *)xml;
|
|
Bits *bits = getBitsForSummary();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (Summary *)bits;
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) return (Summary *)sections;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Summary *)pos;
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (Summary *)site;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Summary *)d;
|
|
Matches *mm = getMatches();
|
|
if ( ! mm || mm == (Matches *)-1 ) return (Summary *)mm;
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Summary *)ti;
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (Summary *)q;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . get the highest number of summary lines that we need
|
|
// . the summary vector we generate for doing summary-based deduping
|
|
// typically has more lines in it than the summary we generate for
|
|
// displaying to the user
|
|
int32_t numLines = m_req->m_numSummaryLines;
|
|
if ( cr->m_percentSimilarSummary > 0 &&
|
|
cr->m_percentSimilarSummary < 100 &&
|
|
m_req->m_getSummaryVector &&
|
|
cr->m_summDedupNumLines > numLines )
|
|
// request more lines than we will display
|
|
numLines = cr->m_summDedupNumLines;
|
|
|
|
// int16_tcut
|
|
Summary *s = &m_summary;
|
|
|
|
// time cpu set time
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
m_cpuSummaryStartTime = start;
|
|
|
|
// make sure summary does not include title
|
|
char *tbuf = ti->m_title;
|
|
// this does not include the terminating \0
|
|
int32_t tbufLen = ti->m_titleBytes;
|
|
|
|
// compute the summary
|
|
bool status;
|
|
status = s->set2( xml ,
|
|
ww ,
|
|
bits ,
|
|
sections ,
|
|
pos ,
|
|
q ,
|
|
(int64_t *)m_req->ptr_termFreqs ,
|
|
(float *)m_req->ptr_affWeights ,
|
|
false , // doStemming
|
|
m_req->m_summaryMaxLen ,
|
|
numLines ,
|
|
// . displayLines, # lines we are displaying
|
|
// . Summary::getDisplayLen() will return the
|
|
// length of the summary to display
|
|
m_req->m_numSummaryLines ,
|
|
m_req->m_summaryMaxNumCharsPerLine,
|
|
m_req->m_ratInSummary ,
|
|
getFirstUrl() ,
|
|
//&reply->m_queryProximityScore ,
|
|
mm ,
|
|
tbuf ,
|
|
tbufLen );
|
|
|
|
// error, g_errno should be set!
|
|
if ( ! status ) return NULL;
|
|
|
|
m_summaryValid = true;
|
|
|
|
return &m_summary;
|
|
}
|
|
|
|
char *XmlDoc::getHighlightedSummary ( ) {
|
|
|
|
if ( m_finalSummaryBufValid ) {
|
|
//char *fsum = m_finalSummaryBuf.getBufStart();
|
|
//if ( ! fsum ) fsum = (char *)0x01;
|
|
return m_finalSummaryBuf.getBufStart();
|
|
}
|
|
|
|
Summary *s = getSummary();
|
|
|
|
if ( ! s || s == (void *)-1 ) return (char *)s;
|
|
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (char *)q;
|
|
|
|
// get the summary
|
|
char *sum = s->getSummary();
|
|
//int32_t sumLen = s->getSummaryLen();
|
|
int32_t sumLen = s->getSummaryDisplayLen();
|
|
|
|
//sum[sumLen] = 0;
|
|
|
|
// assume no highlighting?
|
|
if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
|
|
m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
|
|
m_finalSummaryBuf.nullTerm();
|
|
m_finalSummaryBufValid = true;
|
|
return m_finalSummaryBuf.getBufStart();
|
|
//char *fsum = m_finalSummaryBuf.getBufStart();
|
|
//if ( ! fsum ) fsum = (char *)0x01;
|
|
//return fsum;
|
|
}
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
Highlight hi;
|
|
StackBuf(hb);
|
|
// highlight the query in it
|
|
int32_t hlen = hi.set ( &hb,
|
|
sum,
|
|
sumLen,
|
|
m_langId,
|
|
q,
|
|
false , // doStemming?
|
|
false , //click&scroll?
|
|
NULL , // base url
|
|
"<b>" , // front tag
|
|
"</b>" , // back tag
|
|
0,
|
|
m_niceness );
|
|
|
|
|
|
// highlight::set() returns 0 on error
|
|
if ( hlen < 0 ) {
|
|
log("build: highlight class error = %s",mstrerror(g_errno));
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return NULL;
|
|
}
|
|
|
|
// store into our safebuf then
|
|
m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 );
|
|
m_finalSummaryBufValid = true;
|
|
m_finalSummaryBuf.nullTerm();
|
|
|
|
return m_finalSummaryBuf.getBufStart();
|
|
|
|
//char *fsum = m_finalSummaryBuf.getBufStart();
|
|
//if ( ! fsum ) fsum = (char *)0x01;
|
|
//return fsum;
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// GET GIGABIT SAMPLE
|
|
//
|
|
//
|
|
// This will get samples surrounding all the query terms for purposes
|
|
// of gigabits generation. We don't just generate gigabits from the
|
|
// WHOLE document because it takes much longer?? is that still true?
|
|
// We assume that the first call to getTopLines() above set
|
|
// matches/numMatches. We use those arrays to
|
|
// skip directly to just the query terms in the document and save time.
|
|
// We may have to reset the Scores array here if we want to use it ltr.
|
|
//
|
|
// aka getGigabitSample. get gigabit sample
|
|
//
|
|
SafeBuf *XmlDoc::getSampleForGigabits ( ) {
|
|
|
|
|
|
if ( m_gsbufValid ) return &m_gsbuf;
|
|
|
|
// assume empty
|
|
//m_gsbuf = NULL;
|
|
|
|
// basically, exit now if no sample needed
|
|
if ( m_req->m_bigSampleMaxLen <= 0 ||
|
|
m_req->m_bigSampleRadius <= 0 ) {
|
|
m_gsbufValid = true;
|
|
return &m_gsbuf;
|
|
}
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
|
|
|
|
|
|
// if it is json then only return the json fields that are strings
|
|
// and json decode them... separate each field with a \0.
|
|
if ( *ct == CT_JSON )
|
|
return getSampleForGigabitsJSON();
|
|
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
|
|
|
|
// just send back the whole page, but separate each section
|
|
// with \0. make only sentences end with ? ! or ., headers
|
|
// not with anything, and no menu items
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) return (SafeBuf *)sections;
|
|
Section *sp = sections->m_rootSection;
|
|
SafeBuf reply;
|
|
reply.setLabel("gbtrepbuf");
|
|
// m_contentLen is invalid, don't use that here use size_utf8Content
|
|
if ( ! reply.reserve ( size_utf8Content + 1000 ) ) return NULL;
|
|
// scan the sections of the document
|
|
for ( ; sp ; sp = sp->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// do not allow menu crap
|
|
if ( sp->m_flags & ( SEC_MENU |
|
|
SEC_MENU_SENTENCE |
|
|
SEC_MENU_HEADER ) )
|
|
continue;
|
|
// must be sentence or header
|
|
bool ok = false;
|
|
if ( sp->m_flags & SEC_SENTENCE ) ok = true;
|
|
// headings are ok, just don't use as sentences...
|
|
if ( sp->m_flags & SEC_HEADING ) ok = true;
|
|
if ( ! ok ) continue;
|
|
|
|
// store without tags
|
|
char *p = ww->m_words[sp->m_a];
|
|
// include period after final word in section
|
|
int32_t b = sp->m_b - 1;
|
|
char *e = ww->m_words[b] + ww->m_wordLens[b];
|
|
|
|
// if 3+ commas and one comma for every 4 words, forget it,
|
|
// it is probably a list! well, process it, but make sure it
|
|
// does not end in a period so we do not display it
|
|
// as a fast fact, but we use it for gigabits.
|
|
bool isList = false;
|
|
int32_t commaCount = 0;
|
|
int32_t bracketCount = 0;
|
|
for ( char *z = p ; z < e ; z++ ) {
|
|
if ( *z == ',' ) commaCount++;
|
|
// fix ] [AllTheWeb] [Gigablast] [Google] [HotBot]...
|
|
if ( *z == '[' ) bracketCount++;
|
|
}
|
|
int32_t naw = (b - sp->m_a) / 2;
|
|
|
|
// just skip even for gigabits if too long. most likely
|
|
// a spammy list of nouns.
|
|
if ( naw >= 130 ) continue;
|
|
|
|
if ( commaCount >= 3 && commaCount *4 >= naw )
|
|
isList = true;
|
|
if ( commaCount >= 10 )
|
|
isList = true;
|
|
if ( bracketCount >= 3 )
|
|
isList = true;
|
|
|
|
// too much uppercase?
|
|
bool yelling = false;
|
|
int32_t upper = 0;
|
|
int32_t lower = 0;
|
|
char cs = 0;
|
|
for ( char *z = p ; z < e ; z += cs ) {
|
|
cs = getUtf8CharSize(z);
|
|
if ( ! is_alpha_utf8(z) ) continue;
|
|
if ( is_upper_utf8(z) ) upper++;
|
|
if ( is_lower_utf8(z) ) lower++;
|
|
}
|
|
if ( upper > lower ) yelling = true;
|
|
|
|
|
|
|
|
// ending ) or ]
|
|
if ( e[0] == ')' ) e++;
|
|
else if ( e[0] == ']' ) e++;
|
|
|
|
// incorporate period etc.
|
|
if ( e[0] == '.' ) e++;
|
|
else if ( e[0] == '!' ) e++;
|
|
else if ( e[0] == '?' ) e++;
|
|
else if ( e[0] == ';' ) e++;
|
|
|
|
|
|
// must end in a period, or .) or .]
|
|
bool endsInPeriod = false;
|
|
if ( e-2 >= p &&
|
|
( e[-1] =='.' ||
|
|
e[-1] =='!' ||
|
|
e[-1] =='?' ) )
|
|
endsInPeriod = true;
|
|
if ( (e[-1] == ')' ||
|
|
e[-1] == ']' ) &&
|
|
(e[-2] == '.' ||
|
|
e[-2] == '?' ||
|
|
e[-2] == '!' ) )
|
|
endsInPeriod = true;
|
|
|
|
//int32_t off = reply.length();
|
|
|
|
// filter out tags and \n's and \r's and store into "reply"
|
|
if ( ! reply.safePrintFilterTagsAndLines ( p , e-p ,false ) )
|
|
return NULL;
|
|
|
|
// if a sentence and does not end in period, toss one in
|
|
//if ( sp->m_flags & SEC_SENTENCE ) {
|
|
// if ( e[-1] !='.' &&
|
|
// e[-1] !='!' &&
|
|
// e[-1] !='?' &&
|
|
// e[-1] !=']' &&
|
|
// e[-1] !=')' )
|
|
// reply.pushChar('.');
|
|
//}
|
|
|
|
// too huge? if # of ALNUM words > 70 it's too big.
|
|
bool isHuge = false;
|
|
if ( naw > 70 ) isHuge = true;
|
|
|
|
|
|
// ending in a * indicates a printable sentence for fast facts
|
|
if ( (sp->m_flags & SEC_SENTENCE) &&
|
|
! isList &&
|
|
! isHuge &&
|
|
! yelling &&
|
|
endsInPeriod )
|
|
reply.pushChar('*');
|
|
|
|
// delineate sentences/headers/sections with | now so
|
|
// we can still allow a word to be a gigabit even if it is
|
|
// not in a sentence with a query term
|
|
//reply.pushChar('\0');
|
|
reply.pushChar('|');
|
|
char *pc = reply.getBufStart() + reply.length() - 1;
|
|
*pc = '\0';
|
|
|
|
// debug
|
|
//char *x = reply.getBufStart() + off;
|
|
// turn off fast fact debug for now
|
|
//log("fastfact: fastfact: %s",x);
|
|
// revert back to |
|
|
*pc = '|';
|
|
|
|
// stop? this fixes the query 'lesbain vedeo porno' on
|
|
// my cluster taking 10 seconds to get gigabits for.
|
|
// bigsamplemaxlen is 1000 as of 12/4/2013.
|
|
if ( reply.length() >= m_req->m_bigSampleMaxLen )
|
|
break;
|
|
}
|
|
// a final \0
|
|
reply.pushChar('\0');
|
|
// move it over to m_gsbuf now
|
|
m_gsbuf.stealBuf ( &reply );
|
|
// we are valid
|
|
m_gsbufValid = true;
|
|
// success
|
|
return &m_gsbuf;
|
|
|
|
|
|
|
|
|
|
|
|
// need a buncha crap
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (SafeBuf *)pos;
|
|
Matches *mm = getMatches();
|
|
if ( ! mm || mm == (Matches *)-1 ) return (SafeBuf *)mm;
|
|
|
|
// convert length to number of words
|
|
int32_t bigSampleRadius = m_req->m_bigSampleRadius / 5;
|
|
// at least 1
|
|
if ( bigSampleRadius <= 0 ) bigSampleRadius = 1;
|
|
|
|
// alloc for whole document?
|
|
int32_t max = xml->getContentLen() ;
|
|
// do not exceed
|
|
if ( max > m_req->m_bigSampleMaxLen ) max = m_req->m_bigSampleMaxLen;
|
|
// make sure we have something in words too. i guess no sample?
|
|
if ( max <= 2 ) { m_gsbufValid = true; return &m_gsbuf; }
|
|
// a flag so we don't overlap samples...
|
|
int32_t lastb = -1;
|
|
// . set m_buf to where we write the sample
|
|
// . add a byte for the terminating \0
|
|
int32_t gsbufAllocSize = max + 1;
|
|
// temp hack
|
|
//m_gsbuf = (char *)mmalloc(m_gsbufAllocSize,"gsbuf");
|
|
if ( ! m_gsbuf.reserve ( gsbufAllocSize, "gsbuf" ) ) return NULL;
|
|
// g_errno should be set...
|
|
//if ( ! m_gsbuf ) return NULL;
|
|
//m_freeBuf = true;
|
|
// set our pointer
|
|
char *pstart = m_gsbuf.getBufStart();
|
|
char *p = pstart;
|
|
char *pend = pstart + max;
|
|
|
|
int32_t nw = ww->m_numWords;
|
|
|
|
// skip to first query term
|
|
for ( int32_t i = 0 ; i < mm->m_numMatches ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the match
|
|
Match *m = &mm->m_matches[i];
|
|
// break out if match is not from the document's Words class
|
|
if ( m->m_words != ww ) break;
|
|
// the word #
|
|
int32_t n = m->m_wordNum;
|
|
// got a match, add this samplet, [a,b]
|
|
int32_t a = n - bigSampleRadius;
|
|
int32_t b = n + bigSampleRadius;
|
|
if ( a < 0 ) a = 0;
|
|
if ( b > nw ) b = nw;
|
|
if ( a < lastb ) a = lastb;
|
|
// ensure the samples are separated by \0
|
|
else if ( p > pstart && p + 2 < pend ) {
|
|
*p++ = '\0';
|
|
}
|
|
Pos *pos = m->m_pos;
|
|
int32_t *pp = pos->m_pos;
|
|
int32_t len = pp[b+1] - pp[a];
|
|
// if match would send us over, we are done
|
|
if ( p + len >= pend ) break;
|
|
len = pos->filter(p,pend,m->m_words,a,b,m->m_sections);
|
|
// for debug (mdw)
|
|
//log("query: gigabitsample#%" INT32 "=%s",i,p);
|
|
p += len;
|
|
// we are the new lastb
|
|
lastb = b;
|
|
}
|
|
// always null terminate
|
|
*p++ = '\0';
|
|
// . set sample size
|
|
// . this includes terminating 0\'s in this case
|
|
//int32_t gsbufSize = p - m_gsbuf;
|
|
m_gsbuf.setLength( p - m_gsbuf.getBufStart() );
|
|
// we are valid
|
|
m_gsbufValid = true;
|
|
// for debug (mdw)
|
|
//log("query: finalgigabitsample=%s",m_gsbuf);
|
|
// success
|
|
return &m_gsbuf;
|
|
}
|
|
|
|
// if it is json then only return the json fields that are strings
|
|
// and json decode them... separate each field with a \0.
|
|
SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {
|
|
|
|
SafeBuf tmp;
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (SafeBuf *)jp;
|
|
JsonItem *ji = jp->getFirstItem();
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not string
|
|
if ( ji->m_type != JT_STRING )
|
|
continue;
|
|
// store field value
|
|
char *val = ji->getValue();
|
|
int valLen = ji->getValueLen();
|
|
// if it contains html then skip it as a gigabit candidate.
|
|
// otherwise our fast facts end up including html tags in them
|
|
// in computeFastFacts() in Msg40.cpp
|
|
int i;
|
|
for ( i = 0 ; i < valLen ; i++ )
|
|
if ( val[i] == '<' ) break;
|
|
if ( i < valLen ) continue;
|
|
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
// if ( ! tmp.safePrintf("<p>"))
|
|
// return NULL;
|
|
|
|
|
|
// decode the json
|
|
//SafeBuf xx;
|
|
if ( ! tmp.safeDecodeJSONToUtf8(val,valLen,m_niceness))
|
|
return NULL;
|
|
|
|
// escape out the html
|
|
// if ( ! tmp.htmlEncode ( xx.getBufStart() ))
|
|
// return NULL;
|
|
|
|
// two new lines
|
|
if ( ! tmp.safePrintf("<hr>"))
|
|
return NULL;
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
}
|
|
|
|
if ( ! tmp.nullTerm() )
|
|
return NULL;
|
|
|
|
Xml xml;
|
|
if ( ! xml.set ( tmp.getBufStart() ,
|
|
tmp.length() ,
|
|
false , // ownData?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
false , // setParentsArg?
|
|
m_niceness ,
|
|
CT_HTML ) ) // *ct ) )
|
|
return NULL;
|
|
Words ww;
|
|
if ( ! ww.set ( &xml , true , m_niceness ) ) return NULL;
|
|
Bits bb;
|
|
if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL;
|
|
Phrases pp;
|
|
if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL;
|
|
// this uses the sectionsReply to see which sections are
|
|
// "text", etc. rather than compute it expensively
|
|
Sections sec;
|
|
if ( !sec.set ( &ww ,
|
|
&pp ,
|
|
&bb ,
|
|
getFirstUrl() ,
|
|
0,//*d ,
|
|
0,//*sh64 , // 64 bits
|
|
"",//cr->m_coll ,
|
|
m_niceness ,
|
|
NULL,//m_masterState , // state
|
|
NULL,//m_masterLoop , // callback
|
|
CT_JSON, // *ct ,
|
|
NULL,//&m_dates ,
|
|
NULL , // sd // sections data
|
|
true , // sections data valid?
|
|
NULL , // sv // for m_nsvt
|
|
NULL , // buf
|
|
0 )) { // bufSize
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// now add each sentence section into the buffer
|
|
// scan the sentences if we got those
|
|
char **wptrs = ww.getWords();
|
|
int32_t *wlens = ww.getWordLens();
|
|
Section *ss = sec.m_firstSent;
|
|
for ( ; ss ; ss = ss->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// count of the alnum words in sentence
|
|
int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
|
|
// start with one word!
|
|
count--;
|
|
// how can it be less than one alnum word
|
|
if ( count < 0 ) continue;
|
|
// store it
|
|
char *wp1 = wptrs[ss->m_senta];
|
|
char *wp2 = wptrs[ss->m_sentb-1] + wlens[ss->m_sentb-1];
|
|
|
|
bool gotTerm = (wp2[0]=='.' || wp2[0]=='?' || wp2[0]=='!' ) ;
|
|
|
|
//if ( ! gotTerm ) continue;
|
|
|
|
if ( ! m_gsbuf.safeMemcpy ( wp1 , wp2 - wp1 ) )
|
|
return NULL;
|
|
|
|
// puncty?
|
|
if ( gotTerm && ! m_gsbuf.pushChar(wp2[0]))
|
|
return NULL;
|
|
|
|
// to indicate end of header or sentence, in order to
|
|
// qualify as a fast fact, we must add a '*'. see
|
|
// PageResults.cpp, search for ''*''
|
|
if ( gotTerm && ! m_gsbuf.pushChar('*') )
|
|
return NULL;
|
|
if ( ! m_gsbuf.pushChar('\0') )
|
|
return NULL;
|
|
}
|
|
m_gsbufValid = true;
|
|
return &m_gsbuf;
|
|
}
|
|
|
|
|
|
// . good sites sometimes have hacked pages
|
|
// . try to identify those
|
|
char *XmlDoc::getIsCompromised ( ) {
|
|
if ( m_isCompromisedValid ) return &m_isCompromised;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (void *)-1 ) return (char *)xml;
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// assume compromised
|
|
m_isCompromised = true;
|
|
m_isCompromisedValid = true;
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != TAG_FONT ) continue;
|
|
// only get content for <meta name=..> not <meta http-equiv=..>
|
|
int32_t stlen;
|
|
char *style = nodes[i].getFieldValue ( "style" , &stlen );
|
|
// skip if none
|
|
if ( ! style || stlen <= 6 ) continue;
|
|
// NULL term
|
|
char c = style[stlen];
|
|
style[stlen] = '\0';
|
|
char *hc = strstr(style,"height");
|
|
char *wc = strstr(style,"width");
|
|
// skip if neither
|
|
if ( ! hc && ! wc ) continue;
|
|
// advance
|
|
if ( hc ) hc += 6;
|
|
if ( wc ) wc += 5;
|
|
while ( is_wspace_a(*hc) ) hc++;
|
|
while ( is_wspace_a(*wc) ) wc++;
|
|
if ( hc && *hc == ':' ) hc++;
|
|
if ( wc && *wc == ':' ) hc++;
|
|
while ( is_wspace_a(*hc) ) hc++;
|
|
while ( is_wspace_a(*wc) ) wc++;
|
|
style[stlen] = c;
|
|
// a zero height or width is a signal of invisible text and of
|
|
// our syzygy compromised site to compromised site spammer
|
|
if ( *hc == '0' ) return &m_isCompromised;
|
|
if ( *wc == '0' ) return &m_isCompromised;
|
|
}
|
|
m_isCompromised = false;
|
|
return &m_isCompromised;
|
|
}
|
|
|
|
// <meta name=robots value=noarchive>
|
|
// <meta name=gigabot value=noarchive>
|
|
char *XmlDoc::getIsNoArchive ( ) {
|
|
if ( m_isNoArchiveValid ) return &m_isNoArchive;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (void *)-1 ) return (char *)xml;
|
|
m_isNoArchive = false;
|
|
m_isNoArchiveValid = true;
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// find the meta tags
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != TAG_META ) continue;
|
|
// get robots attribute
|
|
int32_t alen; char *att;
|
|
// <meta name=robots value=noarchive>
|
|
att = nodes[i].getFieldValue ( "name" , &alen );
|
|
// need a name!
|
|
if ( ! att ) continue;
|
|
// get end
|
|
char *end = att + alen;
|
|
// skip leading spaces
|
|
while ( att < end && *att && is_wspace_a(*att) ) att++;
|
|
// must be robots or gigabot. skip if not
|
|
if ( strncasecmp(att,"robots" ,6) &&
|
|
strncasecmp(att,"gigabot",7) ) continue;
|
|
// get the content value
|
|
att = nodes[i].getFieldValue("content",&alen);
|
|
// skip if none
|
|
if ( ! att ) continue;
|
|
// get end
|
|
end = att + alen;
|
|
// skip leading spaces
|
|
while ( att < end && *att && is_wspace_a(*att) ) att++;
|
|
// is is noarchive? skip if no such match
|
|
if ( strncasecmp(att,"noarchive",9) ) continue;
|
|
// ok, we got it
|
|
m_isNoArchive = true;
|
|
break;
|
|
}
|
|
// return what we got
|
|
return &m_isNoArchive;
|
|
}
|
|
|
|
// this vector's components are 64-bit, not the usual 32-bit
|
|
int64_t **XmlDoc::getAdVector ( ) {
|
|
if ( m_adVectorValid ) return &ptr_adVector;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int64_t **)xml;
|
|
setStatus ( "parsing out ad ids");
|
|
// assume valid
|
|
m_adVectorValid = true;
|
|
int32_t na = 0;
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// find the meta tags
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// continue if not a script tag
|
|
if ( nodes[i].m_nodeId != TAG_SCRIPT ) continue; // 83
|
|
// must be a front tag, not a back tag
|
|
if ( xml->isBackTag ( i ) ) continue;
|
|
// find the back tag for it
|
|
int32_t j;
|
|
for ( j = i ; j < n ; j++ ) {
|
|
// another script tag
|
|
if( nodes[i].m_nodeId != TAG_SCRIPT ) continue;
|
|
// must be a back tag this time
|
|
if ( ! xml->isBackTag ( i ) ) continue;
|
|
// ok, we got it
|
|
break;
|
|
}
|
|
// if no back tag, give up
|
|
if ( j == n ) break;
|
|
|
|
// buf/len defines the script area
|
|
char *buf = xml->getNode(i);
|
|
int32_t len = xml->getNode(j) - buf;
|
|
|
|
// skip this script tag for next loop
|
|
i = j;
|
|
|
|
bool found = false;
|
|
|
|
// start off looking for google
|
|
char *needles[3] =
|
|
{ "google_ad_client" ,
|
|
"ctxt_ad_partner",
|
|
"http://ad" };
|
|
char *providers[3] =
|
|
{ "google" ,
|
|
"yahoo",
|
|
"doubleclick" };
|
|
|
|
for ( int32_t k = 0 ; k < 3 ; k++ ) {
|
|
// try to match this needle
|
|
char *match = needles[k];
|
|
// try to get a match
|
|
char *p = strnstr ( buf, match , len );
|
|
// go again
|
|
if ( ! p ) continue;
|
|
// do not exceed the script area
|
|
char *pend = buf + len;
|
|
|
|
// it is in quotes
|
|
// pub-uint64_t for google ad, uint32_t for yahoo
|
|
|
|
// check for double or single quote
|
|
while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
|
|
// it must have them!... i guess
|
|
if ( p >= pend ) continue;
|
|
|
|
// point to after the quote
|
|
char *pbegin = ++p;
|
|
// find the ending quote
|
|
while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
|
|
// if none, bail
|
|
if ( p >= pend ) continue;
|
|
// get length of the ad client id between the quotes
|
|
int32_t adClientLen = p - pbegin;
|
|
|
|
if ( k == 2 ) {
|
|
p = strnstr(p,".doubleclick.net/",pend-p);
|
|
if ( ! p ) continue;
|
|
p += 17;
|
|
// look for doubleclick ads
|
|
// user name is the second element of the path
|
|
while(p < pend && *p != '/') p++;
|
|
pbegin = ++p;
|
|
while(p < pend && *p != '/') p++;
|
|
if(p >= pend) continue;
|
|
adClientLen = p - pbegin;
|
|
found = true;
|
|
}
|
|
|
|
char *f = pbegin;
|
|
char *fend = pbegin + adClientLen;
|
|
for ( ; f < fend ; f++ ) {
|
|
if ( is_alnum_a ( *f ) ) continue;
|
|
if ( *f == '-' || *f == '_' || *f == '.' )
|
|
continue;
|
|
break;
|
|
}
|
|
if ( f < fend ) continue;
|
|
if ( adClientLen >= 400 ) continue;
|
|
if ( adClientLen < 4 ) continue;
|
|
// null term temp
|
|
char c = *fend;
|
|
*fend = '\0';
|
|
// hash it
|
|
char buf[512];
|
|
sprintf(buf,"gbad:%s-%s",providers[k],pbegin);
|
|
// put it back
|
|
*fend = c;
|
|
// . make the query term id
|
|
// . first hash the field
|
|
uint64_t h = hash64 ( "gbad" , 4 );
|
|
// then add in the other junk
|
|
h = hash64 ( buf , gbstrlen(buf) , h );
|
|
// . now we will index that as-is
|
|
// . and Msg25/LinkInfo can use to dedup voters!
|
|
m_adIds[na++] = h;
|
|
// stop if too many. save room for NULL termination.
|
|
if ( na + 1 >= XD_MAX_AD_IDS ) break;
|
|
}
|
|
//look for another if not found or not ok.
|
|
}
|
|
// null term it like a good vector! no, those are 32-bit components,
|
|
// we are a 64-bit component vector
|
|
//m_adIds[na++] = 0;
|
|
// point to where we should put them
|
|
ptr_adVector = m_adIds;
|
|
// store this i guess
|
|
size_adVector = na * 8;
|
|
// *lastNode = nn;
|
|
return &ptr_adVector;
|
|
}
|
|
|
|
|
|
|
|
char *XmlDoc::getIsLinkSpam ( ) {
|
|
if ( m_isLinkSpamValid ) return &m_isLinkSpam2;
|
|
|
|
setStatus ( "checking if linkspam" );
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (char *)links;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
|
|
int32_t **pici = getIndCatIds();
|
|
if ( ! pici || pici == (void *)-1 ) return (char *)pici;
|
|
//LinkInfo *info1 = getLinkInfo1();
|
|
//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// reset note
|
|
m_note = NULL;
|
|
|
|
// . if a doc is "link spam" then it cannot vote, or its
|
|
// voting power is reduced
|
|
// . look for indications that the link is from a guestbook
|
|
// . doc length over 100,000 bytes consider it link spam
|
|
m_isLinkSpamValid = true;
|
|
m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
|
|
*ip ,
|
|
ptr_indCatIds ,
|
|
size_indCatIds / 4 ,
|
|
*sni ,
|
|
xml,
|
|
links,
|
|
150000,//MAXDOCLEN,//maxDocLen ,
|
|
&m_note ,
|
|
NULL , // &linkee , // url ,
|
|
-1 , // linkNode ,
|
|
cr->m_coll ,
|
|
m_niceness );
|
|
// set shadow
|
|
m_isLinkSpam2 = (bool)m_isLinkSpam;
|
|
return &m_isLinkSpam2;
|
|
}
|
|
|
|
|
|
|
|
void *zliballoc ( void *opaque , unsigned int items , unsigned int size ) {
|
|
//log("db: got zlib alloc");
|
|
return (void *)mmalloc ( items * size , "zlib" );
|
|
}
|
|
|
|
void zlibfree ( void *opaque , void *address ) {
|
|
//log("db: got zlib free");
|
|
// -1 will tell Mem.cpp to look it up in the table
|
|
mfree ( address , -1 , "zlib" );
|
|
}
|
|
|
|
void *malloc_replace (void *pf , unsigned int nitems , unsigned int size ) {
|
|
return g_mem.gbmalloc(size*nitems,"malloc_replace");
|
|
}
|
|
|
|
void free_replace ( void *pf , void *s ) {
|
|
// -1 means we don't know the size
|
|
g_mem.gbfree(s,-1,"free_replace");
|
|
}
|
|
|
|
int gbuncompress ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ) {
|
|
z_stream stream;
|
|
int err;
|
|
|
|
stream.next_in = (Bytef*)source;
|
|
stream.avail_in = (uInt)sourceLen;
|
|
// Check for source > 64K on 16-bit machine:
|
|
if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
|
|
|
|
stream.next_out = dest;
|
|
stream.avail_out = (uInt)*destLen;
|
|
if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
|
|
|
|
//stream.zalloc = (alloc_func)0;
|
|
//stream.zfree = (free_func)0;
|
|
stream.zalloc = malloc_replace;//zliballoc;
|
|
stream.zfree = free_replace;//zlibfree;
|
|
|
|
// this calls memcpy so make sure Profiler.cpp doesn't crash
|
|
// since when it calls backtrace() that calls memcpy() too
|
|
// and it's not async safe
|
|
g_inMemcpy = 2;
|
|
|
|
//we can be gzip or deflate
|
|
err = inflateInit2(&stream, 47);
|
|
|
|
g_inMemcpy = 0;
|
|
|
|
if (err != Z_OK) return err;
|
|
|
|
err = inflate(&stream, Z_FINISH);
|
|
if (err != Z_STREAM_END) {
|
|
inflateEnd(&stream);
|
|
if (err == Z_NEED_DICT ||
|
|
(err == Z_BUF_ERROR && stream.avail_in == 0))
|
|
return Z_DATA_ERROR;
|
|
return err;
|
|
}
|
|
*destLen = stream.total_out;
|
|
|
|
err = inflateEnd(&stream);
|
|
return err;
|
|
}
|
|
|
|
void deflateQuickPoll ( ) {
|
|
QUICKPOLL(1);
|
|
}
|
|
|
|
int gbcompress ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ,
|
|
int32_t encoding ) {
|
|
|
|
int level = Z_DEFAULT_COMPRESSION;
|
|
z_stream stream;
|
|
int err;
|
|
int method = Z_DEFLATED;
|
|
//lots of mem, faster, more compressed, see zlib.h
|
|
int windowBits = 31;
|
|
int memLevel = 8;
|
|
int strategy = Z_DEFAULT_STRATEGY;
|
|
|
|
stream.next_in = (Bytef*)source;
|
|
stream.avail_in = (uInt)sourceLen;
|
|
#ifdef MAXSEG_64K
|
|
// Check for source > 64K on 16-bit machine:
|
|
if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
|
|
#endif
|
|
stream.next_out = dest;
|
|
stream.avail_out = (uInt)*destLen;
|
|
if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
|
|
|
|
//stream.zalloc = (alloc_func)0;
|
|
//stream.zfree = (free_func)0;
|
|
stream.zalloc = malloc_replace;//zliballoc;
|
|
stream.zfree = free_replace;//zlibfree;
|
|
|
|
stream.opaque = (voidpf)0;
|
|
|
|
//we can be gzip or deflate
|
|
if(encoding == ET_DEFLATE) err = deflateInit (&stream, level);
|
|
else err = deflateInit2(&stream, level,
|
|
method, windowBits,
|
|
memLevel, strategy);
|
|
if (err != Z_OK) {
|
|
// zlib's incompatible version error?
|
|
if ( err == -6 ) {
|
|
log("zlib: zlib did you forget to add #pragma pack(4) to "
|
|
"zlib.h when compiling libz.a so it aligns on 4-byte "
|
|
"boundaries because we have that pragma in "
|
|
"gb-include.h so its used when including zlib.h");
|
|
}
|
|
return err;
|
|
}
|
|
|
|
// cygwin uses the system libz.a which is not hacked for our quickpoll
|
|
#ifndef CYGWIN
|
|
// tell deflat() to call quickpoll
|
|
|
|
// MDW: 11/14/2014 don't do this for the 64bit zlib for now just to
|
|
// save some time. do it later when it proves to be an issue.
|
|
//setQuickPoll ( (char *)&g_loop.m_needsToQuickPoll, deflateQuickPoll);
|
|
#endif
|
|
|
|
// this calls memcpy so make sure Profiler.cpp doesn't crash
|
|
// since when it calls backtrace() that calls memcpy() too
|
|
// and it's not async safe
|
|
g_inMemcpy = 3;
|
|
|
|
err = deflate(&stream, Z_FINISH);
|
|
|
|
g_inMemcpy = 0;
|
|
|
|
if (err != Z_STREAM_END) {
|
|
deflateEnd(&stream);
|
|
return err == Z_OK ? Z_BUF_ERROR : err;
|
|
}
|
|
*destLen = stream.total_out;
|
|
|
|
err = deflateEnd(&stream);
|
|
return err;
|
|
}
|
|
|
|
//
|
|
// NO NO don't use until use replace in[64] with SafeBuf in and out below
|
|
//
|
|
int gbcompress7 ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ,
|
|
bool compress ) {
|
|
|
|
//int32_t id = 1;
|
|
// pass the input to the program through this file
|
|
// rather than a pipe, since popen() seems broken
|
|
char in[264];
|
|
if ( compress ) sprintf ( in , "%s/in.7z", g_hostdb.m_dir );
|
|
else sprintf ( in , "%s/out.7z", g_hostdb.m_dir );
|
|
unlink ( in );
|
|
// collect the output from the filter from this file
|
|
char out[264];
|
|
if ( compress ) sprintf ( out , "%s/out.7z", g_hostdb.m_dir );
|
|
else sprintf ( out , "%s/in.7z", g_hostdb.m_dir );
|
|
if ( ! compress )
|
|
unlink ( out );
|
|
// ignore errno from those unlinks
|
|
errno = 0;
|
|
// open the input file
|
|
retry11:
|
|
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry11;
|
|
log("build: Could not open file %s for writing: %s.",
|
|
in,mstrerror(errno));
|
|
return -1;
|
|
}
|
|
|
|
retry12:
|
|
// write the content into the input file
|
|
int32_t w = write ( fd , source , sourceLen );
|
|
// valgrind
|
|
if ( w < 0 && errno == EINTR ) goto retry12;
|
|
// did we get an error
|
|
if ( w != (int32_t)sourceLen ) {
|
|
log("build: Error writing to %s: %s.",in,mstrerror(errno));
|
|
close(fd);
|
|
return -1;
|
|
}
|
|
// close the file
|
|
close ( fd );
|
|
|
|
// . open a pipe to pdf2html program
|
|
// . the output will go to stdout
|
|
//char cmd[2048];
|
|
SafeBuf cmd;
|
|
// different commands to filter differt ctypes
|
|
// -i : ignore images
|
|
// -stdout: send output to stdout
|
|
// -c : generate complex document
|
|
// Google generates complex docs, but the large ones are horribly slow
|
|
// in the browser, but docs with 2 cols don't display right w/o -c.
|
|
// damn, -stdout doesn't work when -c is specified.
|
|
// These ulimit sizes are max virtual memory in kilobytes. let's
|
|
// keep them to 25 Megabytes
|
|
// . the newer 2.6 kernels do not support ulimit !!!
|
|
if ( compress )
|
|
// 7za a out.7z in.7z
|
|
cmd.safePrintf( "%s7za a %s %s > /dev/null",
|
|
g_hostdb.m_dir , out,in);
|
|
else
|
|
// -y = yes on all. so we overwrite "in.7z"
|
|
cmd.safePrintf( "%s7za -o%s -y e %s > /dev/null",
|
|
g_hostdb.m_dir,g_hostdb.m_dir , in);//,in);
|
|
// breach sanity check
|
|
//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// exectue it
|
|
int retVal = gbsystem ( cmd.getBufStart() );
|
|
if ( retVal == -1 )
|
|
log("gb: system(%s) : %s",cmd.getBufStart(),
|
|
mstrerror(g_errno));
|
|
|
|
// all done with input file
|
|
// clean up the binary input file from disk
|
|
//if ( unlink ( in ) != 0 ) {
|
|
// // log error
|
|
// log("gbfilter: unlink (%s): %s\n",in,strerror(errno));
|
|
// // ignore it, since it was not a processing error per se
|
|
// errno = 0;
|
|
//}
|
|
|
|
retry13:
|
|
fd = open ( out , O_RDONLY );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry13;
|
|
log("7zip: Could not open file %s for reading: %s.",
|
|
out,mstrerror(errno));
|
|
return -1;
|
|
}
|
|
// to read - leave room for \0
|
|
//int32_t toRead = MAXDOCLEN + 1000;
|
|
int32_t toRead = 150000 + 1000;
|
|
retry14:
|
|
// read right from pipe descriptor
|
|
int32_t r = read (fd, dest,toRead);
|
|
// note errors
|
|
if ( r < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry14;
|
|
log("7zip: reading output: %s",mstrerror(errno));
|
|
// this is often bad fd from an oom error, so ignore it
|
|
errno = 0;
|
|
r = 0;
|
|
}
|
|
// clean up shop
|
|
close ( fd );
|
|
// delete output file
|
|
//unlink ( out );
|
|
if ( r > (int32_t)*destLen ) { char *xx=NULL;*xx=0; }
|
|
// assign
|
|
*destLen = r;
|
|
// debug for now
|
|
char *pre = "";
|
|
if ( ! compress ) pre = "un";
|
|
log("7zip: %scompressed %" UINT32 " to %" UINT32 " bytes"
|
|
, pre,sourceLen , *destLen );
|
|
return Z_OK;
|
|
}
|
|
|
|
int gbuncompress7 ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ) {
|
|
return gbcompress7(dest,destLen,source,sourceLen,false);
|
|
}
|
|
|
|
/*
|
|
bool XmlDoc::hashSingleTerm ( int64_t termId , HashInfo *hi ) {
|
|
// combine with a non-NULL prefix
|
|
if ( hi->m_prefix ) {
|
|
int64_t prefixHash = hash64b ( hi->m_prefix );
|
|
// sanity test, make sure it is in supported list
|
|
if ( getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
|
|
char *xx=NULL;*xx=0; }
|
|
termId = hash64 ( termId , prefixHash );
|
|
}
|
|
|
|
// save it?
|
|
if ( m_wts && ! ::storeTerm ( "binary",6,termId,hi,0,0,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
hi->m_hashGroup,
|
|
false,&m_wbuf,m_wts,false) )
|
|
return false;
|
|
|
|
// int16_tcut
|
|
HashTableX *dt = hi->m_tt;
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key_t) ) { char *xx=NULL;*xx=0; }
|
|
// make the key like we do in hashWords()
|
|
key96_t k;
|
|
k.n1 = hi->m_date;
|
|
k.n0 = termId;
|
|
// get current score for this wordid
|
|
int32_t slot = dt->getSlot ( &k );
|
|
// does this termid/date already exist?
|
|
if ( slot >= 0 ) {
|
|
// done
|
|
return true;
|
|
}
|
|
// otherwise, add a new slot
|
|
char val = 1;
|
|
if ( ! hi->m_tt->addKey ( (char *)k , &val ) )
|
|
return false;
|
|
// return true on success
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
bool storeTerm ( char *s ,
|
|
int32_t slen ,
|
|
int64_t termId ,
|
|
HashInfo *hi ,
|
|
int32_t wordNum ,
|
|
int32_t wordPos ,
|
|
char densityRank,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char hashGroup,
|
|
//bool isPhrase ,
|
|
SafeBuf *wbuf ,
|
|
HashTableX *wts ,
|
|
char synSrc ,
|
|
char langId ,
|
|
POSDBKEY key ) {
|
|
|
|
// store prefix
|
|
int32_t poff = wbuf->length();
|
|
// int16_tcut
|
|
char *p = hi->m_prefix;
|
|
// add the prefix too!
|
|
if ( p && ! wbuf->safeMemcpy(p,gbstrlen(p)+1)) return false;
|
|
// none?
|
|
if ( ! p ) poff = -1;
|
|
|
|
|
|
// store description
|
|
int32_t doff = wbuf->length();
|
|
// int16_tcut
|
|
char *d = hi->m_desc;
|
|
// add the desc too!
|
|
if ( d && ! wbuf->safeMemcpy(d,gbstrlen(d)+1) ) return false;
|
|
// none?
|
|
if ( ! d ) doff = -1;
|
|
|
|
// store term
|
|
int32_t toff = wbuf->length();
|
|
// add it
|
|
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
|
|
|
|
// make this
|
|
TermDebugInfo ti;
|
|
ti.m_termOff = toff;
|
|
ti.m_termLen = slen;
|
|
ti.m_descOff = doff;
|
|
ti.m_prefixOff = poff;
|
|
ti.m_date = hi->m_date;
|
|
ti.m_shardByTermId = hi->m_shardByTermId;
|
|
ti.m_termId = termId;
|
|
//ti.m_weight = 1.0;
|
|
//ti.m_spam = -1.0;
|
|
ti.m_diversityRank = diversityRank;
|
|
ti.m_densityRank = densityRank;
|
|
ti.m_wordSpamRank = wordSpamRank;
|
|
ti.m_hashGroup = hashGroup;
|
|
ti.m_wordNum = wordNum;
|
|
ti.m_wordPos = wordPos;
|
|
ti.m_langId = langId;
|
|
ti.m_key = key;
|
|
|
|
// was sitehash32
|
|
//ti.m_facetVal32 = hi->m_facetVal32;//sentHash32 = hi->m_sentHash32;
|
|
|
|
// save for printing out an asterisk
|
|
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
|
|
|
|
// get language bit vec
|
|
ti.m_langBitVec64 = g_speller.getLangBits64(&termId);
|
|
|
|
//if ( isPhrase ) ti.m_synSrc = SOURCE_NGRAM;
|
|
|
|
/*
|
|
// the weight vec for the words and phrases
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) ti.m_rv[j] = 1.0;
|
|
|
|
int32_t *wscores = NULL;
|
|
|
|
if ( weights && ! isPhrase ) wscores = weights->m_ww;
|
|
if ( weights && isPhrase ) wscores = weights->m_pw;
|
|
|
|
// int16_tcut
|
|
int32_t i = wordNum;
|
|
|
|
if ( weights && ! weights->m_rvw ) { char *xx=NULL;*xx=0; }
|
|
if ( weights && ! weights->m_rvp ) { char *xx=NULL;*xx=0; }
|
|
|
|
float *rv = NULL;
|
|
if ( weights && ! isPhrase ) rv = &weights->m_rvw[i*MAX_RULES];
|
|
if ( weights && isPhrase ) rv = &weights->m_rvp[i*MAX_RULES];
|
|
|
|
if ( weights ) ti.m_weight = (float)wscores[i] / (float)DW;
|
|
|
|
if ( weights )
|
|
gbmemcpy ( &ti.m_rv, rv , MAX_RULES*sizeof(float));
|
|
|
|
// no, because if this is zero we force it up to 1!
|
|
//if ( weights )
|
|
// ti.m_score32 = (int32_t)((float)ti.m_score32 * ti.m_weight);
|
|
ti.m_score32 = score;
|
|
|
|
if ( isSynonym )
|
|
ti.m_score32 = score;
|
|
*/
|
|
|
|
// make the key
|
|
key96_t k;
|
|
k.n1 = 0; // date
|
|
k.n0 = termId;
|
|
|
|
// store it
|
|
return wts->addKey ( &k , &ti ) ;
|
|
}
|
|
|
|
|
|
|
|
bool XmlDoc::hashSingleTerm ( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ) {
|
|
// empty?
|
|
if ( slen <= 0 ) return true;
|
|
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
|
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
|
|
//
|
|
//if ( ! m_storeTermListInfo )
|
|
// return true;
|
|
|
|
|
|
// a single blob hash
|
|
int64_t termId = hash64 ( s , slen );
|
|
// combine with prefix
|
|
int64_t final = termId;
|
|
// combine with a non-NULL prefix
|
|
int64_t prefixHash = 0LL;
|
|
if ( hi->m_prefix ) {
|
|
prefixHash = hash64b ( hi->m_prefix );
|
|
final = hash64 ( termId , prefixHash );
|
|
}
|
|
// call the other guy now
|
|
//return hashSingleTerm ( final , hi );
|
|
|
|
// int16_tcut
|
|
HashTableX *dt = hi->m_tt;
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key144_t) ) { char *xx=NULL;*xx=0; }
|
|
// make the key like we do in hashWords()
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
final,
|
|
0LL, // docid
|
|
0, // dist
|
|
MAXDENSITYRANK, // density rank
|
|
MAXDIVERSITYRANK, // diversity rank
|
|
MAXWORDSPAMRANK, // wordspamrank
|
|
0, // siterank
|
|
hi->m_hashGroup,
|
|
// we set to docLang in final hash loop
|
|
langUnknown,// langid
|
|
0, // multiplier
|
|
0, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//
|
|
// HACK: mangle the key if its a gbsitehash:xxxx term
|
|
// used for doing "facets" like stuff on section xpaths.
|
|
//
|
|
// no longer do this because we just hash the term
|
|
// gbxpathsitehash1234567 where 1234567 is that hash.
|
|
// but
|
|
//
|
|
//static int64_t s_gbsectionhash = 0LL;
|
|
//if ( ! s_gbsectionhash ) s_gbsectionhash = hash64b("gbsectionhash");
|
|
//if ( prefixHash == s_gbsectionhash )
|
|
// g_posdb.setSectionSentHash32 ( &k, hi->m_sentHash32 );
|
|
|
|
// . otherwise, add a new slot
|
|
// . key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
if ( ! dt->addTerm144 ( &k ) ) return false;
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( m_wts && ! storeTerm ( s,slen,final,hi,
|
|
0, // wordnum
|
|
0, // wordPos,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
hi->m_hashGroup,
|
|
//false,
|
|
&m_wbuf,
|
|
m_wts,
|
|
SOURCE_NONE, // synsrc
|
|
langUnknown,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashString ( char *s, HashInfo *hi ) {
|
|
return hashString ( s , gbstrlen(s), hi ); }
|
|
|
|
bool XmlDoc::hashString ( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ) {
|
|
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
|
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t *sni = getSiteNumInlinks();
|
|
return hashString3( s ,
|
|
slen ,
|
|
hi ,
|
|
&m_countTable ,
|
|
m_pbuf ,
|
|
m_wts ,
|
|
&m_wbuf ,
|
|
m_version ,
|
|
*sni ,
|
|
m_niceness );
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashString3( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ,
|
|
HashTableX *countTable ,
|
|
SafeBuf *pbuf ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf ,
|
|
int32_t version ,
|
|
int32_t siteNumInlinks ,
|
|
int32_t niceness ) {
|
|
Words words;
|
|
Bits bits;
|
|
Phrases phrases;
|
|
//Weights weights;
|
|
//Synonyms synonyms;
|
|
if ( ! words.set ( s , slen , version , true , niceness ) )
|
|
return false;
|
|
if ( ! bits.set ( &words , version , niceness ) )
|
|
return false;
|
|
if ( ! phrases.set(&words,&bits,true,false,version,niceness ) )
|
|
return false;
|
|
|
|
// use primary langid of doc
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// words
|
|
//SafeBuf myLangVec;
|
|
//if ( ! setLangVec ( &words , &myLangVec , m_niceness ) )
|
|
// return false;
|
|
//char *langVec = (char *)myLangVec.getBufStart();
|
|
|
|
/*
|
|
// debugBuf for synonyms? yes if we are debugging
|
|
SafeBuf synDebugBuf;
|
|
SafeBuf *sdbp = NULL;
|
|
if ( pbuf || m_storeTermListInfo ) sdbp = &synDebugBuf;
|
|
// now we can set it...
|
|
if ( hi->m_useSynonyms && !synonyms.set(&words,
|
|
NULL, // langVec,
|
|
m_langId,
|
|
&phrases,
|
|
niceness,
|
|
sdbp))
|
|
return false;
|
|
*/
|
|
|
|
// set weights because of count table
|
|
//if ( countTable && ! weights.set ( &words ,
|
|
/*
|
|
if ( hi->m_useWeights &&
|
|
! weights.set ( &words ,
|
|
&phrases ,
|
|
&bits ,
|
|
NULL ,
|
|
pbuf ,
|
|
false ,
|
|
false ,
|
|
version ,
|
|
100 , // titleWeight
|
|
100 , // headerWeight
|
|
countTable ,
|
|
false , // isLinkText
|
|
false , // isCntTable?
|
|
siteNumInlinks ,
|
|
niceness ) )
|
|
return false;
|
|
|
|
Weights *wp = &weights;
|
|
if ( ! hi->m_useWeights ) wp = NULL;
|
|
*/
|
|
|
|
//Synonyms *sp = NULL;
|
|
//if ( hi->m_useSynonyms ) sp = &synonyms;
|
|
|
|
return hashWords3 ( //0 ,
|
|
//words.getNumWords() ,
|
|
hi ,
|
|
&words ,
|
|
&phrases ,
|
|
NULL,//sp , synonyms
|
|
NULL , // sections
|
|
countTable ,
|
|
NULL , // fragvec
|
|
NULL , // wordspamvec
|
|
NULL , // langvec
|
|
langUnknown , // default langid doclangid
|
|
pbuf ,
|
|
wts ,
|
|
wbuf ,
|
|
niceness );
|
|
}
|
|
|
|
bool XmlDoc::hashWords ( //int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
HashInfo *hi ) {
|
|
// sanity checks
|
|
if ( ! m_wordsValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_phrasesValid ) { char *xx=NULL; *xx=0; }
|
|
if ( hi->m_useCountTable &&!m_countTableValid){char *xx=NULL; *xx=0; }
|
|
if ( ! m_bitsValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_sectionsValid) { char *xx=NULL; *xx=0; }
|
|
//if ( ! m_synonymsValid) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_fragBufValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_wordSpamBufValid ) { char *xx=NULL; *xx=0; }
|
|
if ( m_wts && ! m_langVectorValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL; *xx=0; }
|
|
// . is the word repeated in a pattern?
|
|
// . this should only be used for document body, for meta tags,
|
|
// inlink text, etc. we should make sure words are unique
|
|
char *wordSpamVec = getWordSpamVec();
|
|
char *fragVec = m_fragBuf.getBufStart();
|
|
char *langVec = m_langVec.getBufStart();
|
|
|
|
return hashWords3( //wordStart ,
|
|
//wordEnd ,
|
|
hi ,
|
|
&m_words ,
|
|
&m_phrases ,
|
|
NULL,//&m_synonyms ,
|
|
&m_sections ,
|
|
&m_countTable ,
|
|
fragVec ,
|
|
wordSpamVec ,
|
|
langVec ,
|
|
m_langId , // defaultLangId docLangId
|
|
m_pbuf ,
|
|
m_wts ,
|
|
&m_wbuf ,
|
|
m_niceness );
|
|
}
|
|
|
|
// . this now uses posdb exclusively
|
|
bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
HashInfo *hi ,
|
|
Words *words ,
|
|
Phrases *phrases ,
|
|
Synonyms *synonyms ,
|
|
Sections *sectionsArg ,
|
|
HashTableX *countTable ,
|
|
char *fragVec ,
|
|
char *wordSpamVec ,
|
|
char *langVec ,
|
|
char docLangId , // default lang id
|
|
//Weights *weights ,
|
|
SafeBuf *pbuf ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf ,
|
|
int32_t niceness ) {
|
|
|
|
//
|
|
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
|
|
//
|
|
//if ( ! m_storeTermListInfo )
|
|
// return true;
|
|
|
|
Sections *sections = sectionsArg;
|
|
// for getSpiderStatusDocMetaList() we don't use sections it'll
|
|
// mess us up
|
|
if ( ! hi->m_useSections ) sections = NULL;
|
|
|
|
// int16_tcuts
|
|
uint64_t *wids = (uint64_t *)words->getWordIds();
|
|
//nodeid_t *tids = words->m_tagIds;
|
|
uint64_t *pids2 = (uint64_t *)phrases->m_phraseIds2;
|
|
//uint64_t *pids3 = (uint64_t *)phrases->m_phraseIds3;
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// . sanity checks
|
|
// . posdb just uses the full keys with docid
|
|
if ( dt->m_ks != 18 ) { char *xx=NULL;*xx=0; }
|
|
if ( dt->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if provided...
|
|
if ( wts ) {
|
|
if ( wts->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( wts->m_ds != sizeof(TermDebugInfo)){char *xx=NULL;*xx=0; }
|
|
if ( ! wts->m_allowDups ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// ensure caller set the hashGroup
|
|
if ( hi->m_hashGroup < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// handy
|
|
char **wptrs = words->getWordPtrs();
|
|
int32_t *wlens = words->getWordLens();
|
|
|
|
// hash in the prefix
|
|
uint64_t prefixHash = 0LL;
|
|
int32_t plen = 0;
|
|
if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && plen ) {
|
|
// we gotta make this case insensitive, and skip spaces
|
|
// because if it is 'focal length' we can't search
|
|
// 'focal length:10' because that comes across as TWO terms.
|
|
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
|
|
// . sanity test, make sure it is in supported list
|
|
// . hashing diffbot json output of course fails this so
|
|
// skip in that case if diffbot
|
|
//if ( ! m_isDiffbotJSONObject &&
|
|
// getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
|
|
// if (hi->m_desc&&strcmp(hi->m_desc,"custom meta tag")) {
|
|
// char *xx=NULL;*xx=0; }
|
|
//}
|
|
}
|
|
|
|
bool hashIffUnique = false;
|
|
//if ( hi->m_hashGroup == HASHGROUP_INLINKTEXT ) hashIffUnique = true;
|
|
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
|
|
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
|
|
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,niceness,"uqtbl");
|
|
|
|
///////
|
|
//
|
|
// diversity rank vector.
|
|
//
|
|
///////
|
|
// the final diversity which is a multiplier
|
|
// is converted into a rank from 0-15 i guess.
|
|
// so 'mexico' in "new mexico" should receive a low word score but high
|
|
// phrase score. thus, a search for 'mexico' should not bring up
|
|
// the page for university of new mexico!
|
|
SafeBuf dwbuf;
|
|
if(!getDiversityVec ( words,phrases,countTable,&dwbuf,niceness))
|
|
return false;
|
|
char *wdv = dwbuf.getBufStart();
|
|
|
|
int32_t nw = words->getNumWords();
|
|
|
|
/////
|
|
//
|
|
// calculate density ranks
|
|
//
|
|
/////
|
|
//
|
|
// this now varies depending on the length of the sentence/header etc.
|
|
// so if the hasgroup is not title, link text or meta tag, we have to
|
|
// use a safebuf.
|
|
SafeBuf densBuf;
|
|
// returns false and sets g_errno on error
|
|
if ( ! getDensityRanks((int64_t *)wids,
|
|
nw,//wordStart,
|
|
//wordEnd,
|
|
hi->m_hashGroup,
|
|
&densBuf,
|
|
sections,
|
|
m_niceness))
|
|
return false;
|
|
// a handy ptr
|
|
char *densvec = (char *)densBuf.getBufStart();
|
|
|
|
////////////
|
|
//
|
|
// get word positions
|
|
//
|
|
///////////
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
SafeBuf wpos;
|
|
if ( ! getWordPosVec ( words ,
|
|
sections,
|
|
//wordStart,
|
|
//wordEnd,
|
|
m_dist, // hi->m_startDist,
|
|
fragVec,
|
|
niceness,
|
|
&wpos) ) return false;
|
|
// a handy ptr
|
|
int32_t *wposvec = (int32_t *)wpos.getBufStart();
|
|
|
|
/*
|
|
// show that for debug
|
|
if ( m_docId == 192304365235LL ) {
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
char buf[1000];
|
|
int32_t len = wlens[i];
|
|
if ( len > 900 ) len = 900;
|
|
gbmemcpy(buf,wptrs[i],len);
|
|
buf[len]='\0';
|
|
log("seopipe: wptr=%s pos[%" INT32 "]=%" INT32 "",buf,i,wposvec[i]);
|
|
}
|
|
}
|
|
*/
|
|
|
|
//int32_t wc = 0;
|
|
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
|
|
|
|
int32_t i;
|
|
for ( i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
if ( ! wids[i] ) continue;
|
|
// ignore if in repeated fragment
|
|
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
|
|
// ignore if in style section
|
|
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
|
|
|
|
// do not breach wordpos bits
|
|
if ( wposvec[i] > MAXWORDPOS ) break;
|
|
|
|
// . hash the startHash with the wordId for this word
|
|
// . we must mask it before adding it to the table because
|
|
// this table is also used to hash IndexLists into that come
|
|
// from LinkInfo classes (incoming link text). And when
|
|
// those IndexLists are hashed they used masked termIds.
|
|
// So we should too...
|
|
//uint64_t h = g_indexdb.getTermId ( startHash , wids[i] ) ;
|
|
uint64_t h ;
|
|
if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash );
|
|
else h = wids[i];
|
|
|
|
// . get word spam rank. 0 means not spammed
|
|
// . just mod Weights class to ues a weight rank...
|
|
// . and diversity rank
|
|
// . need to separate weights by spam vs. diversity.
|
|
// . maybe just have a diversity class and a pattern class
|
|
// and leave the poor weights class alone
|
|
//int32_t wsr = 0;
|
|
|
|
int32_t hashGroup = hi->m_hashGroup;
|
|
|
|
Section *sx = NULL;
|
|
if ( sp ) {
|
|
sx = sp[i];
|
|
// . this is taken care of in hashTitle()
|
|
// . it is slightly different if the title is
|
|
// multiple sentences because when hashing the
|
|
// body the density rank is per sentence, but in
|
|
// hashTitle we count all the words in the title
|
|
// towards the density rank even if they are
|
|
// in different sentences
|
|
if ( sx->m_flags & SEC_IN_TITLE )
|
|
//hashGroup = HASHGROUP_TITLE;
|
|
continue;
|
|
if ( sx->m_flags & SEC_IN_HEADER )
|
|
hashGroup = HASHGROUP_HEADING;
|
|
if ( sx->m_flags & ( SEC_MENU |
|
|
SEC_MENU_SENTENCE |
|
|
SEC_MENU_HEADER ) )
|
|
hashGroup = HASHGROUP_INMENU;
|
|
}
|
|
|
|
// this is for link text and meta tags mostly
|
|
if ( hashIffUnique ) {
|
|
// skip if already did it
|
|
if ( ut.isInTable ( &h ) ) continue;
|
|
if ( ! ut.addKey ( &h ) ) return false;
|
|
}
|
|
|
|
char ws = 15;
|
|
if ( wordSpamVec ) ws = wordSpamVec[i];
|
|
|
|
// HACK:
|
|
// if this is inlink text, use the wordspamrank to hold the
|
|
// inlinker's site rank!
|
|
if ( hashGroup == HASHGROUP_INLINKTEXT )
|
|
ws = hi->m_linkerSiteRank;
|
|
|
|
// default to the document's primary language if it is not
|
|
// clear what language this word belongs to.
|
|
// if the word is only in german it should be german,
|
|
// otherwise it will be the document's primary language.
|
|
char langId = langUnknown;
|
|
if ( m_wts && langVec ) langId = langVec[i];
|
|
// keep it as the original vector. i'm not sure we use
|
|
// this for anything but for display, so show the user
|
|
// how we made our calculation of the document's primary lang
|
|
//if ( langId == langUnknown ) langId = docLangId;
|
|
|
|
char wd;
|
|
if ( hi->m_useCountTable ) wd = wdv[i];
|
|
else wd = MAXDIVERSITYRANK;
|
|
|
|
// if using posdb
|
|
key144_t k;
|
|
// if ( i == 11429 )
|
|
// log("foo");
|
|
g_posdb.makeKey ( &k ,
|
|
h ,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd, // diversityRank 0-15
|
|
ws, // wordSpamRank 0-15
|
|
0, // siterank
|
|
hashGroup ,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
false , // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
// get the one we lost
|
|
// char *kstr = KEYSTR ( &k , sizeof(POSDBKEY) );
|
|
// if (!strcmp(kstr,"0x0ca3417544e400000000000032b96bf8aa01"))
|
|
// log("got lost key");
|
|
|
|
// key should NEVER collide since we are always incrementing
|
|
// the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
|
|
|
|
// . make the m_wordPosInfoBuf here because we need to set
|
|
// WordPosInfo::m_wordPtr/m_wordLen.
|
|
// . could also use instead of the "wts" buffer?
|
|
if ( m_doingSEO ) {
|
|
// alloc in 10k chunks
|
|
if ( m_wordPosInfoBuf.getAvail() <
|
|
(int32_t)sizeof(WordPosInfo) ) {
|
|
int32_t newSize = m_wordPosInfoBuf.length();
|
|
newSize += 10000;
|
|
if ( ! m_wordPosInfoBuf.reserve ( newSize ) )
|
|
return false;
|
|
}
|
|
// make it
|
|
WordPosInfo wi;
|
|
wi.m_wordPtr = wptrs[i];
|
|
wi.m_wordLen = wlens[i];
|
|
wi.m_wordPos = wposvec[i];
|
|
wi.m_densityRank = densvec[i];
|
|
wi.m_wordSpamRank = ws;
|
|
wi.m_diversityRank = wd;//v[i];
|
|
wi.m_hashGroup = hashGroup;
|
|
wi.m_trafficGain = 0;
|
|
int32_t cs = sizeof(WordPosInfo);
|
|
if(!m_wordPosInfoBuf.safeMemcpy(&wi,cs)) return false;
|
|
}
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( wts ) {
|
|
if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_NONE, // synsrc
|
|
langId ,
|
|
k))
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// STRIP POSSESSIVE WORDS for indexing
|
|
//
|
|
// . for now do simple stripping here
|
|
// . if word is "bob's" hash "bob"
|
|
//
|
|
if ( wlens[i] >= 3 &&
|
|
wptrs[i][wlens[i]-2] == '\'' &&
|
|
to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
|
|
int64_t nah ;
|
|
nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 );
|
|
if ( plen>0 ) nah = hash64 ( nah , prefixHash );
|
|
g_posdb.makeKey ( &k ,
|
|
nah,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i], // diversityRank ,
|
|
ws, // wordSpamRank ,
|
|
0, //siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
// keep going if not debug
|
|
if ( ! wts ) continue;
|
|
// print the synonym
|
|
if ( ! storeTerm(wptrs[i], // synWord,
|
|
wlens[i] -2, // gbstrlen(synWord),
|
|
nah, // termid
|
|
hi,
|
|
i, // wordnum
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_GENERATED,
|
|
langId,
|
|
k) )
|
|
return false;
|
|
}
|
|
|
|
|
|
/////////////
|
|
//
|
|
// synonyms (alt words,morphs,synonyms)
|
|
//
|
|
/////////////
|
|
/*
|
|
int64_t *aids = NULL;
|
|
int16_t naids = 0;
|
|
int64_t syh;
|
|
if ( synonyms ) {
|
|
aids = synonyms->getAltIds (i);
|
|
naids = synonyms->getNumAlts(i);
|
|
//ascore = saved / 4;
|
|
//if ( ascore <= 0 ) ascore = 1;
|
|
//asaved = ascore;
|
|
}
|
|
for ( int32_t j = 0 ; j < naids ; j++ ) {
|
|
// skip if same as original
|
|
if ( (uint64_t)aids[j] == wids[i] ) continue;
|
|
// . hash it with the prefix if any
|
|
// . fixes gbwhere:galleries bug...
|
|
if ( plen>0 ) syh = hash64 ( aids[j] , prefixHash );
|
|
else syh = aids[j];
|
|
g_posdb.makeKey ( &k ,
|
|
syh ,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wdv[i], // diversityRank ,
|
|
ws, // wordSpamRank ,
|
|
0, //siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false ); // delkey?
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
|
|
// keep going if not debug
|
|
if ( ! wts ) continue;
|
|
// get the junk
|
|
char *synWord = synonyms->getStringFromId(&aids[j]);
|
|
// sanity
|
|
if ( ! synWord ) { char *xx=NULL;*xx=0; }
|
|
// print the synonym
|
|
if ( ! storeTerm(synWord,
|
|
gbstrlen(synWord),
|
|
syh, // termid
|
|
hi,
|
|
i, // wordnum
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wdv[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
synonyms->m_source[i], // synsrc
|
|
langId) )
|
|
return false;
|
|
}
|
|
*/
|
|
|
|
////////
|
|
//
|
|
// two-word phrase
|
|
//
|
|
////////
|
|
|
|
int64_t npid = pids2[i];
|
|
int32_t npw = 2;
|
|
uint64_t ph2 = 0;
|
|
|
|
// repeat for the two word hash if different!
|
|
if ( npid ) {
|
|
// hash with prefix
|
|
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
|
else ph2 = npid;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0LL,//docid
|
|
wposvec[i],//dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK, //phrase
|
|
ws, // wordSpamRank ,
|
|
0,//siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
}
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( wts && npid ) {
|
|
// get phrase as a string
|
|
int32_t plen;
|
|
char *phr=phrases->getPhrase(i,&plen,npw);
|
|
// store it
|
|
if ( ! storeTerm ( phr,plen,ph2,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK,//phrase
|
|
ws,
|
|
hashGroup,
|
|
//true,
|
|
wbuf,
|
|
wts,
|
|
SOURCE_BIGRAM, // synsrc
|
|
langId,
|
|
k) )
|
|
return false;
|
|
}
|
|
|
|
////////
|
|
//
|
|
// three-word phrase
|
|
//
|
|
////////
|
|
/*
|
|
npid = pids3[i];
|
|
npw = 3;
|
|
|
|
// repeat for the two word hash if different!
|
|
if ( npid ) {
|
|
// hash with prefix
|
|
uint64_t ph2 ;
|
|
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
|
else ph2 = npid;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0LL,//docid
|
|
wposvec[i],//dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK, //phrase
|
|
ws, // wordSpamRank ,
|
|
0,//siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false ); // delkey?
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
}
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( wts && npid ) {
|
|
// get phrase as a string
|
|
int32_t plen;
|
|
char *phr=phrases->getPhrase(i,&plen,npw);
|
|
// store it
|
|
if ( ! storeTerm ( phr,plen,ph2,hi,i,
|
|
wposvec[i], // wordpos
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK,//phrase
|
|
ws,
|
|
hashGroup,
|
|
//true, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_TRIGRAM, // synsrc
|
|
langId ) )
|
|
return false;
|
|
}
|
|
*/
|
|
// update for hashIncomingLinkText()
|
|
//hi->m_startDist = wposvec[i];
|
|
|
|
// debug point
|
|
//if ( ph2 == (uint64_t)-233869093807964777LL ) {
|
|
// log("hey slot=%" INT32 " date=%" UINT32 " n0=%" INT64 " score=%" INT32 "",
|
|
// slot,
|
|
// k.n1,k.n0,
|
|
// score);
|
|
// //char *xx=NULL;*xx=0;
|
|
//}
|
|
|
|
//
|
|
// NUMERIC SORTING AND RANGES
|
|
//
|
|
|
|
// only store numbers in fields this way
|
|
if ( prefixHash == 0 ) continue;
|
|
|
|
// this may or may not be numeric.
|
|
if ( ! is_digit ( wptrs[i][0] ) ) continue;
|
|
|
|
// this might have to "back up" before any '.' or '-' symbols
|
|
if ( ! hashNumber ( wptrs[0] ,
|
|
wptrs[i] ,
|
|
wlens[i] ,
|
|
hi ) )
|
|
return false;
|
|
}
|
|
|
|
// hash a single term so they can do gbfacet:ext or
|
|
// gbfacet:siterank or gbfacet:price. a field on a field.
|
|
if ( prefixHash && words->m_numWords )
|
|
// hash gbfacet:price with and store the price in the key
|
|
hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi );
|
|
|
|
// between calls? i.e. hashTitle() and hashBody()
|
|
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
|
|
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
|
|
|
|
return true;
|
|
}
|
|
|
|
// just like hashNumber*() functions but we use "gbfacet" as the
|
|
// primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby,
|
|
// gbsortbyint, gbrevsortby, gbrevsortbyint
|
|
bool XmlDoc::hashFacet1 ( char *term ,
|
|
Words *words ,
|
|
HashTableX *tt ) {
|
|
|
|
// need a prefix
|
|
//if ( ! hi->m_prefix ) return true;
|
|
|
|
// hash the ENTIRE content, all words as one blob
|
|
int32_t nw = words->getNumWords();
|
|
char *a = words->m_words[0];
|
|
char *b = words->m_words[nw-1]+words->m_wordLens[nw-1];
|
|
// hash the whole string as one value, the value of the facet
|
|
int32_t val32 = hash32 ( a , b - a );
|
|
|
|
if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;
|
|
|
|
//
|
|
// why do this if we already do it for hashNumber() using gbsortby: ?
|
|
//
|
|
|
|
/*
|
|
// if it's a number hash as float and int
|
|
if ( nw != 1 ) return true;
|
|
char **wptrs = words->m_words;
|
|
if ( ! is_digit ( wptrs[0][0] ) ) return true;
|
|
|
|
// hash with a float val
|
|
float f = atof(wptrs[0]);
|
|
int32_t vf32 = *(int32_t *)&f;
|
|
if ( ! hashFacet2 ( "gbfacetfloat",term, vf32 , tt ) ) return false;
|
|
|
|
// and an int val
|
|
int32_t vi32 = atoi(wptrs[0]);
|
|
if ( ! hashFacet2 ( "gbfacetint",term, vi32 , tt ) ) return false;
|
|
*/
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashFacet2 ( char *prefix,
|
|
char *term ,
|
|
int32_t val32 ,
|
|
HashTableX *tt ,
|
|
// we only use this for gbxpathsitehash terms:
|
|
bool shardByTermId ) {
|
|
|
|
// need a prefix
|
|
//if ( ! hi->m_prefix ) return true;
|
|
//int32_t plen = gbstrlen ( hi->m_prefix );
|
|
//if ( plen <= 0 ) return true;
|
|
// we gotta make this case insensitive, and skip spaces
|
|
// because if it is 'focal length' we can't search
|
|
// 'focal length:10' because that comes across as TWO terms.
|
|
//int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen);
|
|
|
|
// now any field has to support gbfacet:thatfield
|
|
// and store the 32-bit termid into where we normally put
|
|
// the word position bits, etc.
|
|
//static int64_t s_facetPrefixHash = 0LL;
|
|
//if ( ! s_facetPrefixHash )
|
|
// s_facetPrefixHash = hash64n ( "gbfacet" );
|
|
|
|
// this is case-sensitive
|
|
int64_t prefixHash = hash64n ( prefix );
|
|
|
|
// term is like something like "object.price" or whatever.
|
|
// it is the json field itself, or the meta tag name, etc.
|
|
int64_t termId64 = hash64n ( term );
|
|
|
|
// combine with the "gbfacet" prefix. old prefix hash on right.
|
|
// like "price" on right and "gbfacetfloat" on left... see Query.cpp.
|
|
int64_t ph2 = hash64 ( termId64, prefixHash );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
g_posdb.setInt ( &k , val32 );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
HashTableX *dt = tt;//hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
bool isFloat = false;
|
|
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
|
|
|
|
// store in buffer for display on pageparser.cpp output
|
|
char buf[130];
|
|
if ( isFloat )
|
|
snprintf(buf,128,"facetField=%s facetVal32=%f",term,
|
|
*(float *)&val32);
|
|
else
|
|
snprintf(buf,128,"facetField=%s facetVal32=%" UINT32 "",
|
|
term,(uint32_t)val32);
|
|
int32_t bufLen = gbstrlen(buf);
|
|
|
|
// make a special hashinfo for this facet
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
// the full prefix
|
|
char fullPrefix[66];
|
|
snprintf(fullPrefix,64,"%s:%s",prefix,term);
|
|
hi.m_prefix = fullPrefix;//"gbfacet";
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
ph2, // prefixHash, // s_facetPrefixHash,
|
|
&hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
|
|
|
|
HashTableX *tt = hi->m_tt;
|
|
|
|
uint64_t val64 = hash64 ( val , vlen );
|
|
|
|
// term is like something like "object.price" or whatever.
|
|
// it is the json field itself, or the meta tag name, etc.
|
|
uint64_t middlePrefix = hash64n ( hi->m_prefix );
|
|
|
|
// hash "This is a new product." with "object.desc".
|
|
// "object.desc" (termId64) is case-sensitive.
|
|
uint64_t composite = hash64 ( val64 , middlePrefix );
|
|
|
|
// hash that with "gbfieldmatch"
|
|
char *prefix = "gbfieldmatch";
|
|
uint64_t prefixHash = hash64n ( prefix );
|
|
uint64_t ph2 = hash64 ( composite , prefixHash );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
false ) ; // shardByTermId? no, by docid.
|
|
|
|
HashTableX *dt = tt;//hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer for display on pageparser.cpp output
|
|
char buf[128];
|
|
int32_t bufLen ;
|
|
bufLen = sprintf(buf,"gbfieldmatch:%s:%" UINT64 "",hi->m_prefix,val64);
|
|
|
|
// make a special hashinfo for this facet
|
|
HashInfo hi2;
|
|
hi2.m_tt = tt;
|
|
// the full prefix
|
|
char fullPrefix[64];
|
|
snprintf(fullPrefix,62,"%s:%s",prefix,hi->m_prefix);
|
|
hi2.m_prefix = fullPrefix;//"gbfacet";
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
ph2, // prefixHash, // s_facetPrefixHash,
|
|
&hi2,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the
|
|
// posdb key
|
|
// . the termid is the hash of the preceeding field
|
|
// . in json docs a field is like "object.details.price"
|
|
// . in meta tags it is just the meta tag name
|
|
// . credit card numbers are 16 digits. we'd need like 58 bits to store those
|
|
// so we can't do that here, but we can approximate as a float
|
|
// . the binary representation of floating point numbers is ordered in the
|
|
// same order as the floating points themselves! so we are lucky and can
|
|
// keep our usually KEYCMP sorting algos to keep the floats in order.
|
|
bool XmlDoc::hashNumber ( char *beginBuf ,
|
|
char *buf ,
|
|
int32_t bufLen ,
|
|
HashInfo *hi ) {
|
|
|
|
if ( ! is_digit(buf[0]) ) return true;
|
|
|
|
char *p = buf;
|
|
char *bufEnd = buf + bufLen;
|
|
|
|
// back-up over any .
|
|
if ( p > beginBuf && p[-1] == '.' ) p--;
|
|
|
|
// negative sign?
|
|
if ( p > beginBuf && p[-1] == '-' ) p--;
|
|
|
|
// . convert it to a float
|
|
// . this now allows for commas in numbers like "1,500.62"
|
|
float f = atof2 ( p , bufEnd - p );
|
|
|
|
// debug
|
|
//log("build: hashing %s %f",hi->m_prefix,f);
|
|
|
|
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
|
|
return false;
|
|
|
|
// also hash in reverse order for sorting from low to high
|
|
f = -1.0 * f;
|
|
|
|
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
|
|
return false;
|
|
|
|
//
|
|
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
|
|
// dont lose 128 seconds of resolution
|
|
//
|
|
|
|
int32_t i = (int32_t) atoll2 ( p , bufEnd - p );
|
|
|
|
if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) )
|
|
return false;
|
|
|
|
// also hash in reverse order for sorting from low to high
|
|
i = -1 * i;
|
|
|
|
if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) )
|
|
return false;
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// . THIS IS NOW replaced by ::hashFacet2() being called by hashSections()
|
|
// above. it is a more generic, faceted approach.
|
|
// . the term is gbxpathsite123456 the prefix is gbfacet the val32
|
|
// stored in the posdb key is the inner html hash of the section, and
|
|
// the "123456" is the hash of the xpath and site. so the field names
|
|
// are very custom, not your typical "ext" or "title"
|
|
// . CHROME DETECTION
|
|
// . hash a special "gbxpathsitehash12345678" term which has the hash of the
|
|
// innerHTML content embedded in it.
|
|
// . we do this for doing gbfacetstr:gbxpathsitehash12345678 etc. on every
|
|
// section with innerHTML so we can figure out the histogram of each
|
|
// section on this page relative to its subdomain. like the distriubtion
|
|
// of the innerHTML for this section as it appears on other pages from
|
|
// this site. this allows killer CHROME DETECTION!!!!
|
|
/*
|
|
bool XmlDoc::hashSectionTerm ( char *term , HashInfo *hi , int32_t sentHash32 ) {
|
|
|
|
int64_t termId = hash64 ( term , gbstrlen(term) );
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
termId,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
g_posdb.setInt ( &k , sentHash32 );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
int t = g_posdb.getInt ( &k );
|
|
if ( t != sentHash32 ) { char *xx=NULL;*xx=0; }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
//char buf[128];
|
|
//int32_t bufLen = sprintf(buf,"%" UINT32 "",sentHash32);
|
|
|
|
// if no gbmin or gbmax or gbsorty or gbrevsortby we need gbfacet
|
|
//int64_t truePrefix64 = hash64n ( "gbfacet" );
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( term,//buf,
|
|
gbstrlen(term),//bufLen,
|
|
0LL,//truePrefix64,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
|
|
|
|
// prefix is something like price. like the meta "name" or
|
|
// the json name with dots in it like "product.info.price" or something
|
|
int64_t nameHash = 0LL;
|
|
int32_t nameLen = 0;
|
|
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && nameLen )
|
|
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
|
// need a prefix for hashing numbers... for now
|
|
else { char *xx=NULL; *xx=0; }
|
|
|
|
// combine prefix hash with a special hash to make it unique to avoid
|
|
// collisions. this is the "TRUE" prefix.
|
|
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
|
// hash with the "TRUE" prefix
|
|
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
g_posdb.setFloat ( &k , f );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
float t = g_posdb.getFloat ( &k );
|
|
if ( t != f ) { char *xx=NULL;*xx=0; }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
char buf[128];
|
|
snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
|
|
int32_t bufLen = gbstrlen(buf);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
truePrefix64,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashNumber3 ( int32_t n , HashInfo *hi , char *sortByStr ) {
|
|
|
|
// prefix is something like price. like the meta "name" or
|
|
// the json name with dots in it like "product.info.price" or something
|
|
int64_t nameHash = 0LL;
|
|
int32_t nameLen = 0;
|
|
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && nameLen )
|
|
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
|
// need a prefix for hashing numbers... for now
|
|
else { char *xx=NULL; *xx=0; }
|
|
|
|
// combine prefix hash with a special hash to make it unique to avoid
|
|
// collisions. this is the "TRUE" prefix.
|
|
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
|
// hash with the "TRUE" prefix
|
|
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
//g_posdb.setFloat ( &k , f );
|
|
g_posdb.setInt ( &k , n );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
//float t = g_posdb.getFloat ( &k );
|
|
int32_t x = g_posdb.getInt ( &k );
|
|
if ( x != n ) { char *xx=NULL;*xx=0; }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
char buf[128];
|
|
snprintf(buf,126,"%s:%s int32=%" INT32 "",sortByStr, hi->m_prefix,n);
|
|
int32_t bufLen = gbstrlen(buf);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
truePrefix64,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . many many websites got hijacked pages in them...
|
|
// . revkim.org/mcdrt/mgntf/sata/sata.htm
|
|
// . collegefootballweekly.net/hswsj/riime/sata/sata.htm
|
|
char *XmlDoc::getIsHijacked() {
|
|
bool hj = false;
|
|
if ( ! hj ) hj = isHijackerFormat ( ptr_firstUrl );
|
|
if ( ! hj ) hj = isHijackerFormat ( ptr_redirUrl );
|
|
if ( ! hj ) {
|
|
m_isHijacked = false;
|
|
m_isHijackedValid = true;
|
|
return &m_isHijacked;
|
|
}
|
|
uint32_t *h1 = getTagPairHash32();
|
|
if ( ! h1 || h1 == (void *)-1 ) return (char *)h1;
|
|
// TODO: check it for the malicious tag formats here!!
|
|
m_isHijacked = false;
|
|
m_isHijackedValid = true;
|
|
return &m_isHijacked;
|
|
}
|
|
|
|
// is it a custom error page? ppl do not always use status 404!
|
|
char *XmlDoc::getIsErrorPage ( ) {
|
|
if ( m_isErrorPageValid ) return &m_isErrorPage;
|
|
|
|
setStatus ( "getting is error page");
|
|
|
|
// need a buncha crap
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
// get local link info
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// error or blocked
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
// get remote link info
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
// error or blocked
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
|
|
// convenience
|
|
LinkInfo *info2 = *pinfo2;
|
|
|
|
// default
|
|
LinkInfo *li = info1;
|
|
|
|
//we have to be more sophisticated with longer pages because they
|
|
//are could actually be talking about an error message.
|
|
//if(xml->getContentLen() > 4096) return false;
|
|
|
|
|
|
// assume not
|
|
m_isErrorPage = false;
|
|
m_isErrorPageValid = true;
|
|
|
|
int32_t nn = xml->getNumNodes();
|
|
int32_t i;
|
|
|
|
char* s;
|
|
int32_t len;
|
|
int32_t len2;
|
|
|
|
char* errMsg = NULL;
|
|
|
|
int32_t numChecked = 0;
|
|
// check the first header and title tag
|
|
// limit it to first 32 nodes
|
|
if(nn > 32) nn = 32;
|
|
for ( i = 0 ; i < nn ; i++ ) {
|
|
switch(xml->getNodeId(i)) {
|
|
case TAG_TITLE:
|
|
case TAG_H1:
|
|
case TAG_H2:
|
|
case TAG_H3:
|
|
case TAG_SPAN:
|
|
char* p = xml->getString(i,true,&len);
|
|
if(len == 0 || len > 1024) continue;
|
|
char* pend = p + len;
|
|
errMsg = matchErrorMsg(p, pend );
|
|
++numChecked;
|
|
break;
|
|
}
|
|
if(errMsg || numChecked > 1) break;
|
|
}
|
|
if(!errMsg) return &m_isErrorPage;
|
|
len = gbstrlen(errMsg);
|
|
|
|
// make sure the error message was not present in the link text
|
|
loop:
|
|
if ( li && li->getNumGoodInlinks() > 5 ) return &m_isErrorPage;
|
|
for (Inlink *k=NULL;li && (k=li->getNextInlink(k)); ) {
|
|
//int32_t nli = li->getNumLinkTexts();
|
|
//if we can index some link text from the page, then do it
|
|
//if(nli > 5) return false;
|
|
//for ( int32_t i = 0 ; i < nli ; i++ ) {
|
|
s = k->getLinkText();
|
|
len2 = k->size_linkText - 1; // exclude \0
|
|
//if(!s) break;
|
|
//allow error msg to contain link text or vice versa
|
|
if(len < len2) {
|
|
if(strncasestr(errMsg, s,len,len2) != NULL)
|
|
return &m_isErrorPage;
|
|
}
|
|
else {
|
|
if(strncasestr(s, errMsg,len2,len) != NULL)
|
|
return &m_isErrorPage;
|
|
}
|
|
}
|
|
|
|
if ( li ) { li = info2; info2 = NULL; goto loop; }
|
|
|
|
m_isErrorPage = true;
|
|
return &m_isErrorPage;
|
|
}
|
|
|
|
|
|
char* XmlDoc::matchErrorMsg(char* p, char* pend ) {
|
|
char utf8Buf[1024];
|
|
// int32_t utf8Len = 0;
|
|
int32_t len = pend - p;
|
|
|
|
if(len > 1024) len = 1024;
|
|
pend = p + len;
|
|
char* tmp = utf8Buf;
|
|
while(p < pend) {
|
|
*tmp = to_lower_a(*p);
|
|
tmp++; p++;
|
|
}
|
|
|
|
p = utf8Buf;
|
|
pend = p + len;
|
|
|
|
char* errMsg = NULL;
|
|
|
|
while(p < pend) {
|
|
int32_t r = pend - p;
|
|
switch (*p) { //sorted by first letter, then by frequency
|
|
case '4':
|
|
errMsg = "404 error";
|
|
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
|
|
errMsg = "403 forbidden";
|
|
if(r>=13&&strncmp(p, errMsg, 13) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'd':
|
|
errMsg = "detailed error information follows";
|
|
if(r>=34&&strncmp(p, errMsg, 34) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'e':
|
|
errMsg = "error 404";
|
|
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
|
|
errMsg = "error was encountered while processing "
|
|
"your request";
|
|
if(r>=51&&strncmp(p, errMsg,51) == 0) return errMsg;
|
|
|
|
errMsg = "error occurred while processing request";
|
|
if(r>=39&&strncmp(p, errMsg, 39) == 0) return errMsg;
|
|
errMsg = "exception error has occurred";
|
|
if(r>=28&&strncmp(p, errMsg,28) == 0) return errMsg;
|
|
errMsg = "error occurred";
|
|
if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
|
|
//http://www.gnu.org/fun/jokes/unix.errors.html
|
|
//errMsg = "error message";
|
|
//if(strncmp(p, errMsg, 13) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'f':
|
|
errMsg = "file not found";
|
|
if(r>=14&&strncmp(p, errMsg, 14) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'h':
|
|
errMsg = "has moved";
|
|
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'n':
|
|
errMsg = "no referrer";
|
|
if(r>=12&&strncmp(p, errMsg,12) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'o':
|
|
errMsg = "odbc error code = ";
|
|
if(r>=18&&strncmp(p, errMsg,18) == 0) return errMsg;
|
|
errMsg = "object not found";
|
|
if(r>=16&&strncmp(p, errMsg,16) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'p':
|
|
errMsg = "page not found";
|
|
if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
|
|
break;
|
|
|
|
case 's':
|
|
errMsg = "system error";
|
|
if(r>=12&&strncmp(p, errMsg, 12) == 0) return errMsg;
|
|
break;
|
|
case 't':
|
|
errMsg = "the application encountered an "
|
|
"unexpected problem";
|
|
if(r>=49&&strncmp(p, errMsg, 49) == 0) return errMsg;
|
|
errMsg = "the page you requested has moved";
|
|
if(r>=32&&strncmp(p, errMsg, 32) == 0) return errMsg;
|
|
errMsg = "this page has moved";
|
|
if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'u':
|
|
errMsg = "unexpected problem has occurred";
|
|
if(r>=31&&strncmp(p, errMsg, 31) == 0) return errMsg;
|
|
errMsg = "unexpected error has occurred";
|
|
if(r>=29&&strncmp(p, errMsg, 29) == 0) return errMsg;
|
|
errMsg = "unexpected problem occurred";
|
|
if(r>=27&&strncmp(p, errMsg, 27) == 0) return errMsg;
|
|
errMsg ="unexpected error occurred";
|
|
if(r>=25&&strncmp(p, errMsg, 25) == 0) return errMsg;
|
|
errMsg ="unexpected result has occurred";
|
|
if(r>=33&&strncmp(p, errMsg, 33) == 0) return errMsg;
|
|
errMsg ="unhandled exception";
|
|
if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
|
|
|
|
break;
|
|
|
|
case 'y':
|
|
errMsg = "you have been blocked";
|
|
if(r>=21&&strncmp(p, errMsg, 21) == 0) return errMsg;
|
|
break;
|
|
}
|
|
//skip to the beginning of the next word
|
|
while(p < pend && !is_wspace_a(*p)) p++;
|
|
while(p < pend && is_wspace_a(*p)) p++;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
#include "Spider.h"
|
|
|
|
static SafeBuf *s_wbuf = NULL;
|
|
|
|
// . this is used by gbsort() above
|
|
// . sorts TermInfos alphabetically by their TermInfo::m_term member
|
|
int cmptp (const void *v1, const void *v2) {
|
|
TermDebugInfo *t1 = *(TermDebugInfo **)v1;
|
|
TermDebugInfo *t2 = *(TermDebugInfo **)v2;
|
|
|
|
char *start = s_wbuf->getBufStart();
|
|
|
|
// prefix first
|
|
char *ps1 = start + t1->m_prefixOff;
|
|
char *ps2 = start + t2->m_prefixOff;
|
|
if ( t1->m_prefixOff < 0 ) ps1 = NULL;
|
|
if ( t2->m_prefixOff < 0 ) ps2 = NULL;
|
|
int32_t plen1 = 0; if ( ps1 ) plen1 = gbstrlen(ps1);
|
|
int32_t plen2 = 0; if ( ps2 ) plen2 = gbstrlen(ps2);
|
|
int32_t pmin = plen1;
|
|
if ( plen2 < pmin ) pmin = plen2;
|
|
int32_t pn = strncmp ( ps1 , ps2 , pmin );
|
|
if ( pn ) return pn;
|
|
if ( plen1 != plen2 ) return ( plen1 - plen2 );
|
|
|
|
// return if groups differ
|
|
int32_t len1 = t1->m_termLen;
|
|
int32_t len2 = t2->m_termLen;
|
|
int32_t min = len1;
|
|
if ( len2 < min ) min = len2;
|
|
char *s1 = start + t1->m_termOff;
|
|
char *s2 = start + t2->m_termOff;
|
|
int32_t n = strncasecmp ( s1 , s2 , min );
|
|
if ( n ) return n;
|
|
// . if length same, we are tied
|
|
// . otherwise, prefer the int16_ter
|
|
return ( len1 - len2 );
|
|
}
|
|
|
|
// . this is used by gbsort() above
|
|
// . sorts TermDebugInfos by their TermDebugInfo::m_wordPos member
|
|
int cmptp2 (const void *v1, const void *v2) {
|
|
TermDebugInfo *t1 = *(TermDebugInfo **)v1;
|
|
TermDebugInfo *t2 = *(TermDebugInfo **)v2;
|
|
// word position first
|
|
int32_t d = t1->m_wordPos - t2->m_wordPos;
|
|
if ( d ) return d;
|
|
// secondly drop back to hashgroup i guess
|
|
//d = t1->m_hashGroup - t2->m_hashGroup;
|
|
d = t1->m_synSrc - t2->m_synSrc;
|
|
if ( d ) return d;
|
|
// word len
|
|
d = t1->m_termLen - t2->m_termLen;
|
|
if ( d ) return d;
|
|
return 0;
|
|
}
|
|
|
|
bool printLangBits ( SafeBuf *sb , TermDebugInfo *tp ) {
|
|
|
|
char printed = false;
|
|
if ( tp->m_synSrc ) {
|
|
sb->safePrintf(" ");
|
|
printed = true;
|
|
}
|
|
int32_t j = 0;
|
|
if ( printed ) j = MAX_LANGUAGES;
|
|
for ( ; j < MAX_LANGUAGES ; j++ ) {
|
|
int64_t mask = 1LL << j;
|
|
//if ( j == tp->m_langId )
|
|
// sb->safePrintf("[%s]",
|
|
// getLangAbbr(tp->m_langId));
|
|
if ( ! (tp->m_langBitVec64 & mask) ) continue;
|
|
char langId = j+1;
|
|
// match in langvec? that means even if the
|
|
// word is in multiple languages we put it in
|
|
// this language because we intersect its lang bit
|
|
// vec with its neighbors in the sliding window
|
|
// algo in setLangVector.
|
|
if ( langId == tp->m_langId )
|
|
sb->safePrintf("<b>");
|
|
sb->safePrintf("%s ", getLangAbbr(langId) );
|
|
if ( langId == tp->m_langId )
|
|
sb->safePrintf("</b>");
|
|
printed = true;
|
|
}
|
|
if ( ! printed ) {
|
|
sb->safePrintf("??");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
|
|
|
if ( ! sb ) return true;
|
|
|
|
Url *u = getFirstUrl();
|
|
// hash the url into 64 bits
|
|
int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());
|
|
|
|
|
|
// int16_tcut
|
|
char *fu = ptr_firstUrl;
|
|
|
|
char *allowed = "???";
|
|
if ( m_isAllowedValid && m_isAllowed ) allowed = "yes";
|
|
else if ( m_isAllowedValid ) allowed = "no";
|
|
|
|
int32_t ufn = -1;
|
|
if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
|
|
time_t spideredTime = getSpideredTime();
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
|
|
"content=\"text/html; charset=utf-8\">"
|
|
|
|
"<table cellpadding=3 border=0>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">docId</td>"
|
|
"<td><a href=/get?c=%s&d=%" UINT64 ">%" UINT64 "</a></td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">uh48</td>"
|
|
"<td>%" UINT64 "</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">uh64</td>"
|
|
"<td>%" UINT64 "</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>index error code</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>http status</td>"
|
|
"<td>%i</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>url filter num</td>"
|
|
"<td>%" INT32 "</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>other - errno</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>robots.txt allows</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>metalist size</td>"
|
|
"<td>%" INT32 "</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
|
|
,
|
|
cr->m_coll,
|
|
m_docId ,
|
|
m_docId ,
|
|
getFirstUrlHash48(), // uh48
|
|
getFirstUrlHash64(), // uh48
|
|
|
|
mstrerror(m_indexCode),
|
|
m_httpStatus,
|
|
ufn,
|
|
mstrerror(g_errno),
|
|
allowed,
|
|
|
|
m_metaListSize,
|
|
|
|
fu,
|
|
fu
|
|
|
|
);
|
|
|
|
if ( ptr_redirUrl )
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>redir url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
,ptr_redirUrl
|
|
,ptr_redirUrl
|
|
);
|
|
else
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>redir url</td>"
|
|
"<td>--</td>"
|
|
"</tr>\n"
|
|
);
|
|
|
|
|
|
sb->safePrintf("<tr><td>hostHash64</td><td>0x%" XINT64 "</td></tr>",
|
|
(uint64_t)getHostHash32a());
|
|
sb->safePrintf("<tr><td>site</td><td>");
|
|
sb->safeMemcpy(ptr_site,size_site-1);
|
|
sb->safePrintf("</td></tr>\n");
|
|
if ( m_siteHash32Valid )
|
|
sb->safePrintf("<tr><td>siteHash32</td><td>0x%" XINT32 "</td></tr>\n",
|
|
m_siteHash32);
|
|
if ( m_domHash32Valid )
|
|
sb->safePrintf("<tr><td>domainHash32</td><td>0x%" XINT32 "</td></tr>\n",
|
|
m_domHash32);
|
|
sb->safePrintf ( "<tr>"
|
|
"<td>domainHash8</td>"
|
|
"<td>0x%" XINT32 "</td>"
|
|
"</tr>\n"
|
|
,
|
|
(int32_t)g_titledb.getDomHash8FromDocId(m_docId)
|
|
);
|
|
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>coll</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>spidered date</td>"
|
|
"<td>%s UTC</td>"
|
|
"</tr>\n"
|
|
,
|
|
cr->m_coll,
|
|
asctime(gmtime ( &spideredTime ))
|
|
);
|
|
|
|
|
|
/*
|
|
char *ms = "-1";
|
|
if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
|
|
sb->safePrintf (
|
|
"<tr>"
|
|
"<td>min pub date</td>"
|
|
"<td>%s UTC</td>"
|
|
"</tr>\n" , ms );
|
|
|
|
ms = "-1";
|
|
if ( m_maxPubDate != -1 ) ms = asctime(gmtime ( &m_maxPubDate ));
|
|
sb->safePrintf (
|
|
"<tr>"
|
|
"<td>max pub date</td>"
|
|
"<td>%s UTC</td>"
|
|
"</tr>\n" , ms );
|
|
*/
|
|
|
|
// our html template fingerprint
|
|
sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
|
|
if ( m_tagPairHash32Valid )sb->safePrintf("%" UINT32 "",
|
|
(uint32_t)m_tagPairHash32);
|
|
else sb->safePrintf("invalid");
|
|
sb->safePrintf("</td></tr>\n" );
|
|
|
|
|
|
// print list we added to delete stuff
|
|
if ( m_indexCode && m_oldDocValid && m_oldDoc ) {
|
|
// skip debug printing for now...
|
|
//return true;
|
|
sb->safePrintf("</table><br>\n");
|
|
sb->safePrintf("<h2>Delete Meta List</h2>");
|
|
printMetaList ( m_metaList , m_metaList + m_metaListSize ,sb);
|
|
}
|
|
|
|
|
|
if ( m_indexCode || g_errno ) {
|
|
printMetaList ( m_metaList , m_metaList + m_metaListSize, sb );
|
|
}
|
|
|
|
if ( m_indexCode ) return true;
|
|
if ( g_errno ) return true;
|
|
|
|
|
|
// sanity check
|
|
//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
sb->safePrintf("<tr><td>next spider date</td>"
|
|
"<td>%s UTC</td></tr>\n"
|
|
|
|
"<tr><td>next spider priority</td>"
|
|
"<td>%" INT32 "</td></tr>\n" ,
|
|
asctime(gmtime( &m_nextSpiderTime )) ,
|
|
(int32_t)m_nextSpiderPriority );
|
|
*/
|
|
|
|
// must always start with http i guess!
|
|
if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
|
|
// show the host that should spider it
|
|
//int32_t domLen ; char *dom = getDomFast ( fu , &domLen , true );
|
|
//int32_t hostId;
|
|
if ( m_sreqValid ) {
|
|
// must not block
|
|
SpiderRequest *oldsr = &m_sreq;
|
|
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
|
|
sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
|
|
"</td>\n"
|
|
"<td><b>%" UINT32 "</b></td></tr>\n",shard);
|
|
}
|
|
|
|
time_t ts = m_firstIndexedDate;
|
|
sb->safePrintf("<tr><td>first indexed date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
|
|
ts = m_outlinksAddedDate;
|
|
sb->safePrintf("<tr><td>outlinks last added date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
|
|
// hop count
|
|
sb->safePrintf("<tr><td>hop count</td><td>%" INT32 "</td></tr>\n",
|
|
(int32_t)m_hopCount);
|
|
|
|
// thumbnails
|
|
ThumbnailArray *ta = (ThumbnailArray *) ptr_imageData;
|
|
if ( ta ) {
|
|
int32_t nt = ta->getNumThumbnails();
|
|
sb->safePrintf("<tr><td># thumbnails</td>"
|
|
"<td>%" INT32 "</td></tr>\n",nt);
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
ThumbnailInfo *ti = ta->getThumbnailInfo(i);
|
|
sb->safePrintf("<tr><td>thumb #%" INT32 "</td>"
|
|
"<td>%s (%" INT32 "x%" INT32 ",%" INT32 "x%" INT32 ") "
|
|
, i
|
|
, ti->getUrl()
|
|
, ti->m_origDX
|
|
, ti->m_origDY
|
|
, ti->m_dx
|
|
, ti->m_dy
|
|
);
|
|
ti->printThumbnailInHtml ( sb , 100,100,true,NULL) ;
|
|
// end the row for this thumbnail
|
|
sb->safePrintf("</td></tr>\n");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
char *ddd;
|
|
time_t datedbDate = (time_t)m_pubDate;
|
|
if ( datedbDate != -1 ) ddd = asctime ( gmtime(&datedbDate ));
|
|
else ddd = "---";
|
|
|
|
char strLanguage[128];
|
|
languageToString(m_langId, strLanguage);
|
|
|
|
// print tags
|
|
//if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
SafeBuf tb;
|
|
|
|
TagRec *ogr = NULL;
|
|
if ( m_tagRecValid ) ogr = &m_tagRec;
|
|
if ( ogr ) ogr->printToBufAsHtml ( &tb , "old tag" );
|
|
|
|
SafeBuf *ntb = NULL;
|
|
if ( m_newTagBufValid ) ntb = getNewTagBuf();
|
|
if ( ntb ) {
|
|
// this is just a sequence of tags like an rdblist
|
|
char *pt = ntb->getBufStart();
|
|
char *ptend = pt + ntb->length();
|
|
for ( ; pt < ptend ; ) {
|
|
// skip rdbid
|
|
pt++;
|
|
// cast it
|
|
Tag *tag = (Tag *)pt;
|
|
// skip it
|
|
pt += tag->getRecSize();
|
|
// print tag out
|
|
tag->printToBufAsHtml ( &tb, "new tag");
|
|
}
|
|
}
|
|
|
|
|
|
// prevent (null) from being displayed
|
|
tb.pushChar('\0');
|
|
|
|
|
|
//Tag *tag1 = gr->getTag ("sitenuminlinks");
|
|
//Tag *tag2 = gr->getTag ("sitepop");
|
|
//int32_t sni = 0;
|
|
//int32_t spop = 0;
|
|
//if ( tag1 ) sni = atol(tag1->m_data);
|
|
//if ( tag2 ) spop = atol(tag2->m_data);
|
|
int32_t sni = m_siteNumInlinks;
|
|
//int32_t spop = m_sitePop;
|
|
|
|
LinkInfo *info1 = ptr_linkInfo1;
|
|
//LinkInfo *info2 = ptr_linkInfo2;
|
|
//int32_t sni ;
|
|
//int32_t extrapolated = 0;
|
|
//if ( info1 ) extrapolated = info1->m_numInlinksExtrapolated;
|
|
//if ( info1 ) sni = info1->m_siteNumInlinks;
|
|
|
|
char *ipString = iptoa(m_ip);
|
|
char *estimated = "";
|
|
if ( datedbDate & 0x01 ) // tr->datedbDateIsEstimated() )
|
|
estimated = "<nobr><b>[estimated from bisection]</b></nobr>";
|
|
|
|
//char *ls = getIsLinkSpam();
|
|
Links *links = getLinks();
|
|
// sanity check. should NEVER block!
|
|
if ( links == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this is all to get "note"
|
|
//char *note = NULL;
|
|
// make it a URL
|
|
Url uu; uu.set ( ptr_firstUrl , false );
|
|
// sanity check
|
|
Xml *xml = getXml();
|
|
// sanity check
|
|
if ( xml == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
sb->safePrintf (
|
|
"<tr><td>datedb date</td><td>%s UTC (%" UINT32 ")%s"
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>compressed size</td><td>%" INT32 " bytes</td></tr>\n"
|
|
|
|
"<tr><td>original charset</td><td>%s</td></tr>\n"
|
|
|
|
//"<tr><td>site num inlinks</td><td><b>%" INT32 "%</b></td></tr>\n"
|
|
|
|
//"<tr><td>total extrapolated linkers</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
"<tr><td><b>title rec version</b></td><td><b>%" INT32 "</b>"
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>adult bit</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
//"<tr><td>is link spam?</td><td>%" INT32 " <b>%s</b></td></tr>\n"
|
|
|
|
"<tr><td>is permalink?</td><td>%" INT32 "</td></tr>\n"
|
|
"<tr><td>is RSS feed?</td><td>%" INT32 "</td></tr>\n"
|
|
//"<tr><td>index article only?</td><td>%" INT32 "</td></tr>\n"
|
|
"%s\n"
|
|
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
|
|
"%s</td></tr>\n"
|
|
"<tr><td>content len</td><td>%" INT32 " bytes</td></tr>\n"
|
|
"<tr><td>content truncated</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
"<tr><td>content type</td><td>%" INT32 " (%s)</td></tr>\n"
|
|
"<tr><td>language</td><td>%" INT32 " (%s)</td></tr>\n"
|
|
"<tr><td>country</td><td>%" INT32 " (%s)</td></tr>\n"
|
|
"<tr><td>time axis used</td><td>%" INT32 "</td></tr>\n"
|
|
"<tr><td>metadata</td><td>%s</td></tr>\n"
|
|
"</td></tr>\n",
|
|
|
|
ddd ,
|
|
(uint32_t)datedbDate ,
|
|
estimated ,
|
|
|
|
m_oldTitleRecSize,
|
|
|
|
get_charset_str(m_charset),
|
|
|
|
//sni ,
|
|
|
|
//ptr_linkInfo1->m_numInlinksExtrapolated,
|
|
|
|
(int32_t)m_version ,
|
|
|
|
(int32_t)m_isAdult,
|
|
|
|
//(int32_t)m_isLinkSpam,
|
|
//m_note,
|
|
|
|
(int32_t)m_isPermalink,
|
|
|
|
(int32_t)m_isRSS,
|
|
|
|
|
|
//(int32_t)m_eliminateMenus,
|
|
|
|
|
|
// tag rec
|
|
tb.getBufStart(),
|
|
|
|
ipString,
|
|
cr->m_coll,
|
|
ipString,
|
|
size_utf8Content - 1,
|
|
(int32_t)m_isContentTruncated,
|
|
|
|
(int32_t)m_contentType,
|
|
g_contentTypeStrings[(int)m_contentType] ,
|
|
|
|
(int32_t)m_langId,
|
|
strLanguage,
|
|
|
|
(int32_t)m_countryId,
|
|
g_countryCode.getName(m_countryId),
|
|
m_useTimeAxis,
|
|
ptr_metadata);
|
|
|
|
|
|
/*
|
|
int32_t boost1 = getBoostFromSiteNumInlinks ( sni );
|
|
|
|
sb->safePrintf (
|
|
"<tr><td><b>title weight</b></td>"
|
|
"<td><b>%" UINT32 "%%</b></td></tr>\n"
|
|
|
|
"<tr><td>header weight</td>"
|
|
"<td>%" UINT32 "%%</td></tr>\n"
|
|
|
|
"<tr><td>url path weight</td>"
|
|
"<td>%" UINT32 "%%</td></tr>\n"
|
|
|
|
"<tr><td>external link text weight</td>"
|
|
"<td>%" UINT32 "%%</td></tr>\n"
|
|
|
|
"<tr><td>internal link text weight</td>"
|
|
"<td>%" UINT32 "%%</td></tr>\n"
|
|
|
|
"<tr><td>concept weight</td>"
|
|
"<td>%" UINT32 "%%</td></tr>\n"
|
|
|
|
"<tr><td>score boost from site num inlinks</td>"
|
|
"<td>%" INT32 "%%</td>"
|
|
"</tr>\n",
|
|
|
|
(int32_t)m_titleWeight,
|
|
(int32_t)m_headerWeight,
|
|
(int32_t)m_urlPathWeight,
|
|
(int32_t)m_externalLinkTextWeight,
|
|
(int32_t)m_internalLinkTextWeight,
|
|
(int32_t)m_conceptWeight ,
|
|
boost1 );
|
|
*/
|
|
|
|
// print title
|
|
//sb->safePrintf( "<tr><td>title</td><td>%s</td></tr>\n" ,
|
|
// ti->m_title );
|
|
|
|
// print the new, unstored, gigabit vector
|
|
if ( size_gigabitHashes ) {
|
|
// get gigabit vector
|
|
int32_t *vec = ptr_gigabitHashes;
|
|
// point to scores
|
|
int32_t *ss = ptr_gigabitScores;
|
|
int32_t count = 0;
|
|
int32_t total = 0;
|
|
sb->safePrintf ( "<tr><td>stored gigabit vector</td><td>");
|
|
while ( *vec ) {
|
|
sb->safePrintf ( "%08" XINT32 " ", *vec );
|
|
sb->safePrintf ( "(%05" INT32 ") ", *ss );
|
|
vec++;
|
|
ss++;
|
|
count++;
|
|
total++;
|
|
//if ( total >= GIGABITS_IN_VECTOR ) break;
|
|
if ( count < 4 ) continue;
|
|
count = 0;
|
|
sb->safePrintf ( "<br>\n");
|
|
}
|
|
sb->safePrintf ( "</tr>\n");
|
|
}
|
|
|
|
// print dmoz stuff
|
|
int32_t numCatIds = size_catIds/4;
|
|
int32_t numIndCatIds = size_indCatIds/4;
|
|
sb->safePrintf( "<tr><td>Number of Category IDs</td>"
|
|
"<td>%" INT32 "</td></tr>\n", numCatIds );
|
|
char *dtp = ptr_dmozTitles;
|
|
char *dsp = ptr_dmozSumms;
|
|
char *dap = ptr_dmozAnchors;
|
|
for (int32_t i = 0; i < numCatIds; i++) {
|
|
// print the ID
|
|
sb->safePrintf( "<tr><td>ID #%" INT32 "</td><td>%" INT32 "</td></tr>\n",
|
|
i, ptr_catIds[i]);
|
|
// print the title
|
|
if ( dtp ) {
|
|
sb->safePrintf( "<tr><td>Title #%" INT32 " </td><td>",i);
|
|
sb->safeMemcpy( dtp,gbstrlen(dtp) );
|
|
sb->safePrintf( "</td></tr>\n");
|
|
dtp += gbstrlen(dtp) + 1;
|
|
}
|
|
// print the summary
|
|
if ( dsp ) {
|
|
sb->safePrintf( "<tr><td>Summary #%" INT32 "</td><td>", i);
|
|
sb->safeMemcpy( dsp , gbstrlen(dsp ) ) ;
|
|
sb->safePrintf( "</td></tr>\n");
|
|
dsp += gbstrlen ( dsp ) + 1;
|
|
}
|
|
// print the anchor
|
|
if ( dap ) {
|
|
sb->safePrintf( "<tr><td>Anchor #%" INT32 "</td><td>",i);
|
|
sb->safeMemcpy( dap , gbstrlen(dap) );
|
|
sb->safePrintf( "</td></tr>\n");
|
|
dap += gbstrlen ( dap ) + 1;
|
|
}
|
|
}
|
|
sb->safePrintf( "<tr><td>Number of Indirect Category IDs</td>"
|
|
"<td>%" INT32 "</td></tr>\n", numIndCatIds);
|
|
|
|
for (int32_t i = 0; i < numIndCatIds; i++) {
|
|
// print the ID
|
|
sb->safePrintf( "<tr><td>Indirect ID #%" INT32 "</td>"
|
|
"<td>%" INT32 "</td></tr>\n",
|
|
i, ptr_indCatIds[i]);
|
|
}
|
|
|
|
if ( info1 ) {
|
|
//sb->safePrintf("<tr><td>page pop</td><td>%" INT32 "</td></tr>\n",
|
|
// info1->m_pagePop );
|
|
//sb->safePrintf("<tr><td>whole site pop</td>"
|
|
// "<td>%" INT32 "</td></tr>\n",
|
|
// spop );
|
|
sb->safePrintf("<tr><td>num GOOD links to whole site</td>"
|
|
"<td>%" INT32 "</td></tr>\n",
|
|
sni );
|
|
}
|
|
|
|
// close the table
|
|
sb->safePrintf ( "</table></center><br>\n" );
|
|
|
|
//
|
|
// convert document into json representing multiple documents
|
|
// if it makes sense. sometimes a single url contains multiple
|
|
// subdocuments that each should have their own url, but do not,
|
|
// so we fix that here.
|
|
//
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( dbr->length() ) {
|
|
sb->safePrintf("<b>START EXACT DIFFBOT REPLY</b><br>\n");
|
|
sb->safePrintf("<pre>");
|
|
sb->safeMemcpy ( dbr );
|
|
sb->safePrintf("</pre>");
|
|
sb->safePrintf("<b>END EXACT DIFFBOT REPLY</b><br><br>\n");
|
|
}
|
|
|
|
// print outlinks
|
|
links->print( sb );
|
|
|
|
//
|
|
// PRINT ADDRESSES (prints streets first)
|
|
//
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (Addresses *)-1 ) { char *xx=NULL;*xx=0;}
|
|
aa->print(sb,uh64);
|
|
|
|
|
|
|
|
//
|
|
// PRINT PUB DATE CANDIDATES
|
|
//
|
|
|
|
// print stored pub date candidates which we indexed as clock
|
|
// or not clock!
|
|
Dates *dp = getDates() ;
|
|
// should never block!
|
|
if ( dp == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// print it out
|
|
if ( dp ) dp->printDates ( sb );
|
|
|
|
//return true;
|
|
|
|
//
|
|
// PRINT SECTIONS
|
|
//
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;}
|
|
//SectionVotingTable *nsvt = getNewSectionVotingTable();
|
|
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
|
|
|
|
// these are nice
|
|
//HashTableX *pt = dp->getPhoneTable();
|
|
//HashTableX *et = dp->getEmailTable();
|
|
//HashTableX *at = aa->getPlaceTable();
|
|
//HashTableX *tt = dp->getTODTable();
|
|
//HashTableX *rt = ev->getRegistrationTable();
|
|
//HashTableX *priceTable = dp->getPriceTable();
|
|
|
|
//sections->print ( sb , pt , et , NULL , at , tt , priceTable );
|
|
|
|
// try the new print function
|
|
//sections->print2 ( sb , NULL, NULL , NULL , false );
|
|
|
|
printRainbowSections ( sb , NULL );
|
|
|
|
//nsvt->print ( sb , "NEW Sections Voting Table" );
|
|
|
|
//osvt->print ( sb , "OLD Sections Voting Table" );
|
|
|
|
|
|
//
|
|
// PRINT LINKINFO
|
|
//
|
|
|
|
//if ( info1 )
|
|
// info1->print ( sb , cr->m_coll );
|
|
|
|
//if ( info2 ) {
|
|
// sb->safePrintf ( "<tr><td><b>IMPORTED LINK INFO:"
|
|
// "</b></td></tr>" );
|
|
// info2->print ( sb , cr->m_coll );
|
|
//}
|
|
|
|
|
|
// cut it int16_t for debugging
|
|
logf(LOG_DEBUG,"xmldoc: FIX ME remove return");
|
|
|
|
//return true;
|
|
|
|
//
|
|
// PRINT LINKINFO
|
|
//
|
|
|
|
char *p = m_pageLinkBuf.getBufStart();
|
|
int32_t plen = m_pageLinkBuf.length();
|
|
sb->safeMemcpy ( p , plen );
|
|
|
|
|
|
//
|
|
// PRINT SITE LINKINFO
|
|
//
|
|
p = m_siteLinkBuf.getBufStart();
|
|
plen = m_siteLinkBuf.length();
|
|
sb->safeMemcpy ( p , plen );
|
|
|
|
|
|
//
|
|
// BEGIN PRINT GIGABITS
|
|
//
|
|
|
|
// print out for PageParser.cpp
|
|
const char *help =
|
|
"The <i>Gigabits</i> are words extracted from the document "
|
|
"that are deemed to best represent it. The <i>Pop</i> column "
|
|
"is the popularity of the word and it ranges from 0 to 1000 "
|
|
"and is how many documents out of a sample of 1000 that "
|
|
"contained that word. The <i>Score</i> of each Gigabit is "
|
|
"based on the popularity and how many times the word appeared "
|
|
"in the document. Higher scores are deemed more "
|
|
"representative of the document. The hashes of these Gigabits "
|
|
"are stored with the cached copy of the document as numeric "
|
|
"hashes for purposes of topic clustering. You can see these "
|
|
"hashes by clicking on the <i>[info]</i> link next to "
|
|
"any search result.<br><br>";
|
|
|
|
if ( m_numTop > 0 )
|
|
sb->safePrintf( "<table width=100%%>"
|
|
"<td bgcolor=pink>\n"
|
|
"%s"
|
|
"<table>"
|
|
"<tr><td>#</td><td>"
|
|
"<b>%" INT32 " Gigabits</b></td><td><b>Score</b>"
|
|
"</td>"
|
|
"<td><b>Pop</b></td>"
|
|
"<td><b>Hash</b></td>"
|
|
"</tr>\n",
|
|
help,m_numTop);
|
|
|
|
// . print out the top gigabits we harvested
|
|
// . start with the highest scoring node first, the last node since
|
|
// nodes are ranked by lowest to highest key
|
|
int32_t total = 0;
|
|
for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
|
|
// get the info
|
|
GigabitInfo *gi = m_top[i];
|
|
// print row
|
|
sb->safePrintf("<tr><td>%" INT32 "</td><td>",i);
|
|
// print gigabit
|
|
sb->safeMemcpy(gi->m_ptr , gi->m_len );
|
|
// get 32 bit hash
|
|
uint32_t h = gi->m_hash & 0xffffffff;
|
|
// never allow 0
|
|
if ( h == 0 ) h = 1;
|
|
// if unicode, pop's hi bit is set
|
|
sb->safePrintf( "</td>"
|
|
"<td>%" INT32 "</td>"
|
|
"<td>%" INT32 "</td>"
|
|
"<td>%08" XINT32 "</td>"
|
|
"</tr>\n",
|
|
(int32_t)gi->m_pts,
|
|
(int32_t)gi->m_pop,
|
|
(int32_t)h );
|
|
// add up all scores
|
|
total += gi->m_pts;
|
|
}
|
|
|
|
// close table
|
|
if ( m_numTop > 0 ) {
|
|
sb->safePrintf("<tr><td></td><td></td><td>"
|
|
"<b>%" INT32 "</b></td></tr>\n",total);
|
|
sb->safePrintf("</table>\n");
|
|
}
|
|
|
|
|
|
//
|
|
// END PRINT GIGABITS
|
|
//
|
|
|
|
|
|
// note this
|
|
sb->safePrintf("<h2>NEW Meta List</h2>");
|
|
|
|
printMetaList ( m_metaList , m_metaList + m_metaListSize , sb );
|
|
|
|
|
|
// all done if no term table to print out
|
|
if ( ! m_wts ) return true;
|
|
|
|
|
|
// print out the rules in Weights.cpp
|
|
/*
|
|
sb->safePrintf ("<br>"
|
|
"<table border=1 cellpadding=0>"
|
|
|
|
"<tr><td>Rule #3</td>"
|
|
"<td>First 40 words in ()'s.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #4</td>"
|
|
"<td>Adjacent to bad punct.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #5</td>"
|
|
"<td>In a link.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #6</td>"
|
|
"<td>First occurrence in a section. Actual weight "
|
|
"depends on section word count.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #7</td>"
|
|
"<td>In a header tag. h1 is most weight.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #8</td>"
|
|
"<td>In a \"ul\" list.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #9</td>"
|
|
"<td>Repeated occurrence in the same fragment or "
|
|
"sentence.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #10</td>"
|
|
"<td>In a comma-separated list.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #11</td>"
|
|
"<td>Promoted isolated capitalized words, demote "
|
|
"if it is in a capitalized phrase.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #13</td>"
|
|
"<td>First occurrence in document.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #15</td>"
|
|
"<td>Word to phrase ratio weight.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #16</td>"
|
|
"<td>At the beginning of a fragment or sentence."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>Rule #17</td>"
|
|
"<td>If immediately after a quote, iff not "
|
|
"promoted by Rule #18.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #18</td>"
|
|
"<td>Promote phrase if capitalized. Demote phrase "
|
|
"if mixed case without hypehn.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #22</td>"
|
|
"<td>Demote phrases containing bad punct.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #23</td>"
|
|
"<td>In script, style, select or marquee tag. "
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>Rule #23</td>"
|
|
"<td>Follows a number.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #25</td>"
|
|
"<td>Demote non-hyphenated phrases that would split "
|
|
"adjacent hyphenated phrases.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #26</td>"
|
|
"<td>Demote if in a repeated fragment.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #27</td>"
|
|
"<td>Demote if in a menu section.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #28</td>"
|
|
"<td>Pattern spam detector.</td></tr>\n"
|
|
|
|
"</table>\n"
|
|
"<br>"
|
|
);
|
|
*/
|
|
|
|
|
|
//
|
|
// BEGIN PRINT HASHES TERMS
|
|
//
|
|
|
|
// int16_tcut
|
|
HashTableX *wt = m_wts;
|
|
|
|
// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
|
|
TermDebugInfo **tp = NULL;
|
|
// add them with this counter
|
|
int32_t nt = 0;
|
|
|
|
int32_t nwt = 0;
|
|
if ( wt ) {
|
|
nwt = wt->m_numSlots;
|
|
tp = (TermDebugInfo **)wt->m_keys;
|
|
}
|
|
|
|
// now print the table we stored all we hashed into
|
|
for ( int32_t i = 0 ; i < nwt ; i++ ) {
|
|
// skip if empty
|
|
if ( wt->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
//QUICKPOLL(m_niceness);
|
|
// get its key, date=32bits termid=64bits
|
|
//key96_t *k = (key96_t *)wt->getKey ( i );
|
|
// get the TermDebugInfo
|
|
TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
|
|
// point to it for sorting
|
|
tp[nt++] = ti;
|
|
}
|
|
|
|
// set this for cmptp
|
|
s_wbuf = &m_wbuf;
|
|
|
|
// sort them alphabetically by Term
|
|
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
|
|
|
|
// determine how many non 1.0 weight fields we got in the vectors
|
|
/*
|
|
int32_t count [ MAX_RULES ];
|
|
memset ( count , 0 , MAX_RULES * 4 );
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
TermDebugInfo *ti = tp[i];
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ )
|
|
if ( ti->m_rv[j] != 1.0 ) count[j]++;
|
|
}
|
|
// count the counts
|
|
char fbuf[9024];
|
|
char *fp = fbuf;
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
|
|
if ( ! count[j] ) continue;
|
|
fp += sprintf(fp ,"<td><b>R#%" INT32 "</b></td>",j);
|
|
}
|
|
*/
|
|
|
|
// print them out in a table
|
|
char hdr[1000];
|
|
sprintf(hdr,
|
|
"<table border=1 cellpadding=0>"
|
|
"<tr>"
|
|
// this messes up Test.cpp diff'ing
|
|
//"<td><b>#</b></td>"
|
|
"<td><b>Prefix</b></td>"
|
|
"<td><b>WordNum</b></td>"
|
|
"<td><b>Lang</b></td>"
|
|
"<td><b>Term</b></td>"
|
|
|
|
//"%s"
|
|
|
|
//"<td><b>Weight</b></td>"
|
|
//"<td><b>Spam</b></td>"
|
|
|
|
"<td><b>Desc</b></td>"
|
|
"<td><b>TermId/TermHash48</b></td>"
|
|
"<td><b>ShardByTermId?</b></td>"
|
|
"<td><b>Note</b></td>"
|
|
"</tr>\n"
|
|
//,fbuf
|
|
);
|
|
|
|
sb->safePrintf("%s",hdr);
|
|
|
|
char *start = m_wbuf.getBufStart();
|
|
int32_t rcount = 0;
|
|
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
|
|
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS) == 0 )
|
|
sb->safePrintf("<!--ignore--></table>%s",hdr);
|
|
|
|
char *prefix = " ";
|
|
if ( tp[i]->m_prefixOff >= 0 )
|
|
prefix = start + tp[i]->m_prefixOff;
|
|
|
|
bool isFacet = false;
|
|
if ( prefix &&
|
|
prefix[0]=='g' &&
|
|
strncmp(prefix,"gbfacet",7)== 0 )
|
|
isFacet = true;
|
|
|
|
sb->safePrintf ( "<tr>"
|
|
//"<td><b>%" INT32 "</b></td>"
|
|
"<td>%s</td>"
|
|
//i ,
|
|
, prefix
|
|
);
|
|
|
|
if ( isFacet )
|
|
sb->safePrintf("<td>--</td>");
|
|
else
|
|
sb->safePrintf( "<td>%" INT32 "</td>"
|
|
, tp[i]->m_wordNum );
|
|
|
|
|
|
// print lang
|
|
//char langId = tp[i]->m_langId;
|
|
|
|
// print out all langs word is in if it's not clear
|
|
// what language it is. we use a sliding window to
|
|
// resolve some ambiguity, but not all, so print out
|
|
// the possible langs here
|
|
sb->safePrintf("<td>");
|
|
if ( isFacet )
|
|
sb->safePrintf("--");
|
|
else
|
|
printLangBits ( sb , tp[i] );
|
|
sb->safePrintf("</td>");
|
|
|
|
|
|
// print the term
|
|
sb->safePrintf("<td><nobr>");
|
|
|
|
if ( tp[i]->m_synSrc )
|
|
sb->pushChar('*');
|
|
|
|
char *term = start + tp[i]->m_termOff;
|
|
int32_t termLen = tp[i]->m_termLen;
|
|
sb->safeMemcpy ( term , termLen );
|
|
|
|
/*
|
|
char *dateStr = " ";
|
|
int32_t ddd = tp[i]->m_date;
|
|
uint8_t *tddd = (uint8_t *)&ddd;
|
|
char tbbb[32];
|
|
if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
|
|
tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
|
|
sprintf(tbbb,"evIds %" INT32 "-%" INT32 "",
|
|
(int32_t)tddd[1],(int32_t)tddd[0]);
|
|
dateStr = tbbb;
|
|
}
|
|
else if ( ddd )
|
|
dateStr = asctime ( gmtime(&ddd ));
|
|
*/
|
|
|
|
//char ss[30];
|
|
//if ( tp[i]->m_spam == -1.0 ) sprintf(ss," ");
|
|
//else if ( tp[i]->m_spam == 0.0 ) sprintf(ss,"--");
|
|
//else sprintf ( ss , "%.03f",1.0-tp[i]->m_spam);
|
|
|
|
|
|
sb->safePrintf ( "</nobr></td>"
|
|
);
|
|
|
|
// print the weight vector before Weight and Spam
|
|
/*
|
|
float prod = 1.0;
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
|
|
if ( ! count[j] ) continue;
|
|
if ( tp[i]->m_isSynonym )
|
|
sb->safePrintf("<td> </td>" );
|
|
else if ( tp[i]->m_rv[j] == 1.0 )
|
|
sb->safePrintf("<td> </td>" );
|
|
else sb->safePrintf("<td>%.02f</td>",tp[i]->m_rv[j] );
|
|
// product up
|
|
prod *= tp[i]->m_rv[j];
|
|
}
|
|
|
|
// sanity check
|
|
// maybe look into this at some point, but not a big deal!!
|
|
//float err = prod - tp[i]->m_weight;
|
|
//if ( err > .05 )
|
|
// logf(LOG_DEBUG,"weights: prod was %.02f should be "
|
|
// "%.02f",prod,tp[i]->m_weight);
|
|
*/
|
|
|
|
//char *desc = " ";
|
|
//if ( tp[i]->m_descOff >= 0 )
|
|
// desc = start + tp[i]->m_descOff;
|
|
|
|
/*
|
|
// synonyms are always 1/4 weight of original
|
|
if ( tp[i]->m_isSynonym )
|
|
sb->safePrintf("<td> </td>" );
|
|
else
|
|
sb->safePrintf("<td>%.03f</td>", tp[i]->m_weight );
|
|
*/
|
|
|
|
sb->safePrintf ( //"<td>%s</td>"
|
|
//"<td><b>%" UINT32 "</b></td>"
|
|
//"<td><nobr>%s</nobr></td>"
|
|
"<td><nobr>%s",
|
|
getHashGroupString(tp[i]->m_hashGroup)
|
|
);
|
|
|
|
//if ( tp[i]->m_synSrc ) {
|
|
// char ss = tp[i]->m_synSrc;
|
|
// sb->safePrintf(" - %s",g_synonyms.getSourceString(ss));
|
|
//}
|
|
|
|
sb->safePrintf ( "</nobr></td>" );
|
|
|
|
sb->safePrintf ( "<td>%016" UINT64 "</td>"
|
|
,
|
|
//ss ,
|
|
//(uint32_t)tp[i]->m_score32 ,
|
|
//dateStr ,
|
|
//desc, // start + tp[i]->m_descOff ,
|
|
(uint64_t)(tp[i]->m_termId & TERMID_MASK) );
|
|
|
|
if ( tp[i]->m_shardByTermId ) sb->safePrintf("<td><b>1</b></td>" );
|
|
else sb->safePrintf("<td>0</td>" );
|
|
|
|
|
|
sb->safePrintf("<td>");
|
|
|
|
// there is no prefix for such terms now
|
|
// TODO: store actual key in there i guess?? or just this bit.
|
|
int32_t val32 = 0;
|
|
if ( strncmp(prefix,"gbfacet",7) == 0 )
|
|
val32 = g_posdb.getInt(&tp[i]->m_key);
|
|
|
|
// . this is like gbxpathsitehash1234567
|
|
// . the number following it is the hash
|
|
// . the value stored in the posdb key is the hash of the
|
|
// inner html content of that xpath/site for this page
|
|
if ( strncmp(term,"facetField=gbxpathsitehash",26)==0)
|
|
sb->safePrintf("<b>Term</b> is a 32-bit hash of the "
|
|
"X-path of "
|
|
"a section XOR'ed with the 32-bit "
|
|
"hash of this document's subdomain. "
|
|
"[%" UINT32 "] is the 32-bit hash of the "
|
|
"Inner HTML of this section stored "
|
|
"in the posdb key instead of "
|
|
"the usual stuff. This is also "
|
|
"sharded by termId!",
|
|
(uint32_t)val32
|
|
//(int32_t)tp[i]->m_sentHash32
|
|
);
|
|
|
|
sb->safePrintf("</td>");
|
|
|
|
|
|
sb->safePrintf("</tr>\n");
|
|
}
|
|
|
|
|
|
sb->safePrintf("</table><br>\n");
|
|
|
|
//
|
|
// END PRINT HASHES TERMS
|
|
//
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printMenu ( SafeBuf *sb ) {
|
|
|
|
// encode it
|
|
SafeBuf ue;
|
|
ue.urlEncode ( ptr_firstUrl );
|
|
|
|
// get
|
|
sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
|
|
"content=\"text/html; charset=utf-8\">" );
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
/*
|
|
char *coll = cr->m_coll;
|
|
|
|
int64_t d = m_docId;
|
|
|
|
// print links at top
|
|
sb->safePrintf(
|
|
//"<a href=/print?c=%s&u=%s&page=1>general info</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=2>page inlinks</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=3>site inlinks</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=4>sections</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=5>indexed terms</a> | "
|
|
// the breakdown of when it was spidered and when it
|
|
// is due to be spidered again. and any errors
|
|
// encountered when spidering
|
|
//"<a href=/print?c=%s&u=%s&page=6>spider stats</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=7>cached page</a>"
|
|
"<a href=/print?c=%s&d=%" INT64 "&page=1>general info</a> | "
|
|
"<a href=/print?c=%s&d=%" INT64 "&page=2&recompute=1>"
|
|
"page inlinks</a> | "
|
|
"<a href=/print?c=%s&d=%" INT64 "&page=3>site inlinks</a> | "
|
|
//"<a href=/print?c=%s&d=%" INT64 "&page=4>sections</a> | "
|
|
"<a href=/print?c=%s&d=%" INT64 "&page=5>indexed terms</a>"
|
|
// the breakdown of when it was spidered and when it
|
|
// is due to be spidered again. and any errors
|
|
// encountered when spidering
|
|
//"<a href=/print?c=%s&d=%" INT64 "&page=6>spider stats</a> |"
|
|
//" <a href=/print?c=%s&d=%" INT64 "&page=7>cached page</a>"
|
|
"<br>"
|
|
"<br>"
|
|
,coll,d//ue.getBufStart()
|
|
,coll,d//ue.getBufStart()
|
|
,coll,d//ue.getBufStart()
|
|
//,coll,d//ue.getBufStart()
|
|
,coll,d//ue.getBufStart()
|
|
//,coll,d//ue.getBufStart()
|
|
//,coll,d//ue.getBufStart()
|
|
);
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
// if printDocForProCog, an entry function, blocks, we gotta re-call it
|
|
static void printDocForProCogWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in print doc for pro cog wrapper" );
|
|
// get it
|
|
bool status = THIS->printDocForProCog ( THIS->m_savedSb ,
|
|
THIS->m_savedHr );
|
|
// return if it blocked
|
|
if ( ! status ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// in PageRoot.cpp
|
|
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
|
bool printGigablast );
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno and returns true on error
|
|
bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
if ( ! sb ) return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
m_masterLoop = printDocForProCogWrapper;
|
|
m_masterState = this;
|
|
|
|
m_savedSb = sb;
|
|
m_savedHr = hr;
|
|
|
|
// if we are generating site or page inlinks info for a
|
|
// non docid based url, then store that info in the respective
|
|
// safe bufs
|
|
m_useSiteLinkBuf = true;
|
|
m_usePageLinkBuf = true;
|
|
|
|
|
|
int32_t page = hr->getLong("page",1);
|
|
|
|
|
|
// for some reason sections page blocks forever in browser
|
|
if ( page != 7 && ! m_printedMenu ) { // && page != 5 )
|
|
printFrontPageShell ( sb , "search" , cr , false );
|
|
m_printedMenu = true;
|
|
//printMenu ( sb );
|
|
}
|
|
|
|
|
|
if ( page == 1 )
|
|
return printGeneralInfo(sb,hr);
|
|
|
|
if ( page == 2 )
|
|
return printPageInlinks(sb,hr);
|
|
|
|
if ( page == 3 )
|
|
return printSiteInlinks(sb,hr);
|
|
|
|
if ( page == 4 )
|
|
return printRainbowSections(sb,hr);
|
|
|
|
if ( page == 5 )
|
|
return printTermList(sb,hr);
|
|
|
|
if ( page == 6 )
|
|
return printSpiderStats(sb,hr);
|
|
|
|
if ( page == 7 )
|
|
return printCachedPage(sb,hr);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// int16_tcut
|
|
char *fu = ptr_firstUrl;
|
|
|
|
// sanity check
|
|
Xml *xml = getXml();
|
|
// blocked?
|
|
if ( xml == (void *)-1 ) return false;
|
|
// error?
|
|
if ( ! xml ) return true;
|
|
|
|
char *ict = getIsContentTruncated();
|
|
if ( ! ict ) return true; if ( ict == (char *)-1 ) return false;
|
|
char *at = getIsAdult();
|
|
if ( ! at ) return true; if ( at == (void *)-1 ) return false;
|
|
char *ls = getIsLinkSpam();
|
|
if ( ! ls ) return true; if ( ls == (void *)-1 ) return false;
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return true; if ( ct == (void *)-1 ) return false;
|
|
uint16_t *cs = getCharset ( );
|
|
if ( ! cs ) return true; if ( cs == (uint16_t *)-1 ) return false;
|
|
char *pl = getIsPermalink();
|
|
if ( ! pl ) return true; if ( pl == (char *)-1 ) return false;
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS ) return true; if ( isRSS == (char *)-1 ) return false;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip ) return true; if ( ip == (int32_t *)-1 ) return false;
|
|
uint8_t *li = getLangId();
|
|
if ( ! li ) return true; if ( li == (uint8_t *)-1 ) return false;
|
|
uint16_t *cid = getCountryId();
|
|
if ( ! cid ) return true; if ( cid == (uint16_t *)-1 ) return false;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 ) return true; if ( info1 == (void *)-1 ) return false;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
//char *ls = getIsLinkSpam();
|
|
//Links *links = getLinks();
|
|
// blocked?
|
|
//if ( links == (void *)-1 ) { char *xx=NULL;*xx=0;}//return false;
|
|
// error?
|
|
//if ( ! links ) return true;
|
|
|
|
// make it a URL
|
|
Url uu; uu.set ( fu , false );
|
|
|
|
|
|
|
|
char *allowed = "???";
|
|
int32_t allowedInt = 1;
|
|
if ( m_isAllowedValid && m_isAllowed ) {
|
|
allowed = "yes";
|
|
allowedInt = 1;
|
|
}
|
|
else if ( m_isAllowedValid ) {
|
|
allowed = "no";
|
|
allowedInt = 0;
|
|
}
|
|
|
|
int32_t ufn = -1;
|
|
if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
|
|
|
|
char *es = mstrerror(m_indexCode);
|
|
if ( ! m_indexCode ) es = mstrerror(g_errno);
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
//int32_t groupId = g_hostdb.getGroupIdFromDocId(m_docId);
|
|
//Host *group = g_hostdb.getGroup(groupId);
|
|
int32_t shardNum = getShardNumFromDocId ( m_docId );
|
|
Host *hosts = g_hostdb.getShard ( shardNum );
|
|
Host *h = &hosts[0];
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf (
|
|
"<table cellpadding=3 border=0>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">docId</td>"
|
|
"<td><a href=/get?c=%s&d=%" UINT64 ">%" UINT64 "</a></td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">on host #</td>"
|
|
"<td>%" INT32 "</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>index error code</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>robots.txt allows</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
|
|
,
|
|
cr->m_coll,
|
|
m_docId ,
|
|
m_docId ,
|
|
|
|
h->m_hostId,
|
|
|
|
es,
|
|
allowed,
|
|
|
|
fu,
|
|
fu
|
|
|
|
);
|
|
else
|
|
sb->safePrintf (
|
|
"<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
"\t<coll><![CDATA[%s]]></coll>\n"
|
|
"\t<docId>%" INT64 "</docId>\n"
|
|
"\t<indexError><![CDATA[%s]]></indexError>\n"
|
|
"\t<robotsTxtAllows>%" INT32 ""
|
|
"</robotsTxtAllows>\n"
|
|
"\t<url><![CDATA[%s]]></url>\n"
|
|
,
|
|
cr->m_coll,
|
|
m_docId ,
|
|
es,
|
|
allowedInt,//(int32_t)m_isAllowed,
|
|
fu
|
|
);
|
|
|
|
char *redir = ptr_redirUrl;
|
|
if ( redir && ! isXml ) {
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>redir url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
,redir
|
|
,redir );
|
|
}
|
|
else if ( redir ) {
|
|
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
|
|
"</redirectUrl>\n" ,redir );
|
|
}
|
|
|
|
|
|
if ( m_indexCode || g_errno ) {
|
|
if ( ! isXml ) sb->safePrintf("</table><br>\n");
|
|
else sb->safePrintf("</response>\n");
|
|
return true;
|
|
}
|
|
|
|
|
|
// must always start with http i guess!
|
|
if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
|
|
|
|
time_t ts = (time_t)m_firstIndexedDate;
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>first indexed date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts)) );
|
|
else
|
|
sb->safePrintf("\t<firstIndexedDateUTC>%" UINT32 ""
|
|
"</firstIndexedDateUTC>\n",
|
|
(uint32_t)m_firstIndexedDate );
|
|
|
|
ts = m_spideredTime;
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>last indexed date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
else
|
|
sb->safePrintf("\t<lastIndexedDateUTC>%" UINT32 ""
|
|
"</lastIndexedDateUTC>\n",
|
|
(uint32_t)m_spideredTime );
|
|
|
|
ts = m_outlinksAddedDate;
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>outlinks last added date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
else
|
|
sb->safePrintf("\t<outlinksLastAddedUTC>%" UINT32 ""
|
|
"</outlinksLastAddedUTC>\n",
|
|
(uint32_t)m_outlinksAddedDate );
|
|
|
|
// hop count
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>hop count</td><td>%" INT32 "</td>"
|
|
"</tr>\n",
|
|
(int32_t)m_hopCount);
|
|
else
|
|
sb->safePrintf("\t<hopCount>%" INT32 "</hopCount>\n",
|
|
(int32_t)m_hopCount);
|
|
|
|
|
|
char strLanguage[128];
|
|
languageToString(m_langId, strLanguage);
|
|
|
|
// print tags
|
|
//SafeBuf tb;
|
|
int32_t sni = m_siteNumInlinks;
|
|
|
|
char *ipString = iptoa(m_ip);
|
|
|
|
//int32_t sni = info1->getNumGoodInlinks();
|
|
|
|
time_t tlu = info1->getLastUpdated();
|
|
struct tm *timeStruct3 = gmtime ( &tlu );//info1->m_lastUpdated );
|
|
char tmp3[64];
|
|
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
|
|
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf (
|
|
"<tr><td>original charset</td><td>%s</td></tr>\n"
|
|
"<tr><td>adult bit</td><td>%" INT32 "</td></tr>\n"
|
|
//"<tr><td>is link spam?</td><td>%" INT32 " <b>%s</b></td></tr>\n"
|
|
"<tr><td>is permalink?</td><td>%" INT32 "</td></tr>\n"
|
|
"<tr><td>is RSS feed?</td><td>%" INT32 "</td></tr>\n"
|
|
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
|
|
"%s</td></tr>\n"
|
|
"<tr><td>content len</td><td>%" INT32 " bytes</td></tr>\n"
|
|
"<tr><td>content truncated</td><td>%" INT32 "</td></tr>\n"
|
|
"<tr><td>content type</td><td>%s</td></tr>\n"
|
|
"<tr><td>language</td><td>%s</td></tr>\n"
|
|
"<tr><td>country</td><td>%s</td></tr>\n"
|
|
|
|
"<tr><td><b>good inlinks to site</b>"
|
|
"</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
// "<tr><td>unique IP inlinks to site"
|
|
// "</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
// "<tr><td>unique CBlock inlinks to site"
|
|
// "</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
"<tr><td><b>site rank</b></td><td>%" INT32 "</td></tr>\n"
|
|
|
|
"<tr><td>good inlinks to page"
|
|
"</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
// "<tr><td>unique IP inlinks to page"
|
|
// "</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
// "<tr><td>unique CBlock inlinks to page"
|
|
// "</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
// "<tr><td>total inlinks to page"
|
|
// "</td><td>%" INT32 "</td></tr>\n"
|
|
|
|
"<tr><td><nobr>page inlinks last computed</nobr></td>"
|
|
"<td>%s</td></tr>\n"
|
|
"</td></tr>\n",
|
|
get_charset_str(m_charset),
|
|
(int32_t)m_isAdult,
|
|
//(int32_t)m_isLinkSpam,
|
|
//m_note,
|
|
(int32_t)m_isPermalink,
|
|
(int32_t)m_isRSS,
|
|
ipString,
|
|
cr->m_coll,
|
|
ipString,
|
|
size_utf8Content - 1,
|
|
(int32_t)m_isContentTruncated,
|
|
g_contentTypeStrings[(int)m_contentType] ,
|
|
strLanguage,
|
|
g_countryCode.getName(m_countryId) ,
|
|
sni,
|
|
//m_siteNumInlinksUniqueIp,
|
|
//m_siteNumInlinksUniqueCBlock,
|
|
::getSiteRank(sni),
|
|
//info1->getNumTotalInlinks(),
|
|
info1->getNumGoodInlinks(),
|
|
// info1->m_numUniqueIps,
|
|
// info1->m_numUniqueCBlocks,
|
|
// info1->m_totalInlinkingDocIds,
|
|
|
|
tmp3
|
|
);
|
|
else {
|
|
sb->safePrintf (
|
|
"\t<charset><![CDATA[%s]]></charset>\n"
|
|
"\t<isAdult>%" INT32 "</isAdult>\n"
|
|
"\t<isLinkSpam>%" INT32 "</isLinkSpam>\n"
|
|
"\t<siteRank>%" INT32 "</siteRank>\n"
|
|
|
|
"\t<numGoodSiteInlinks>%" INT32 "</numGoodSiteInlinks>\n"
|
|
//"\t<numTotalSiteInlinks>%" INT32 "</numTotalSiteInlinks>\n"
|
|
// "\t<numUniqueIpsLinkingToSite>%" INT32 ""
|
|
// "</numUniqueIpsLinkingToSite>\n"
|
|
// "\t<numUniqueCBlocksLinkingToSite>%" INT32 ""
|
|
// "</numUniqueCBlocksLinkingToSite>\n"
|
|
|
|
|
|
|
|
|
|
// how many inlinks, external and internal, we have
|
|
// to this page not filtered in any way!!!
|
|
//"\t<numTotalPageInlinks>%" INT32 "</numTotalPageInlinks>\n"
|
|
// how many inlinking ips we got, including our own if
|
|
// we link to ourself
|
|
// "\t<numUniqueIpsLinkingToPage>%" INT32 ""
|
|
// "</numUniqueIpsLinkingToPage>\n"
|
|
// how many inlinking cblocks we got, including our own
|
|
// if we link to ourself
|
|
// "\t<numUniqueCBlocksLinkingToPage>%" INT32 ""
|
|
// "</numUniqueCBlocksLinkingToPage>\n"
|
|
|
|
|
|
"\t<numGoodPageInlinks>%" INT32 "</numGoodPageInlinks>\n"
|
|
"\t<pageInlinksLastComputed>%" INT32 ""
|
|
"</pageInlinksLastComputed>\n"
|
|
|
|
,get_charset_str(m_charset)
|
|
,(int32_t)m_isAdult
|
|
,(int32_t)m_isLinkSpam
|
|
,::getSiteRank(sni)
|
|
,sni
|
|
// ,m_siteNumInlinksTotal
|
|
// ,m_siteNumInlinksUniqueIp
|
|
// ,m_siteNumInlinksUniqueCBlock
|
|
|
|
//,info1->m_totalInlinkingDocIds
|
|
//,info1->m_numUniqueIps
|
|
//,info1->m_numUniqueCBlocks
|
|
|
|
,info1->getNumGoodInlinks()
|
|
//,tmp3
|
|
,(int32_t)info1->m_lastUpdated
|
|
);
|
|
//if ( m_note )
|
|
// sb->safePrintf("\t<isLinkSpamReason><![CDATA[%s]]>"
|
|
// "</isLinkSpamReason>\n"
|
|
// , m_note );
|
|
sb->safePrintf("\t<isPermalink>%" INT32 "</isPermalink>\n"
|
|
"\t<isRSSFeed>%" INT32 "</isRSSFeed>\n"
|
|
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
|
|
"\t<contentLenInBytes>%" INT32 ""
|
|
"</contentLenInBytes>\n"
|
|
"\t<isContentTruncated>%" INT32 ""
|
|
"</isContentTruncated>\n"
|
|
"\t<contentType><![CDATA[%s]]></contentType>\n"
|
|
"\t<language><![CDATA[%s]]></language>\n"
|
|
"\t<country><![CDATA[%s]]></country>\n",
|
|
(int32_t)m_isPermalink,
|
|
(int32_t)m_isRSS,
|
|
ipString,
|
|
size_utf8Content - 1,
|
|
(int32_t)m_isContentTruncated,
|
|
g_contentTypeStrings[(int)m_contentType] ,
|
|
strLanguage,
|
|
g_countryCode.getName(m_countryId) );
|
|
}
|
|
|
|
//sb->safePrintf("<tr><td>site</td><td>");
|
|
//sb->safeMemcpy(ptr_site,size_site-1);
|
|
//sb->safePrintf("</td></tr>\n");
|
|
|
|
|
|
TagRec *ogr = NULL;
|
|
if ( m_tagRecDataValid && m_version >= 118 ) {
|
|
ogr = getTagRec(); // &m_tagRec;
|
|
// sanity. should be set from titlerec, so no blocking!
|
|
if ( ! ogr || ogr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
|
|
else if ( ogr ) ogr->printToBufAsXml ( sb );
|
|
|
|
// show the good inlinks we used when indexing this
|
|
if ( ! isXml )
|
|
info1->print(sb,cr->m_coll);
|
|
|
|
// close the table
|
|
if ( ! isXml )
|
|
sb->safePrintf ( "</table></center><br>\n" );
|
|
else
|
|
sb->safePrintf("</response>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// use msg25 to hit linkdb and give us a link info class i guess
|
|
// but we need paging functionality so we can page through like
|
|
// 100 links at a time. clustered by c-class ip.
|
|
|
|
// do we need to mention how many from each ip c-class then? because
|
|
// then we'd have to read the whole termlist, might be several
|
|
// separate disk reads.
|
|
|
|
// we need to re-get both if either is NULL
|
|
LinkInfo *sinfo = getSiteLinkInfo();
|
|
// block or error?
|
|
if ( ! sinfo ) return true; if ( sinfo == (LinkInfo *)-1) return false;
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
|
|
|
|
sb->safeMemcpy ( &m_siteLinkBuf );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("</response>\n" );
|
|
|
|
// just print that
|
|
//sinfo->print ( sb , cr->m_coll );
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// we need to re-get both if either is NULL
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// block or error?
|
|
if ( ! info1 ) return true; if ( info1 == (LinkInfo *)-1) return false;
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
|
|
int32_t recompute = hr->getLong("recompute",0);
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
// i guess we need this
|
|
if ( ! recompute ) // m_setFromTitleRec )
|
|
info1->print ( sb , cr->m_coll );
|
|
else
|
|
sb->safeMemcpy ( &m_pageLinkBuf );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("</response>\n" );
|
|
|
|
return true;
|
|
}
|
|
|
|
static void getInlineSectionVotingBufWrapper ( void *state ) {
|
|
XmlDoc *xd = (XmlDoc *)state;
|
|
SafeBuf *vb = xd->getInlineSectionVotingBuf();
|
|
// return if blocked
|
|
if ( vb == (void *)-1 ) return;
|
|
// error?
|
|
if ( ! vb ) log("xmldoc: error getting inline section votes: %s",
|
|
mstrerror(g_errno));
|
|
// all done then. call original entry callback
|
|
log("xmldoc: returning control to original caller");
|
|
xd->m_callback1 ( xd->m_state );
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true with g_errno set on error
|
|
// . this actually returns the page content with inserted information
|
|
// based on sectiondb data
|
|
// . for example, <div id=poo> --> <div id=poo d=5 n=20>
|
|
// means that the section is repeated on 20 pages from this site and 5 of
|
|
// which have the same innerHtml as us
|
|
SafeBuf *XmlDoc::getInlineSectionVotingBuf ( ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . if we block anywhere below we want to come back here until done
|
|
// . this can be a main entry point, so set m_masterLoop
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getInlineSectionVotingBufWrapper;
|
|
m_masterState = this;
|
|
log("xmldoc: getting section voting info from coll=%s",
|
|
cr->m_coll);
|
|
}
|
|
|
|
if ( m_inlineSectionVotingBufValid )
|
|
return &m_inlineSectionVotingBuf;
|
|
|
|
Sections *sections = getSectionsWithDupStats();
|
|
if ( ! sections || sections == (void *)-1 ) return (SafeBuf *)sections;
|
|
Words *words = getWords();
|
|
if ( ! words || words == (void *)-1 ) return (SafeBuf *)words;
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (void *)-1 ) return (SafeBuf *)mime;
|
|
|
|
int32_t siteHash32 = *getSiteHash32();
|
|
|
|
//int32_t nw = words->getNumWords();
|
|
//int64_t *wids = words->getWordIds();
|
|
|
|
SafeBuf *sb = &m_inlineSectionVotingBuf;
|
|
|
|
// store mime first then content
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we no longer use this through a proxy, so take this out
|
|
//sb->safeMemcpy ( m_httpReply , mime->getMimeLen() );
|
|
// but hack the Content-Length: field to something alien
|
|
// because we markup the html and the length will be different...
|
|
//sb->nullTerm();
|
|
|
|
// we no longer use this through a proxy so take this out
|
|
//char *cl = strstr(sb->getBufStart(),"\nContent-Length:");
|
|
//if ( cl ) cl[1] = 'Z';
|
|
|
|
//sec_t mflags = SEC_SENTENCE | SEC_MENU;
|
|
|
|
// just print out each word
|
|
// map the word to a section.
|
|
// if it s the first time we've printed the section then we
|
|
// can inject the stuff
|
|
// set a printed bit to indicate when we print out a section so
|
|
// we do not re-print it...
|
|
|
|
// these are 1-1 with words
|
|
Section **sptrs = sections->m_sectionPtrs;
|
|
int32_t nw = words->getNumWords();
|
|
char **wptrs = words->m_words;
|
|
int32_t *wlens = words->m_wordLens;
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
char *a = wptrs[i];
|
|
// skip if not a front tag
|
|
if ( *a != '<' || a[1] == '/' ) {
|
|
sb->safeMemcpy(a,wlens[i]);
|
|
continue;
|
|
}
|
|
Section *sa = sptrs[i];
|
|
// straight copy if no stats
|
|
if ( ! sa || ! sa->m_stats.m_totalEntries ) {
|
|
sb->safeMemcpy ( a , wlens[i] );
|
|
continue;
|
|
}
|
|
// should be tag then
|
|
char *e = a;
|
|
for ( ; *e && *e != '>' && ! is_wspace_a(*e) ; e++);
|
|
// copy that
|
|
sb->safeMemcpy ( a , e-a);
|
|
|
|
// the hash of the turktaghash and sitehash32 combined
|
|
// so you can do gbfacetstr:gbxpathsitehash12345
|
|
// where the 12345 is this h32 value.
|
|
uint32_t h32 = sa->m_turkTagHash32 ^ siteHash32;
|
|
|
|
// insert our stuff into the tag
|
|
//sb->safePrintf("<!--");
|
|
//sb->safePrintf("<font color=red>");
|
|
SectionStats *sx = &sa->m_stats;
|
|
// # docs from our site had the same innerHTML?
|
|
sb->safePrintf(" _s=M%" INT32 "D%" INT32 "n%" INT32 "u%" INT32 "h%" UINT32 "",
|
|
// total # of docs that had an xpath with
|
|
// our same innerHtml
|
|
(int32_t)sx->m_totalMatches,
|
|
// # of of docids with this facet
|
|
(int32_t)sx->m_totalDocIds,
|
|
// . total # of times this xpath occurred
|
|
// . can be multiple times per doc
|
|
(int32_t)sx->m_totalEntries,
|
|
// unique values in the xpath innerhtml
|
|
(int32_t)sx->m_numUniqueVals,
|
|
// xpathsitehash
|
|
h32 );
|
|
// copy the rest of the tag
|
|
sb->safeMemcpy( e, wlens[i]-(e-a) );
|
|
//sb->safePrintf("-->");
|
|
//sb->safePrintf("</font>");
|
|
// print it here
|
|
}
|
|
sb->nullTerm();
|
|
m_inlineSectionVotingBufValid = true;
|
|
return &m_inlineSectionVotingBuf;
|
|
}
|
|
|
|
bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// what wordposition to scroll to and blink?
|
|
int32_t hiPos = -1;
|
|
if ( hr ) hiPos = hr->getLong("hipos",-1);
|
|
|
|
//
|
|
// PRINT SECTIONS
|
|
//
|
|
Sections *sections ;
|
|
// hr is NULL if being called from page parser which does not have the
|
|
// dup stats! and we core if we block here!
|
|
if ( hr ) sections = getSectionsWithDupStats();
|
|
else sections = getSections();
|
|
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
|
|
//SectionVotingTable *nsvt = getNewSectionVotingTable();
|
|
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
Words *words = getWords();
|
|
if ( ! words ) return true; if ( words == (Words *)-1 ) return false;
|
|
Phrases *phrases = getPhrases();
|
|
if ( ! phrases ) return true; if (phrases == (void *)-1 ) return false;
|
|
HashTableX *cnt = getCountTable();
|
|
if ( ! cnt ) return true; if ( cnt == (void *)-1 ) return false;
|
|
|
|
|
|
int32_t nw = words->getNumWords();
|
|
//int32_t wordStart = 0;
|
|
//int32_t wordEnd = nw;
|
|
int64_t *wids = words->getWordIds();
|
|
|
|
int32_t isXml = false;
|
|
if ( hr ) isXml = (bool)hr->getLong("xml",0);
|
|
|
|
//if ( ! isXml ) printMenu ( sb );
|
|
|
|
// now complement, cuz bigger is better in the ranking world
|
|
//int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY );
|
|
SafeBuf densBuf;
|
|
// returns false and sets g_errno on error
|
|
if ( ! getDensityRanks((int64_t *)wids,
|
|
nw,
|
|
HASHGROUP_BODY,//hi->m_hashGroup,
|
|
&densBuf,
|
|
sections,
|
|
m_niceness))
|
|
return true;
|
|
// a handy ptr
|
|
char *densityVec = (char *)densBuf.getBufStart();
|
|
|
|
|
|
/*
|
|
if ( ! isXml )
|
|
sb->safePrintf("<br><b>density rank of body = %" INT32 "</b> "
|
|
"(out of %" INT32 ")"
|
|
"<br>"
|
|
"<br>"
|
|
, densityRank
|
|
, (int32_t)MAXDENSITYRANK
|
|
);
|
|
*/
|
|
|
|
|
|
char *wordSpamVec = getWordSpamVec();
|
|
char *fragVec = m_fragBuf.getBufStart();
|
|
|
|
SafeBuf dwbuf;
|
|
if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true;
|
|
char *diversityVec = dwbuf.getBufStart();
|
|
|
|
// hack fack debug
|
|
//m_bodyStartPos =2136;
|
|
|
|
SafeBuf wpos;
|
|
if ( ! getWordPosVec ( words ,
|
|
sections,
|
|
//wordStart,
|
|
//wordEnd,
|
|
// we save this in the titlerec, when we
|
|
// start hashing the body. we have the url
|
|
// terms before the body, so this is necessary.
|
|
m_bodyStartPos,//0, // hi->m_startDist,
|
|
fragVec,
|
|
m_niceness,
|
|
&wpos) ) return true;
|
|
// a handy ptr
|
|
int32_t *wposVec = (int32_t *)wpos.getBufStart();
|
|
|
|
if ( ! isXml ) {
|
|
// put url in for steve to parse out
|
|
sb->safePrintf("%s\n",
|
|
m_firstUrl.m_url);
|
|
sb->safePrintf("<font color=black>w</font>"
|
|
"/"
|
|
"<font color=purple>x</font>"
|
|
//"/"
|
|
//"<font color=green>y</font>"
|
|
"/"
|
|
"<font color=red>z</font>"
|
|
": "
|
|
"w=wordPosition "
|
|
"x=densityRank "
|
|
//"y=diversityRank "
|
|
"z=wordSpamRank "
|
|
"<br>"
|
|
"<br>"
|
|
""
|
|
);
|
|
}
|
|
|
|
if ( ! isXml ) {
|
|
// try the new print function
|
|
sections->print2 ( sb ,
|
|
hiPos,
|
|
wposVec,
|
|
densityVec,
|
|
diversityVec,
|
|
wordSpamVec,
|
|
fragVec,
|
|
NULL,
|
|
NULL ,
|
|
&m_addresses ,
|
|
true );
|
|
return true;
|
|
}
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
|
|
Section *si = sections->m_rootSection;
|
|
|
|
sec_t mflags = SEC_SENTENCE | SEC_MENU;
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// print it out
|
|
sb->safePrintf("\t<section>\n");
|
|
// get our offset in the array of sections
|
|
int32_t num = si - sections->m_sections;
|
|
sb->safePrintf("\t\t<id>%" INT32 "</id>\n",num);
|
|
Section *parent = si->m_parent;
|
|
if ( parent ) {
|
|
int32_t pnum = parent - sections->m_sections;
|
|
sb->safePrintf("\t\t<parent>%" INT32 "</parent>\n",pnum);
|
|
}
|
|
char *byte1 = words->m_words[si->m_a];
|
|
char *byte2 = words->m_words[si->m_b-1] +
|
|
words->m_wordLens[si->m_b-1];
|
|
int32_t off1 = byte1 - words->m_words[0];
|
|
int32_t size = byte2 - byte1;
|
|
sb->safePrintf("\t\t<byteOffset>%" INT32 "</byteOffset>\n",off1);
|
|
sb->safePrintf("\t\t<numBytes>%" INT32 "</numBytes>\n",size);
|
|
if ( si->m_flags & mflags ) {
|
|
sb->safePrintf("\t\t<flags><![CDATA[");
|
|
bool printed = false;
|
|
if ( si->m_flags & SEC_SENTENCE ) {
|
|
sb->safePrintf("sentence");
|
|
printed = true;
|
|
}
|
|
if ( si->m_flags & SEC_MENU ) {
|
|
if ( printed ) sb->pushChar(' ');
|
|
sb->safePrintf("ismenu");
|
|
printed = true;
|
|
}
|
|
sb->safePrintf("]]></flags>\n");
|
|
}
|
|
int32_t bcolor = (int32_t)si->m_colorHash& 0x00ffffff;
|
|
int32_t fcolor = 0x000000;
|
|
//int32_t rcolor = 0x000000;
|
|
uint8_t *bp = (uint8_t *)&bcolor;
|
|
bool dark = false;
|
|
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
|
|
dark = true;
|
|
// or if two are less than 50
|
|
if ( bp[0]<100 && bp[1]<100 ) dark = true;
|
|
if ( bp[1]<100 && bp[2]<100 ) dark = true;
|
|
if ( bp[0]<100 && bp[2]<100 ) dark = true;
|
|
// if bg color is dark, make font color light
|
|
if ( dark ) {
|
|
fcolor = 0x00ffffff;
|
|
//rcolor = 0x00ffffff;
|
|
}
|
|
sb->safePrintf("\t\t<bgColor>%06" XINT32 "</bgColor>\n",bcolor);
|
|
sb->safePrintf("\t\t<textColor>%06" XINT32 "</textColor>\n",fcolor);
|
|
// count stats
|
|
uint64_t ch64 = (int32_t)si->m_sentenceContentHash64;
|
|
if ( ! ch64 ) {
|
|
sb->safePrintf("\t</section>\n");
|
|
continue;
|
|
}
|
|
/* take this out for now it is not quite right any more.
|
|
we now use the xpath hash and site hash as the key
|
|
and the "value" is the sentence/innerHtml hash
|
|
sb->safePrintf("\t\t<numOnSitePagesThatDuplicateContent>%" INT32 ""
|
|
"</numOnSitePagesThatDuplicateContent>\n",
|
|
(int32_t)si->m_stats.m_onSiteDocIds);
|
|
sb->safePrintf("\t\t<numOffSitePagesThatDuplicateContent>%" INT32 ""
|
|
"</numOffSitePagesThatDuplicateContent>\n",
|
|
(int32_t)si->m_stats.m_offSiteDocIds);
|
|
sb->safePrintf("\t\t<numSitesThatDuplicateContent>%" INT32 ""
|
|
"</numSitesThatDuplicateContent>\n",
|
|
(int32_t)si->m_stats.m_numUniqueSites);
|
|
*/
|
|
// you can do a sitehash:xxxxx this number to see who the
|
|
// dups are!
|
|
sb->safePrintf("\t\t<innerContentHash64>%" UINT64 ""
|
|
"</innerContentHash64>\n",
|
|
si->m_sentenceContentHash64);
|
|
sb->safePrintf("\t</section>\n");
|
|
}
|
|
|
|
// now print out the entire page content so the offsets make sense!
|
|
sb->safePrintf("\t<utf8Content><![CDATA[");
|
|
if ( ptr_utf8Content )
|
|
sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,false);
|
|
sb->safePrintf("]]></utf8Content>\n");
|
|
|
|
// end xml response
|
|
sb->safePrintf("</response>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// set debug buffer
|
|
m_storeTermListInfo = true;
|
|
|
|
// default to sorting by wordpos
|
|
m_sortTermListBy = hr->getLong("sortby",1);
|
|
|
|
// cores in getNewSpiderReply() if we do not have this and provide
|
|
// the docid...
|
|
m_useSpiderdb = false;
|
|
|
|
char *metaList = getMetaList ( );
|
|
if ( ! metaList ) return true; if (metaList==(char *) -1) return false;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( isXml ) {
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
sb->safePrintf(
|
|
"\t<maxDens>%" INT32 "</maxDens>\n"
|
|
//"\t<maxDiv>%" INT32 "</maxDiv>\n"
|
|
"\t<maxSpam>%" INT32 "</maxSpam>\n"
|
|
, (int32_t)MAXDENSITYRANK
|
|
//, (int32_t)MAXDIVERSITYRANK
|
|
, (int32_t)MAXWORDSPAMRANK
|
|
);
|
|
}
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! isXml ) {
|
|
//printMenu ( sb );
|
|
//sb->safePrintf("<i>* indicates word is a synonym or "
|
|
// "alternative word form<br><br>");
|
|
sb->safePrintf("N column = DensityRank (0-%" INT32 ")<br>"
|
|
//"V column = DiversityRank (0-%" INT32 ")<br>"
|
|
"S column = WordSpamRank (0-%" INT32 ") "
|
|
"[or linker "
|
|
"siterank if its offsite link text]<br>"
|
|
|
|
"Lang column = language used for purposes "
|
|
"of detecting the document's primary language "
|
|
"using a simple majority vote"
|
|
"<br>"
|
|
|
|
"</i>"
|
|
"<br>"
|
|
"Document Primary Language: <b>%s</b> (%s)"
|
|
"<br>"
|
|
"<br>"
|
|
, (int32_t)MAXDENSITYRANK
|
|
//, (int32_t)MAXDIVERSITYRANK
|
|
, (int32_t)MAXWORDSPAMRANK
|
|
, getLanguageString (m_langId)
|
|
, getLangAbbr(m_langId)
|
|
);
|
|
// encode it
|
|
SafeBuf ue;
|
|
ue.urlEncode ( ptr_firstUrl );
|
|
|
|
sb->safePrintf("Sort by: " );
|
|
if ( m_sortTermListBy == 0 )
|
|
sb->safePrintf("<b>Term</b>");
|
|
else
|
|
sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
|
|
"sortby=0>"
|
|
"Term</a>"
|
|
, cr->m_coll
|
|
, ue.getBufStart()
|
|
);
|
|
sb->safePrintf(" | ");
|
|
if ( m_sortTermListBy == 1 )
|
|
sb->safePrintf("<b>WordPos</b>");
|
|
else
|
|
sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
|
|
"sortby=1>"
|
|
"WordPos</a>"
|
|
, cr->m_coll
|
|
, ue.getBufStart()
|
|
);
|
|
sb->safePrintf("<br>"
|
|
"<br>"
|
|
);
|
|
}
|
|
|
|
|
|
//
|
|
// BEGIN PRINT HASHES TERMS (JUST POSDB)
|
|
//
|
|
|
|
// int16_tcut
|
|
HashTableX *wt = m_wts;
|
|
|
|
// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
|
|
TermDebugInfo **tp = NULL;
|
|
// add them with this counter
|
|
int32_t nt = 0;
|
|
|
|
int32_t nwt = 0;
|
|
if ( wt ) {
|
|
nwt = wt->m_numSlots;
|
|
tp = (TermDebugInfo **)wt->m_keys;
|
|
}
|
|
|
|
// now print the table we stored all we hashed into
|
|
for ( int32_t i = 0 ; i < nwt ; i++ ) {
|
|
// skip if empty
|
|
if ( wt->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
//QUICKPOLL(m_niceness);
|
|
// get its key, date=32bits termid=64bits
|
|
//key96_t *k = (key96_t *)wt->getKey ( i );
|
|
// get the TermDebugInfo
|
|
TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
|
|
// point to it for sorting
|
|
tp[nt++] = ti;
|
|
}
|
|
|
|
// set this for cmptp
|
|
s_wbuf = &m_wbuf;
|
|
|
|
if ( m_sortTermListBy == 0 )
|
|
// sort them alphabetically
|
|
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
|
|
else
|
|
// sort by word pos
|
|
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp2 , m_niceness );
|
|
|
|
|
|
// print the weight tables
|
|
//printLocationWeightsTable(sb,isXml);
|
|
//printDiversityWeightsTable(sb,isXml);
|
|
//printDensityWeightsTable(sb,isXml);
|
|
//printWordSpamWeightsTable(sb,isXml);
|
|
|
|
// print them out in a table
|
|
char hdr[1000];
|
|
sprintf(hdr,
|
|
"<table border=1 cellpadding=0>"
|
|
"<tr>"
|
|
// this messes up Test.cpp diff'ing
|
|
//"<td><b>#</b></td>"
|
|
"<td><b>Prefix</b></td>"
|
|
"<td><b>WordPos</b></td>"
|
|
"<td><b>Lang</b></td>"
|
|
|
|
"<td><b>Term</b></td>"
|
|
|
|
//"%s"
|
|
|
|
//"<td><b>Weight</b></td>"
|
|
//"<td><b>Spam</b></td>"
|
|
|
|
"<td><b>Desc</b></td>"
|
|
|
|
"<td><b>N</b></td>"
|
|
//"<td><b>V</b></td>" // diversityRank
|
|
"<td><b>S</b></td>"
|
|
"<td><b>Score</b></td>"
|
|
|
|
//"<td><b>Date</b></td>"
|
|
//"<td><b>Desc</b></td>"
|
|
//"<td><b>TermId</b></td>"
|
|
"</tr>\n"
|
|
//,fbuf
|
|
);
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("%s",hdr);
|
|
|
|
char *start = m_wbuf.getBufStart();
|
|
int32_t rcount = 0;
|
|
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS) == 0 && ! isXml )
|
|
sb->safePrintf("<!--ignore--></table>%s",hdr);
|
|
|
|
char *prefix = NULL;//" ";
|
|
if ( tp[i]->m_prefixOff >= 0 )
|
|
prefix = start + tp[i]->m_prefixOff;
|
|
|
|
if ( isXml ) sb->safePrintf("\t<term>\n");
|
|
|
|
if ( isXml && prefix )
|
|
sb->safePrintf("\t\t<prefix><![CDATA[%s]]>"
|
|
"</prefix>\n",prefix);
|
|
|
|
if ( ! isXml ) {
|
|
sb->safePrintf ( "<tr>");
|
|
if ( prefix )
|
|
sb->safePrintf("<td>%s:</td>",prefix);
|
|
else
|
|
sb->safePrintf("<td> </td>");
|
|
}
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<td>%" INT32 ""
|
|
"/%" INT32 ""
|
|
"</td>" ,
|
|
tp[i]->m_wordPos
|
|
,tp[i]->m_wordNum
|
|
);
|
|
|
|
//char *abbr = getLangAbbr(tp[i]->m_langId);
|
|
//if ( tp[i]->m_langId == langTranslingual ) abbr ="??";
|
|
//if ( tp[i]->m_langId == langUnknown ) abbr ="--";
|
|
//if ( tp[i]->m_synSrc ) abbr = "";
|
|
|
|
|
|
// print out all langs word is in if it's not clear
|
|
// what language it is. we use a sliding window to
|
|
// resolve some ambiguity, but not all, so print out
|
|
// the possible langs here
|
|
if ( ! isXml ) {
|
|
sb->safePrintf("<td>");
|
|
printLangBits ( sb , tp[i] );
|
|
sb->safePrintf("</td>");
|
|
}
|
|
|
|
|
|
//if ( ! isXml && abbr[0] )
|
|
// sb->safePrintf("<td>%s</td>", abbr );
|
|
//else if ( ! isXml )
|
|
// sb->safePrintf("<td> </td>" );
|
|
//else if ( abbr[0] )
|
|
// sb->safePrintf("\t\t<lang><![CDATA["
|
|
// "]]>%s</lang>\n", abbr );
|
|
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<s><![CDATA[");
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf ("<td><nobr>" );
|
|
|
|
//if ( tp[i]->m_synSrc )
|
|
// sb->pushChar('*');
|
|
|
|
sb->safeMemcpy_nospaces ( start + tp[i]->m_termOff ,
|
|
tp[i]->m_termLen );
|
|
|
|
/*
|
|
char *dateStr = " ";
|
|
int32_t ddd = tp[i]->m_date;
|
|
uint8_t *tddd = (uint8_t *)&ddd;
|
|
char tbbb[32];
|
|
if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
|
|
tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
|
|
sprintf(tbbb,"evIds %" INT32 "-%" INT32 "",
|
|
(int32_t)tddd[1],(int32_t)tddd[0]);
|
|
dateStr = tbbb;
|
|
}
|
|
else if ( ddd )
|
|
dateStr = asctime ( gmtime(&ddd ));
|
|
|
|
char tmp[20];
|
|
if ( tp[i]->m_noSplit ) sprintf ( tmp,"<b>1</b>" );
|
|
else sprintf ( tmp,"0" );
|
|
*/
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("]]></s>\n");
|
|
else
|
|
sb->safePrintf ( "</nobr></td>" );
|
|
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<wordPos>%" INT32 "</wordPos>\n",
|
|
tp[i]->m_wordPos);
|
|
|
|
char *desc = NULL;
|
|
if ( tp[i]->m_descOff >= 0 )
|
|
desc = start + tp[i]->m_descOff;
|
|
|
|
// use hashgroup
|
|
int32_t hg = tp[i]->m_hashGroup;
|
|
if ( ! desc || ! strcmp(desc,"body") )
|
|
desc = getHashGroupString(hg);
|
|
|
|
if ( isXml && desc )
|
|
sb->safePrintf("\t\t<loc>%s</loc>\n", desc);
|
|
else if ( ! isXml ) {
|
|
if ( ! desc ) desc = " ";
|
|
sb->safePrintf ( "<td>%s", desc );
|
|
char ss = tp[i]->m_synSrc;
|
|
if ( ss )
|
|
sb->safePrintf(" - %s",
|
|
getSourceString(ss));
|
|
sb->safePrintf("</td>");
|
|
}
|
|
|
|
int32_t dn = (int32_t)tp[i]->m_densityRank;
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<dens>%" INT32 "</dens>\n",dn);
|
|
|
|
if ( ! isXml && dn >= MAXDENSITYRANK )
|
|
sb->safePrintf("<td>%" INT32 "</td>\n",dn);
|
|
else if ( ! isXml )
|
|
sb->safePrintf("<td><font color=purple>%" INT32 "</font>"
|
|
"</td>",dn);
|
|
|
|
// the diversityrank/wordspamrank
|
|
/*
|
|
int32_t ds = (int32_t)tp[i]->m_diversityRank;
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<div>%" INT32 "</div>\n",ds);
|
|
if ( ! isXml && ds >= MAXDIVERSITYRANK )
|
|
sb->safePrintf("<td>%" INT32 "</td>\n",ds);
|
|
else if ( ! isXml )
|
|
sb->safePrintf("<td><font color=green>%" INT32 "</font>"
|
|
"</td>",ds);
|
|
*/
|
|
|
|
int32_t ws = (int32_t)tp[i]->m_wordSpamRank;
|
|
|
|
if ( isXml && hg == HASHGROUP_INLINKTEXT )
|
|
sb->safePrintf("\t\t<linkerSiteRank>%" INT32 ""
|
|
"</linkerSiteRank>\n",ws);
|
|
else if ( isXml )
|
|
sb->safePrintf("\t\t<spam>%" INT32 "</spam>\n",ws);
|
|
|
|
if ( ! isXml && ws >= MAXWORDSPAMRANK )
|
|
sb->safePrintf("<td>%" INT32 "</td>",ws);
|
|
else if ( ! isXml )
|
|
sb->safePrintf("<td><font color=red>%" INT32 "</font></td>",
|
|
ws);
|
|
|
|
float score = 1.0;
|
|
// square this like we do in the query ranking algo
|
|
score *= getHashGroupWeight(hg) * getHashGroupWeight(hg);
|
|
//score *= getDiversityWeight(tp[i]->m_diversityRank);
|
|
score *= getDensityWeight(tp[i]->m_densityRank);
|
|
if ( tp[i]->m_synSrc ) score *= SYNONYM_WEIGHT;
|
|
if ( hg == HASHGROUP_INLINKTEXT ) score *= getLinkerWeight(ws);
|
|
else score *= getWordSpamWeight(ws);
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<score>%.02f</score>\n",score);
|
|
else
|
|
sb->safePrintf("<td>%.02f</td>\n",score);
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("\t</term>\n");
|
|
else
|
|
sb->safePrintf("</tr>\n");
|
|
}
|
|
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("</response>\n" );
|
|
else
|
|
sb->safePrintf("</table><br>\n");
|
|
|
|
//
|
|
// END PRINT HASHES TERMS
|
|
//
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
sb->safePrintf("<b>Coming Soon</b>");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
char **c = getUtf8Content();
|
|
if ( ! c ) return true; if ( c==(void *)-1) return false;
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
int32_t raw = hr->getLong("raw",0);
|
|
|
|
if ( ! isXml && ! raw ) printMenu ( sb );
|
|
|
|
if ( ! isXml ) {
|
|
// just copy it otherwise
|
|
if ( ptr_utf8Content )
|
|
sb->safeMemcpy ( ptr_utf8Content ,size_utf8Content -1);
|
|
return true;
|
|
}
|
|
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
sb->safePrintf("\t<utf8Content><![CDATA[");
|
|
if ( ptr_utf8Content )
|
|
sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,
|
|
false);
|
|
sb->safePrintf("]]></utf8Content>\n");
|
|
// end xml response
|
|
sb->safePrintf("</response>\n");
|
|
return true;
|
|
}
|
|
|
|
|
|
// . get the possible titles of the root page
|
|
// . includes the title tag text
|
|
// . includes various inlink text
|
|
// . used to match the VERIFIED place name 1 or 2 of addresses on this
|
|
// site in order to set Address::m_flags's AF_VENUE_DEFAULT bit which
|
|
// indicates the address is the address of the website (a venue website)
|
|
char **XmlDoc::getRootTitleBuf ( ) {
|
|
|
|
// return if valid
|
|
if ( m_rootTitleBufValid ) return (char **)&m_rootTitleBuf;
|
|
|
|
// get it from the tag rec first
|
|
setStatus ( "getting root title buf");
|
|
|
|
// sanity check, root must have been indexed
|
|
//if ( ! m_sreq.m_rootIndexed ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . update it first before reading it!
|
|
// . do not update it here, just update it in getTitleRec() because
|
|
// this makes doConsistencyCheck() block and core
|
|
//bool *status2 = updateSiteTitleBuf();
|
|
//if ( ! status2 || status2 == (void *)-1 ) return (char **)status2;
|
|
|
|
// get it from the tag rec if we can
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (void *)-1 ) return (char **)gr;
|
|
|
|
// clear this if not set from title rec
|
|
//if ( ! m_setFromTitleRec ) {
|
|
// ptr_siteTitleBuf = NULL;
|
|
// size_siteTitleBuf = 0;
|
|
//}
|
|
|
|
// PROBLEM: new title rec is the only thing which has sitetitles tag
|
|
// sometimes and we do not store that in the title rec. in this case
|
|
// we should maybe store ptr_siteTitleBuf/size_siteTitleBuf in the
|
|
// title rec?
|
|
Tag *tag = gr->getTag("roottitles");
|
|
|
|
char *src = NULL;
|
|
int32_t srcSize = 0;
|
|
|
|
if ( ptr_rootTitleBuf || m_setFromTitleRec ) {
|
|
src = ptr_rootTitleBuf;
|
|
srcSize = size_rootTitleBuf;
|
|
}
|
|
else if ( tag ) {
|
|
src = tag->getTagData();
|
|
srcSize = tag->getTagDataSize();
|
|
// no need to add to title rec since already in the tag so
|
|
// make sure we did not double add
|
|
if ( ptr_rootTitleBuf ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
// . get the root doc
|
|
// . allow for a one hour cache of the titleRec
|
|
XmlDoc **prd = getRootXmlDoc( 3600 );
|
|
if ( ! prd || prd == (void *)-1 ) return (char **)prd;
|
|
// int16_tcut
|
|
XmlDoc *rd = *prd;
|
|
// . if no root doc, then assume no root title
|
|
// . this happens if we are injecting because we do not want
|
|
// to download the root page for speed purposes
|
|
if ( ! rd ) {
|
|
m_rootTitleBuf[0] = '\0';
|
|
m_rootTitleBufSize = 0;
|
|
m_rootTitleBufValid = true;
|
|
return (char **)&m_rootTitleBuf;
|
|
}
|
|
// . ONLY do this if root doc was NOT set from titleRec to
|
|
// avoid that core in updateSiteTitleBuf(). this can happen
|
|
// if the root doc had no title! (or no content)
|
|
//if ( rd->m_setFromTitleRec ) {
|
|
// // emptyt
|
|
// m_siteTitleBuf[0] = '\0';
|
|
// // set the size of it
|
|
// m_siteTitleBufSize = 0;
|
|
// // validate it
|
|
// m_siteTitleBufValid = true;
|
|
// // return a ptr to it
|
|
// return (char **)&m_siteTitleBuf;
|
|
//}
|
|
|
|
// a \0 separated list
|
|
char **rtl = rd->getTitleBuf();
|
|
if ( ! rtl || rtl == (void *)-1 ) return (char **)rtl;
|
|
|
|
// ptr
|
|
src = rd->m_titleBuf;
|
|
srcSize = rd->m_titleBufSize;
|
|
}
|
|
|
|
int32_t max = (int32_t)ROOT_TITLE_BUF_MAX - 5;
|
|
// sanity
|
|
if ( srcSize >= max ) {
|
|
// truncate
|
|
srcSize = max;
|
|
// back up so we split on a space
|
|
for ( ; srcSize>0 && ! is_wspace_a(src[srcSize]); srcSize--);
|
|
// null term
|
|
src[srcSize] = '\0';
|
|
// include it
|
|
srcSize++;
|
|
}
|
|
|
|
// copy that over in case root is destroyed
|
|
gbmemcpy ( m_rootTitleBuf , src , srcSize );
|
|
m_rootTitleBufSize = srcSize;
|
|
|
|
// sanity check, must include the null ni the size
|
|
if ( m_rootTitleBufSize > 0 &&
|
|
m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
|
|
log("build: bad root titlebuf size not end in null char for "
|
|
"collnum=%i",(int)m_collnum);
|
|
ptr_rootTitleBuf = NULL;
|
|
size_rootTitleBuf = 0;
|
|
m_rootTitleBufValid = true;
|
|
return (char **)&m_rootTitleBuf;
|
|
char *xx=NULL;*xx=0;
|
|
//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
|
|
//m_rootTitleBufSize++;
|
|
}
|
|
|
|
// sanity check - breach check
|
|
if ( m_rootTitleBufSize > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0;}
|
|
|
|
// serialize into our titlerec
|
|
ptr_rootTitleBuf = m_rootTitleBuf;
|
|
size_rootTitleBuf = m_rootTitleBufSize;
|
|
|
|
m_rootTitleBufValid = true;
|
|
|
|
return (char **)&m_rootTitleBuf;
|
|
}
|
|
|
|
|
|
char **XmlDoc::getFilteredRootTitleBuf ( ) {
|
|
|
|
if ( m_filteredRootTitleBufValid )
|
|
return (char **)&m_filteredRootTitleBuf;
|
|
|
|
// get unfiltered. m_rootTitleBuf should be set from this call.
|
|
char **rtbp = getRootTitleBuf();
|
|
if ( ! rtbp || rtbp == (void *)-1 ) return (char **)rtbp;
|
|
|
|
/*
|
|
// assume none
|
|
m_filteredRootTitleBuf[0] = '\0';
|
|
m_filteredRootTitleBufSize = 0;
|
|
m_filteredRootTitleBufValid = true;
|
|
return (char **)&m_filteredRootTitleBuf;
|
|
*/
|
|
|
|
// filter all the punct to \0 so that something like
|
|
// "walmart.com : live better" is reduced to 3 potential
|
|
// names, "walmart", "com" and "live better"
|
|
char *src = m_rootTitleBuf;
|
|
char *srcEnd = src + m_rootTitleBufSize;
|
|
char *dst = m_filteredRootTitleBuf;
|
|
// save some room to add a \0, so subtract 5
|
|
char *dstEnd = dst + ROOT_TITLE_BUF_MAX - 5;
|
|
//char *src = tag->getTagData();
|
|
//char *srcEnd = src + tag->getTagDataSize();
|
|
int32_t size = 0;
|
|
bool lastWasPunct = true;
|
|
for ( ; src < srcEnd && dst < dstEnd ; src += size ) {
|
|
// set the char size
|
|
size = getUtf8CharSize(src);
|
|
// space?
|
|
if ( is_wspace_a (*src) ||
|
|
// allow periods too
|
|
*src=='.' ) {
|
|
// no back to back punct
|
|
if ( lastWasPunct ) continue;
|
|
// flag it
|
|
lastWasPunct = true;
|
|
// add it in
|
|
*dst++ = '.';
|
|
// that's it
|
|
continue;
|
|
}
|
|
// x'y or x-y
|
|
if ( ( *src == '\'' ||
|
|
*src == '.' ||
|
|
*src == '-' ) &&
|
|
! lastWasPunct &&
|
|
is_alnum_a(src[1]) ) {
|
|
// add it in
|
|
*dst++ = *src;
|
|
// that's it
|
|
continue;
|
|
}
|
|
// x & y is ok
|
|
if ( *src == '&' ) {
|
|
// assume not punct (stands for and)
|
|
lastWasPunct = false;
|
|
// add it in
|
|
*dst++ = *src;
|
|
// that's it
|
|
continue;
|
|
}
|
|
// store alnums right in
|
|
if ( is_alnum_a(*src) ) {
|
|
// flag it
|
|
lastWasPunct = false;
|
|
// copy it over
|
|
gbmemcpy ( dst , src , size );
|
|
// skip what we copied
|
|
dst += size;
|
|
continue;
|
|
}
|
|
// if punct and haven't stored anything, just skip it
|
|
if ( lastWasPunct ) dst[-1] = '\0';
|
|
// store it
|
|
else *dst++ = '\0';
|
|
}
|
|
// make sure we end on a \0
|
|
if ( dst > m_filteredRootTitleBuf && dst[-1] != '\0' )
|
|
*dst++ = '\0';
|
|
|
|
// int16_tcut
|
|
char *str = m_filteredRootTitleBuf;
|
|
int32_t strSize = dst - m_filteredRootTitleBuf;
|
|
|
|
// copy that over in case root is destroyed
|
|
gbmemcpy ( m_filteredRootTitleBuf , str , strSize );
|
|
m_filteredRootTitleBufSize = strSize;
|
|
|
|
// sanity check, must include the null ni the size
|
|
if ( m_filteredRootTitleBufSize > 0 &&
|
|
m_filteredRootTitleBuf [ m_filteredRootTitleBufSize - 1 ] ) {
|
|
char *xx=NULL;*xx=0;
|
|
//m_filteredRootTitleBuf [ m_filteredRootTitleBufSize-1]='\0';
|
|
//m_filteredRootTitleBufSize++;
|
|
}
|
|
|
|
// sanity check - breach check
|
|
if ( m_filteredRootTitleBufSize > ROOT_TITLE_BUF_MAX ) {
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
m_filteredRootTitleBufValid = true;
|
|
|
|
// make this static to avoid compiler warning
|
|
static char *fp = m_filteredRootTitleBuf;
|
|
|
|
return (char **)&fp;
|
|
//return (char **)&m_filteredRootTitleBuf;
|
|
}
|
|
|
|
//static bool s_dummyBool = 1;
|
|
|
|
class Binky {
|
|
public:
|
|
char *m_text;
|
|
int32_t m_textLen;
|
|
int32_t m_score;
|
|
int64_t m_hash;
|
|
};
|
|
|
|
|
|
int cmpbk ( const void *v1, const void *v2 ) {
|
|
Binky *b1 = (Binky *)v1;
|
|
Binky *b2 = (Binky *)v2;
|
|
return b1->m_score - b2->m_score;
|
|
}
|
|
|
|
char **XmlDoc::getTitleBuf ( ) {
|
|
if ( m_titleBufValid ) return (char **)&m_titleBuf;
|
|
|
|
// recalc this every time the root page is indexed
|
|
setStatus ( "getting title buf on root");
|
|
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
|
|
// this should only be called on the root!
|
|
// . if the site changed for us, but the title rec of what we
|
|
// think is now the root thinks that it is not the root because
|
|
// it is using the old site, then it cores here!
|
|
// . i.e. if the new root is www.xyz.com/user/ted/ and the old root
|
|
// is www.xyz.com then and the old root is stored in ptr_site for
|
|
// the title rec for www.xyz.com/user/ted/ then we core here,
|
|
// . so take this sanity check out
|
|
// . but if the title rec does not think he is the site root yet
|
|
// then just wait until he does so we can get his
|
|
// ptr_rootTitleBuf below
|
|
if ( ! *isRoot ) {
|
|
m_titleBuf[0] = '\0';
|
|
m_titleBufSize = 0;
|
|
m_titleBufValid = true;
|
|
return (char **)&m_titleBuf;
|
|
}
|
|
|
|
// sanity check
|
|
if ( m_setFromTitleRec ) {
|
|
gbmemcpy(m_titleBuf, ptr_rootTitleBuf, size_rootTitleBuf );
|
|
m_titleBufSize = size_rootTitleBuf;
|
|
m_titleBufValid = true;
|
|
return (char **)&m_titleBuf;
|
|
}
|
|
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
|
|
// get link info first
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// error or blocked
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char **)info1;
|
|
|
|
// sanity check
|
|
Xml *xml = getXml();
|
|
// return -1 if it blocked
|
|
if ( xml == (void *)-1 ) return (char **)-1;
|
|
// set up for title
|
|
int32_t tlen ;
|
|
char *title ;
|
|
// on error, ignore it to avoid hammering the root!
|
|
if ( xml == (void *)NULL ) {
|
|
// log it
|
|
log("build: error downloading root xml: %s",
|
|
mstrerror(g_errno));
|
|
// clear it
|
|
g_errno = 0;
|
|
// make it 0
|
|
tlen = 0;
|
|
title = NULL;
|
|
}
|
|
else {
|
|
// get the title
|
|
title = m_xml.getTextForXmlTag ( 0,
|
|
999999 ,
|
|
"title" ,
|
|
&tlen ,
|
|
true ); // skip leading spaces
|
|
}
|
|
|
|
// truncate to 100 chars
|
|
//for ( ; tlen>0 && (tlen > 100 || is_alnum_a(title[tlen])) ; tlen-- )
|
|
// if ( tlen == 0 ) break;
|
|
if ( tlen > 100 ) {
|
|
char *tpend = title + 100;
|
|
char *prev = getPrevUtf8Char ( tpend , title );
|
|
// make that the end so we don't split a utf8 char
|
|
tlen = prev - title;
|
|
}
|
|
|
|
// store tag in here
|
|
char tmp[1024];
|
|
// point to it
|
|
char *ptmp = tmp;
|
|
// set this
|
|
char *pend = tmp + 1024;
|
|
// add that in
|
|
gbmemcpy ( ptmp, title, tlen); ptmp += tlen;
|
|
// null terminate it
|
|
*ptmp++ = '\0';
|
|
|
|
// two votes per internal inlink
|
|
int32_t internalCount = 0;
|
|
// count inlinkers
|
|
int32_t linkNum = 0;
|
|
Binky bk[1000];
|
|
// init this
|
|
//char stbuf[2000];
|
|
//HashTableX scoreTable;
|
|
//scoreTable.set(8,4,64,stbuf,2000,false,m_niceness,"xmlscores");
|
|
// scan each link in the link info
|
|
for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
|
|
// do not breach
|
|
if ( linkNum >= 1000 ) break;
|
|
// is this inlinker internal?
|
|
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// skip corrupted
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 4 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
// store these
|
|
// zero out hash
|
|
bk[linkNum].m_hash = 0;
|
|
bk[linkNum].m_text = txt;
|
|
bk[linkNum].m_textLen = tlen;
|
|
bk[linkNum].m_score = 0;
|
|
// internal count
|
|
if ( internal && ++internalCount >= 3 ) continue;
|
|
// it's good
|
|
bk[linkNum].m_score = 1;
|
|
linkNum++;
|
|
/*
|
|
// set into words
|
|
Words w;
|
|
// return NULL on error with g_errno set
|
|
if ( ! w.setx ( txt , tlen , m_niceness ) ) return NULL;
|
|
// int16_tcut
|
|
int64_t *wids = w.getWordIds();
|
|
// init hash
|
|
int64_t h = 0LL;
|
|
// hash all words together
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if not hashable
|
|
if ( ! wids[i] ) continue;
|
|
// mix it up
|
|
h <<= 1LL;
|
|
// xor it in
|
|
h ^= wids[i];
|
|
}
|
|
// update hash
|
|
bk[linkNum].m_hash = h;
|
|
// store in table, return NULL with g_errno set on error
|
|
if ( ! scoreTable.addTerm ( &h ) ) return NULL;
|
|
*/
|
|
}
|
|
// init this
|
|
char dtbuf[1000];
|
|
HashTableX dupTable;
|
|
dupTable.set(8,0,64,dtbuf,1000,false,m_niceness,"xmldup");
|
|
// now set the scores and isdup
|
|
for ( int32_t i = 0 ; i < linkNum ; i++ ) {
|
|
// skip if ignored
|
|
if ( bk[i].m_score == 0 ) continue;
|
|
// get hash
|
|
int64_t h = bk[i].m_hash;
|
|
// assume a dup
|
|
bk[i].m_score = 0;
|
|
// skip if zero'ed out
|
|
if ( ! h ) continue;
|
|
// only do each hash once!
|
|
if ( dupTable.isInTable(&h) ) continue;
|
|
// add to it. return NULL with g_errno set on error
|
|
if ( ! dupTable.addKey(&h) ) return NULL;
|
|
// is it in there?
|
|
bk[i].m_score = 1; // scoreTable.getScore ( &h );
|
|
}
|
|
// now sort the bk array by m_score
|
|
//gbsort ( bk , linkNum , sizeof(Binky), cmpbk , m_niceness );
|
|
|
|
// sanity check - make sure sorted right
|
|
//if ( linkNum >= 2 && bk[0].m_score < bk[1].m_score ) {
|
|
// char *xx=NULL; *xx=0; }
|
|
|
|
// . now add the winners to the buffer
|
|
// . skip if score is 0
|
|
for ( int32_t i = 0 ; i < linkNum ; i++ ) {
|
|
// skip if score is zero
|
|
if ( bk[i].m_score == 0 ) continue;
|
|
// skip if too big
|
|
if ( bk[i].m_textLen + 1 > pend - ptmp ) continue;
|
|
// store it
|
|
gbmemcpy ( ptmp , bk[i].m_text , bk[i].m_textLen );
|
|
// advance
|
|
ptmp += bk[i].m_textLen;
|
|
// null terminate it
|
|
*ptmp++ = '\0';
|
|
}
|
|
|
|
// sanity
|
|
int32_t size = ptmp - tmp;
|
|
if ( size > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0; }
|
|
|
|
gbmemcpy ( m_titleBuf , tmp , ptmp - tmp );
|
|
m_titleBufSize = size;
|
|
m_titleBufValid = true;
|
|
// ensure null terminated
|
|
if ( size > 0 && m_titleBuf[size-1] ) { char *xx=NULL;*xx=0; }
|
|
//ptr_siteTitleBuf = m_siteTitleBuf;
|
|
//size_siteTitleBuf = m_siteTitleBufSize;
|
|
return (char **)&m_titleBuf;
|
|
}
|
|
|
|
|
|
// . now we just get all the tagdb rdb recs to add using this function
|
|
// . then we just use the metalist to update tagdb
|
|
SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
|
|
|
if ( m_newTagBufValid ) return &m_newTagBuf;
|
|
|
|
setStatus ( "getting new tags");
|
|
|
|
int32_t *ic = getIndexCode();
|
|
if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get our ip
|
|
int32_t *ip = getIp();
|
|
// this must not block to avoid re-computing "addme" above
|
|
if ( ip == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! ip || ip == (int32_t *)-1) return (SafeBuf *)ip;
|
|
|
|
// . do not both if there is a problem
|
|
// . otherwise if our ip is invalid (0 or 1) we core in
|
|
// getNumSiteInlinks() which requires a valid ip
|
|
// . if its robots.txt disallowed, then indexCode will be set, but we
|
|
// still want to cache our sitenuminlinks in tagdb! delicious.com was
|
|
// recomputing the sitelinkinfo each time because we were not storing
|
|
// these tags in tagdb!!
|
|
if ( ! *ip || *ip == -1 ) { // *ic ) {
|
|
m_newTagBuf.reset();
|
|
m_newTagBufValid = true;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
// get the tags already in tagdb
|
|
TagRec *gr = getTagRec ( );
|
|
if ( ! gr || gr == (void *)-1 ) return (SafeBuf *)gr;
|
|
|
|
// get our site
|
|
char *mysite = getSite();
|
|
// this must not block to avoid re-computing "addme" above
|
|
if ( mysite == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! mysite || mysite == (char *)-1 ) return (SafeBuf *)mysite;
|
|
|
|
// age of tag in seconds
|
|
int32_t timestamp;
|
|
|
|
// always just use the primary tagdb so we can cache our sitenuminlinks
|
|
char rdbId = RDB_TAGDB;
|
|
//if ( m_useSecondaryRdbs ) rdbId = RDB2_TAGDB2;
|
|
//else rdbId = RDB_TAGDB;
|
|
|
|
// sitenuminlinks special for repair
|
|
if ( m_useSecondaryRdbs &&
|
|
// and not rebuilding titledb
|
|
! m_useTitledb ) {
|
|
m_newTagBuf.reset();
|
|
m_newTagBufValid = true;
|
|
int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,×tamp);
|
|
if ( old1 == m_siteNumInlinks &&
|
|
old1 != -1 &&
|
|
! m_updatingSiteLinkInfoTags )
|
|
return &m_newTagBuf;
|
|
int32_t now = getTimeGlobal();
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: adding tag site=%s sitenuminlinks=%" INT32 "",
|
|
mysite,m_siteNumInlinks);
|
|
if ( ! m_newTagBuf.addTag2(mysite,"sitenuminlinks",now,
|
|
"xmldoc",
|
|
*ip,m_siteNumInlinks,rdbId) )
|
|
return NULL;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
// if doing consistency check, this buf is for adding to tagdb
|
|
// so just ignore those. we use ptr_tagRecData in getTagRec() function
|
|
// but this is really for updating tagdb.
|
|
if ( m_doingConsistencyCheck ) {
|
|
m_newTagBuf.reset();
|
|
m_newTagBufValid = true;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
|
|
|
|
char *isIndexed = getIsIndexed();
|
|
if ( !isIndexed || isIndexed==(char *)-1 ) return (SafeBuf *)isIndexed;
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (SafeBuf *)isRoot;
|
|
|
|
int32_t *siteNumInlinks = getSiteNumInlinks();
|
|
if ( ! siteNumInlinks ) return NULL;
|
|
if ( siteNumInlinks == (int32_t *)-1) return (SafeBuf *)-1;
|
|
|
|
// ok, get the sites of the external outlinks and they must
|
|
// also be NEW outlinks, added to the page since the last time
|
|
// we spidered it...
|
|
Links *links = getLinks ();
|
|
if ( ! links || links == (Links *)-1 ) return (SafeBuf *)links;
|
|
|
|
// our next slated spider priority
|
|
char *spiderLinks = getSpiderLinks();
|
|
if ( ! spiderLinks || spiderLinks == (char *)-1 )
|
|
return (SafeBuf *)spiderLinks;
|
|
|
|
// . get ips of all outlinks.
|
|
// . use m_msgeForIps class just for that
|
|
// . it sucks if the outlink's ip is a dns timeout, then we never
|
|
// end up being able to store it in tagdb, that is why when
|
|
// rebuilding we need to skip adding firstip tags for the outlinks
|
|
int32_t **ipv = NULL;
|
|
TagRec ***grv = NULL;
|
|
bool addLinkTags = true;
|
|
if ( ! *spiderLinks ) addLinkTags = false;
|
|
if ( ! m_useSpiderdb ) addLinkTags = false;
|
|
if ( addLinkTags ) {
|
|
ipv = getOutlinkFirstIpVector ();
|
|
if ( ! ipv || ipv == (void *)-1 ) return (SafeBuf *)ipv;
|
|
// . uses m_msgeForTagRecs for this one
|
|
grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
|
|
}
|
|
|
|
// get root langid of root page
|
|
uint8_t *rl = getRootLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;
|
|
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (SafeBuf *)hci;
|
|
|
|
// get the address class
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (Addresses *)-1 ) return (SafeBuf *)aa;
|
|
|
|
// get comma separated list of email address on page
|
|
char *emails = getEmailBuf ( );
|
|
if ( ! emails || emails == (void *)-1 ) return (SafeBuf *)emails;
|
|
|
|
#ifdef _USETURKS_
|
|
//HashTableX *tvt = getTurkVotingTable ();
|
|
//if ( ! tvt || tvt == (void *)-1 ) return (SafeBuf *)tvt;
|
|
#endif
|
|
|
|
//
|
|
// init stuff
|
|
//
|
|
|
|
// . this gets the root doc and and parses titles out of it
|
|
// . sets our m_rootTitleBuf/m_rootTitleBufSize
|
|
char **rtbufp = getRootTitleBuf();
|
|
if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf *)rtbufp;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// overwrite "getting root title buf" status
|
|
setStatus ("computing new tags");
|
|
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: adding tags for mysite=%s",mysite);
|
|
|
|
// int16_tcut
|
|
//TagRec *tr = &m_newTagRec;
|
|
// current time
|
|
int32_t now = getTimeGlobal();
|
|
// actually, use spider download time if we can. that way
|
|
// Test.cpp's injection runs will be more consistent!
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
now = getSpideredTime();//m_spideredTime;
|
|
}
|
|
// store tags into here
|
|
SafeBuf *tbuf = &m_newTagBuf;
|
|
// allocate space to hold the tags we will add
|
|
Tag *tag;
|
|
int32_t need = 512;
|
|
// add in root title buf in case we add it too
|
|
need += m_rootTitleBufSize;
|
|
// reserve it all now
|
|
if ( ! tbuf->reserve(need) ) return NULL;
|
|
|
|
|
|
|
|
//
|
|
// add root langid if we need to
|
|
//
|
|
char *oldrl = gr->getString("rootlang",NULL,×tamp);
|
|
// assume no valid id
|
|
int32_t oldrlid = -99;
|
|
// convert to id
|
|
if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );
|
|
|
|
// if not in old tag, or changed from what was in tag, or it has
|
|
// been 10 days or more, then update tagdb with this tag.
|
|
bool addRootLang = false;
|
|
if ( ! oldrl ) addRootLang = true;
|
|
if ( oldrlid != *rl ) addRootLang = true;
|
|
if ( now-timestamp > 10*86400 ) addRootLang = true;
|
|
// injects do not download the root doc for speed reasons, so do not
|
|
// bother for them unless the doc itself is the root.
|
|
if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
|
|
// . get the two letter (usually) language code from the id
|
|
// . i think the two chinese languages are 5 letters
|
|
char *newrl = NULL;
|
|
if ( addRootLang )
|
|
// i've seen this return NULL because *rl is a corrupt 215
|
|
// for some reason
|
|
newrl = getLanguageAbbr( *rl );
|
|
|
|
if ( newrl )
|
|
tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);
|
|
|
|
//
|
|
// add hascontactinfo if we need to
|
|
//
|
|
int32_t oldhci = gr->getLong("hascontactinfo",-1,NULL,×tamp);
|
|
if ( oldhci == -1 || oldhci != *hci || now-timestamp > 10 *86400 ) {
|
|
char *val = "0";
|
|
if ( m_hasContactInfo ) val = "1";
|
|
tbuf->addTag3 (mysite,"hascontactinfo",now,"xmldoc",*ip,val,
|
|
rdbId);
|
|
}
|
|
//
|
|
// add "site" tag
|
|
//
|
|
char *oldsite = gr->getString("site",NULL);
|
|
if ( ! oldsite || strcmp(oldsite,mysite) || now-timestamp > 10*86400)
|
|
tbuf->addTag3(mysite,"site",now,"xmldoc",*ip,mysite,rdbId);
|
|
|
|
//
|
|
// add firstip if not there at all
|
|
//
|
|
char *oldfip = gr->getString("firstip",NULL);
|
|
// convert it
|
|
int32_t ip3 = 0;
|
|
if ( oldfip ) ip3 = atoip(oldfip);
|
|
// if not there or if bogus, add it!! should override bogus firstips
|
|
if ( ! ip3 || ip3 == -1 ) {
|
|
char *ipstr = iptoa(m_ip);
|
|
//if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0; }
|
|
//int32_t iplen = gbstrlen(ipstr);
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
tbuf->addTag3(mysite,"firstip",now,"xmldoc",*ip,ipstr,
|
|
rdbId);
|
|
}
|
|
|
|
//if ( strncmp(m_firstUrl.m_url,"http://delicious.com/",21)==0 )
|
|
// log("boo");
|
|
|
|
// sitenuminlinks
|
|
int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,×tamp);
|
|
if ( old1 == -1 || old1 != m_siteNumInlinks ||
|
|
m_updatingSiteLinkInfoTags ) {
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: adding tag site=%s sitenuminlinks=%" INT32 "",
|
|
mysite,m_siteNumInlinks);
|
|
if ( ! tbuf->addTag2(mysite,"sitenuminlinks",now,"xmldoc",
|
|
*ip,m_siteNumInlinks,rdbId) )
|
|
return NULL;
|
|
}
|
|
|
|
//int32_t old2, old3, old4;
|
|
|
|
// if running for diffbot crawlbot then isCustomCrawl is true
|
|
// so do not update the siteinlink info already in tagdb since i
|
|
// imported it from my main collection. we do not want to overwrite it.
|
|
// NO, because for single site crawls we bottlenech on msg25
|
|
// when there are millions of urls. we only skip this
|
|
// for the global-index and if already in tagdb!
|
|
// No, let's just not invalidate the sitenuminlinks* tags
|
|
// in XmlDoc::getSiteNumInlinks()
|
|
//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;
|
|
|
|
// sitenuminlinksfresh
|
|
// old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,×tamp);
|
|
// if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
|
|
// m_updatingSiteLinkInfoTags )
|
|
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
|
|
// now,"xmldoc",
|
|
// *ip,m_siteNumInlinksUniqueIp,rdbId))
|
|
// return NULL;
|
|
// // sitepop
|
|
// old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
|
|
// ×tamp);
|
|
// if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
|
|
// m_updatingSiteLinkInfoTags )
|
|
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
|
|
// now,"xmldoc",
|
|
// *ip,m_siteNumInlinksUniqueCBlock,rdbId))
|
|
// return NULL;
|
|
// // total site inlinks
|
|
// old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
|
|
// ×tamp);
|
|
// if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
|
|
// m_updatingSiteLinkInfoTags )
|
|
// if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
|
|
// now,"xmldoc",
|
|
// *ip,m_siteNumInlinksTotal,rdbId))
|
|
// return NULL;
|
|
|
|
// skipSiteInlinks:
|
|
|
|
// get root title buf from old tag
|
|
char *data = NULL;
|
|
int32_t dsize = 0;
|
|
Tag *rt = gr->getTag("roottitles");
|
|
if ( rt ) {
|
|
data = rt->getTagData();
|
|
dsize = rt->getTagDataSize();
|
|
}
|
|
|
|
bool addRootTitle = false;
|
|
// store the root title buf if we need to. if we had no tag yet...
|
|
if ( ! rt )
|
|
addRootTitle = true;
|
|
// or if differs in size
|
|
else if ( dsize != m_rootTitleBufSize )
|
|
addRootTitle = true;
|
|
// or if differs in content
|
|
else if ( memcmp(data,m_rootTitleBuf,m_rootTitleBufSize))
|
|
addRootTitle =true;
|
|
// or if it is 10 days old or more
|
|
if ( now-timestamp > 10*86400 ) addRootTitle = true;
|
|
// but not if injected
|
|
if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false;
|
|
// add it then
|
|
if ( addRootTitle &&
|
|
! tbuf->addTag(mysite,"roottitles",now,"xmldoc",
|
|
*ip,m_rootTitleBuf,m_rootTitleBufSize,
|
|
rdbId,true) )
|
|
return NULL;
|
|
|
|
|
|
//
|
|
// add the VENUEADDRESS tags
|
|
//
|
|
|
|
// init the dedup table so we do not add the same address many times
|
|
char dtbuf[1000];
|
|
HashTableX dt;
|
|
dt.set(8,0,32,dtbuf,1000,false,m_niceness,"xmldt");
|
|
// reset counts
|
|
int32_t numContactAddressTags = 0;
|
|
int32_t numContactEmailTags = 0;
|
|
int32_t tagType2 = getTagTypeFromStr ( "contactaddress" );
|
|
int32_t tagType3 = getTagTypeFromStr ( "contactemails" );
|
|
// before we add the sitevenue to the tagrec let's make sure it is
|
|
// not a dedup.. i.e. that we do not already have this address
|
|
// in there.
|
|
int32_t tagType = getTagTypeFromStr ( "venueaddress" );
|
|
// start at the first tag
|
|
tag = gr->getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
|
|
// count current contact addresses we have
|
|
if ( tag->m_type == tagType2 ) numContactAddressTags++;
|
|
if ( tag->m_type == tagType3 ) numContactEmailTags++;
|
|
// skip if not a venueaddress tag
|
|
if ( tag->m_type != tagType ) continue;
|
|
// point to the serialized address
|
|
char *data = tag->getTagData();
|
|
// get that address hash i guess
|
|
uint64_t ah = getHashFromAddr ( data );
|
|
// add to dedup table - return NULL with g_errno set on error
|
|
if ( ! dt.addKey ( &ah ) ) return NULL;
|
|
}
|
|
int32_t na = aa->getNumAddresses();
|
|
// add up to 10 for now
|
|
for ( int32_t i = 0 ; i < na ; i++ ) {
|
|
// get it
|
|
Address *a = (Address *)aa->m_am.getPtr(i);
|
|
// check if venue
|
|
if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue;
|
|
// must have street on the page, not pointing into a tagrec
|
|
// from tagdb... otherwise we keep re-adding
|
|
if ( a->m_street->m_a < 0 ) continue;
|
|
// dedup! dedup against
|
|
// addresses in tagdb for venueaddress tag. can we use
|
|
// the dc[] array from Address.cpp... we need another
|
|
// set of bit flags for address class:
|
|
if ( dt.isInTable ( &a->m_hash ) ) continue;
|
|
// sanity
|
|
if ( a->m_hash == 0 ) { char *xx=NULL;*xx=0; }
|
|
// . serialize it
|
|
// . TODO: get rid of Address::addToTagRec() functions
|
|
char abuf[5000];
|
|
a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
|
|
// store in safebuf of tags
|
|
if ( ! tbuf->addTag3 (mysite,"venueaddress",now,"xmldoc",
|
|
*ip,abuf,rdbId) ) return NULL;
|
|
// only add once
|
|
if ( ! dt.addKey (&a->m_hash) ) return NULL;
|
|
}
|
|
|
|
//
|
|
//
|
|
// contact info stuff
|
|
//
|
|
//
|
|
|
|
// ensure m_numContactAddresses etc. are valid
|
|
Address **ca = getContactAddresses();
|
|
// blocked?
|
|
if ( ! ca || ca == (void *)-1 ) return (SafeBuf *)ca;
|
|
|
|
// do not do this for root if multiple addresses. this
|
|
// fixes http://obits.abqjournal.com/
|
|
if ( *isRoot && aa->m_uniqueStreetHashes > 1 ) na = 0;
|
|
|
|
// do not store more than 2 contact addresses, or 2 contact emails
|
|
// to avoid tagdb bloat. and also because we do not need that many.
|
|
|
|
// . store contact address if we had one
|
|
// . this is a buffer of Address ptrs
|
|
for ( int32_t i = 0 ; i < m_numContactAddresses ; i++ ) {
|
|
// stop on breach
|
|
if ( numContactAddressTags >= 2 ) break;
|
|
// inc it
|
|
numContactAddressTags++;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *a = ca[i];
|
|
// . serialize it
|
|
// . TODO: get rid of Address::addToTagRec() functions
|
|
char abuf[5000];
|
|
a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
|
|
// store in safebuf of tags
|
|
if ( ! tbuf->addTag3 (mysite,"contactaddress",now,"xmldoc",
|
|
*ip,abuf,rdbId) ) return NULL;
|
|
}
|
|
|
|
// . add email addresses and submission forms to tag
|
|
// . this does not block, so make sure only called once!
|
|
// . contact emails. comma separated list
|
|
if ( emails && numContactEmailTags <= 1 ) {
|
|
numContactEmailTags++;
|
|
if ( ! tbuf->addTag3 (mysite,"contactemails",now,"xmldoc",
|
|
*ip,emails,rdbId) ) return NULL;
|
|
}
|
|
|
|
|
|
//
|
|
//
|
|
// NOW add tags for our outlinks
|
|
//
|
|
//
|
|
|
|
bool oldHighQualityRoot = true;
|
|
// if we are new, do not add anything, because we only add a tagdb
|
|
// rec entry for "new" outlinks that were added to the page since
|
|
// the last time we spidered it
|
|
if ( ! *isIndexed ) oldHighQualityRoot = false;
|
|
// special tags for google search results pages for scraping
|
|
char inGoogle = false;
|
|
if ( strstr(mysite,"google.com") ) inGoogle = true;
|
|
// no updating if we are not root
|
|
if ( ! inGoogle && ! *isRoot ) oldHighQualityRoot = false;
|
|
// must be high quality, too
|
|
if ( ! inGoogle && *siteNumInlinks < 500 ) oldHighQualityRoot = false;
|
|
// . if we are a google url then add tags for each outlink!
|
|
// . more google special tags to replace Scraper.cpp
|
|
char *fu = m_firstUrl.getUrl();
|
|
//char *name = NULL;
|
|
bool inGoogleBlogs = false;
|
|
bool inGoogleNews = false;
|
|
if ( ! strncmp ( fu , "http://www.google.com/blogsearch?", 33 ) )
|
|
inGoogleBlogs = true;
|
|
if ( ! strncmp ( fu , "http://blogsearch.google.com/blogsearch?", 40 ))
|
|
inGoogleBlogs = true;
|
|
if ( ! strncmp ( fu , "http://news.google.com/", 23 ))
|
|
inGoogleNews = true;
|
|
// only do once per site
|
|
char buf[1000];
|
|
HashTableX ht; ht.set (4,0,-1 , buf , 1000 ,false,m_niceness,"sg-tab");
|
|
// get site of outlink
|
|
SiteGetter siteGetter;
|
|
// . must be from an EXTERNAL DOMAIN and must be new
|
|
// . we should already have its tag rec, if any, since we have msge
|
|
int32_t n = links->getNumLinks();
|
|
// not if not spidering links
|
|
if ( ! addLinkTags ) n = 0;
|
|
// get the flags
|
|
linkflags_t *flags = links->m_linkFlags;
|
|
// scan all outlinks we have on this page
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
// get its tag rec
|
|
TagRec *gr = (*grv)[i];
|
|
|
|
// does this hostname have a "firstIp" tag?
|
|
char *ips = gr->getString("firstip",NULL);
|
|
|
|
bool skip = false;
|
|
// skip if we are not "old" high quality root
|
|
if ( ! oldHighQualityRoot ) skip = true;
|
|
// . skip if not external domain
|
|
// . we added this above, so just "continue"
|
|
if ( flags[i] & LF_SAMEDOM ) continue;//skip = true;
|
|
// skip links in the old title rec
|
|
if ( flags[i] & LF_OLDLINK ) skip = true;
|
|
// skip if determined to be link spam! should help us
|
|
// with the text ads we hate so much
|
|
if ( links->m_spamNotes[i] ) skip = true;
|
|
|
|
// if we should skip, and they have firstip already...
|
|
if ( skip && ips ) continue;
|
|
|
|
// get the normalized url
|
|
char *url = links->getLinkPtr(i);
|
|
// get the site. this will not block or have an error.
|
|
siteGetter.getSite(url,gr,timestamp,cr->m_collnum,m_niceness);
|
|
// these are now valid and should reference into
|
|
// Links::m_buf[]
|
|
char *site = siteGetter.m_site;
|
|
int32_t siteLen = siteGetter.m_siteLen;
|
|
|
|
int32_t linkIp = (*ipv)[i];
|
|
|
|
// get site hash
|
|
uint32_t sh = hash32 ( site , siteLen );
|
|
// ensure site is unique
|
|
if ( ht.getSlot ( &sh ) >= 0 ) continue;
|
|
// add it. returns false and sets g_errno on error
|
|
if ( ! ht.addKey ( &sh ) ) return NULL;
|
|
|
|
// . need to add firstip tag for this link's subdomain?
|
|
// . this was in Msge1.cpp but now we do it here
|
|
if ( ! ips && linkIp && linkIp != -1 ) {
|
|
// make it
|
|
char *ips = iptoa(linkIp);
|
|
if (!tbuf->addTag3(site,"firstip",now,"xmldoc",*ip,ips,
|
|
rdbId))
|
|
return NULL;
|
|
}
|
|
|
|
if ( skip ) continue;
|
|
|
|
// if outlink is a .gov or .edu site, do not bother, because
|
|
// getIsSpam() always returns false for those
|
|
// TODO: verify this
|
|
//if ( flags[i] & LF_EDUTLD ) continue;
|
|
//if ( flags[i] & LF_GOVTLD ) continue;
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
//int32_t timestamp = m_spideredTime;
|
|
|
|
// how much avail for adding tags?
|
|
int32_t avail = tbuf->getAvail();
|
|
// reserve space
|
|
int32_t need = 512;
|
|
// make sure enough
|
|
if ( need > avail && ! tbuf->reserve ( need ) ) return NULL;
|
|
|
|
// add tag for this outlink
|
|
if ( inGoogle ) {// && ! gr->getTag("ingoogle") ) {
|
|
if ( ! tbuf->addTag(site,"ingoogle",now,"xmldoc",
|
|
*ip,"1",2,rdbId,true) )
|
|
return NULL;
|
|
}
|
|
if ( inGoogleBlogs && //! gr->getTag("ingoogleblogs") &&
|
|
!tbuf->addTag(site,"ingoogleblogs",now,"xmldoc",*ip,"1",2,
|
|
rdbId,true))
|
|
return NULL;
|
|
if ( inGoogleNews && //! gr->getTag("ingooglenews") &&
|
|
!tbuf->addTag(site,"ingooglenews",now,"xmldoc",*ip,"1",2,
|
|
rdbId,true))
|
|
return NULL;
|
|
// link is linked to by a high quality site! 500+ inlinks.
|
|
if ( gr->getNumTagTypes("authorityinlink") < 5 &&
|
|
! tbuf->addTag(site,"authorityinlink",now,"xmldoc",
|
|
*ip,"1",2,rdbId,true) )
|
|
return NULL;
|
|
}
|
|
|
|
m_newTagBufValid = true;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
|
|
//
|
|
//
|
|
// BEGIN OLD SPAM.CPP class
|
|
//
|
|
//
|
|
|
|
#define WTMPBUFSIZE (MAX_WORDS *21*3)
|
|
|
|
// . RULE #28, repetitive word/phrase spam detector
|
|
// . set's the "spam" member of each word from 0(no spam) to 100(100% spam)
|
|
// . "bits" describe each word in phrasing terminology
|
|
// . if more than maxPercent of the words are spammed to some degree then we
|
|
// consider all of the words to be spammed, and give each word the minimum
|
|
// score possible when indexing the document.
|
|
// . returns false and sets g_errno on error
|
|
char *XmlDoc::getWordSpamVec ( ) {
|
|
|
|
if ( m_wordSpamBufValid ) {
|
|
char *wbuf = m_wordSpamBuf.getBufStart();
|
|
if ( ! wbuf ) return (char *)0x01;
|
|
return wbuf;
|
|
}
|
|
|
|
setStatus("getting word spam vec");
|
|
|
|
// assume not the repeat spammer
|
|
m_isRepeatSpammer = false;
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (char *)words;
|
|
|
|
m_wordSpamBuf.purge();
|
|
|
|
int32_t nw = words->getNumWords();
|
|
if ( nw <= 0 ) {
|
|
m_wordSpamBufValid = true;
|
|
return (char *)0x01;
|
|
}
|
|
|
|
Phrases *phrases = getPhrases ();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (char *)phrases;
|
|
Bits *bits = getBits();
|
|
if ( ! bits ) return (char *)NULL;
|
|
|
|
m_wordSpamBufValid = true;
|
|
|
|
//if ( m_isLinkText ) return true;
|
|
//if ( m_isCountTable ) return true;
|
|
|
|
// int16_tcuts
|
|
//Words *words = m_words;
|
|
//Bits *bits = m_bits;
|
|
|
|
// if 20 words totally spammed, call it all spam?
|
|
m_numRepeatSpam = 20;
|
|
|
|
// int16_tcut
|
|
int32_t sni = m_siteNumInlinks;
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set "m_maxPercent"
|
|
int32_t maxPercent = 6;
|
|
if ( sni > 10 ) maxPercent = 8;
|
|
if ( sni > 30 ) maxPercent = 10;
|
|
if ( sni > 100 ) maxPercent = 20;
|
|
if ( sni > 500 ) maxPercent = 30;
|
|
// fix this a bit so we're not always totally spammed
|
|
maxPercent = 25;
|
|
|
|
// assume not totally spammed
|
|
m_totallySpammed = false;
|
|
// get # of words we have to set spam for
|
|
int32_t numWords = words->getNumWords();
|
|
|
|
// set up the size of the hash table (number of buckets)
|
|
int32_t size = numWords * 3;
|
|
|
|
// . add a tmp buf as a scratch pad -- will be freed right after
|
|
// . allocate this second to avoid mem fragmentation more
|
|
// . * 2 for double the buckets
|
|
char tmpBuf [ WTMPBUFSIZE ];
|
|
char *tmp = tmpBuf;
|
|
int32_t need = (numWords * 21) * 3 + numWords;
|
|
if ( need > WTMPBUFSIZE ) {
|
|
tmp = (char *) mmalloc ( need , "Spam" );
|
|
if ( ! tmp ) {
|
|
log("build: Failed to allocate %" INT32 " more "
|
|
"bytes for spam detection: %s.",
|
|
need,mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
// set up ptrs
|
|
char *p = tmp;
|
|
// first this
|
|
unsigned char *spam = (unsigned char *)p; p += numWords ;
|
|
// . this allows us to make linked lists of indices of words
|
|
// . i.e. next[13] = 23--> word #23 FOLLOWS word #13 in the linked list
|
|
int32_t *next = (int32_t *)p; p += size * 4;
|
|
// hash of this word's stem (or word itself if useStem if false)
|
|
int64_t *bucketHash = (int64_t *)p; p += size * 8;
|
|
// that word's position in document
|
|
int32_t *bucketWordPos = (int32_t *)p; p += size * 4;
|
|
// profile of a word
|
|
int32_t *profile = (int32_t *)p; p += size * 4;
|
|
// is it a common word?
|
|
char *commonWords = (char *)p; p += size * 1;
|
|
|
|
// sanity check
|
|
if ( p - tmp > need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// clear all our spam percentages for these words
|
|
memset ( spam , 0 , numWords );
|
|
|
|
int32_t np;
|
|
// clear the hash table
|
|
int32_t i;
|
|
for ( i = 0 ; i < size ; i++ ) {
|
|
bucketHash [i] = 0;
|
|
bucketWordPos[i] = -1;
|
|
commonWords [i] = 0;
|
|
}
|
|
|
|
// count position since Words class can now have tags in it
|
|
//
|
|
//int32_t pos = 0;
|
|
//bool usePos = false;
|
|
//if ( words->m_tagIds ) usePos = true;
|
|
|
|
int64_t *wids = words->getWordIds();
|
|
|
|
// . loop through each word
|
|
// . hash their stems and place in linked list
|
|
// . if no stemming then don't do stemming
|
|
for ( i = 0 ; i < numWords ; i++ ) {
|
|
// . skip punctuation
|
|
// . this includes tags now , too i guess
|
|
//if ( words->isPunct(i) ) continue;
|
|
if ( wids[i] == 0 ) continue;
|
|
// skip if will not be indexed cuz score is too low
|
|
//if ( wscores && wscores[i] <= 0 ) continue;
|
|
QUICKPOLL(m_niceness);
|
|
// TODO: get phrase stem if stemming is on
|
|
// store the phrase stem this word int32_to the buffer
|
|
// blen = words->getPhraseStem(i,buf,100);
|
|
// if (blen<=0) continue;
|
|
// get the hash of the ith word
|
|
int64_t h = words->getWordId(i);
|
|
// use secondary wordId if available
|
|
//if ( words->getStripWordId(i) )
|
|
// h = words->getStripWordId(i);
|
|
// "j" is the bucket index
|
|
int32_t j = (uint64_t)h % size;
|
|
// make sure j points to the right bucket
|
|
while (bucketHash[j]) {
|
|
if ( h == bucketHash[j] ) break;
|
|
if (++j == size) j = 0;
|
|
}
|
|
// if this bucket is occupied by a word then replace it but
|
|
// make sure it adds onto the "linked list"
|
|
if (bucketHash[j]) {
|
|
// if Words class contain tags as words, do this
|
|
//if ( usePos ) {
|
|
// next [pos] = bucketWordPos[j];
|
|
// bucketWordPos[ j] = pos++;
|
|
//}
|
|
//else {
|
|
// add onto linked list for the ith word
|
|
next[i] = bucketWordPos[j];
|
|
// replace bucket with index to this word
|
|
bucketWordPos[j] = i;
|
|
//}
|
|
}
|
|
// otherwise, we have a new occurrence of this word
|
|
else {
|
|
bucketHash [j] = h;
|
|
// if Words class contain tags as words, do this
|
|
//if ( usePos ) {
|
|
// bucketWordPos[ j] = pos++;
|
|
// next [pos] = -1;
|
|
//}
|
|
//else {
|
|
// store our position # (i) in bucket
|
|
bucketWordPos[j] = i;
|
|
// no next occurrence of the ith word yet
|
|
next[i] = -1;
|
|
//}
|
|
}
|
|
// if stop word or number then mark it
|
|
if ( bits->isStopWord(i) ) commonWords[j] = 1;
|
|
if ( words->isNum ( i ) ) commonWords[j] = 1;
|
|
}
|
|
// count distinct candidates that had spam and did not have spam
|
|
int32_t spamWords = 0;
|
|
int32_t goodWords = 0;
|
|
// . now cruise down the hash table looking for filled buckets
|
|
// . grab the linked list of indices and make a "profile"
|
|
for ( i = 0 ; i < size ; i++ ) {
|
|
// skip empty buckets
|
|
if (bucketHash[i] == 0) continue;
|
|
np=0;
|
|
// word #j is in bucket #i
|
|
int32_t j = bucketWordPos[i];
|
|
// . cruise down the linked list for this word
|
|
while ( j!=-1) {
|
|
// store position of occurrence of this word in profile
|
|
profile [ np++ ] = j;
|
|
// get the position of next occurrence of this word
|
|
j = next[ j ];
|
|
}
|
|
// if 2 or less occurrences of this word, don't check for spam
|
|
if ( np < 3 ) { goodWords++; continue; }
|
|
|
|
//
|
|
// set m_isRepeatSpammer
|
|
//
|
|
// look for a word repeated in phrases, in a big list,
|
|
// where each phrase is different
|
|
//
|
|
int32_t max = 0;
|
|
int32_t count = 0;
|
|
int32_t knp = np;
|
|
// must be 3+ letters, not a stop word, not a number
|
|
if ( words->m_wordLens[profile[0]] <= 2 || commonWords[i] )
|
|
knp = 0;
|
|
// scan to see if they are a tight list
|
|
for ( int32_t k = 1 ; k < knp ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// are they close together? if not, bail
|
|
if ( profile[k-1] - profile[k] >= 25 ) {
|
|
count = 0;
|
|
continue;
|
|
}
|
|
// otherwise inc it
|
|
count++;
|
|
// must have another word in between or tag
|
|
int32_t a = profile[k];
|
|
int32_t b = profile[k-1];
|
|
bool gotSep = false;
|
|
bool inLink = false;
|
|
for ( int32_t j = a+1 ; j <b ; j++ ) {
|
|
// if in link do not count, chinese spammer
|
|
// does not have his crap in links
|
|
if ( words->m_words[j][0] == '<' &&
|
|
words->m_wordLens[j]>=3 ) {
|
|
// get the next char after the <
|
|
char nc;
|
|
nc=to_lower_a(words->m_words[j][1]);
|
|
// now check it for anchor tag
|
|
if ( nc == 'a' ) {
|
|
inLink = true; break; }
|
|
}
|
|
if ( words->m_words[j][0] == '<' )
|
|
gotSep = true;
|
|
if ( is_alnum_a(words->m_words[j][0]) )
|
|
gotSep = true;
|
|
}
|
|
// . the chinese spammer always has a separator,
|
|
// usually another tag
|
|
// . and fix "BOW BOW BOW..." which has no separators
|
|
if ( ! gotSep ) count--;
|
|
else if ( inLink ) count--;
|
|
// get the max
|
|
if ( count > max ) max = count;
|
|
}
|
|
// a count of 50 such monsters indicates the chinese spammer
|
|
if ( max >= 50 )
|
|
m_isRepeatSpammer = true;
|
|
//
|
|
// end m_isRepeatSpammer detection
|
|
//
|
|
|
|
// . determine the probability this word was spammed by looking
|
|
// at the distribution of it's positions in the document
|
|
// . sets "spam" member of each word in this profile
|
|
// . don't check if word occurred 2 or less times
|
|
// . TODO: what about TORA! TORA! TORA!
|
|
// . returns true if 1+ occurrences were considered spam
|
|
QUICKPOLL(m_niceness);
|
|
bool isSpam = setSpam ( profile , np , numWords , spam );
|
|
// don't count stop words or numbers towards this threshold
|
|
if ( commonWords[i] ) continue;
|
|
// tally them up
|
|
if ( isSpam ) spamWords++;
|
|
else goodWords++;
|
|
}
|
|
// what percent of distinct candidate words were spammed?
|
|
int32_t totalWords = spamWords + goodWords;
|
|
// if no or ver few words return true
|
|
int32_t percent;
|
|
if ( totalWords <= 10 ) goto done;
|
|
percent = ( spamWords * 100 ) / totalWords;
|
|
// if 20% of words we're spammed punish everybody now to 100% spam
|
|
// if we had < 100 candidates and < 20% spam, don't bother
|
|
//if ( percent < 5 ) goto done;
|
|
if ( percent <= maxPercent ) goto done;
|
|
// set flag so linkspam.cpp can see if all is spam and will not allow
|
|
// this page to vote
|
|
m_totallySpammed = true;
|
|
// now only set to 99 so each singleton usually gets hashed
|
|
for ( i = 0 ; i < numWords ; i++ )
|
|
if ( words->getWordId(i) && spam[i] < 99 )
|
|
spam[i] = 99;
|
|
done:
|
|
|
|
// update the weights for the words
|
|
//for ( i = 0 ; i < numWords ; i++ ) {
|
|
// m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100;
|
|
//}
|
|
|
|
// TODO: use the min word spam algo as in Phrases.cpp for this!
|
|
//for ( i = 0 ; i < numWords ; i++ ) {
|
|
// m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100;
|
|
//}
|
|
|
|
// convert from percent spammed into rank.. from 0 to 10 i guess
|
|
for ( i = 0 ; i < numWords ; i++ )
|
|
spam[i] = (MAXWORDSPAMRANK * (100 - spam[i])) / 100;
|
|
|
|
// copy into our buffer
|
|
if ( ! m_wordSpamBuf.safeMemcpy ( (char *)spam , numWords ) )
|
|
return NULL;
|
|
|
|
// free our temporary table stuff
|
|
if ( tmp != tmpBuf ) mfree ( tmp , need , "Spam" );
|
|
|
|
return m_wordSpamBuf.getBufStart();
|
|
}
|
|
|
|
|
|
// . a "profile" is an array of all the positions of a word in the document
|
|
// . a "position" is just the word #, like first word, word #8, etc...
|
|
// . we map "each" subProfile to a probability of spam (from 0 to 100)
|
|
// . if the profile is really big we get really slow (O(n^2)) iterating through
|
|
// many subProfiles
|
|
// . so after the first 25 words, it's automatically considered spam
|
|
// . return true if one word was spammed w/ probability > 20%
|
|
bool XmlDoc::setSpam ( int32_t *profile, int32_t plen , int32_t numWords ,
|
|
unsigned char *spam ) {
|
|
// don't bother detecting spam if 2 or less occurrences of the word
|
|
if ( plen < 3 ) return false;
|
|
int32_t i;
|
|
// if we have more than 10 words and this word is 20% or more of
|
|
// them then all but the first occurrence is spammed
|
|
//log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam);
|
|
if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) {
|
|
for (i=1; i<plen; i++) spam[profile[i]] = 100;
|
|
return true ;
|
|
}
|
|
// . over 50 repeated words is ludicrous
|
|
// . set all past 50 to spam and continue detecting
|
|
// . no, our doc length based weight takes care of that kind of thing
|
|
//if (plen > 50 && m_version < 93 ) {
|
|
// // TODO: remember, profile[i] is in reverse order!! we should
|
|
// // really do i=0;i<plen-50, but this is obsolete anyway...
|
|
// for (i=50; i<plen;i++) m_spam[profile[i]] = 100;
|
|
// plen = 50;
|
|
//}
|
|
|
|
|
|
// we have to do this otherwise it takes FOREVER to do for plens in
|
|
// the thousands, like i saw a plen of 8338!
|
|
if ( plen > 50 ) { // && m_version >= 93 ) {
|
|
// . set all but the last 50 to a spam of 100%
|
|
// . the last 50 actually occur as the first 50 in the doc
|
|
for (i=0; i<plen-50;i++) spam[profile[i]] = 100;
|
|
// we now have only 50 occurrences
|
|
plen = 50;
|
|
// we want to skip the first plen-50 because they actually
|
|
// occur at the END of the document
|
|
profile += plen - 50;
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
// higher quality docs allow more "freebies", but only starting with
|
|
// version 93... (see Titledb.h)
|
|
// profile[i] is actually in reverse order so we subtract off from wlen
|
|
//int32_t off ;
|
|
//if ( m_version >= 93 ) {
|
|
// off = (m_docQuality - 30) / 3;
|
|
// if ( off < 0 ) off = 0;
|
|
//}
|
|
// just use 40% "quality"
|
|
int32_t off = 3;
|
|
|
|
// . now the nitty-gritty part
|
|
// . compute all sub sequences of the profile
|
|
// . similar to a compression scheme (wavelets?)
|
|
// . TODO: word positions should count by two's since punctuation is
|
|
// not included so start step @ 2 instead of 1
|
|
// . if "step" is 1 we look at every word position in the profile
|
|
// . if "step" is 2 we look at every other word position
|
|
// . if "step" is 3 we look at every 3rd word position, etc...
|
|
int32_t maxStep = plen / 4;
|
|
if ( maxStep > 4 ) maxStep = 4;
|
|
// . loop through all possible tuples
|
|
int32_t window, wlen, step, prob;
|
|
for ( step = 1 ; step <= maxStep ; step++ ) {
|
|
for ( window = 0 ; window + 3 < plen ; window+=1) {
|
|
for (wlen = 3; window+wlen <= plen ; wlen+=1) {
|
|
// continue if step isn't aligned with window
|
|
// length
|
|
if (wlen % step != 0) continue;
|
|
// . get probability that this tuple is spam
|
|
// . returns 0 to 100
|
|
prob = getProbSpam ( profile + window ,
|
|
wlen , step);
|
|
// printf("(%i,%i,%i)=%i\n",step,window,
|
|
// wlen,prob);
|
|
// . if the probability is too low continue
|
|
// . was == 100
|
|
if ( prob <= 20 ) continue;
|
|
// set the spammed words spam to "prob"
|
|
// only if it's bigger than their current spam
|
|
for (i=window; i<window+wlen;i++) {
|
|
// first occurrences can have immunity
|
|
// due to doc quality being high
|
|
if ( i >= plen - off ) break;
|
|
if (spam[profile[i]] < prob)
|
|
spam[profile[i]] = prob;
|
|
}
|
|
QUICKPOLL(m_niceness);
|
|
}
|
|
|
|
}
|
|
}
|
|
// was this word spammed at all?
|
|
bool hadSpam = false;
|
|
for (i=0;i<plen;i++) if ( spam[profile[i]] > 20 ) hadSpam = true;
|
|
// make sure at least one word survives
|
|
for (i=0;i<plen;i++) if ( spam[profile[i]] == 0) return hadSpam;
|
|
// clear the spam level on this guy
|
|
spam[profile[0]] = 0;
|
|
// return true if we had spam, false if not
|
|
return hadSpam;
|
|
}
|
|
|
|
bool getWordPosVec ( Words *words ,
|
|
Sections *sections,
|
|
//int32_t wordStart,
|
|
//int32_t wordEnd,
|
|
int32_t startDist, // m_dist
|
|
char *fragVec,
|
|
int32_t niceness ,
|
|
SafeBuf *wpos ) {
|
|
|
|
int32_t dist = startDist; // 0;
|
|
Section *lastsx = NULL;
|
|
int32_t tagDist = 0;
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
nodeid_t *tids = words->m_tagIds;
|
|
int64_t *wids = words->m_wordIds;
|
|
int32_t *wlens = words->getWordLens();
|
|
char **wptrs = words->getWords();
|
|
int32_t nw = words->getNumWords();
|
|
|
|
if ( ! wpos->reserve ( nw * 4 ) ) return false;
|
|
int32_t *wposvec = (int32_t *)wpos->getBufStart();
|
|
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
|
|
// save it
|
|
wposvec[i] = dist;
|
|
|
|
// tags affect the distance/wordposition cursor
|
|
if ( tids && tids[i] ) {
|
|
// tag distance affects
|
|
nodeid_t tid = tids[i] & BACKBITCOMP;
|
|
if ( isBreakingTagId ( tid ) ) tagDist += SENT_UNITS;
|
|
dist++;
|
|
continue;
|
|
}
|
|
// . and so do sequences of punct
|
|
// . must duplicate this code in Query.cpp for setting
|
|
// QueryWord::m_posNum
|
|
if ( ! wids[i] ) {
|
|
// simple space or sequence of just white space
|
|
if ( words->isSpaces(i) )
|
|
dist++;
|
|
// 'cd-rom'
|
|
else if ( wptrs[i][0]=='-' && wlens[i]==1 )
|
|
dist++;
|
|
// 'mr. x'
|
|
else if ( wptrs[i][0]=='.' && words->isSpaces2(i,1))
|
|
dist++;
|
|
// animal (dog)
|
|
else
|
|
dist += 2;
|
|
continue;
|
|
}
|
|
// ignore if in repeated fragment
|
|
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) {
|
|
dist++; continue; }
|
|
|
|
Section *sx = NULL;
|
|
if ( sp ) {
|
|
sx = sp[i];
|
|
// ignore if in style tag, etc. and do not
|
|
// increment the distance
|
|
if ( sx->m_flags & NOINDEXFLAGS )
|
|
continue;
|
|
}
|
|
|
|
// different sentence?
|
|
if ( sx &&
|
|
( ! lastsx ||
|
|
sx->m_sentenceSection != lastsx->m_sentenceSection ) ) {
|
|
// separate different sentences with 30 units
|
|
dist += SENT_UNITS; // 30;
|
|
// limit this!
|
|
if ( tagDist > 120 ) tagDist = 120;
|
|
// and add in tag distances as well here, otherwise
|
|
// we do not want "<br>" to really increase the
|
|
// distance if the separated words are in the same
|
|
// sentence!
|
|
dist += tagDist;
|
|
// new last then
|
|
lastsx = sx;
|
|
// store the vector AGAIN
|
|
wposvec[i] = dist;
|
|
}
|
|
|
|
tagDist = 0;
|
|
|
|
dist++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool getDensityRanks ( int64_t *wids ,
|
|
int32_t nw ,
|
|
int32_t hashGroup ,
|
|
SafeBuf *densBuf ,
|
|
Sections *sections ,
|
|
int32_t niceness ) {
|
|
|
|
//int32_t nw = wordEnd - wordStart;
|
|
|
|
// make the vector
|
|
if ( ! densBuf->reserve ( nw ) ) return false;
|
|
|
|
// convenience
|
|
char *densVec = densBuf->getBufStart();
|
|
|
|
// clear i guess
|
|
memset ( densVec , 0 , nw );
|
|
|
|
if ( hashGroup != HASHGROUP_BODY &&
|
|
hashGroup != HASHGROUP_HEADING )
|
|
sections = NULL;
|
|
|
|
// scan the sentences if we got those
|
|
Section *ss = NULL;
|
|
if ( sections ) ss = sections->m_firstSent;
|
|
// sanity
|
|
//if ( sections && wordStart != 0 ) { char *xx=NULL;*xx=0; }
|
|
for ( ; ss ; ss = ss->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// count of the alnum words in sentence
|
|
int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
|
|
// start with one word!
|
|
count--;
|
|
// how can it be less than one alnum word
|
|
if ( count < 0 ) continue;
|
|
// . base density rank on that
|
|
// . count is 0 for one alnum word now
|
|
int32_t dr = MAXDENSITYRANK - count;
|
|
// ensure not negative. make it at least 1. zero means un-set.
|
|
if ( dr < 1 ) dr = 1;
|
|
// mark all in sentence then
|
|
for ( int32_t i = ss->m_senta ; i < ss->m_sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// assign
|
|
densVec[i] = dr;
|
|
}
|
|
}
|
|
// all done if using sections
|
|
if ( sections ) return true;
|
|
|
|
|
|
// count # of alphanumeric words in this string
|
|
int32_t na = 0;
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) if ( wids[i] ) na++;
|
|
// a single alnum should map to 0 "na"
|
|
na--;
|
|
// wtf?
|
|
if ( na < 0 ) return true;
|
|
// compute density rank
|
|
int32_t dr = MAXDENSITYRANK - na ;
|
|
// at least 1 to not be confused with 0 which means un-set
|
|
if ( dr < 1 ) dr = 1;
|
|
// assign
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// assign
|
|
densVec[i] = dr;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . called by hashString() for hashing purposes, i.e. creating posdb keys
|
|
// . string is usually the document body or inlink text of an inlinker or
|
|
// perhaps meta keywords. it could be anything. so we need to create this
|
|
// vector based on that string, which is represented by words/phrases here.
|
|
bool getDiversityVec ( Words *words ,
|
|
Phrases *phrases ,
|
|
HashTableX *countTable ,
|
|
SafeBuf *sbWordVec ,
|
|
//SafeBuf *sbPhraseVec ,
|
|
int32_t niceness ) {
|
|
|
|
int64_t *wids = words->getWordIds ();
|
|
//nodeid_t *tids = words->getTagIds ();
|
|
int32_t nw = words->getNumWords();
|
|
int64_t *pids = phrases->getPhraseIds2();
|
|
|
|
// . make the vector
|
|
// . it will be diversity ranks, so one float per word for now
|
|
// cuz we convert to rank below though, one byte rank
|
|
if ( ! sbWordVec ->reserve ( nw*4 ) ) return false;
|
|
//if ( ! sbPhraseVec->reserve ( nw*4 ) ) return false;
|
|
|
|
// get it
|
|
float *ww = (float *)sbWordVec ->getBufStart();
|
|
//float *pw = (float *)sbPhraseVec->getBufStart();
|
|
|
|
int32_t nexti = -10;
|
|
int64_t pidLast = 0;
|
|
|
|
// . now consider ourselves the last word in a phrase
|
|
// . adjust the score of the first word in the phrase to be
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// yield
|
|
QUICKPOLL ( niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) { ww[i] = 0.0; continue; }
|
|
// try to inline this
|
|
int64_t nextWid = 0;
|
|
int64_t lastPid = 0;
|
|
// how many words in the bigram?
|
|
int32_t nwp = phrases->getNumWordsInPhrase2(i);
|
|
if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ;
|
|
if ( i == nexti ) lastPid = pidLast;
|
|
// get current pid
|
|
int64_t pid = pids[i];
|
|
// get the word and phrase weights for term #i
|
|
float ww2;
|
|
//float pw2;
|
|
getWordToPhraseRatioWeights ( lastPid , // pids[i-1],
|
|
wids[i] ,
|
|
pid ,
|
|
nextWid , // wids[i+1] ,
|
|
&ww2 ,
|
|
//&pw2 ,
|
|
countTable ,
|
|
1);//m_version );
|
|
// 0 to 1.0
|
|
if ( ww2 < 0 || ww2 > 1.0 ) { char *xx=NULL;*xx=0; }
|
|
// save the last phrase id
|
|
if ( nwp > 0 ) {
|
|
nexti = i + nwp - 1;
|
|
pidLast = pid; // pids[i] ;
|
|
}
|
|
// . apply the weights
|
|
// . do not hit all the way down to zero though...
|
|
// . Words.cpp::hash() will not index it then...
|
|
//if ( ww[i] > 0 ) {
|
|
ww[i] = ww2;
|
|
//}
|
|
/*
|
|
//if ( pw[i] > 0 ) {
|
|
pw[i] = (int32_t)(pw[i] * pw2);
|
|
if ( pw[i] <= 0 ) pw[i] = 1;
|
|
//}
|
|
|
|
// MDW: why was this here?
|
|
//if ( isLinkText ) continue;
|
|
|
|
// do not demote all the way to 0
|
|
//if ( ww[i] <= 0 ) ww[i] = 2;
|
|
|
|
// skip if phrase score is 0
|
|
if ( ! pw[i] ) continue;
|
|
|
|
if ( pid == 0 ) { pw[i] = 0; continue; }
|
|
// skip if does not start phrase
|
|
if ( nwp <= 0 ) continue;
|
|
// sanity check
|
|
if ( nwp == 99 ) { char *xx = NULL; *xx = 0; }
|
|
// now mod the score
|
|
float avg = pw[i];
|
|
// weight by punct in between
|
|
//for ( int32_t j = i+1 ; j < i+nwp ; j++ ) {
|
|
// if ( wids[j] ) continue;
|
|
// avg = (avg * (int64_t)pw[j]) / DW;
|
|
//}
|
|
// do not demote all the way to zero, we still want to index it
|
|
// and when normalized on a 100 point scale, like when printed
|
|
// out by PageParser.cpp, a score of 1 here gets normalized to
|
|
// 0, so make sure it is at least 2.
|
|
if ( avg < 2 )
|
|
avg = 2;
|
|
// set that as our new score
|
|
pw[i] = avg;
|
|
*/
|
|
}
|
|
|
|
// overwrite the array of floats with an array of chars (ranks)
|
|
char *nww = (char *)ww;
|
|
//char *npw = (char *)pw;
|
|
|
|
// convert from float into a rank from 0-15
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
if ( ! ww[i] ) { nww[i] = 0; continue; }
|
|
// 2.50 is max in getWordToPhraseRatioWeights() function
|
|
char wrank = (char) ((ww[i] * ((float)MAXDIVERSITYRANK))/.55);
|
|
// sanity
|
|
if ( wrank > MAXDIVERSITYRANK ) wrank = MAXDIVERSITYRANK;
|
|
if ( wrank < 0 ) { char *xx=NULL;*xx=0; }
|
|
//char prank = (char) ((pw[i] * 15.0) / 2.50);
|
|
// assign now
|
|
nww[i] = wrank;
|
|
//npw[i] = prank;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// match word sequences of NUMWORDS or more words
|
|
#define NUMWORDS 5
|
|
|
|
// . repeated sentence frags
|
|
// . 1-1 with words in body of doc
|
|
char *XmlDoc::getFragVec ( ) {
|
|
|
|
if ( m_fragBufValid ) {
|
|
char *fb = m_fragBuf.getBufStart();
|
|
if ( ! fb ) return (char *)0x01;
|
|
return fb;
|
|
}
|
|
|
|
setStatus("getting frag vec");
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (char *)words;
|
|
Bits *bits = getBits();
|
|
if ( ! bits ) return NULL;
|
|
|
|
m_fragBuf.purge();
|
|
|
|
// ez vars
|
|
int64_t *wids = words->getWordIds ();
|
|
int32_t nw = words->getNumWords();
|
|
|
|
// if no words, nothing to do
|
|
if ( nw == 0 ) {
|
|
m_fragBufValid = true;
|
|
return (char *)0x01;//true;
|
|
}
|
|
|
|
// truncate for performance reasons. i've seen this be over 4M
|
|
// and it was VERY VERY SLOW... over 10 minutes...
|
|
// - i saw this tak over 200MB for an alloc for
|
|
// WeightsSet3 below, so lower from 200k to 50k. this will probably
|
|
// make parsing inconsistencies for really large docs...
|
|
if ( nw > MAXFRAGWORDS ) nw = MAXFRAGWORDS;
|
|
|
|
int64_t ringWids [ NUMWORDS ];
|
|
int32_t ringPos [ NUMWORDS ];
|
|
int32_t ringi = 0;
|
|
int32_t count = 0;
|
|
uint64_t h = 0;
|
|
|
|
// . make the hash table
|
|
// . make it big enough so there are gaps, so chains are not too long
|
|
int32_t minBuckets = (int32_t)(nw * 1.5);
|
|
uint32_t nb = 2 * getHighestLitBitValue ( minBuckets ) ;
|
|
int32_t need = nb * (8+4+4);
|
|
char *buf = NULL;
|
|
char tmpBuf[50000];
|
|
if ( need < 50000 ) buf = tmpBuf;
|
|
else buf = (char *)mmalloc ( need , "WeightsSet3" );
|
|
char *ptr = buf;
|
|
uint64_t *hashes = (uint64_t *)ptr; ptr += nb * 8;
|
|
int32_t *vals = (int32_t *)ptr; ptr += nb * 4;
|
|
float *ww = (float *)ptr; ptr += nb * 4;
|
|
if ( ! buf ) return NULL;
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) ww[i] = 1.0;
|
|
|
|
if ( ptr != buf + need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make the mask
|
|
uint32_t mask = nb - 1;
|
|
|
|
// clear the hash table
|
|
memset ( hashes , 0 , nb * 8 );
|
|
|
|
// clear ring of hashes
|
|
memset ( ringWids , 0 , NUMWORDS * 8 );
|
|
|
|
// for sanity check
|
|
int32_t lastStart = -1;
|
|
|
|
// . hash EVERY NUMWORDS-word sequence in the document
|
|
// . if we get a match look and see what sequences it matches
|
|
// . we allow multiple instances of the same hash to be stored in
|
|
// the hash table, so keep checking for a matching hash until you
|
|
// chain to a 0 hash, indicating the chain ends
|
|
// . check each matching hash to see if more than NUMWORDS words match
|
|
// . get the max words that matched from all of the candidates
|
|
// . demote the word and phrase weights based on the total/max
|
|
// number of words matching
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// yield
|
|
QUICKPOLL ( m_niceness );
|
|
// add new to the 5 word hash
|
|
h ^= wids[i];
|
|
// . remove old from 5 word hash before adding new...
|
|
// . initial ring wids are 0, so should be benign at startup
|
|
h ^= ringWids[ringi];
|
|
// add to ring
|
|
ringWids[ringi] = wids[i];
|
|
// save our position
|
|
ringPos[ringi] = i;
|
|
// wrap the ring ptr if we need to, that is why we are a ring
|
|
if ( ++ringi >= NUMWORDS ) ringi = 0;
|
|
// this 5-word sequence starts with word # "start"
|
|
int32_t start = ringPos[ringi];
|
|
// need at least NUMWORDS words in ring buffer to do analysis
|
|
if ( ++count < NUMWORDS ) continue;
|
|
// . skip if it starts with a word which can not start phrases
|
|
// . that way "a new car" being repeated a lot will not
|
|
// decrease the weight of the phrase term "new car"
|
|
// . setCountTable() calls set3() with this set to NULL
|
|
//if ( bits && ! bits->canStartPhrase(start) ) continue;
|
|
// sanity check
|
|
if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
|
|
// reset max matched
|
|
int32_t max = 0;
|
|
// look up in the hash table
|
|
uint32_t n = h & mask;
|
|
// sanity breach check
|
|
if ( n >= nb ) { char *xx=NULL;*xx=0; }
|
|
loop:
|
|
// all done if empty
|
|
if ( ! hashes[n] ) {
|
|
// sanity check
|
|
//if ( n >= nb ) { char *xx = NULL; *xx = 0; }
|
|
// add ourselves to the hash table now
|
|
hashes[n] = h;
|
|
// sanity check
|
|
//if ( wids[start] == 0 ) { char *xx = NULL; *xx = 0; }
|
|
// this is where the 5-word sequence starts
|
|
vals [n] = start;
|
|
// save it
|
|
lastStart = start;
|
|
// debug point
|
|
//if ( start == 7948 )
|
|
// log("heystart");
|
|
// do not demote words if less than NUMWORDS matched
|
|
if ( max < NUMWORDS ) continue;
|
|
// . how much we should we demote
|
|
// . 10 matching words pretty much means 0 weights
|
|
float demote = 1.0 - ((max-5)*.10);
|
|
if ( demote >= 1.0 ) continue;
|
|
if ( demote < 0.0 ) demote = 0.0;
|
|
|
|
// . RULE #26 ("long" phrases)
|
|
// . if we got 3, 4 or 5 in our matching sequence
|
|
// . basically divide by the # of *phrase* terms
|
|
// . multiply by 1/(N-1)
|
|
// . HOWEVER, should we also look at HOW MANY other
|
|
// sequences matches this too!???
|
|
//float demote = 1.0 / ((float)max-1.0);
|
|
// set3() is still called from setCountTable() to
|
|
// discount the effects of repeated fragments, and
|
|
// the count table only understands score or no score
|
|
//if ( max >= 15 ) demote = 0.0;
|
|
|
|
// demote the next "max" words
|
|
int32_t mc = 0;
|
|
int32_t j;
|
|
for ( j = start ; mc < max ; j++ ) {
|
|
// sanity
|
|
if ( j >= nw ) { char *xx=NULL;*xx=0; }
|
|
if ( j < 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip if not an alnum word
|
|
if ( ! wids[j] ) continue;
|
|
// count it
|
|
mc++;
|
|
// demote it
|
|
ww[j] = (int32_t)(ww[j] * demote);
|
|
if ( ww[j] <= 0 ) ww[j] = 2;
|
|
}
|
|
// save the original i
|
|
int32_t mini = i;
|
|
// advance i, it will be incremented by 1 immediately
|
|
// after hitting the "continue" statement
|
|
i = j - 1;
|
|
// must be at least the original i, we are monotinic
|
|
// otherwise ringPos[] will not be monotonic and core
|
|
// dump ultimately cuz j and k will be equal below
|
|
// and we increment matched++ forever.
|
|
if ( i < mini ) i = mini;
|
|
// get next word
|
|
continue;
|
|
}
|
|
// get next in chain if hash does not match
|
|
if ( hashes[n] != h ) {
|
|
// wrap around the hash table if we hit the end
|
|
if ( ++n >= nb ) n = 0;
|
|
// check out bucket #n now
|
|
goto loop;
|
|
}
|
|
// how many words match so far
|
|
int32_t matched = 0;
|
|
// . we have to check starting at the beginning of each word
|
|
// sequence since the XOR compositional hash is order
|
|
// independent
|
|
// . see what word offset this guy has
|
|
int32_t j = vals[n] ;
|
|
// k becomes the start of the current 5-word sequence
|
|
int32_t k = start;
|
|
// sanity check
|
|
if ( j == k ) { char *xx = NULL; *xx = 0; }
|
|
// skip to next in chain to check later
|
|
if ( ++n >= nb ) n = 0;
|
|
// keep advancing k and j as int32_t as the words match
|
|
matchLoop:
|
|
// get next wid for k and j
|
|
while ( k < nw && ! wids[k] ) k++;
|
|
while ( j < nw && ! wids[j] ) j++;
|
|
if ( k < nw && wids[k] == wids[j] ) {
|
|
matched++;
|
|
k++;
|
|
j++;
|
|
goto matchLoop;
|
|
}
|
|
// keep track of the max matched for i0
|
|
if ( matched > max ) max = matched;
|
|
// get another matching string of words, if possible
|
|
goto loop;
|
|
}
|
|
|
|
if ( nw <= 0 ) { char *xx=NULL;*xx=0;}
|
|
|
|
// make space
|
|
if ( ! m_fragBuf.reserve ( nw ) ) {
|
|
// save it
|
|
int32_t saved = g_errno;
|
|
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
|
|
// reinstate it
|
|
g_errno = saved;
|
|
return NULL;
|
|
}
|
|
// validate
|
|
m_fragBufValid = true;
|
|
// handy ptr
|
|
char *ff = m_fragBuf.getBufStart();
|
|
|
|
// convert from floats into frag score, 0 or 1 really
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
if ( ww[i] <= 0.0 ) ff[i] = 0;
|
|
else ff[i] = 1;
|
|
}
|
|
|
|
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
|
|
|
|
// wtf?
|
|
if ( ! ff ) { char *xx=NULL;*xx=0; }
|
|
|
|
return ff;
|
|
}
|
|
|
|
float g_wtab[30][30];
|
|
|
|
// . inline this for speed
|
|
// . if a word repeats in different phrases, promote the word
|
|
// and demote the phrase
|
|
// . if a word repeats in pretty much the same phrase, promote
|
|
// the phrase and demote the word
|
|
// . if you have the window of text "new mexico good times"
|
|
// and word #i is mexico, then:
|
|
// pid1 is "new mexico"
|
|
// wid1 is "mexico"
|
|
// pid2 is "mexico good"
|
|
// wid2 is "good"
|
|
// . we store sliderParm in titleRec so we can update it along
|
|
// with title and header weights on the fly from the spider controls
|
|
void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
|
|
int64_t wid1 ,
|
|
int64_t pid2 ,
|
|
int64_t wid2 , // post word
|
|
float *retww ,
|
|
//float *retpw ,
|
|
HashTableX *tt1 ,
|
|
int32_t titleRecVersion ) {
|
|
|
|
static float s_fsp;
|
|
// from 0 to 100
|
|
char sliderParm = g_conf.m_sliderParm;
|
|
// i'm not too keen on putting this as a parm in the CollectionRec
|
|
// because it is so cryptic...
|
|
//static char sliderParm = 25;
|
|
|
|
// . to support RULE #15 (word to phrase ratio)
|
|
// . these weights are based on the ratio of word to phrase count
|
|
// for a particular word
|
|
static char s_sp = -1;
|
|
if ( s_sp != sliderParm ) {
|
|
// . set it to the newly updated value
|
|
// . should range from 0 up to 100
|
|
s_sp = sliderParm;
|
|
// the float version
|
|
s_fsp = (float)sliderParm / 100.0;
|
|
// sanity test
|
|
if ( s_fsp < 0.0 || s_fsp > 1.0 ) { char *xx = NULL; *xx = 0; }
|
|
// i is the word count, how many times a particular word
|
|
// occurs in the document
|
|
for ( int32_t i = 0 ; i < 30 ; i++ ) {
|
|
// . k is the phrase count, how many times a particular phrase
|
|
// occurs in the document
|
|
// . k can be GREATER than i because we index only phrase terms
|
|
// sometimes when indexing neighborhoods, and not the
|
|
// single words that compose them
|
|
for ( int32_t k = 0 ; k < 30 ; k++ ) {
|
|
// do not allow phrase count to be greater than
|
|
// word count, even though it can happen since we
|
|
// add imported neighborhood pwids to the count table
|
|
int32_t j = k;
|
|
if ( k > i ) j = i;
|
|
// get ratio
|
|
//float ratio = (float)phrcount / (float)wrdcount;
|
|
float ratio = (float)j/(float)i;
|
|
// it should be impossible that this can be over 1.0
|
|
// but might happen due to hash collisions
|
|
if ( ratio > 1.0 ) ratio = 1.0;
|
|
// restrict the range we can weight a word or phrase
|
|
// based on the word count
|
|
//float r = 1.0;
|
|
//if ( i >= 20 ) r = 2.1;
|
|
//else if ( i >= 10 ) r = 1.8;
|
|
//else if ( i >= 4 ) r = 1.5;
|
|
//else r = 1.3;
|
|
//g_ptab[i][k] = 1.00;
|
|
g_wtab[i][k] = 1.00;
|
|
if ( i <= 1 ) continue;
|
|
// . we used to have a sliding bar between 0.0 and 1.0.
|
|
// word is weighted (1.0 - x) and phrase is weighted
|
|
// by (x). however, x could go all the way to 1.0
|
|
// even when i = 2, so we need to restrict x.
|
|
// . x is actually "ratio"
|
|
// . when we have 8 or less word occurrences, do not
|
|
// remove more than 80% of its score, a 1/5 penalty
|
|
// is good enough for now. but for words that occur
|
|
// a lot in the link text or pwids, go to town...
|
|
if ( i <= 2 && ratio >= .50 ) ratio = .50;
|
|
else if ( i <= 4 && ratio >= .60 ) ratio = .60;
|
|
else if ( i <= 8 && ratio >= .80 ) ratio = .80;
|
|
else if ( i <= 12 && ratio >= .95 ) ratio = .95;
|
|
// round up, so many "new mexico" phrases but only
|
|
// make it up to 95%...
|
|
if ( ratio >= .95 ) ratio = 1.00;
|
|
// if word's phrase is repeated 3 times or more then
|
|
// is a pretty good indication that we should weight
|
|
// the phrase more and the word itself less
|
|
//if ( k >= 3 && ratio < .90 ) ratio = .90;
|
|
// compute the weights
|
|
//float pw = 2.0 * ratio;
|
|
//float ww = 2.0 * (1.0 - ratio);
|
|
float ww = (1.0 - ratio);
|
|
|
|
// . punish words a little more
|
|
// . if we got 50% ratio, words should not get as much
|
|
// weight as the phrase
|
|
//ww *= .45;
|
|
// do not weight to 0, no less than .15
|
|
if ( ww < 0.0001 ) ww = 0.0001;
|
|
//if ( pw < 0.0001 ) pw = 0.0001;
|
|
// do not overpromote either
|
|
//if ( ww > 2.50 ) ww = 2.50;
|
|
//if ( pw > 2.50 ) pw = 2.50;
|
|
// . do a sliding weight of the weight
|
|
// . a "ww" of 1.0 means to do no weight
|
|
// . can't do this for ww cuz we use "mod" below
|
|
//float newWW = s_fsp*ww + (1.0-s_fsp)*1.00;
|
|
//float newPW = s_fsp*pw + (1.0-s_fsp)*1.00;
|
|
// limit how much we promote a word because it
|
|
// may occur 30 times total, but have a phrase count
|
|
// of only 1. however, the other 29 times it occurs it
|
|
// is in the same phrase, just not this particular
|
|
// phrase.
|
|
//if ( ww > 2.0 ) ww = 2.0;
|
|
g_wtab[i][k] = ww;
|
|
//g_ptab[i][k] = newPW;
|
|
//logf(LOG_DEBUG,"build: wc=%" INT32 " pc=%" INT32 " ww=%.2f "
|
|
//"pw=%.2f",i,k,g_wtab[i][k],g_ptab[i][k]);
|
|
}
|
|
}
|
|
}
|
|
|
|
int32_t phrcount1 = 0;
|
|
int32_t phrcount2 = 0;
|
|
int32_t wrdcount1 = 0;
|
|
int32_t wrdcount2 = 0;
|
|
if ( tt1->m_numSlotsUsed > 0 ) {
|
|
if (pid1) phrcount1 = tt1->getScore(&pid1);
|
|
if (pid2) phrcount2 = tt1->getScore(&pid2);
|
|
if (wid1) wrdcount1 = tt1->getScore(&wid1);
|
|
if (wid2) wrdcount2 = tt1->getScore(&wid2);
|
|
}
|
|
// if we are always ending the same phrase, like "Mexico"
|
|
// in "New Mexico"... get the most popular phrase this word is
|
|
// in...
|
|
int32_t phrcountMax = phrcount1;
|
|
int32_t wrdcountMin = wrdcount1;
|
|
// these must actually exist to be part of the selection
|
|
if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2;
|
|
if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2;
|
|
|
|
|
|
// . but if we are 'beds' and in a popular phrase like 'dog beds'
|
|
// there maybe a lot of other phrases mentioned that have 'beds'
|
|
// in them like 'pillow beds', 'pet beds', but we need to assume
|
|
// that is phrcountMax is high enough, do not give much weight to
|
|
// the word... otherwise you can subvert this algorithm by just
|
|
// adding other random phrases with the word 'bed' in them.
|
|
// . BUT, if a page has 'X beds' with a lot of different X's then you
|
|
// still want to index 'beds' with a high score!!! we are trying to
|
|
// balance those 2 things.
|
|
// . do this up here before you truncate phrcountMax below!!
|
|
float mod = 1.0;
|
|
if ( phrcountMax <= 6 ) mod = 0.50;
|
|
else if ( phrcountMax <= 8 ) mod = 0.20;
|
|
else if ( phrcountMax <= 10 ) mod = 0.05;
|
|
else if ( phrcountMax <= 15 ) mod = 0.03;
|
|
else mod = 0.01;
|
|
|
|
// scale wrdcount1/phrcountMax down for the g_wtab table
|
|
if ( wrdcount1 > 29 ) {
|
|
float ratio = (float)phrcountMax / (float)wrdcount1;
|
|
phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
|
|
wrdcount1 = 29;
|
|
}
|
|
if ( phrcountMax > 29 ) {
|
|
float ratio = (float)wrdcount1 / (float)phrcountMax;
|
|
wrdcount1 = (int32_t)((29.0 * ratio) + 0.5);
|
|
phrcountMax = 29;
|
|
}
|
|
|
|
// . sanity check
|
|
// . neighborhood.cpp does not always have wid/pid pairs
|
|
// that match up right for some reason... so we can't do this
|
|
//if ( phrcount1 > wrdcount1 ) { char *xx = NULL; *xx = 0; }
|
|
//if ( phrcount2 > wrdcount2 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// apply the weights from the table we computed above
|
|
*retww = mod * g_wtab[wrdcount1][phrcountMax];
|
|
|
|
// slide it
|
|
*retww = s_fsp*(*retww) + (1.0-s_fsp)*1.00;
|
|
|
|
// ensure we do not punish too hard
|
|
if ( *retww <= 0.0 ) *retww = 0.01;
|
|
|
|
if ( *retww > 1.0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
if ( phrcountMax >= 0 ) {
|
|
int64_t sh = getPrefixHash ( (char *)NULL , 0 , NULL , 0 );
|
|
int64_t tid = g_indexdb.getTermId ( sh , wid1 );
|
|
logf(LOG_DEBUG,"build: phrcountMax=%" INT32 " wrdCount1=%" INT32 " "
|
|
"*ww=%.4f for word with tid=%" UINT64 "",
|
|
phrcountMax,wrdcount1,(float)*ww,tid);
|
|
//if ( phrcountMax < 10 && tid == 16944700235015LL )
|
|
// log("hey");
|
|
}
|
|
*/
|
|
|
|
// sanity check
|
|
//if ( *ww == 0.0 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
/*
|
|
// scale wrdcountMin/phrcount down for the g_ptab table
|
|
if ( wrdcountMin > 29 ) {
|
|
float ratio = (float)phrcount2 / (float)wrdcountMin;
|
|
phrcount2 = (int32_t)((29.0 * ratio) + 0.5);
|
|
wrdcountMin = 29;
|
|
}
|
|
if ( phrcount2 > 29 ) {
|
|
float ratio = (float)wrdcountMin / (float)phrcount2;
|
|
wrdcountMin = (int32_t)((29.0 * ratio) + 0.5);
|
|
phrcount2 = 29;
|
|
}
|
|
*/
|
|
// . if the word is Mexico in 'New Mexico good times' then
|
|
// phrase term #i which is, say, "Mexico good" needs to
|
|
// get the min word count when doings its word to phrase
|
|
// ratio.
|
|
// . it has two choices, it can use the word count of
|
|
// "Mexico" or it can use the word count of "good".
|
|
// . say, each is pretty high in the document so the phrase
|
|
// ends up getting penalized heavily, which is good because
|
|
// it is a nonsense phrase.
|
|
// . if we had "united socialist soviet republic" repeated
|
|
// a lot, the phrase "socialist soviet" would score high
|
|
// and the individual words would score low. that is good.
|
|
// . try to seek the highest weight possible for this phrase
|
|
// by choosing the lowest word count possible
|
|
// . NO LONGER AFFECT phrase weights because just because the
|
|
// words occur a lot in the document and this may be the only
|
|
// occurrence of this phrase, does not mean we should punish
|
|
// the phrase. -- MDW
|
|
//*retpw = 1.0;
|
|
return;
|
|
|
|
// do it the old way...
|
|
//*pw = g_ptab[wrdcountMin][phrcount2];
|
|
|
|
// sanity check
|
|
//if ( *pw == 0.0 ) { char *xx = NULL; *xx = 0; }
|
|
}
|
|
|
|
// for registerSleepCallback
|
|
static void clockSyncWaitWrapper ( int fd , void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . a special call
|
|
// . returns -1 if blocked, 1 otherwise, 0 on error
|
|
char XmlDoc::waitForTimeSync ( ) {
|
|
// unregister?
|
|
if ( isClockInSync() && m_alreadyRegistered ) {
|
|
// note it
|
|
log("build: clock now synced for %s",m_firstUrl.m_url);
|
|
g_loop.unregisterSleepCallback(m_masterState,
|
|
clockSyncWaitWrapper);
|
|
}
|
|
// return 1 if synced!
|
|
if ( isClockInSync() ) return 1;
|
|
// already registered? wait another 1000ms
|
|
if ( m_alreadyRegistered ) return -1;
|
|
// flag it
|
|
m_alreadyRegistered = true;
|
|
// note it
|
|
log("build: waiting for clock to sync for %s",m_firstUrl.m_url);
|
|
// this should mean it is re-called later
|
|
if ( g_loop.registerSleepCallback ( 1000 , // 1000 ms
|
|
m_masterState ,
|
|
clockSyncWaitWrapper ,
|
|
m_niceness ))
|
|
// wait for it, return -1 since we blocked
|
|
return -1;
|
|
// if was not able to register, ignore delay
|
|
log("doc: failed to register clock wait callback");
|
|
return 0;
|
|
}
|
|
|
|
////////////////////////////
|
|
//
|
|
// SCRAPING TOOLS
|
|
//
|
|
////////////////////////////
|
|
|
|
void doInjectLoopWrapper ( void *state ) {
|
|
XmlDoc *XD = (XmlDoc *)state;
|
|
// if it blocked, wait
|
|
if ( ! XD->doInjectLoop ( ) ) return;
|
|
// . if we did not inject any links, i guess we are done!
|
|
// . this happens if the ahrefs.com doc had the same outlinks
|
|
// as the ahrefs.com doc for another search result, they are all
|
|
// deduped and it does not block.
|
|
XD->m_finalCallback ( XD->m_finalState );
|
|
}
|
|
|
|
// . return false if blocks, true otherwise
|
|
// . return true and set error on error, with no blocks outstanding
|
|
// . TODO: make this word for ahrefs.com list of links in xml feed
|
|
bool XmlDoc::injectLinks (HashTableX *linkDedupTablePtr ,
|
|
HashTableX *domDedupTablePtr,
|
|
void *finalState ,
|
|
void (* finalCallback)(void *)) {
|
|
|
|
// INJECT 10 at a time. xmldoc is 1MB.
|
|
int32_t i; for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
|
|
XmlDoc *nd;
|
|
// continue if already set it. this was overwriting it
|
|
// and causing a mem leak before
|
|
if ( m_xmlDocs[i] ) continue;
|
|
try { nd = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
break;
|
|
}
|
|
mnew ( nd , sizeof(XmlDoc),"xmldocarr");
|
|
m_xmlDocs[i] = nd;
|
|
}
|
|
|
|
// all null?
|
|
if ( i < (int32_t)MAX_XML_DOCS ) {
|
|
log("scrape: one xmldoc alloc failed");
|
|
return true;
|
|
}
|
|
|
|
m_masterLoop = doInjectLoopWrapper;
|
|
m_masterState = this;
|
|
|
|
m_finalState = finalState;
|
|
m_finalCallback = finalCallback;
|
|
|
|
// note it
|
|
//log("xmldoc: injecting outlinks of %s",m_firstUrl.getUrl());
|
|
|
|
m_linkDedupTablePtr = linkDedupTablePtr;
|
|
m_domDedupTablePtr = domDedupTablePtr;
|
|
|
|
// loop over all links
|
|
m_i = 0;
|
|
m_blocked = 0;
|
|
memset ( m_used , 0 , (int32_t)MAX_XML_DOCS );
|
|
|
|
return doInjectLoop();
|
|
}
|
|
|
|
|
|
void doneInjectingWrapper ( void *state ) {
|
|
XmlDoc *xd = (XmlDoc *)state;
|
|
XmlDoc *XD = (XmlDoc *)xd->m_hack;
|
|
XD->doneInjecting ( xd );
|
|
}
|
|
|
|
// . return false if blocks, true otherwise
|
|
// . return true and set error on error, with no blocks outstanding
|
|
bool XmlDoc::doInjectLoop ( ) {
|
|
|
|
setStatus("inject outlinks");
|
|
|
|
//Links *links = getLinks();
|
|
//if ( ! links ) return (m_blocked == 0);
|
|
//if ( links == (void *)-1 ) return false;
|
|
Sections *sections = getSections();
|
|
if ( ! sections ) return (m_blocked == 0);
|
|
if ( sections == (void *)-1 ) return false;
|
|
Links *links = getLinks();
|
|
if ( ! links ) return (m_blocked == 0);
|
|
if ( links == (void *)-1 ) return false;
|
|
Words *words = getWords();
|
|
if ( ! words ) return (m_blocked == 0);
|
|
if ( words == (void *)-1 ) return false;
|
|
Bits *bp = getBits();
|
|
if ( ! bp ) return (m_blocked == 0);
|
|
if ( bp == (void *)-1 ) return false;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
int32_t n = links->getNumLinks();
|
|
Url tmpUrl;
|
|
Section *prev = NULL;
|
|
// scan the links now
|
|
for ( ; m_i < n ; ) {
|
|
// get xml doc then
|
|
int32_t j; for ( j = 0 ; j < MAX_XML_DOCS ; j++ )
|
|
if ( ! m_used[j] ) break;
|
|
// none? return false if blocked.
|
|
if ( j >= MAX_XML_DOCS ) return false;
|
|
// get the m_ith link
|
|
char *link = links->getLink ( m_i );
|
|
int32_t linkLen = links->getLinkLen ( m_i );
|
|
// temp term
|
|
if ( link[linkLen] ) { char *xx=NULL;*xx=0; }
|
|
// skip to next link to index
|
|
m_i++;
|
|
// skip injecting if its an internal bing/google outlink
|
|
if ( strncmp(link,"http://www.bing.com/",20) == 0 )
|
|
continue;
|
|
// skip youtube query links. they contain our exact
|
|
// query!! so almost always come up #1
|
|
if ( strstr(link,".youtube.com/") && strstr(link,"&q="))
|
|
continue;
|
|
if ( strstr(link,".msn.com/") )
|
|
continue;
|
|
if ( strstr(link,".microsoft.com/") )
|
|
continue;
|
|
if ( strstr(link,".discoverbing.com/") )
|
|
continue;
|
|
if ( strstr(link,".googleusercontent.com/") )
|
|
continue;
|
|
//if(!strncmp(link,"http://webcache.googleusercontent.com/",38)
|
|
if(!strncmp(link,"http://www.google.com/url?q=http",32)){
|
|
// grab the real url from that
|
|
char *embed = strstr(link,"url?q=http");
|
|
if ( ! embed ) continue;
|
|
link = embed+6;
|
|
char *end = embed;
|
|
for ( ; *end && *end != '&' ; end++) {
|
|
// google appends query to url.. strange
|
|
//if ( end[0] == '%' &&
|
|
// end[1] == '2' &&
|
|
// to_lower_a(end[2]) == 'b' )
|
|
// break;
|
|
}
|
|
SafeBuf mbuf;
|
|
mbuf.reserve ( end - link + 100 );
|
|
int32_t dlen;
|
|
char *bs = mbuf.getBufStart();
|
|
dlen=urlDecode(bs,link , end - link );
|
|
bs[dlen] = '\0';
|
|
tmpUrl.set ( bs );
|
|
link = tmpUrl.getUrl();
|
|
linkLen = tmpUrl.getUrlLen();
|
|
}
|
|
// skip maps.google.com etc.
|
|
if ( strstr(link,".google.com/") )
|
|
continue;
|
|
|
|
// ok, point to title and summary for this result!
|
|
// go up to prev node for first non-clickable text which
|
|
// should be summary
|
|
//Section **sp = sections->m_sectionPtrs;
|
|
// get the section
|
|
int32_t ln = links->getNodeNum(m_i-1);
|
|
// get node ptr
|
|
XmlNode *node = m_xml.getNodePtr(ln);
|
|
char *ptr = node->m_node;
|
|
// find section that contains it i guess
|
|
Section *sx = sections->m_rootSection;
|
|
Section *last = NULL;
|
|
char **wptrs = words->getWords();
|
|
//nodeid_t *tids = words->getTagIds();
|
|
for ( ; sx ; sx = sx->m_next ) {
|
|
// get section ptr
|
|
char *sw = wptrs[sx->m_b-1];
|
|
if ( sw < ptr ) continue;
|
|
// over?
|
|
sw = wptrs[sx->m_a];
|
|
if ( sw > ptr ) break;
|
|
last = sx;
|
|
}
|
|
// assign
|
|
sx = last;
|
|
// telescope section up one i guess
|
|
//sx = sx->m_parent;
|
|
// int16_tcut
|
|
wbit_t *bits = bp->m_bits;
|
|
// if still same first alnum, go another
|
|
//for ( ; sx ; sx = sx->m_parent ) {
|
|
// // skip if same word starts this section
|
|
// //if ( sx->m_firstWordPos == fa ) continue;
|
|
// // must have alnum
|
|
// if ( sx->m_firstWordPos <= 1 ) continue;
|
|
// // must be in link! should be the result TITLE
|
|
// if ( bits[sx->m_firstWordPos] & D_IN_LINK ) break;
|
|
// // word must not be "cached" or whatever...
|
|
//}
|
|
// if in bold tag, should telescope up some more
|
|
//if ( sx && sx->m_tagId == TAG_B ) sx = sx->m_parent;
|
|
//if ( sx && sx->m_tagId == TAG_STRONG ) sx = sx->m_parent;
|
|
// save
|
|
//int32_t fa = sx->m_firstWordPos;
|
|
// that's the title so telescope up as int32_t as that is the
|
|
// first alnum!!!
|
|
for ( ; sx ; sx = sx->m_parent ) {
|
|
//Section *ps = sx->m_parent;
|
|
// do we have a next brother? stop then! that means
|
|
// we are in a list!
|
|
//if ( sx->m_nextBrother ) break;
|
|
//if ( ps->m_firstWordPos != fa ) break;
|
|
// stop when we hit a result delimeter!!
|
|
if ( sx->m_tagId == TAG_LI ) {
|
|
// bing...
|
|
if ( strncmp(wptrs[sx->m_a],
|
|
"<li class=\"sa_wr\">",
|
|
17) == 0 ) {
|
|
break;
|
|
}
|
|
// google...
|
|
if ( strncmp(wptrs[sx->m_a],
|
|
"<li class=\"g\">",
|
|
13) == 0 ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
// if no indicator, bail
|
|
if ( ! sx ) continue;
|
|
// skip link if contained in prev section
|
|
if ( prev == sx )
|
|
continue;
|
|
// save it
|
|
prev = sx;
|
|
// record search result details
|
|
Section *title = NULL;
|
|
Section *cite = NULL;
|
|
Section *summary = NULL;
|
|
// . that is probably the full result then...
|
|
// . title is first sentence
|
|
for ( ; sx ; sx = sx->m_next ) {
|
|
// only sentences
|
|
if ( ! ( sx->m_flags & SEC_SENTENCE ) ) continue;
|
|
// grab it
|
|
if ( ! title ) {
|
|
title = sx;
|
|
continue;
|
|
}
|
|
// skip section if in link
|
|
if ( bits[sx->m_firstWordPos] & D_IN_LINK ) continue;
|
|
// we are sentence section so fix it so we are one
|
|
// above!
|
|
Section *rs = sx; // ->m_parent;
|
|
// telescope up to a div or whatever...
|
|
//for ( ; rs ; rs = rs->m_parent ) {
|
|
// if ( rs->m_tagId == TAG_DIV ) break;
|
|
// if ( rs->m_tagId == TAG_P ) break;
|
|
//}
|
|
// and out of bold
|
|
if ( rs && rs->m_tagId == TAG_B ) rs = rs->m_parent;
|
|
if ( rs && rs->m_tagId == TAG_STRONG) rs=rs->m_parent;
|
|
// bail if no good!
|
|
if ( ! rs ) continue;
|
|
// then site if google
|
|
if ( ! cite ) {
|
|
cite = rs;
|
|
continue;
|
|
}
|
|
// then summary
|
|
summary = rs;
|
|
break;
|
|
}
|
|
m_serpBuf.safePrintf("\t\t<result>\n");
|
|
// print <title> tag
|
|
if ( title ) printSerpFiltered(title,"title");
|
|
// print <sum> tag
|
|
if ( summary ) printSerpFiltered(summary,"sum");
|
|
m_serpBuf.safePrintf("\t\t\t<url>");
|
|
m_serpBuf.safeMemcpy ( link , linkLen );
|
|
m_serpBuf.safePrintf("</url>\n");
|
|
m_serpBuf.safePrintf("\t\t</result>\n");
|
|
|
|
|
|
// if not injecting, skip
|
|
//continue;
|
|
if ( ! m_reallyInjectLinks ) continue;
|
|
|
|
// dedup
|
|
int32_t linkHash32 = hash32 ( link , linkLen );
|
|
if ( m_linkDedupTablePtr &&
|
|
m_linkDedupTablePtr->isInTable (&linkHash32) ) continue;
|
|
// add it otherwise
|
|
if ( m_linkDedupTablePtr )
|
|
m_linkDedupTablePtr->addKey ( &linkHash32 );
|
|
|
|
// we use this when injecting ahrefs links
|
|
if ( m_domDedupTablePtr ) {
|
|
int32_t domLen;
|
|
char *dom = getDomFast ( link , &domLen );
|
|
int32_t dh32 = hash32 ( dom , domLen );
|
|
if ( m_domDedupTablePtr->isInTable (&dh32) ) continue;
|
|
m_domDedupTablePtr->addKey ( &dh32 );
|
|
}
|
|
|
|
// get it
|
|
XmlDoc *xd = m_xmlDocs[j];
|
|
|
|
if ( ! xd ) { char *xx=NULL;*xx=0; }
|
|
|
|
// add www to it
|
|
Url lu;
|
|
lu.set ( link , linkLen , true );
|
|
|
|
char *wwwLink = lu.getUrl();
|
|
|
|
// this can go on the stack since set4() copies it
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
// index this link!
|
|
strcpy(sreq.m_url,wwwLink);
|
|
// parentdocid of 0
|
|
int32_t firstIp = hash32n(wwwLink);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = 0;//hopCount;
|
|
sreq.m_hopCountValid = 1;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
|
|
setStatus("injecting an outlink");
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
if ( ! xd->set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
m_niceness, // 1 ,
|
|
// inject this content
|
|
NULL, // content ,
|
|
false, // deleteFromIndex ,
|
|
0, // forcedIp ,
|
|
0, // contentType ,
|
|
0, // lastSpidered ,
|
|
false )) { // hasMime
|
|
// . g_errno should be set if that returned false
|
|
// . return true if does not need to block
|
|
log("xmldoc: outlink inject: %s",mstrerror(g_errno));
|
|
break;
|
|
}
|
|
|
|
xd->m_hack = this;
|
|
|
|
// make this our callback in case something blocks
|
|
xd->setCallback ( xd , doneInjectingWrapper );
|
|
// . set xd from the old title rec if recycle is true
|
|
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
|
xd->m_recycleContent = false;//true;
|
|
|
|
// avoid looking up ip of each outlink to add "firstip" tag to
|
|
// tagdb because that can be slow!!!!!!!
|
|
xd->m_spiderLinks = false;
|
|
xd->m_spiderLinks2 = false;
|
|
xd->m_spiderLinksValid = true;
|
|
|
|
// . newOnly is true --> do not inject if document is already
|
|
// indexed!
|
|
// . maybe just set indexCode
|
|
xd->m_newOnly = true;//false;//newOnly;
|
|
// need to refresh it!!
|
|
//xd->m_newOnly = false;//newOnly;
|
|
|
|
// turn off robots.txt lookups
|
|
xd->m_isAllowed = true;
|
|
xd->m_isAllowedValid = true;
|
|
xd->m_crawlDelay = -1; // unknown
|
|
xd->m_crawlDelayValid = true;
|
|
|
|
// log it now
|
|
log("inject: indexing outlink %s (hash=%" UINT32 ")",wwwLink,
|
|
(uint32_t)linkHash32);
|
|
|
|
// costs one API unit, which is one cent. but if we do
|
|
// top 50 on google, top 50 on procog, it can be like
|
|
// $1 every time we do this.
|
|
//xd->injectAhrefsLinks();
|
|
|
|
bool status = true;
|
|
|
|
// this will tell it to index ahrefs first before indexing
|
|
// the doc. but do NOT do this if we are from ahrefs.com
|
|
// ourselves to avoid recursive explosion!!
|
|
xd->m_downloadLevel = m_downloadLevel + 1;
|
|
xd->m_useAhrefs = m_useAhrefs;
|
|
|
|
// inherit dedup tables as well!
|
|
xd->m_linkDedupTablePtr = m_linkDedupTablePtr;
|
|
|
|
// . now tell it to index
|
|
// . this returns false if blocked
|
|
status = xd->indexDoc ( );
|
|
|
|
// log it. i guess only for errors when it does not block?
|
|
// because xmldoc.cpp::indexDoc calls logIt()
|
|
if ( status ) xd->logIt();
|
|
// otherwise, it blocks
|
|
else {
|
|
m_blocked++;
|
|
log("xmldoc: blockedout=%" INT32 " slotj=%" INT32 " "
|
|
"(this=0x%" PTRFMT ",xd=0x%" PTRFMT ")",
|
|
m_blocked,j,(PTRTYPE)this,(PTRTYPE)xd);
|
|
m_used[j] = true;
|
|
}
|
|
}
|
|
|
|
// return true if all done
|
|
return (m_blocked == 0);
|
|
}
|
|
|
|
void XmlDoc::doneInjecting ( XmlDoc *xd ) {
|
|
// find it in our list
|
|
int32_t i;
|
|
for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
|
|
if ( ! m_used[i] ) continue;
|
|
if ( m_xmlDocs[i] != xd ) continue;
|
|
break;
|
|
}
|
|
// core if not found in our list, it must be there
|
|
if ( i >= MAX_XML_DOCS ) { char *xx=NULL;*xx=0; }
|
|
// free it up now!
|
|
m_used[i] = 0;
|
|
// free it up
|
|
//mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
|
|
//delete ( m_xmlDocs[i] );
|
|
//m_xmlDocs[i] = NULL;
|
|
m_xmlDocs[i]->reset();
|
|
// uncount it as being outstanding
|
|
m_blocked--;
|
|
// log debug
|
|
log("xmldoc: blockedin=%" INT32 " (this=0x%" PTRFMT ")",
|
|
m_blocked,(PTRTYPE)this);
|
|
// return if still blocked
|
|
if ( ! doInjectLoop() ) return;
|
|
// log debug
|
|
log("xmldoc: final callback");
|
|
// ok, all have been indexed
|
|
m_finalCallback ( m_finalState );
|
|
}
|
|
|
|
bool XmlDoc::injectAhrefsLinks ( ) {
|
|
|
|
setStatus("get inlinks from ahrefs.com");
|
|
|
|
// skip for now
|
|
//return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
|
|
// make the ahrefs urls
|
|
try { m_ahrefsDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
mnew ( m_ahrefsDoc , sizeof(XmlDoc),"xmldocah");
|
|
// make the url
|
|
SafeBuf ubuf;
|
|
// turn count down to 10 for now
|
|
ubuf.safePrintf("http://api.ahrefs.com/get_backlinks.php?count=350&mode=exact&output=xml&AhrefsKey=0452f27fd5a7fec5e9702e23ba4af223&target=");
|
|
//ubuf.safePrintf("http://www.gigablast.com/?q=poo&u=");
|
|
ubuf.urlEncode (m_firstUrl.getUrl() );
|
|
Url url;
|
|
url.set ( ubuf.getBufStart() );
|
|
char *up = url.getUrl();
|
|
// set by url i guess
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
strcpy(sreq.m_url,up);
|
|
// parentdocid of 0
|
|
int32_t firstIp = hash32n(up);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = 0;//hopCount;
|
|
sreq.m_hopCountValid = 1;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
// int16_tcut
|
|
XmlDoc *ah = m_ahrefsDoc;
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
if ( ! ah->set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
m_niceness, // 1 ,
|
|
// inject this content
|
|
NULL, // content ,
|
|
false, // deleteFromIndex ,
|
|
0, // forcedIp ,
|
|
0, // contentType ,
|
|
0, // lastSpidered ,
|
|
false )) { // hasMime
|
|
log("xmldoc: ahref doc error %s",mstrerror(g_errno));
|
|
// g_errno should be set if that returned false
|
|
return true;
|
|
}
|
|
// do not re-call the set
|
|
//m_needsSet = false;
|
|
// make this our callback in case something blocks
|
|
//ah->setCallback ( state , callback );
|
|
// do not re-lookup the robots.txt
|
|
ah->m_isAllowed = true;
|
|
ah->m_isAllowedValid = true;
|
|
ah->m_crawlDelay = -1; // unknown
|
|
ah->m_crawlDelayValid = true;
|
|
|
|
ah->m_downloadLevel = m_downloadLevel + 1;
|
|
|
|
// reset domain table for deduping ahref's links by domain
|
|
// before injecting them... only inject one per domain
|
|
if ( ! m_domDedupTablePtr ) {
|
|
m_domDedupTable.set(4,0,512,NULL,0,false,m_niceness,"dmtab2");
|
|
m_domDedupTablePtr = &m_domDedupTable;
|
|
}
|
|
|
|
// log it now
|
|
//log("inject: indexing injected doc %s",url);
|
|
|
|
// if we are a url like api.ahrefs.com/get_backlinks... then
|
|
// our links can use our table for deduping based on domain, AND
|
|
// they can use our link dedup table in case one outlink is also
|
|
// a search result on google's page...
|
|
if ( ! ah->injectLinks ( m_linkDedupTablePtr,
|
|
m_domDedupTablePtr,
|
|
m_masterState ,
|
|
m_masterLoop ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::printSerpFiltered ( Section *sx , char *tagName ) {
|
|
//int64_t *wids = m_words.getWordIds();
|
|
char **wptrs = m_words.getWords();
|
|
int32_t *wlens = m_words.getWordLens();
|
|
int32_t fa = sx->m_firstWordPos;
|
|
nodeid_t *tids = m_words.getTagIds();
|
|
if ( fa > 0 && tids[fa-1] == TAG_B ) fa--;
|
|
if ( fa > 0 && tids[fa-1] == TAG_STRONG ) fa--;
|
|
int32_t la = sx->m_b;
|
|
int32_t nw = m_words.getNumWords();
|
|
if ( la+1 < nw && tids[la+1] == (TAG_B|BACKBIT) ) la++;
|
|
if ( la+1 < nw && tids[la+1] == (TAG_STRONG|BACKBIT) ) la++;
|
|
|
|
// advance la even more if regular words or br tags or b or strong tags
|
|
for ( ; la < nw ; la++ ) {
|
|
if ( ! tids[la] ) continue;
|
|
if ( (tids[la]&BACKBITCOMP) == TAG_BR ) continue;
|
|
if ( (tids[la]&BACKBITCOMP) == TAG_STRONG ) continue;
|
|
if ( tids[la] == TAG_BR ) continue;
|
|
break;
|
|
}
|
|
|
|
m_serpBuf.safePrintf("\t\t\t<%s>",tagName);
|
|
// cdata!
|
|
m_serpBuf.safePrintf("<![CDATA[");
|
|
// subtract 1 from sx->m_b to avoid ending tag
|
|
for ( int32_t i = fa ; i < la ; i++ ) {
|
|
// skip if br
|
|
if ( tids[i] == TAG_BR ) continue;
|
|
m_serpBuf.cdataEncode ( wptrs[i] , wlens[i] );
|
|
}
|
|
// cdata!
|
|
m_serpBuf.safePrintf("]]>");
|
|
m_serpBuf.safePrintf("</%s>\n",tagName);
|
|
return true;
|
|
}
|
|
|
|
//////////
|
|
//
|
|
// BEGIN NEW SEO MATCHING QUERIES TOOL CODE
|
|
//
|
|
//////////
|
|
|
|
|
|
static void loadTitleRecFromDiskOrSpiderWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
if ( ! THIS->loadTitleRecFromDiskOrSpider() ) return;
|
|
THIS->m_callback1 ( THIS->m_state );
|
|
}
|
|
|
|
// . if we can't load titlerec from titledb, spider it, index it and
|
|
// use that new titlerec
|
|
// . returns false if blocks
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::loadTitleRecFromDiskOrSpider() {
|
|
|
|
if ( ! m_masterLoop ) {
|
|
m_masterState = this;
|
|
m_masterLoop = loadTitleRecFromDiskOrSpiderWrapper;
|
|
}
|
|
|
|
// fix a core when getTermListBuf() calls getMetaList()
|
|
// which calls getNewSpiderReply() which calls
|
|
// getDownloadEndTime() and tries to download the page
|
|
// even though we have a valid titlerec!
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTimeValid = true;
|
|
m_downloadEndTime = 0;
|
|
}
|
|
|
|
// . try to recycle the content first
|
|
// . try to load it from title rec first
|
|
// . we have to do this otherwise our ptr_linkInfo link texts
|
|
// will be somewhat random and cause us to get different scores
|
|
// for the queries we match!!
|
|
// . so do this not just for speed, but to be consistent.
|
|
if ( ! loadFromOldTitleRec() ) return false;
|
|
|
|
// did that fail? i.e. not found!?!?! ignore and just indexx it
|
|
if ( m_oldTitleRecValid && m_oldTitleRec )
|
|
return true;
|
|
|
|
// ok, we gotta index it
|
|
if ( ! m_loggedMsg3 ) {
|
|
m_loggedMsg3 = true;
|
|
log("xmldoc: url %s not in titledb, spidering and indexing",
|
|
m_firstUrl.m_url);
|
|
}
|
|
|
|
// clear that
|
|
g_errno = 0;
|
|
|
|
// turn off recycling i guess since we don't have it
|
|
m_recycleContent = false;
|
|
|
|
// first index it, but only if not already indexed
|
|
// did it block?
|
|
// error indexing doc? indexCode should be set then
|
|
if ( ! indexDoc() ) return false;
|
|
|
|
// no blocking
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
void getSEOQueryInfoWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// note it
|
|
THIS->setStatus ( "seoqueryinfowrapper" );
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in seo query info wrapper" );
|
|
// return if it blocked
|
|
if ( THIS->getSEOQueryInfo( ) == (void *)-1 ) return;
|
|
// print any error
|
|
if ( g_errno )
|
|
log("seopipe: getSeoQueryInfo error: %s",mstrerror(g_errno));
|
|
// all done
|
|
else
|
|
log("seopipe: getSeoQueryInfo is done");
|
|
// show timing info
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - THIS->m_beginSEOTime;
|
|
log("seopipe: time: getSeoQueryInfo took %" INT64 "ms",took);
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
return;
|
|
}
|
|
|
|
void getSEOQueryInfoWrapper2 ( int fd , void *state ) {
|
|
// just pump! otherwise we might re-launch a msg3a request while
|
|
// one is outstanding causing a core in Multicast::reset()
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// debug log
|
|
THIS->setStatus ("getseoqueryinfowrapper2");
|
|
// if we are waiting just on the pump i guess we are all done!
|
|
if ( ! THIS->m_socketWriteBufValid ) {
|
|
log("seopipe: pumping socket");
|
|
THIS->pumpSocketWriteBuf();
|
|
return;
|
|
}
|
|
// not pumping?
|
|
log("seopipe: pumping socket ready wrapper");
|
|
// otherwise, let it call the callback
|
|
getSEOQueryInfoWrapper ( state );
|
|
}
|
|
|
|
// . return safebuf of xml containing matching and related queries and
|
|
// related urls/titles
|
|
// . this transmits the xml as it generates it to "m_seoSocket" if non-null
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . stores the xml in the m_socketWriteBuf SafeBuf
|
|
// . will keep blocking (returning -1) until the xml is delivered to socket
|
|
// if it is non-NULL
|
|
SafeBuf *XmlDoc::getSEOQueryInfo ( ) {
|
|
|
|
setStatus ( "seo query info" );
|
|
|
|
// only set to valid once it has been all written out!!
|
|
if ( m_socketWriteBufValid ) {
|
|
// all done?
|
|
if ( ! m_seoSocket ) return &m_socketWriteBuf;
|
|
// pump
|
|
pumpSocketWriteBuf();
|
|
// if socket not done being pumped... we block. it's
|
|
// ready wrappers should re-call our wrapper.
|
|
if ( m_socketWriteBufSent >= m_socketWriteBuf.length() )
|
|
return &m_socketWriteBuf;
|
|
// wait for write to finish
|
|
return (SafeBuf *)-1;
|
|
}
|
|
|
|
// the g_errno could be a title rec not found reply coming back
|
|
// so do not process that here! it needs to be processed
|
|
// by the function whose request resulted in an error reply.
|
|
// for instances, the getTitle() call below needs to set g_errno
|
|
// when we call it now, responding to its msg22 reply.
|
|
//if ( g_errno ) return NULL;
|
|
|
|
// a good place to init stuff we need here
|
|
if ( ! m_masterState ) {
|
|
m_printedQueries = false;
|
|
m_printedRelatedDocIds = false;
|
|
m_printedRelatedQueries = false;
|
|
m_printedRecommendedLinks = false;
|
|
m_printedScoredInsertableTerms = false;
|
|
//m_docIndexed = false;
|
|
// time it
|
|
m_beginSEOTime = gettimeofdayInMilliseconds();
|
|
// for our m_masterLoop function, it uses this as the state
|
|
m_masterState = this;
|
|
// this is a main entry point function so anything that blocks
|
|
// should re-call this function
|
|
m_masterLoop = getSEOQueryInfoWrapper;
|
|
// assume indexed
|
|
m_docIndexed = true;
|
|
// fix a core when getTermListBuf() calls getMetaList()
|
|
// which calls getNewSpiderReply() which calls
|
|
// getDownloadEndTime() and tries to download the page
|
|
// even though we have a valid titlerec!
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTimeValid = true;
|
|
m_downloadEndTime = 0;
|
|
}
|
|
}
|
|
|
|
// . try to load it from title rec first
|
|
// . we have to do this otherwise our ptr_linkInfo link texts
|
|
// will be somewhat random and cause us to get different scores
|
|
// for the queries we match!!
|
|
// . so do this not just for speed, but to be consistent.
|
|
if ( m_recycleContent && ! loadFromOldTitleRec()) return (SafeBuf *)-1;
|
|
|
|
// did that fail? i.e. not found!?!?! ignore and just indexx it
|
|
if ( m_oldTitleRecValid && ! m_oldTitleRec && m_recycleContent ) {
|
|
// just skip this asshole then
|
|
log("xmldoc: url %s load3 failed",m_firstUrl.m_url);
|
|
// clear that
|
|
g_errno = 0;
|
|
// need to index it
|
|
m_docIndexed = false;
|
|
}
|
|
|
|
// first index it, but only if not already indexed
|
|
if ( ! m_docIndexed ) {
|
|
// turn off recycling i guess since we don't have it
|
|
m_recycleContent = false;
|
|
// did it block?
|
|
// error indexing doc? indexCode should be set then
|
|
if ( ! indexDoc() ) return (SafeBuf *)-1;
|
|
// do not re-call
|
|
m_docIndexed = true;
|
|
}
|
|
|
|
|
|
// was indexing successful?
|
|
int32_t *indexCode = getIndexCode();
|
|
if ( ! indexCode || indexCode == (void *)-1 )
|
|
return (SafeBuf *)indexCode;
|
|
|
|
// if not successfully indexed send back error msg
|
|
if ( *indexCode && m_seoSocket ) {
|
|
m_socketWriteBuf.safePrintf(
|
|
"\t<errorMsg><![CDATA[%s]]>"
|
|
"</errorMsg>\n"
|
|
"</response>"
|
|
, mstrerror(*indexCode) );
|
|
// send on socket
|
|
pumpSocketWriteBuf();
|
|
// if socket not done being pumped... we block
|
|
if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
|
|
return (SafeBuf *)-1;
|
|
// otherwise, we are done sending
|
|
return &m_socketWriteBuf;
|
|
}
|
|
|
|
|
|
// seo.cpp needs this in printDupSentences
|
|
Sections *sections = getSectionsWithDupStats();
|
|
if ( ! sections || sections == (void *)-1) return (SafeBuf *)sections;
|
|
|
|
// seo.cpp needs this now when it calls getSiteRank()
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (void *)-1 ) return (SafeBuf *)sni;
|
|
|
|
// . find all logged queries that this document matches
|
|
// . this will launch msg99 requests to each host in the network
|
|
// . then it scores them
|
|
// . don't worry about sending back in real-time for this since it
|
|
// should be fast
|
|
SafeBuf *qpbuf = getMatchingQueriesScored();
|
|
if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;
|
|
|
|
// . how many queries do we have that match this url?
|
|
// . they should be sorted by our url's score
|
|
int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
|
|
// int16_tcut
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
// cast the msg99 reply ptrs, i.e. query ptrs
|
|
Msg99Reply **queryPtrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
|
|
// store each one as xml then into m_headerBuf99
|
|
if ( ! m_printedQueries && m_seoSocket ) {
|
|
m_printedQueries = true;
|
|
// do not flood the socket! so limit to 1000 queries
|
|
// they should be sorted by queryImportance!
|
|
// cheatcodes.com has like 50,000 matching queries.
|
|
int32_t max = numQueryPtrs;
|
|
if ( max > 1000 ) max = 1000;
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
// int16_tcut
|
|
Msg99Reply *qp = queryPtrs[i];
|
|
// sometimes queries like 'gallery-view' are
|
|
// hard-phrased and do not show up for us, so skip.
|
|
// they should be at the very end so we should be
|
|
// trimming the tail for them, so don't worry about
|
|
// <queryNum> having holes in it.
|
|
if ( qp->m_myDocId == 0LL && qp->m_myScore == 0.0 )
|
|
continue;
|
|
// int16_tcut
|
|
QueryLogEntry *qe = &qp->m_queryLogEntry;
|
|
sb->safePrintf("\t<seoQuery>\n"
|
|
"\t\t<queryNum>%" INT32 "</queryNum>\n"
|
|
"\t\t<query><![CDATA[%s]]></query>\n"
|
|
"\t\t<queryTrafficPerDay>%" INT32 ""
|
|
"</queryTrafficPerDay>\n"
|
|
// our url's score
|
|
"\t\t<myDocId>%" INT64 "</myDocId>\n"
|
|
"\t\t<myScore>%f</myScore>\n"
|
|
//"\t\t<mySiteHash32>%" UINT32 ""
|
|
//"</mySiteHash32>\n"
|
|
"\t\t<queryImportance>%f"
|
|
"</queryImportance>\n"
|
|
|
|
|
|
"\t</seoQuery>\n"
|
|
, i
|
|
, qp->m_queryStr
|
|
// x 10 to estimate google?
|
|
, qe->m_gigablastTraffic *
|
|
GB_TRAFFIC_MODIFIER
|
|
, qp->m_myDocId
|
|
, qp->m_myScore
|
|
//, qp->m_mySiteHash32
|
|
, qp->m_queryImportance
|
|
//,qp->m_queryInfo.m_numUniqueWordForms
|
|
//,qp->m_queryInfo.m_numRepeatWordForms
|
|
//qp->m_queryInfo.m_smallestNormTermFreq
|
|
);
|
|
}
|
|
}
|
|
|
|
// pump it some. i.e. send m_socketWriteBuf contents back to
|
|
// m_seoSocket if it is non-NULL
|
|
pumpSocketWriteBuf();
|
|
|
|
// . now instead try getting the top "imax" queries scored on the
|
|
// whole index
|
|
// . transmit them back on m_seoSocket AS WE GET THEM by calling
|
|
// pumpSocketWriteBuf() function and storing into m_socketWriteBuf
|
|
//qpbuf = getMatchingQueriesScoredForFullQuery ( );
|
|
//if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;
|
|
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
// how many related docids do we have?
|
|
int32_t nr = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
//
|
|
// print out the related urls
|
|
//
|
|
if ( ! m_printedRelatedDocIds && nr && m_seoSocket ) {
|
|
m_printedRelatedDocIds = true;
|
|
int32_t max = 200; // m_maxRelatedUrls;
|
|
if ( max == -1 ) max = nr;
|
|
if ( nr < max ) max = nr;
|
|
sb->safePrintf("\t<relatedUrls>\n");
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
RelatedDocId *rd = &rds[i];
|
|
// fix for titlerec not found errors
|
|
char *title = rd->ptr_rd_title;
|
|
char *url = rd->ptr_rd_url;
|
|
if ( ! title ) title = "";
|
|
if ( ! url ) url = "";
|
|
// print it out
|
|
sb->safePrintf("\t\t<relatedUrl>\n"
|
|
"\t\t\t<urlNum>%" INT32 "</urlNum>\n"
|
|
"\t\t\t<url><![CDATA[%s]]></url>\n"
|
|
"\t\t\t<docId>%" INT64 "</docId>\n"
|
|
"\t\t\t<siteHash32>%" UINT32 "</siteHash32>\n"
|
|
"\t\t\t<title><![CDATA["
|
|
, i
|
|
, url
|
|
, rd->m_docId
|
|
, rd->m_siteHash32
|
|
);
|
|
// encode CDATA stuff in title
|
|
sb->cdataEncode(title);
|
|
sb->safePrintf("]]></title>\n"
|
|
"\t\t\t<queriesInCommon>%" INT32 ""
|
|
"</queriesInCommon>\n"
|
|
"\t\t\t<similarityScore>%f"
|
|
"</similarityScore>\n"
|
|
, rd->m_numCommonQueries
|
|
, rd->m_dotProduct // similarityScore
|
|
);
|
|
// print the actually querynums in common
|
|
int32_t firstOff = rd->m_firstCommonQueryNumOff;
|
|
int32_t offset = firstOff;
|
|
sb->safePrintf("\t\t\t<queriesInCommon>\n");
|
|
for ( ; offset >= 0 ; ) {
|
|
// get that node
|
|
char *buf = m_commonQueryNumBuf.getBufStart();
|
|
// and offset
|
|
buf += offset;
|
|
// then cast
|
|
QueryNumLinkedNode *qn;
|
|
qn = (QueryNumLinkedNode *)buf;
|
|
// print that
|
|
sb->safePrintf("\t\t\t\t<queryNum>%" INT32 ""
|
|
"</queryNum>\n"
|
|
, qn->m_queryNum );
|
|
// advance. will be -1 when done
|
|
offset = qn->m_nextOff;
|
|
}
|
|
sb->safePrintf("\t\t\t</queriesInCommon>\n");
|
|
sb->safePrintf("\t\t</relatedUrl>\n");
|
|
}
|
|
sb->safePrintf("\t</relatedUrls>\n");
|
|
}
|
|
|
|
|
|
//
|
|
// recommended inlinks!
|
|
//
|
|
|
|
// pump it some. i.e. send m_socketWriteBuf contents back to
|
|
// m_seoSocket if it is non-NULL
|
|
pumpSocketWriteBuf();
|
|
|
|
SafeBuf *kbuf = getRecommendedLinksBuf();
|
|
if ( ! kbuf || kbuf == (void *)-1 ) return kbuf;
|
|
|
|
// print out the recommended links in xml
|
|
if ( ! m_printedRecommendedLinks && m_seoSocket ) {
|
|
sb->safePrintf("\t<recommendedLinks>\n");
|
|
char *p = kbuf->getBufStart();
|
|
char *pend = kbuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
// cast it
|
|
RecommendedLink *ri = (RecommendedLink *)p;
|
|
// skip it
|
|
p += ri->getSize();
|
|
// print it out
|
|
sb->safePrintf("\t\t<link>\n"
|
|
"\t\t\t<url><![CDATA[%s]]></url>\n"
|
|
"\t\t\t<title><![CDATA[%s]]></title>\n"
|
|
"\t\t\t<score>%f</score>\n"
|
|
"\t\t\t<siteRank>%" INT32 "</siteRanke>\n"
|
|
,ri->getUrl(kbuf)
|
|
,ri->getTitle(kbuf)
|
|
,ri->m_totalRecommendedScore
|
|
,(int32_t)ri->m_siteRank
|
|
);
|
|
}
|
|
sb->safePrintf("\t</recommendedLinks>\n");
|
|
m_printedRecommendedLinks = true;
|
|
}
|
|
|
|
|
|
//
|
|
// related queries
|
|
//
|
|
|
|
// write out
|
|
pumpSocketWriteBuf();
|
|
|
|
|
|
SafeBuf *relBuf = getRelatedQueryBuf();
|
|
if ( ! relBuf || relBuf == (void *)-1 ) return relBuf;
|
|
QueryRel **rels = (QueryRel **)relBuf->getBufStart();
|
|
int32_t numRels = relBuf->length() / sizeof(QueryRel *);
|
|
|
|
//
|
|
// print out the related queries
|
|
//
|
|
if ( ! m_printedRelatedQueries && numRels && m_seoSocket ) {
|
|
sb->safePrintf("\t<relatedQueries>\n");
|
|
int32_t max = 200; // m_maxRelatedQueries;
|
|
if ( max == -1 ) max = numRels;
|
|
if ( numRels < max ) max = numRels;
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
QueryRel *rel = rels[i];
|
|
// must be a first!
|
|
if ( ! rel->m_isFirst ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
//QueryInfo *qi = &rel->m_queryInfo;
|
|
// print it out
|
|
sb->safePrintf("\t\t<relatedQuery>\n"
|
|
"\t\t\t<query><![CDATA[%s]]></query>\n"
|
|
"\t\t\t<relatedDocIdsInCommon>%" INT32 ""
|
|
"</relatedDocIdsInCommon>\n"
|
|
|
|
"\t\t\t<relatedQueryImportance>%f"
|
|
"</relatedQueryImportance>\n"
|
|
|
|
//"\t</relatedUrl>\n"
|
|
, rel->m_queryStr
|
|
, rel->m_docIdVotes
|
|
|
|
//, qi->m_numUniqueWordForms
|
|
//, qi->m_numRepeatWordForms
|
|
//, qi->m_smallestNormTermFreq
|
|
|
|
, rel->m_totalRelatedQueryImportance
|
|
//, qi->m_myScoreRelated
|
|
);
|
|
// print details!
|
|
sb->safePrintf("\t\t\t<matchingDocIds>\n");
|
|
// linked list of Msg99Replies for the related queries.
|
|
// all in linked list are for the same query but
|
|
// restricted to a different docid!
|
|
for ( ; rel ; rel = rel->m_next ) {
|
|
// get his related docid
|
|
RelatedDocId *rd = rel->m_relatedDocId;
|
|
// print that
|
|
sb->safePrintf("\t\t\t\t<match>\n"
|
|
"\t\t\t\t\t<relatedDocId>%" INT64 ""
|
|
"</relatedDocId>\n"
|
|
"\t\t\t\t\t<siteHash32>%" UINT32 ""
|
|
"</siteHash32>\n"
|
|
//"\t\t\t\t\t"
|
|
//"<queryImportance>%f"
|
|
//"</queryImportance>\n"
|
|
"\t\t\t\t\t<docIdSimilarity>%f"
|
|
"</docIdSimilarity>\n"
|
|
"\t\t\t\t\t<docIdScore>%f"
|
|
"</docIdScore>\n"
|
|
"\t\t\t\t</match>\n"
|
|
, rd->m_docId
|
|
, rd->m_siteHash32
|
|
//, rd->m_similarityScore
|
|
, rd->m_dotProduct
|
|
, rel->m_myScore
|
|
);
|
|
}
|
|
sb->safePrintf("\t\t\t</matchingDocIds>\n");
|
|
sb->safePrintf("\t\t</relatedQuery>\n");
|
|
|
|
}
|
|
sb->safePrintf("\t</relatedQueries>\n");
|
|
m_printedRelatedQueries = true;
|
|
}
|
|
|
|
|
|
// write out
|
|
pumpSocketWriteBuf();
|
|
|
|
// this is the Keyword Insertion Tool data (KIT data)
|
|
SafeBuf *sits = getScoredInsertableTerms();
|
|
if ( ! sits || sits == (void *)-1 ) return sits;
|
|
|
|
// try to store into cachedb in case user clicks a different
|
|
// insertable term and we have to update the wordposinfo::m_rankChange
|
|
// stuff in the html src display
|
|
//if ( ! storeIntoCachedb() )
|
|
// // return -1 if it blocked and wait for store to complete
|
|
// return (SafeBuf *)-1;
|
|
|
|
|
|
// print out query changes
|
|
if ( ! m_printedScoredInsertableTerms && m_seoSocket ) {
|
|
// dump out each insertable term and it's corresponding
|
|
// QueryChanges
|
|
if ( ! printScoredInsertableTerms ( sb ) )
|
|
return NULL;
|
|
m_printedScoredInsertableTerms = true;
|
|
// end of xml response?
|
|
sb->safePrintf("</response>\n");
|
|
}
|
|
|
|
// even if not fully pumped, set it to valid here
|
|
m_socketWriteBufValid = true;
|
|
|
|
if ( ! m_seoSocket ) return &m_socketWriteBuf;
|
|
|
|
// write out
|
|
pumpSocketWriteBuf();
|
|
|
|
// if socket not done being pumped... we block
|
|
if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
|
|
return (SafeBuf *)-1;
|
|
|
|
// ok, we are done
|
|
return &m_socketWriteBuf;
|
|
}
|
|
*/
|
|
|
|
// have the smallest twids on top!
|
|
int twidcmp ( const void *a, const void *b ) {
|
|
TermInfo *ua = (TermInfo *)a;
|
|
TermInfo *ub = (TermInfo *)b;
|
|
//uint32_t ua = *(uint32_t *)a;
|
|
//uint32_t ub = *(uint32_t *)b;
|
|
// HACKY: sort by lower 32 bits of the 64 bit termids so
|
|
// seo.cpp can use them with its QueryLogEntries which use 32 bit
|
|
// termids to save mem.
|
|
uint32_t ta = (uint32_t)ua->m_termId64;
|
|
uint32_t tb = (uint32_t)ub->m_termId64;
|
|
// lower first
|
|
if ( ta > tb ) return 1; // swap
|
|
if ( ta < tb ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
// . 1. make a vector of the words in the title, headers, page-inlink-text,
|
|
// and site-inlink-text
|
|
//
|
|
// . 2. pass that word vector to every machine in network to see what queries
|
|
// in the query logs we match. use Msg99.cpp. it should initialize
|
|
// on startup and load in it's share of the query logs. query log file
|
|
// should be sorted then sorted by filtered query then split. should also
|
|
// remove queries from the most aggressive IPs (bots). we would need
|
|
// a program, filterquerylog.cpp to do all that on gk37, our query log
|
|
// storage server. it needs to store # of times query was done, too.
|
|
// all queries should have back to back spaces removed and made lowercase.
|
|
// remove queries that have double quotes or colon operators in them.
|
|
// index each query term in the query log into HashTableX, which will
|
|
// point to the query in the buffer. then we just store the termlist
|
|
// in a SafeBuf that we save on disk. 40GB of queries split 256 ways
|
|
// is still like 175MB per server! (if one server is dead, skip it)
|
|
//
|
|
// . 3. merge all queries received from all hosts and sort by traffic.
|
|
//
|
|
// . 4. perform the queries on procog and cache the scores of the top 10
|
|
// results for each query. should be cached on machine that houses the
|
|
// query. try a 60-day cache max age.
|
|
//
|
|
// . 5. now redo the queries but with a "url:thisurl |" to get this page's
|
|
// score for each query. if the min score of the query on procog is
|
|
// well beyond our grasp, we could just skip it.
|
|
//
|
|
// . 6. then determine the # of inlinks we need to add to get more traffic
|
|
// for each query. assume siterank of 0 per inlink. if that would be
|
|
// impossible then increment the siterank until it gets us in the top 10.
|
|
//
|
|
|
|
|
|
// just use getTopTermsVector
|
|
HashTableX *XmlDoc::getTermIdBufDedupTable32 ( ) {
|
|
SafeBuf *tiBuf = getTermInfoBuf();
|
|
if ( ! tiBuf || tiBuf == (void *)-1 ) return (HashTableX *)tiBuf;
|
|
return &m_tidTable32;
|
|
}
|
|
|
|
// . used by handleRequest8e() which uses msg20::getSummary() with
|
|
// m_getTermListBuf to call this in the local host msg20 handler.
|
|
// . this buf is used to determine what queries this document matches
|
|
SafeBuf *XmlDoc::getTermId32Buf() {
|
|
|
|
if ( m_termId32BufValid )
|
|
return &m_termId32Buf;
|
|
|
|
SafeBuf *tiBuf = getTermInfoBuf ();
|
|
if ( ! tiBuf || tiBuf == (void *) -1 ) return tiBuf;
|
|
|
|
int32_t need = 4 * (tiBuf->length() / sizeof(TermInfo));
|
|
if ( ! m_termId32Buf.reserve(need) ) return NULL;
|
|
|
|
// scan those
|
|
char *p = tiBuf->getBufStart();
|
|
char *pend = tiBuf->getBuf();
|
|
uint32_t last = 0;
|
|
for ( ; p < pend ; ) {
|
|
TermInfo *ti = (TermInfo *)p;
|
|
p += sizeof(TermInfo);
|
|
uint32_t tid32 = (uint32_t)(ti->m_termId64);
|
|
m_termId32Buf.pushLong(tid32);
|
|
// sanity
|
|
if ( last && tid32 <= last ) { char *xx=NULL;*xx=0; }
|
|
last = tid32;
|
|
}
|
|
|
|
m_termId32BufValid = true;
|
|
return &m_termId32Buf;
|
|
}
|
|
|
|
// . used by getTermId32Buf() for getting this document's matching queries
|
|
// . serialize the words in the title and inlink text into a vector
|
|
// . SafeBuf is filled with class TermInfos! defined in seo.h. currently
|
|
// just a int64_t m_termId64 though!
|
|
// . get synonyms of each word too!
|
|
// . we sort them by the 32-bit termid so handleRequest8e() can do its fast
|
|
// compare algo to find matching queries which are also sorted by the lower
|
|
// 32 bits of terms in the query.
|
|
SafeBuf *XmlDoc::getTermInfoBuf ( ) {
|
|
|
|
setStatus ( "getterminfobuf" );
|
|
|
|
if ( m_termInfoBufValid ) return &m_termInfoBuf;
|
|
|
|
bool includeSynonyms = true;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (SafeBuf *)langId;
|
|
|
|
|
|
if (!m_tidTable32.set(4,0,16384,NULL,0,false,m_niceness,"twidtabl"))
|
|
return NULL;
|
|
|
|
//
|
|
// add document body words now to m_twbuf
|
|
//
|
|
|
|
if ( ! addUniqueWordsToBuf ( &m_termInfoBuf ,
|
|
&m_tidTable32 , // dedup table
|
|
NULL, // filter table
|
|
NULL, // mincounttable
|
|
false ,
|
|
ww ,
|
|
includeSynonyms) )
|
|
return NULL;
|
|
|
|
//
|
|
// store count of each term we hash after this into "TMP"
|
|
//
|
|
HashTableX TMP;
|
|
if(!TMP.set(4,4,4096,NULL,0,false,m_niceness,"tmttt") )
|
|
return NULL;
|
|
|
|
//
|
|
// hash meta desc into TMP table
|
|
//
|
|
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription( &mdlen );
|
|
if ( md ) {
|
|
Words ww3;
|
|
ww3.setx ( md , mdlen , m_niceness );
|
|
if (!addUniqueWordsToBuf(NULL,
|
|
NULL , // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable
|
|
true, // store counts?
|
|
&ww3,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// hash meta keywords into TMP table
|
|
//
|
|
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
if ( mk ) {
|
|
Words ww4;
|
|
ww4.setx ( mk , mklen , m_niceness );
|
|
if (!addUniqueWordsToBuf(NULL,
|
|
NULL, // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable
|
|
true, // store counts?
|
|
&ww4,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// hash each link text into TMP table
|
|
//
|
|
|
|
// loop over every link text to this page
|
|
for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the link text
|
|
if ( k->size_linkText <= 1 ) continue;
|
|
// set Url
|
|
Url u;
|
|
u.set ( k->getUrl() , k->size_urlBuf );
|
|
// do not allow anomalous link text to match query
|
|
//if ( k->m_isAnomaly ) continue;
|
|
char *p = k-> getLinkText();
|
|
int32_t plen = k->size_linkText - 1;
|
|
if ( ! verifyUtf8 ( p , plen ) ) {
|
|
log("title: set4 bad link text from url=%s",
|
|
k->getUrl());
|
|
continue;
|
|
}
|
|
// debug
|
|
//log("seo: counttable for link text '%s'",k->getLinkText());
|
|
// now the words.
|
|
Words ww2;
|
|
if ( ! ww2.set ( k->getLinkText() ,
|
|
k->size_linkText-1, // len
|
|
TITLEREC_CURRENT_VERSION ,
|
|
true , // computeIds
|
|
m_niceness ))// niceness
|
|
// g_errno set on error, return NULL
|
|
return NULL;
|
|
// int16_tcuts on link text
|
|
if ( ! addUniqueWordsToBuf( NULL,
|
|
NULL, // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable
|
|
true, // store counts?
|
|
&ww2,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// now only add link texts to main table and buffer if it occurs
|
|
// already in the body, or occurs TWICE in "TMP"
|
|
//
|
|
|
|
|
|
// loop over every link text to this page
|
|
for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the link text
|
|
if ( k->size_linkText <= 1 ) continue;
|
|
// set Url
|
|
Url u;
|
|
u.set ( k->getUrl() , k->size_urlBuf );
|
|
// do not allow anomalous link text to match query
|
|
//if ( k->m_isAnomaly ) continue;
|
|
char *p = k-> getLinkText();
|
|
int32_t plen = k->size_linkText - 1;
|
|
if ( ! verifyUtf8 ( p , plen ) ) {
|
|
log("title: set4 bad link text from url=%s",
|
|
k->getUrl());
|
|
continue;
|
|
}
|
|
// now the words.
|
|
Words ww2;
|
|
if ( ! ww2.set ( k->getLinkText() ,
|
|
k->size_linkText-1, // len
|
|
TITLEREC_CURRENT_VERSION ,
|
|
true , // computeIds
|
|
m_niceness ))// niceness
|
|
// g_errno set on error, return NULL
|
|
return NULL;
|
|
|
|
if ( !addUniqueWordsToBuf( &m_termInfoBuf,
|
|
&m_tidTable32, // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable, >=2 counts
|
|
false, // store counts?
|
|
&ww2,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
|
|
// how many 32-bit twids do we got?
|
|
//m_numTwids = m_twbuf.length() / 4;
|
|
//m_twids = (int32_t *)m_twbuf.getBufStart();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . sort that buf now
|
|
// . HACK: only sorts by last 32 bits of termid!!!!
|
|
qsort ( m_termInfoBuf.getBufStart(),
|
|
m_termInfoBuf.length() / sizeof(TermInfo),
|
|
sizeof(TermInfo), // 32-bit twids = 4 bytes
|
|
twidcmp );
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// if no twids then return a -2 ptr, not NULL, that means error
|
|
// not -1 that means blocked!
|
|
//if ( m_numTwids == 0 ) m_twids = (int32_t *)-2;
|
|
// do not repeat this logic
|
|
//m_twidsValid = true;
|
|
m_termInfoBufValid = true;
|
|
// return the vector
|
|
return &m_termInfoBuf;
|
|
}
|
|
|
|
// . just like getTermInfoBuf but also includes terms from related queries
|
|
// that our document does not have!
|
|
// . we do it this way because for seo.cpp::handleRequest95() it finds
|
|
// matching queries locally based on getNewTermInfoBuf()'s m_newTermInfoBuf.
|
|
SafeBuf *XmlDoc::getNewTermInfoBuf ( ) {
|
|
|
|
setStatus ( "getnewterminfobuf" );
|
|
|
|
if ( m_newTermInfoBufValid ) return &m_newTermInfoBuf;
|
|
|
|
SafeBuf *oldBuf = getTermInfoBuf ();
|
|
if ( ! oldBuf || oldBuf == (void *) -1 ) return oldBuf;
|
|
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
|
|
|
|
|
|
// this should be valid automatically
|
|
HashTableX *oldDedupTable = getTermIdBufDedupTable32 ( );
|
|
|
|
|
|
// get old guy
|
|
if ( ! m_newTermInfoBuf.safeMemcpy ( oldBuf ) )
|
|
return NULL;
|
|
|
|
// a dedup table on stack
|
|
HashTableX newDedup32;
|
|
if (! newDedup32.set(4,0,16384,NULL,0,false,m_niceness,"newdtabl"))
|
|
return NULL;
|
|
|
|
// now scan the insertable terms buf
|
|
char *p = itBuf->getBufStart();
|
|
char*pend = itBuf->getBuf();
|
|
// scan each "term" which might be one or more words
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
char *term = it->getTerm();
|
|
Words ww;
|
|
ww.set9 ( term , m_niceness );
|
|
// we add entries to the dedup table, "newDedup32",
|
|
// but only filter and not add to "oldDedupTable"
|
|
if ( ! addUniqueWordsToBuf ( &m_newTermInfoBuf,
|
|
&newDedup32 , // dedup table
|
|
oldDedupTable, // filter table
|
|
NULL, // mincounttable
|
|
false,
|
|
&ww ,
|
|
true ) )
|
|
return NULL;
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . sort that buf now.
|
|
// . HACK: only sorts by last 32 bits of termid!!!!
|
|
qsort ( m_newTermInfoBuf.getBufStart(),
|
|
m_newTermInfoBuf.length() / sizeof(TermInfo),
|
|
sizeof(TermInfo), // 32-bit twids = 4 bytes
|
|
twidcmp );
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
/*
|
|
// set the term freq of each one
|
|
p = m_newTermInfoBuf.getBufStart();
|
|
pend = m_newTermInfoBuf.getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
TermInfo *ti = (TermInfo *)p;
|
|
p += sizeof(TermInfo);
|
|
// look it up
|
|
int64_t tf = g_posdb.getTermFreq (cr->m_coll,ti->m_termId64);
|
|
// store it
|
|
ti->m_termFreq64 = tf;
|
|
}
|
|
*/
|
|
|
|
// do not repeat this logic
|
|
m_newTermInfoBufValid = true;
|
|
// return the vector
|
|
return &m_newTermInfoBuf;
|
|
}
|
|
|
|
bool XmlDoc::addUniqueWordsToBuf ( SafeBuf *termInfoBuf ,
|
|
HashTableX *dedupTable ,
|
|
HashTableX *filterTable ,
|
|
HashTableX *minCountTable ,
|
|
bool storeCounts,
|
|
Words *ww ,
|
|
bool getSynonyms ) {
|
|
|
|
int32_t nw = ww->getNumWords ();
|
|
uint64_t *wids = (uint64_t *)ww->getWordIds ();
|
|
//nodeid_t *tids = ww->getTagIds ();
|
|
uint8_t *langId = getLangId();
|
|
// this should have been set by parent caller
|
|
if ( ! langId || langId == (uint8_t *)-1 ) {char *xx=NULL;*xx=0; }
|
|
// store the langId here
|
|
uint8_t useLangId = *langId;
|
|
// default that to english i guess if unknown
|
|
if ( useLangId == langUnknown ) {
|
|
static XmlDoc *s_lastPrint = NULL;
|
|
if ( s_lastPrint != this ) {
|
|
log("seopipe: langid of page is unknown for twid "
|
|
"synonyms. assuming english.");
|
|
s_lastPrint = this;
|
|
}
|
|
useLangId = langEnglish;
|
|
}
|
|
|
|
Synonyms syn;
|
|
|
|
//bool inTitle = false;
|
|
|
|
// scan for title
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// out of a link
|
|
//if(tids && tids[i] == TAG_TITLE ) inTitle = true;
|
|
//if(tids && tids[i] == (TAG_TITLE | BACKBIT)) inTitle = false;
|
|
// count it, limit to 30
|
|
//if ( inTitle ) tw++;
|
|
// skip if not alnumword
|
|
if ( ! wids[i] ) continue;
|
|
// make it 32 bit
|
|
uint32_t wid32 = (uint32_t)wids[i];
|
|
// filter table
|
|
if ( filterTable && filterTable->isInTable(&wid32) ) continue;
|
|
/*
|
|
// debug
|
|
if ( minCountTable && storeCounts ) {
|
|
int32_t wlen = ww->m_wordLens[i];
|
|
char *wptr = ww->m_words[i];
|
|
char c= wptr[wlen];
|
|
wptr[wlen] = '\0';
|
|
log("seo: storecount wid=%" UINT32 " word=%s",
|
|
(uint32_t)((uint64_t)wids[i]),wptr);
|
|
wptr[wlen] = c;
|
|
}
|
|
*/
|
|
// to avoid link text anomalies, the word must have been
|
|
// repeated in another link text or a meta tag. should
|
|
// fix ibm.com from getting 'lincoln' or 'unc' as high-scoring
|
|
// matching queries. should fix artdaily.com from getting
|
|
// that foreign language phrase in danish. (bedste pa nettet)
|
|
// (best of the web)
|
|
if ( minCountTable &&
|
|
! storeCounts &&
|
|
minCountTable->getScore32(&wid32) <= 1 )
|
|
continue;
|
|
// get slot
|
|
if ( dedupTable && dedupTable->isInTable(&wid32) ) continue;
|
|
// count it!
|
|
if ( storeCounts && ! minCountTable->addTerm32(&wid32) )
|
|
return false;
|
|
// show it
|
|
//if ( wid32 == 1174583722 && storeCounts ) {
|
|
// log("seo: storing occurrence. current count=%" INT32 "",
|
|
// (int32_t)minCountTable->getScore32(&wid32) );
|
|
//}
|
|
// add it to vector
|
|
TermInfo ti;
|
|
ti.m_termId64 = wids[i];
|
|
//ti.m_termFreq64 = -1;
|
|
if ( termInfoBuf && !
|
|
termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
|
|
return false;
|
|
// add it then
|
|
if ( dedupTable && ! dedupTable->addKey ( &wid32 ) )
|
|
return false;
|
|
// do synonyms now?
|
|
if ( ! getSynonyms ) continue;
|
|
// get its synonyms into tmpBuf
|
|
char tmpBuf[TMPSYNBUFSIZE];
|
|
int32_t naids = syn.getSynonyms(ww,i,useLangId,tmpBuf,m_niceness);
|
|
for ( int32_t j = 0 ; j < naids ; j++ ) {
|
|
// get it
|
|
uint32_t aid32 = (uint32_t)syn.m_aids[j];
|
|
// get slot
|
|
if ( dedupTable && dedupTable->isInTable(&aid32) )
|
|
continue;
|
|
// add it to vector
|
|
TermInfo ti;
|
|
ti.m_termId64 = syn.m_aids[j]; // 64 bit version
|
|
//ti.m_termFreq64 = -1;
|
|
if ( termInfoBuf &&
|
|
! termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
|
|
return false;
|
|
// add it then
|
|
if ( dedupTable && ! dedupTable->addKey(&aid32) )
|
|
return false;
|
|
// count it!
|
|
if ( storeCounts && ! minCountTable->addTerm32(&aid32))
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
static void gotMsg99ReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotMsg99Reply ( slot );
|
|
}
|
|
|
|
void XmlDoc::gotMsg99Reply ( UdpSlot *slot ) {
|
|
// get replying hostid
|
|
int32_t hostId = slot->m_hostId;
|
|
// log
|
|
setStatus ( "gotmsg99reply" );
|
|
// sanity
|
|
if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
|
|
// save it
|
|
int32_t i = m_numMsg99Replies;
|
|
m_msg99ReplyPtrs [i] = slot->m_readBuf;
|
|
m_msg99ReplySizes[i] = slot->m_readBufSize;
|
|
m_msg99ReplyAlloc[i] = slot->m_readBufMaxSize;
|
|
m_msg99HostIds [i] = hostId;
|
|
// steal it so it doesn't free it
|
|
slot->m_readBuf = NULL;
|
|
// note it
|
|
//log("seopipe: got msg99 reply from host #%" INT32 " i=%" INT32 " alloc=%" INT32 "",
|
|
// hostId,i,slot->m_readBufMaxSize);
|
|
// inc the counter
|
|
m_numMsg99Replies++;
|
|
// sanity!
|
|
if ( m_numMsg99Replies > m_numMsg99Requests ) { char *xx=NULL;*xx=0; }
|
|
if ( m_numMsg99Replies > g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
|
|
// don't free the sendbuf, it is shared between all hosts UNLESS
|
|
// we are the last reply received!!!
|
|
if ( m_numMsg99Replies < g_hostdb.m_numHosts )
|
|
slot->m_sendBufAlloc = NULL;
|
|
// return control to transmit function. it will call m_callback1
|
|
// if the function is done. but if a different parent function than
|
|
// transmit called us then we call that. it just depends on the
|
|
// initial entry function that called getMatchingQueries()
|
|
m_masterLoop ( m_masterState );
|
|
}
|
|
*/
|
|
/*
|
|
float getQueryImportance2 ( QueryInfo *qi , float myScore ) {
|
|
// now divide by the top score (or 50th score) for the query
|
|
// so we can see how high we score relatively speaking...
|
|
// although, if all search results for this query have the
|
|
// same score this method kinda sux...
|
|
float imp = myScore / qe->m_minTop50Score;
|
|
return imp;
|
|
// mod because one word query terms get higher scores than
|
|
// multi-word queries because they are divided by distance in
|
|
// the search algo.
|
|
// this hurts 'gigablast' query.
|
|
if ( qi->m_numUniqueWordForms <= 1 ) score /= 10.0;
|
|
// multiply by it?
|
|
score *= qi->m_numUniqueWordForms;
|
|
// until we have the code to fix things like 'coast to coast'
|
|
// where the term is repeated, we have to punish...
|
|
if ( qi->m_numRepeatWordForms >= 1 ) score /= 30.0;
|
|
// kill 'search+engine+search+engine'
|
|
if ( qi->m_numRepeatWordForms >= 2 ) score /= 30.0;
|
|
// if every word in query is repeated... push it down
|
|
// try to fix 'bot+bot' and 'search+search' 'http+http'
|
|
if ( qi->m_numUniqueWordForms == qi->m_numRepeatWordForms )
|
|
score /= 2000.0;
|
|
// fix 'web search search'
|
|
if ( qi->m_numRepeatWordForms > 0 &&
|
|
qi->m_numUniqueWordForms == qi->m_numRepeatWordForms + 1 )
|
|
score /= 200.0;
|
|
// try to kill those queries that are just a single stop word
|
|
// or forms of stop words.
|
|
// this hurts 'gigablast' query, so make it > .9. no, then crap like
|
|
// 'web' and 'http' come up too high...
|
|
if ( qi->m_numUniqueWordForms == 1 ) {
|
|
score *= (1.1 - qi->m_smallestNormTermFreq);
|
|
score *= (1.1 - qi->m_smallestNormTermFreq);
|
|
}
|
|
// http is very common! so make the 'http' or 'http+http' queries
|
|
// very low importance
|
|
if ( qi->m_numControlWordForms == qi->m_numUniqueWordForms )
|
|
score /= 1000000.0;
|
|
// TODO: if query is a single term and it's exact syn min
|
|
// hash is that for 'and' then kill it. fix 'anding'
|
|
|
|
// boost it for more accuracy since we gotta make it into anint
|
|
//score *= 1000;
|
|
return score;
|
|
}
|
|
|
|
// set Msg99Reply::m_queryImportance for all msg99replies
|
|
void setQueryImportance ( Msg99Reply **qptrs , int32_t numQueryPtrs ) {
|
|
}
|
|
|
|
void setQueryImportanceRelated ( QueryRel **qptrs , int32_t numQueryPtrs ) {
|
|
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
|
|
QueryRel *qrel = qptrs[i];
|
|
float score = qrel->m_queryInfo.m_myScoreRelated;
|
|
QueryInfo *qi = &qrel->m_queryInfo;
|
|
float imp = getQueryImportance2 ( qi , score );
|
|
qi->m_queryImportance = imp;
|
|
}
|
|
}
|
|
*/
|
|
/*
|
|
int qp99cmp ( const void *a, const void *b ) {
|
|
Msg99Reply *qa = *(Msg99Reply **)a;
|
|
Msg99Reply *qb = *(Msg99Reply **)b;
|
|
// make sure manually added queries are on top
|
|
if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1;
|
|
if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
|
|
//QueryInfo *qia = &qa->m_queryInfo;
|
|
//QueryInfo *qib = &qb->m_queryInfo;
|
|
// get scores
|
|
float scorea = qa->m_queryImportance;
|
|
float scoreb = qb->m_queryImportance;
|
|
if ( scorea < scoreb ) return 1;
|
|
if ( scorea > scoreb ) return -1;
|
|
// fallback to traffic otherwise i guess
|
|
int32_t traffica = qa->m_queryLogEntry.m_gigablastTraffic;
|
|
int32_t trafficb = qb->m_queryLogEntry.m_gigablastTraffic;
|
|
if ( qa->m_queryLogEntry.m_googleTraffic != -1 )
|
|
traffica = qa->m_queryLogEntry.m_googleTraffic;
|
|
if ( qb->m_queryLogEntry.m_googleTraffic != -1 )
|
|
trafficb = qb->m_queryLogEntry.m_googleTraffic;
|
|
if ( traffica < trafficb ) return 1;
|
|
if ( traffica > trafficb ) return -1;
|
|
// fallback alphabetical otherwise?
|
|
char *qsa = qa->m_queryStr;
|
|
char *qsb = qb->m_queryStr;
|
|
if ( ! qsa ) return 0;
|
|
if ( ! qsb ) return 0;
|
|
return strcmp( qsa , qsb );
|
|
//return 0;
|
|
}
|
|
*/
|
|
|
|
#include "Cachedb.h"
|
|
|
|
// . only check cachedb once per url
|
|
// . return false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::checkCachedb ( ) {
|
|
|
|
|
|
if ( ! m_readFromCachedb ) return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// already set?
|
|
//if ( m_seoInfoSetFromCache )
|
|
// return true;
|
|
|
|
// return -1 if this blocked
|
|
if ( ! m_checkedCachedb ) {
|
|
// we now use the contenthash as part of the key because the
|
|
// data we cache is dependent on the content. i guess we don't
|
|
// need to use the user id then...
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
// first check cachedb. enum type cr_MatchingQueries
|
|
int32_t uh32 ;
|
|
uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
|
|
key_t sk = g_cachedb.makeStartKey ( uh32 , ch32 );
|
|
key_t ek = g_cachedb.makeEndKey ( uh32 , ch32 );
|
|
// debug
|
|
log("seo: checking cachedb uh32=%" UINT32 " ch32=%" UINT32 "",
|
|
(uint32_t)uh32,
|
|
(uint32_t)ch32);
|
|
// do not repeat
|
|
m_checkedCachedb = true;
|
|
// . get it from the appropriate host
|
|
// . get cachedb rec for all types of safebufs for this
|
|
// url/content
|
|
// . then we will set safebufs based on what recs we find
|
|
// in the returned list
|
|
if ( ! m_msg0.getList ( -1, // hostid
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxcacheage
|
|
false, // addtocache?
|
|
RDB_CACHEDB,
|
|
cr->m_collnum ,
|
|
&m_cacheList,
|
|
(char *)&sk ,
|
|
(char *)&ek ,
|
|
30000000, // minrecsizes 30MB
|
|
m_masterState,
|
|
m_masterLoop,
|
|
m_niceness ) )
|
|
// return FALSE if this blocks
|
|
return false;
|
|
}
|
|
|
|
if ( m_processedCachedbReply ) return true;
|
|
|
|
// only scan list once
|
|
m_processedCachedbReply = true;
|
|
|
|
// if empty, that was easy
|
|
if ( m_cacheList.isEmpty() ) return true;
|
|
|
|
// we might have one rec set from cache and another not, and we
|
|
// still want to cache the one that is not in storeIntoCachedb()!
|
|
//m_seoInfoSetFromCache = true;
|
|
|
|
// otherwise, parse out the cache recs
|
|
for ( ; ! m_cacheList.isExhausted() ; m_cacheList.skipCurrentRec() ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
char *rec = m_cacheList.getCurrentRec();
|
|
// . get type of cached rec
|
|
// . enum types cr_MatchingQueries etc. as in Cachedb.h
|
|
char recType = g_cachedb.getTypeFromKey(rec);
|
|
int32_t dataSize = m_cacheList.getCurrentDataSize();
|
|
// sanity. must at least have the cached date
|
|
if ( dataSize < 4 ) { char *xx=NULL;*xx=0; }
|
|
char *data = m_cacheList.getCurrentData ();
|
|
// in data, first int32_t is the cached time in utc
|
|
//int32_t cachedDate = *(int32_t *)data;
|
|
// skip the TIMESTAMP!
|
|
//int32_t timestamp = *(int32_t *)data;
|
|
data += 4;
|
|
dataSize -= 4;
|
|
// and version
|
|
data += 4;
|
|
dataSize -= 4;
|
|
|
|
|
|
// . 1
|
|
// . is it a cached rec for matching queries?
|
|
// . getSeoQueryInfo() needs this
|
|
if (recType == cr_MatchingQueries && !m_matchingQueryBufValid){
|
|
// debug
|
|
log("seo: found matching queries");
|
|
// total size of the msg99replies (totalMsg99ReplySize)
|
|
int32_t size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// just point into the list itself. we will
|
|
// free m_cacheList on reset then.
|
|
m_matchingQueryBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
data += size1;
|
|
// now the m_queryLinkStringBuf
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_matchingQueryStringBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding
|
|
data += size1;
|
|
m_matchingQueryBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// . 2
|
|
// . is it a cached rec for related docis with titles?
|
|
// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
|
|
// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
|
|
// . m_relatedTitleBuf is buf of titles and urls referenced
|
|
// by those classes
|
|
if ( recType == cr_RelatedDocIds &&
|
|
! m_relatedDocIdsWithTitlesValid ) {
|
|
// debug
|
|
log("seo: found related docids");
|
|
// first is the safebuf of RelatedDocId classes
|
|
int32_t size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// point into it
|
|
//char *p = data;
|
|
//char *pend = data + size1;
|
|
// just point into the list itself. we will
|
|
// free m_cacheList on reset then.
|
|
m_relatedDocIdBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// save this
|
|
//char *rtbuf = data;
|
|
// now the string buffer
|
|
m_relatedTitleBuf.setBuf ( data ,
|
|
size1 ,
|
|
size1 ,
|
|
false ,
|
|
0 );
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// now the string buffer
|
|
m_commonQueryNumBuf.setBuf ( data ,
|
|
size1 ,
|
|
size1 ,
|
|
false ,
|
|
0 );
|
|
|
|
// now the RelatedDocId::ptr_url/ptr_rd_title members
|
|
// were hacked to be offsets into this for storage
|
|
// into the cache!
|
|
/*
|
|
for ( ; p < pend ; p += sizeof(RelatedDocId) ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
RelatedDocId *rd = (RelatedDocId *)p;
|
|
// get offsets
|
|
int32_t off1 = (int32_t)rd->ptr_rd_title;
|
|
int32_t off2 = (int32_t)rd->ptr_rd_url;
|
|
int32_t off3 = (int32_t)rd->ptr_rd_site;
|
|
// normalize/store back
|
|
rd->ptr_rd_title = rtbuf + off1;
|
|
rd->ptr_rd_url = rtbuf + off2;
|
|
rd->ptr_rd_site = rtbuf + off3;
|
|
}
|
|
*/
|
|
m_relatedDocIdsWithTitlesValid = true;
|
|
m_relatedTitleBufValid = true;
|
|
m_relatedDocIdBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// . 3
|
|
// . is it a cached rec for related docis with titles?
|
|
// . getSeoQueryInfo() calls getRelatedQueryBuf()
|
|
if ( recType == cr_RelatedQueries && ! m_queryLinkBufValid ) {
|
|
// we changed the format of relatedquerystringbuf
|
|
// to be a bunch of QueryLogEntries now. so ignore
|
|
// if old format.
|
|
//if ( timestamp <= 1367704324 ) continue;
|
|
// debug
|
|
log("seo: found related queries");
|
|
int32_t size1;
|
|
// first is the safebuf m_queryLinkBuf of QueryLinks
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_relatedQueryBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
data += size1;
|
|
// now the m_queryLinkStringBuf
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_relatedQueryStringBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding
|
|
data += size1;
|
|
/*
|
|
// now the ptrs, sorted
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_relPtrs.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// test sorting
|
|
char *p = m_relPtrs.getBufStart();
|
|
char *pend = m_relPtrs.getBuf();
|
|
char *base = m_queryLinkBuf.getBufStart();
|
|
QueryLink *lastqr = NULL;
|
|
for ( ; p < pend ; p += 4 ) {
|
|
QUICKPOLL(m_niceness);
|
|
int32_t qkOff = *(int32_t *)p;
|
|
QueryLink *qr = (QueryRel *)(base+qkOff);
|
|
// no, longer, it is more complicated because
|
|
// if m_uniqueRound scoring addition
|
|
//if ( lastqr &&
|
|
// lastqr->m_totalRelatedQueryImportance <
|
|
// qr ->m_totalRelatedQueryImportance ) {
|
|
// char *xx=NULL;*xx=0;}
|
|
lastqr = qr;
|
|
}
|
|
*/
|
|
// validate
|
|
//m_relPtrsValid = true;
|
|
//m_queryLinkStringBufValid = true;
|
|
m_relatedQueryBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// if it is debug and we are not, skip it!!
|
|
//if(recType == cr_ScoredInsertableTermsDebug && ! m_seoDebug )
|
|
// continue;
|
|
|
|
// or if we are debug and it is not, skip it!
|
|
//if (recType == cr_ScoredInsertableTerms && m_seoDebug )
|
|
// continue;
|
|
|
|
/*
|
|
if ( (recType == cr_MissingTermBuf ) &&
|
|
! m_missingTermBufValid ) {
|
|
// debug
|
|
log("seo: found missingtermbuf");
|
|
int32_t size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_missingTermBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
m_missingTermBufValid = true;
|
|
}
|
|
*/
|
|
|
|
// 3b
|
|
if ( (recType == cr_WordPosInfoBuf ) &&
|
|
! m_wordPosInfoBufValid ) {
|
|
// debug
|
|
log("seo: found wordposinfo");
|
|
int32_t size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_wordPosInfoBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// WordPosInfo::m_term relative to ptr_utf8Content
|
|
char *p = m_wordPosInfoBuf.getBufStart();
|
|
char *pend = m_wordPosInfoBuf.getBuf();
|
|
for ( ; p < pend ; p += sizeof(WordPosInfo) ) {
|
|
QUICKPOLL(m_niceness);
|
|
WordPosInfo *wp = (WordPosInfo *)p;
|
|
int64_t off = (int64_t)wp->m_wordPtr;
|
|
char *ptr = ptr_utf8Content + off;
|
|
if ( off == -1 ) ptr = NULL;
|
|
wp->m_wordPtr = ptr;
|
|
}
|
|
m_wordPosInfoBufValid = true;
|
|
}
|
|
|
|
// . 4
|
|
// . and the insertable terms buffer with its querychanges
|
|
// linked lists!
|
|
if ( recType == cr_ScoredInsertableTerms &&
|
|
! m_scoredInsertableTermsBufValid ) {
|
|
// debug
|
|
log("seo: found scored insertable terms");
|
|
int32_t size1;
|
|
// first is the safebuf m_queryLinkBuf of QueryLinks
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// just point into the list itself. we will
|
|
// free m_cacheList on reset then.
|
|
m_insertableTermsBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// now the buffer of query changes
|
|
// these are normally just referenced by
|
|
// InsertableTerm and in the linked list directly
|
|
// into the Msg95Reply::ptr_queryChanges, but for
|
|
// caching we have to use a new safebuf
|
|
m_queryChangeBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_queryLogBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
/*
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_itStrBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
*/
|
|
/*
|
|
// debug scoring. QueryChange::m_debugScoreInfoOffset
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_debugScoreInfoBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// debug scoring. QueryChange::m_origScoreInfoOffset
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_origScoreInfoBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
*/
|
|
// insertable terms deserialization logic
|
|
char *p = m_insertableTermsBuf.getBufStart();
|
|
char *pend = m_insertableTermsBuf.getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// normalize m_firstQueryChange
|
|
int64_t off =(int64_t)(it->m_firstQueryChange);
|
|
// fix this
|
|
char *buf = m_queryChangeBuf.getBufStart();
|
|
// int16_tcut
|
|
QueryChange *fqc = (QueryChange *)(buf+off);
|
|
// -1 means NULL
|
|
if ( off == -1 ) fqc = NULL;
|
|
// put back
|
|
it->m_firstQueryChange = fqc;
|
|
// terms
|
|
//off = (int32_t)it->m_termStr;
|
|
// to this
|
|
//buf = m_itStrBuf.getBufStart();
|
|
// cast it
|
|
//it->m_termStr = (char *)(buf+off);
|
|
}
|
|
// . now we set QueryChange::m_next and
|
|
// InsertableTerm::m_firstQueryChange to be offsets
|
|
// into the new m_queryChangeBuf before we stored
|
|
// into the cache....
|
|
p = m_queryChangeBuf.getBufStart();
|
|
pend = m_queryChangeBuf.getBuf();
|
|
for ( ; p < pend ; p += sizeof(QueryChange) ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
QueryChange *qc = (QueryChange *)p;
|
|
// normalize m_next
|
|
int64_t off = (int64_t)qc->m_next;
|
|
// offset into this
|
|
char *buf = m_queryChangeBuf.getBufStart();
|
|
// put back
|
|
qc->m_next = (QueryChange *)(buf + off);
|
|
// -1 means NULL
|
|
if ( off == -1 ) qc->m_next = NULL;
|
|
}
|
|
// now all ptrs should be set correctly
|
|
m_scoredInsertableTermsBufValid = true;
|
|
m_insertableTermsBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// . 2
|
|
// . is it a cached rec for related docis with titles?
|
|
// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
|
|
// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
|
|
// . m_relatedTitleBuf is buf of titles and urls referenced
|
|
// by those classes
|
|
if ( recType == cr_RecommendedLinks &&
|
|
! m_recommendedLinksBufValid ) {
|
|
// debug
|
|
log("seo: found recommended links buf");
|
|
// first is the safebuf of RelatedDocId classes
|
|
int32_t size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// now the string buffer
|
|
m_recommendedLinksBuf.setBuf ( data ,
|
|
size1 ,
|
|
size1 ,
|
|
false ,
|
|
0 );
|
|
m_recommendedLinksBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#define CACHEDB_CURRENT_VERSION 1
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . flush the msg4 until it completes i guess
|
|
bool XmlDoc::storeMatchingQueriesIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// all these things should already be validated so they should
|
|
// not block or have errors
|
|
//SafeBuf *qpbuf = getMatchingQueriesScored();
|
|
//SafeBuf *qpbuf = &m_queryPtrs;
|
|
if ( ! m_matchingQueryBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
//int32_t totalMsg99ReplySize = 0;
|
|
//int32_t numQueryPtrs = 0;
|
|
//Msg99Reply **qptrs = NULL;
|
|
|
|
// 1. msg99replies for matchingQueries
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize+cacheDate(now)+ver
|
|
need += 4 + m_matchingQueryBuf.length();
|
|
need += 4 + m_matchingQueryStringBuf.length();
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: mq listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
//
|
|
// 1. first add the matching queries, msg99 replies
|
|
//
|
|
k = g_cachedb.makeKey ( uh32, ch32 , cr_MatchingQueries );
|
|
|
|
// note it
|
|
log("seo: cachedb storing matchingqueries "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 ""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_matchingQueryBuf.length();
|
|
dataSize += 4 + m_matchingQueryStringBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
listBuf.pushLong ( m_matchingQueryBuf.length() );
|
|
listBuf.safeMemcpy ( &m_matchingQueryBuf );
|
|
listBuf.pushLong ( m_matchingQueryStringBuf.length() );
|
|
listBuf.safeMemcpy ( &m_matchingQueryStringBuf );
|
|
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding matching query list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeRelatedDocIdsIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_queryPtrsWholeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
if ( ! m_relatedDocIdsWithTitlesValid ) { char *xx=NULL;*xx=0;}
|
|
if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0;}
|
|
|
|
// 2. related docids
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_relatedDocIdBuf.length();
|
|
need += 4 + m_relatedTitleBuf.length();
|
|
need += 4 + m_commonQueryNumBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: rd listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
char *p1;
|
|
char *p2;
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// 2. then add related docids
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedDocIds );
|
|
|
|
// note it
|
|
log("seo: cachedb storing relateddocids "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 ""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_relatedDocIdBuf.length();
|
|
dataSize += 4 + m_relatedTitleBuf.length();
|
|
dataSize += 4 + m_commonQueryNumBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
listBuf.pushLong ( m_relatedDocIdBuf.length() );
|
|
p1 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_relatedDocIdBuf );
|
|
p2 = listBuf.getBuf();
|
|
listBuf.pushLong ( m_relatedTitleBuf.length() );
|
|
listBuf.safeMemcpy ( &m_relatedTitleBuf );
|
|
//char *tbuf = m_relatedTitleBuf.getBufStart();
|
|
listBuf.pushLong ( m_commonQueryNumBuf.length() );
|
|
listBuf.safeMemcpy ( &m_commonQueryNumBuf );
|
|
|
|
// make ptrs into offsets into m_relatedTitleBuf
|
|
/*
|
|
for ( ; p1 < p2 ; p1 += sizeof(RelatedDocId )) {
|
|
QUICKPOLL(m_niceness);
|
|
RelatedDocId *rd = (RelatedDocId *)p1;
|
|
int32_t off;
|
|
off = rd->ptr_rd_url - tbuf;
|
|
rd->ptr_rd_url = (char *)off;
|
|
off = rd->ptr_rd_title - tbuf;
|
|
rd->ptr_rd_title = (char *)off;
|
|
off = rd->ptr_rd_site - tbuf;
|
|
rd->ptr_rd_site = (char *)off;
|
|
}
|
|
*/
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding related docids list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::storeRecommendedLinksBuf ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
if ( ! m_recommendedLinksBufValid ) { char *xx=NULL;*xx=0;}
|
|
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_recommendedLinksBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: reclnx listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// 2. then add related docids
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr_RecommendedLinks );
|
|
|
|
// note it
|
|
log("seo: cachedb storing recommendedlinksbuf "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 ""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_recommendedLinksBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_recommendedLinksBuf.length() );
|
|
listBuf.safeMemcpy ( &m_recommendedLinksBuf );
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding recommendedlinksbuf list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::storeRelatedQueriesIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_relatedQueryBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
//SafeBuf *relBuf = NULL;
|
|
//if ( m_relPtrsValid ) relBuf = &m_relPtrs;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
// 3. related queries. buf of QueryLinks
|
|
need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_relatedQueryBuf.length();
|
|
need += 4 + m_relatedQueryStringBuf.length();
|
|
//need += 4 + m_relPtrs.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: rq listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
//
|
|
// 3. then related queries (STORED by m_queryImportanceRelated)
|
|
//
|
|
//int32_t sizeRels = (m_relPtrs.length() / 4) * sizeof(QueryLink);
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedQueries );
|
|
|
|
// note it
|
|
log("seo: cachedb storing relatedqueries "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 ""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_relatedQueryBuf.length(); // sizeRels;
|
|
dataSize += 4 + m_relatedQueryStringBuf.length();
|
|
//dataSize += 4 + m_relPtrs.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_relatedQueryBuf.length() );
|
|
//char *p3 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_relatedQueryBuf );
|
|
//char *p4 = listBuf.getBuf();
|
|
listBuf.pushLong ( m_relatedQueryStringBuf.length() );
|
|
listBuf.safeMemcpy ( &m_relatedQueryStringBuf );
|
|
//listBuf.pushLong ( m_relPtrs.length() );
|
|
//char *p5 = listBuf.getBuf();
|
|
//listBuf.safeMemcpy ( &m_relPtrs );
|
|
// sanity tests
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding related queries list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeWordPosInfoBufIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_wordPosInfoBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_wordPosInfoBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: wpi listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
// 4. then the insertable terms and their query changes and log buf
|
|
// mangle key a little if in debug mode because that is the only
|
|
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
|
|
uint8_t cr8 = cr_WordPosInfoBuf;
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr8 );
|
|
|
|
// note it
|
|
log("seo: cachedb storing wordposinfobuf "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 ""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_wordPosInfoBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_wordPosInfoBuf.length() );
|
|
char *p8 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_wordPosInfoBuf );
|
|
char *p9 = listBuf.getBuf();
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// WordPosInfo::m_term relative to html ptr_utf8Content!
|
|
for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
|
|
QUICKPOLL(m_niceness);
|
|
WordPosInfo *wp = (WordPosInfo *)p8;
|
|
int64_t off = wp->m_wordPtr - ptr_utf8Content;
|
|
// if its a tag or fielded term it won't be in the
|
|
// html like ext:html or filetype:html
|
|
if ( wp->m_wordPtr< ptr_utf8Content )
|
|
off = -1;
|
|
if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
|
|
off = -1;
|
|
wp->m_wordPtr = (char *)off;
|
|
}
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding wordposinfobuf list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
bool XmlDoc::storeMissingTermBufIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_missingTermBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_missingTermBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: wpi listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
// 4. then the insertable terms and their query changes and log buf
|
|
// mangle key a little if in debug mode because that is the only
|
|
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
|
|
uint8_t cr = cr_MissingTermBuf;
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr );
|
|
|
|
// note it
|
|
log("seo: cachedb storing missingtermbuf "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 "",uh32,ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_missingTermBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_missingTermBuf.length() );
|
|
listBuf.safeMemcpy ( &m_missingTermBuf );
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding missingtermbuf list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . flush the msg4 until it completes i guess
|
|
bool XmlDoc::storeScoredInsertableTermsIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_scoredInsertableTermsBufValid ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_insertableTermsBuf.length();
|
|
// InsertableTerm::m_firstQryChange:
|
|
need += 4 + m_queryChangeBuf.length();
|
|
//4 QueryChange::m_replyQueryOffset :
|
|
need += 4 + m_queryLogBuf.length();
|
|
//InsertableTerm::m_termStr reference
|
|
//need += 4 + m_itStrBuf.length();
|
|
//need += 4 + m_wordPosInfoBuf.length();
|
|
// TOO BIG to score into cachedb!
|
|
//need += 4 + m_debugScoreInfoBuf.length(); // debug only
|
|
//need += 4 + m_origScoreInfoBuf.length(); // debug only
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: listsize %" INT32 " too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
char *p1;
|
|
char *p2;
|
|
|
|
// 4. then the insertable terms and their query changes and log buf
|
|
// mangle key a little if in debug mode because that is the only
|
|
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
|
|
uint8_t cr8 = cr_ScoredInsertableTerms;
|
|
//if ( m_seoDebug ) cr = cr_ScoredInsertableTermsDebug;
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr8 );
|
|
|
|
// note it
|
|
log("seo: cachedb storing scoredinsertableterms "
|
|
"uh32=%" UINT32 " ch32=%" UINT32 ""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_insertableTermsBuf.length();
|
|
dataSize += 4 + m_queryChangeBuf.length();
|
|
dataSize += 4 + m_queryLogBuf.length();
|
|
//dataSize += 4 + m_itStrBuf.length();
|
|
//dataSize += 4 + m_wordPosInfoBuf.length();
|
|
//dataSize += 4 + m_debugScoreInfoBuf.length(); // debug only
|
|
//dataSize += 4 + m_origScoreInfoBuf .length(); // debug only
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
// m_insertableTermsBuf
|
|
listBuf.pushLong ( m_insertableTermsBuf.length() );
|
|
p1 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_insertableTermsBuf );
|
|
char *p1End = listBuf.getBuf();
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_queryChangeBuf
|
|
listBuf.pushLong ( m_queryChangeBuf.length() );
|
|
p2 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_queryChangeBuf );
|
|
char *p2End = listBuf.getBuf();
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_queryLogBuf
|
|
listBuf.pushLong ( m_queryLogBuf.length() );
|
|
listBuf.safeMemcpy ( &m_queryLogBuf );
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_itStrBuf referenced by InsertableTerm::m_termStr
|
|
//listBuf.pushLong ( m_itStrBuf.length() );
|
|
//listBuf.safeMemcpy ( &m_itStrBuf );
|
|
//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_itStrBuf referenced by InsertableTerm::m_termStr
|
|
//listBuf.pushLong ( m_wordPosInfoBuf.length() );
|
|
//char *p8 = listBuf.getBuf();
|
|
//listBuf.safeMemcpy ( &m_wordPosInfoBuf );
|
|
//char *p9 = listBuf.getBuf();
|
|
//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// debug buffers, QueryChange::m_*Offset parms ref them if
|
|
// m_seoDebug is true. TOO BIG TO STORE INTO CACHEDB!
|
|
//listBuf.pushLong ( m_debugScoreInfoBuf.length() );
|
|
//listBuf.safeMemcpy ( &m_debugScoreInfoBuf );
|
|
//listBuf.pushLong ( m_origScoreInfoBuf.length() );
|
|
//listBuf.safeMemcpy ( &m_origScoreInfoBuf );
|
|
// make the InsertableTerm::m_firstQueryChange parms into
|
|
// offsets
|
|
for ( ; p1 < p1End ; ) { // p1 += sizeof(InsertableTerm) ) {
|
|
QUICKPOLL(m_niceness);
|
|
InsertableTerm *it = (InsertableTerm *)p1;
|
|
p1 += it->getSize();
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
int64_t qoff =(char *)qc - m_queryChangeBuf.getBufStart();
|
|
if ( qc == NULL ) qoff = -1;
|
|
it->m_firstQueryChange = (QueryChange *)qoff;
|
|
// and m_termStr
|
|
//int32_t off = it->m_termStr - m_itStrBuf.getBufStart();
|
|
//it->m_termStr = (char *)off;
|
|
}
|
|
// make QueryChange::m_next ptrs into offsets as well
|
|
for ( ; p2 < p2End ; p2 += sizeof(QueryChange) ) {
|
|
QUICKPOLL(m_niceness);
|
|
QueryChange *qc = (QueryChange *)p2;
|
|
QueryChange *next = qc->m_next;
|
|
int64_t noff =(char *)next-m_queryChangeBuf.getBufStart();
|
|
if ( next == NULL ) noff = -1;
|
|
qc->m_next = (QueryChange *)noff;
|
|
}
|
|
// WordPosInfo::m_term relative to html ptr_utf8Content!
|
|
/*
|
|
for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
|
|
QUICKPOLL(m_niceness);
|
|
WordPosInfo *wp = (WordPosInfo *)p8;
|
|
int32_t off = wp->m_wordPtr - ptr_utf8Content;
|
|
// if its a tag or fielded term it won't be in the
|
|
// html like ext:html or filetype:html
|
|
if ( wp->m_wordPtr< ptr_utf8Content )
|
|
off = -1;
|
|
if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
|
|
off = -1;
|
|
wp->m_wordPtr = (char *)off;
|
|
}
|
|
*/
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding insertable terms list of %" INT32 " bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
#define MAX_TOP_MATCHING_QUERIES 300
|
|
|
|
/*
|
|
// returns -1 if blocked, NULL with g_errno set on error
|
|
SafeBuf *XmlDoc::getMatchingQueriesScored ( ) {
|
|
|
|
setStatus ( "getmatchingqueriesscored" );
|
|
|
|
// try to set m_queryPtrs from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
// just re-use the same m_queryPtrs SafeBuf we used above but we
|
|
// set the Msg99Reply::m_myScore here and sort them by that
|
|
if ( m_queryPtrsSortedValid )
|
|
return &m_queryPtrs;
|
|
|
|
// get the queries from msg99 replies first
|
|
SafeBuf *mq = getMatchingQueries(false,-1);
|
|
if ( mq == NULL || mq == (void *)-1 ) return mq;
|
|
|
|
// time it
|
|
if ( ! m_beginTimeMatchUrl )
|
|
m_beginTimeMatchUrl = gettimeofdayInMilliseconds();
|
|
|
|
// i'm assuming this is quer ptrs!?!?!
|
|
int32_t numQueryPtrs = mq->length() / sizeof(Msg99Reply *);
|
|
|
|
// get the qptrs
|
|
Msg99Reply **qptrs = (Msg99Reply **)mq->getBufStart();
|
|
|
|
// score them in parallel over all hosts in network
|
|
if ( ! scoreDocIdRestrictedQueries ( qptrs,NULL,numQueryPtrs) )
|
|
return (SafeBuf *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
|
|
// total pages indexed!
|
|
int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
|
|
// take 25% of that. i think 'the', the most common term, is in about
|
|
// 25% of those pages
|
|
numPagesIndexed /= 4;
|
|
|
|
//
|
|
// SET QUERY IMPORTANCE
|
|
//
|
|
// . set the m_queryImportance float and sort by that
|
|
// . how important is the matching query for the main url?
|
|
// . just divide the main url's score by the
|
|
// QueryLogEntry::m_mintop50Score for the query to normalize it
|
|
// . however, when we compute RelatedDocId::m_dotProduct we normalize
|
|
// using the score of the #1 result because we executed the full
|
|
// query, so keep that in mind. we can't mix the two.
|
|
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
|
|
Msg99Reply *qp = qptrs[i];
|
|
// int16_tcut
|
|
QueryLogEntry *qe = &qp->m_queryLogEntry;
|
|
// get # results
|
|
int64_t numResults = qe->m_numTotalResultsInSlice;
|
|
// fix it to be global
|
|
numResults *= (int64_t)g_hostdb.getNumGroups();
|
|
// big indexes did the "slice logic" restricting docid
|
|
// range to MAX_DOCID * .10 when setting this!
|
|
if ( numPagesIndexed > 10000000 ) numResults *= 10;
|
|
// point to query
|
|
char *qstr = qp->m_queryStr;
|
|
// if not processed assume like 1M?
|
|
if ( numResults < 0 ) {
|
|
log("seo: guessing query importance for '%s' from "
|
|
"hostid #%" INT32 "",
|
|
qstr,(int32_t)qp->m_replyingHostId);
|
|
qp->m_queryImportance = 0.0;
|
|
continue;
|
|
}
|
|
// zero means make it 1 to avoid div by zero below
|
|
if ( numResults == 0 ) numResults = 1;
|
|
|
|
// and also weight by traffic! the more traffic the
|
|
// more important perhaps...
|
|
// NO! with this we get 'www' 'view' etc for
|
|
// jezebelgallery.com coming up in the top 50 matching
|
|
// queries by importance. crap, but it hurts cheatcodes.com
|
|
// then.
|
|
// fix
|
|
|
|
//if ( strcmp(qstr,"search engine") == 0 )
|
|
// log("poo");
|
|
|
|
// adjust since numPagesIndexed is actually a quarter of
|
|
// the # of pages indexed since 'the' is only in about
|
|
// 1/4 of the pages and it is the most common term
|
|
if ( numResults > numPagesIndexed )
|
|
numResults = numPagesIndexed;
|
|
|
|
// try doubling this to get rid of www problem for
|
|
// jezebelgallery.com. it put www and view down some more.
|
|
float popRatio = (float)numResults / (float)numPagesIndexed;
|
|
|
|
// stuff like 'www' and 'view' will be near 1.0
|
|
float weight = 1.0 - popRatio;//(popRatio * popRatio);
|
|
// go crazy
|
|
weight *= weight;
|
|
weight *= weight;
|
|
weight *= weight;
|
|
weight *= weight;
|
|
|
|
// do not let this be 1.0 because 'web page searching' is
|
|
// getting 1.0 for it and getting a weight of 0.0 and making
|
|
// it the same as the ignored matching queries for
|
|
// gigablast.com, so we end up using the ignored common
|
|
// word matching queries for getting competitor pages and it
|
|
// is bad! we need to fix that to not use such queries if
|
|
// their importance is 0!
|
|
if ( weight < .01 ) weight = .01;
|
|
|
|
|
|
// because you are in the top 50
|
|
//numResults = (int32_t)powf ( (float)numResults , .4 );
|
|
//if ( numResults == 0 )
|
|
// imp /= 1;
|
|
// otherwise, normalize by division
|
|
//else
|
|
// imp /= numResults;
|
|
// boost it!
|
|
//imp *= 10000;
|
|
//QueryInfo *qi = &qp->m_queryInfo;
|
|
//float imp = getQueryImportance2 ( qi , score );
|
|
|
|
// just try sorting by your serp score, hopefully we remove
|
|
// shit like 'www' becaise isCommonQueryWordInEnglish()
|
|
// takes care of it below.
|
|
// consider *= weight on this
|
|
|
|
// the idea is to ignore the top serp score because
|
|
// you do not want terms that you may be able to be #1
|
|
// for but are not really relevant for your doc. so for this
|
|
// let's focus on just getting the queries that best represent
|
|
// your doc...
|
|
double imp = qp->m_myScore * weight;
|
|
|
|
|
|
qp->m_queryImportance = (float)imp;
|
|
// just use this!!!
|
|
//qp->m_queryImportance = qp->m_myScore /
|
|
// (float)(numResults*numResults);
|
|
// set importance to 0 for queries with minus sign in them
|
|
// that indicates negative terms...
|
|
for ( char *p = qstr; *p ; p++ ) {
|
|
if ( *p != ' ' ) continue;
|
|
if ( p[1] != '-' ) continue;
|
|
// 'a - b' is ok
|
|
if ( p[2] == ' ' ) continue;
|
|
qp->m_queryImportance = 0.00;
|
|
log("seo: ignoring query '%s' with minus sign", qstr);
|
|
break;
|
|
}
|
|
// avoid common queries with just common words in them:
|
|
// http web www com org us we 1 2 3 by on i https one page
|
|
Words ww;
|
|
ww.set3 ( qstr );
|
|
int32_t i; for ( i = 0 ; i < ww.m_numWords ; i++ ) {
|
|
int64_t wid = ww.m_wordIds[i];
|
|
if ( wid == 0 ) continue;
|
|
if ( ! isCommonQueryWordInEnglish ( wid ) ) break;
|
|
}
|
|
if ( i >= ww.m_numWords ) {
|
|
qp->m_queryImportance = 0.00;
|
|
log("seo: ignoring common query '%s'", qstr);
|
|
}
|
|
// skip debug for now
|
|
if ( ! m_seoDebug ) continue;
|
|
// note it
|
|
log("seo: "
|
|
"imp=%f "
|
|
"numresults=%" INT64 " "
|
|
"numpagesindexed=%" INT64 " "
|
|
"popweight=%f "
|
|
"myscore=%f "
|
|
"topscore=%f "
|
|
"qstr=%s",
|
|
qp->m_queryImportance,
|
|
numResults,
|
|
numPagesIndexed,
|
|
weight,
|
|
qp->m_myScore,
|
|
qe->m_topSERPScore,
|
|
qstr);
|
|
}
|
|
|
|
|
|
// let's sort them first
|
|
qsort ( qptrs ,
|
|
numQueryPtrs ,
|
|
sizeof(Msg99Reply *),
|
|
qp99cmp );
|
|
|
|
|
|
|
|
// log for debug
|
|
int32_t maxk = numQueryPtrs;
|
|
// limit to logging 300 to avoid log spam
|
|
if ( maxk > MAX_TOP_MATCHING_QUERIES )
|
|
maxk = MAX_TOP_MATCHING_QUERIES; // 300;
|
|
|
|
// limit to top 300 dammit, otherwise we can't store all
|
|
// into cachedb!!!
|
|
int32_t newLen = maxk * sizeof(Msg99Reply *);
|
|
m_queryPtrs.setLength ( newLen );
|
|
|
|
for ( int32_t k = 0 ; k < maxk ; k++ ) {
|
|
Msg99Reply *kp = qptrs[k];
|
|
log("seopipe: newquery=\"%s\" myscore=%f imp=%f",
|
|
kp->m_queryStr,
|
|
kp->m_myScore,
|
|
kp->m_queryImportance);
|
|
}
|
|
|
|
// time it
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginTimeMatchUrl;
|
|
log("seopipe: time: matchingscoredqueries took %" INT64 " ms",took);
|
|
|
|
m_queryPtrsSortedValid = true;
|
|
|
|
if ( ! storeMatchingQueriesIntoCachedb() )
|
|
// return -1 if it blocked and wait for store to complete
|
|
return (SafeBuf *)-1;
|
|
|
|
return mq;
|
|
}
|
|
|
|
*/
|
|
|
|
static void gotMsg3aReplyForFullQueryWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->setStatus ( "gotmsg3areplyforfullquerywrapper" );
|
|
THIS->gotMsg3aReplyForFullQuery();
|
|
// . go back to the main entry function
|
|
// . make sure g_errno is clear from a msg3a g_errno before calling
|
|
// this lest it abandon the loop
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
/*
|
|
void XmlDoc::gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
|
|
Msg99Reply *qp ) {
|
|
|
|
// try again for next guy
|
|
m_triedCache = false;
|
|
|
|
char *p = cachedRec;
|
|
// # docids
|
|
int32_t numDocIds = *(int32_t *)p;
|
|
p += 4;
|
|
// total # results
|
|
int32_t numTotalResults = *(int32_t *)p;
|
|
p += 4;
|
|
// docids
|
|
int64_t *docIds = (int64_t *)p;
|
|
p += 8 * numDocIds;
|
|
// scores
|
|
float *scores = (float *)p;
|
|
p += sizeof(float) * numDocIds;
|
|
// site hashes
|
|
int32_t *siteHashes = (int32_t *)p;
|
|
p += 4 * numDocIds;
|
|
|
|
// store score info into this class
|
|
TopDocIds *td = qp->m_topDocIds;
|
|
|
|
// store reply info, like # docids, in the query ptr
|
|
int32_t max = numDocIds;
|
|
if ( max > (int32_t)NUM_TOP_RESULTS ) max = (int32_t)NUM_TOP_RESULTS;
|
|
td->m_numDocIds = max;
|
|
|
|
// count replies
|
|
m_numMsg3aReplies++;
|
|
|
|
// log to log as well
|
|
char tmp[50000];
|
|
p = tmp;
|
|
p += sprintf(p,
|
|
"seopipe: got full results CACHED "
|
|
"qrynum=%" INT32 "of%" INT32 " docids=%" INT32 " "
|
|
"query=\"%s\" ",
|
|
m_numMsg3aReplies,//m_msg3a->m_hackQNum,
|
|
m_maxFullQueries ,
|
|
td->m_numDocIds,
|
|
qp->m_queryStr );
|
|
// log each docid
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
//float score = m_msg3a->getScores()[i];
|
|
int64_t d = docIds[i];
|
|
//int32_t sh32 = m_msg3a->getSiteHash32(i);
|
|
p += sprintf(p,"d%" INT32 "=%" INT64 " ",i,d);
|
|
}
|
|
log(tmp);
|
|
|
|
|
|
// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
|
|
sb->safePrintf(
|
|
"\t<seoQueryScoreInfo>\n"
|
|
"\t\t<queryNum>%" INT32 "</queryNum>\n"
|
|
"\t\t<numTotalEstimatedSearchResults>%" INT32 ""
|
|
"</numTotalEstimatedSearchResults>\n"
|
|
"\t\t<numDocIds>%" INT32 "</numDocIds>\n"
|
|
, m_msg3a->m_hackQNum
|
|
, numTotalResults
|
|
, numDocIds
|
|
);
|
|
// print the top 50 scores
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
float score = scores[i];
|
|
int64_t d = docIds[i];
|
|
int32_t sh32 = siteHashes[i];
|
|
sb->safePrintf("\t\t<searchResult>\n");
|
|
sb->safePrintf("\t\t\t<rank>%" INT32 "</rank>\n",i+1);
|
|
sb->safePrintf("\t\t\t<score>%f</score>\n",score);
|
|
sb->safePrintf("\t\t\t<docId>%" INT64 "</docId>\n",d);
|
|
sb->safePrintf("\t\t\t<siteHash32>%" UINT32 "</siteHash32>\n",sh32);
|
|
sb->safePrintf("\t\t</searchResult>\n");
|
|
// store results for this Msg99Reply
|
|
td->m_topDocIds[i] = d;
|
|
td->m_topScores[i] = score;
|
|
td->m_topSiteHashes[i] = sh32;
|
|
}
|
|
// reset rest so it prints pretty on gdb debug print cmd
|
|
for ( int32_t i = max ; i < (int32_t)NUM_TOP_RESULTS ; i++ ) {
|
|
td->m_topDocIds[i] = 0LL;
|
|
td->m_topScores[i] = 0.0;
|
|
td->m_topSiteHashes[i] = 0;
|
|
}
|
|
|
|
sb->safePrintf("\t</seoQueryScoreInfo>\n");
|
|
|
|
// pump m_socketWriteBuf to m_seoSocket
|
|
pumpSocketWriteBuf ( );
|
|
}
|
|
*/
|
|
|
|
// . this is the msg3a reply for related docids only
|
|
// . the full replies we get for determining ranks from scores for the
|
|
// HTML simulator, are handled in seo.cpp using State95::m_msg3a.
|
|
void XmlDoc::gotMsg3aReplyForFullQuery ( ) {
|
|
|
|
int32_t err = g_errno;
|
|
|
|
// save it so we know related docid generation had an error...
|
|
if ( g_errno && ! m_msg3aErrno )
|
|
m_msg3aErrno = g_errno;
|
|
|
|
setStatus ( "gotmsg3areplyforfullquery" );
|
|
|
|
if ( g_errno ) {
|
|
log("seopipe: got msg3a reply error: %s",mstrerror(g_errno));
|
|
g_errno = 0;
|
|
}
|
|
|
|
// try again for next guy
|
|
//m_triedCache = false;
|
|
|
|
// how many docids in the search results were returned to us?
|
|
int32_t numDocIds = m_msg3a->getNumDocIds();
|
|
// total # search results estimated
|
|
//int32_t numTotalResults = m_msg3a->getNumTotalEstimatedHits();
|
|
// get the query as we received it in the msg99 reply
|
|
//Msg99Reply *qp = (Msg99Reply *)m_msg3a->m_hackQPtr;
|
|
int32_t queryNum = (int32_t)m_msg3a->m_hackQNum;
|
|
|
|
// . point to the empty class we reserved in the buf
|
|
// . store score info into this class
|
|
//TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBuf();//Start();
|
|
// ensure enough room
|
|
//if ( m_topDocIdsBuf.getAvail() < sizeof(TopDocIds) )
|
|
// m_topDocIdsBuf.reserve(sizeof(TopDocIds) )
|
|
|
|
// get next available spot to store this
|
|
TopDocIds *td = (TopDocIds *)m_topDocIdsBuf.getBuf();
|
|
int32_t tdnum = m_topDocIdsBuf.length() / sizeof(TopDocIds);
|
|
m_topDocIdsBuf.incrementLength(sizeof(TopDocIds));
|
|
if ( m_topDocIdsBuf.length() > m_topDocIdsBuf.m_capacity ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
QueryLink *qks = (QueryLink *)m_matchingQueryBuf.getBufStart();
|
|
QueryLink *qk = &qks[queryNum];
|
|
|
|
// the relateddocidnum hack
|
|
if ( tdnum > 32000 ) { char *xx=NULL;*xx=0; }
|
|
qk->m_relatedDocIdNum = tdnum;
|
|
|
|
// store reply info, like # docids, in the query ptr
|
|
int32_t max = numDocIds;
|
|
if ( max > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS )
|
|
max = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
|
|
td->m_numDocIds = max;
|
|
|
|
// QueryLink # in the m_matchingQueryBuf buffer we represent
|
|
td->m_queryNum = queryNum;
|
|
|
|
// keep it clean
|
|
//qp->m_docIdVotes = 0;
|
|
|
|
// get the query base hash and use that to
|
|
// dedup. the query base hash ignores common
|
|
// words and converts words to their synonym
|
|
// with the smallest hash
|
|
//int64_t qbh = getQueryBaseHash(qstr);
|
|
|
|
//m_msg3a->m_hackQNum = m_queryNum;
|
|
//m_msg3a->m_hackQPtr = (char *)qp;
|
|
|
|
// count replies
|
|
m_numMsg3aReplies++;
|
|
|
|
// log to log as well
|
|
//char tmp[50000];
|
|
SafeBuf tmp;
|
|
//char *p = tmp;
|
|
tmp.safePrintf(
|
|
"seopipe: got list of %" INT32 " related docids for "
|
|
"qrynum=%" INT32 " "
|
|
//"of%" INT32 ""
|
|
"numDocids=%" INT32 " "
|
|
"query=\"",
|
|
numDocIds,
|
|
m_numMsg3aReplies,//m_msg3a->m_hackQNum,
|
|
//m_maxFullQueries ,
|
|
td->m_numDocIds);
|
|
char *qqq = qk->getQueryString(&m_matchingQueryStringBuf);
|
|
tmp.safeStrcpy(qqq);
|
|
tmp.safePrintf("\" (err=%s)",
|
|
mstrerror(err));
|
|
// log each docid
|
|
//for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
// //float score = m_msg3a->getScores()[i];
|
|
// int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
|
|
// //int32_t sh32 = m_msg3a->getSiteHash32(i);
|
|
// p += sprintf(p,"d%" INT32 "=%" INT64 " ",i,d);
|
|
//}
|
|
char *msg = tmp.getBufStart();
|
|
log("%s",msg);
|
|
|
|
/*
|
|
// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
|
|
sb->safePrintf(
|
|
"\t<seoQueryScoreInfo>\n"
|
|
"\t\t<queryNum>%" INT32 "</queryNum>\n"
|
|
"\t\t<numTotalEstimatedSearchResults>%" INT32 ""
|
|
"</numTotalEstimatedSearchResults>\n"
|
|
"\t\t<numDocIds>%" INT32 "</numDocIds>\n"
|
|
, m_msg3a->m_hackQNum
|
|
, numTotalResults
|
|
, numDocIds
|
|
);
|
|
*/
|
|
// print the top 50 scores
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
float score = m_msg3a->m_scores[i];//getScores()[i];
|
|
int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
|
|
int32_t sh26 = m_msg3a->getSiteHash26(i);
|
|
/*
|
|
sb->safePrintf("\t\t<searchResult>\n");
|
|
sb->safePrintf("\t\t\t<rank>%" INT32 "</rank>\n",i+1);
|
|
sb->safePrintf("\t\t\t<score>%f</score>\n",score);
|
|
sb->safePrintf("\t\t\t<docId>%" INT64 "</docId>\n",d);
|
|
sb->safePrintf("\t\t\t<siteHash32>%" UINT32 "</siteHash32>\n",sh32);
|
|
sb->safePrintf("\t\t</searchResult>\n");
|
|
*/
|
|
// store results for this Msg99Reply
|
|
td->m_topDocIds[i] = d;
|
|
td->m_topScores[i] = score;
|
|
td->m_topSiteHashes26[i] = sh26;
|
|
}
|
|
// reset rest so it prints pretty on gdb debug print cmd
|
|
for ( int32_t i = max ; i < (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; i++ ) {
|
|
td->m_topDocIds[i] = 0LL;
|
|
td->m_topScores[i] = 0.0;
|
|
td->m_topSiteHashes26[i] = 0;
|
|
}
|
|
|
|
/*
|
|
sb->safePrintf("\t</seoQueryScoreInfo>\n");
|
|
*/
|
|
|
|
// give front-end the progress bar info
|
|
if ( m_seoSocket && m_progressBar ) {
|
|
// tmp buf
|
|
char tmp[16];
|
|
float percent = (float)m_numMsg3aReplies ;
|
|
//percent /= (float)m_maxFullQueries;
|
|
percent *= 100.0;
|
|
// these are 80% of the pipeline if getting competitor
|
|
// backlinks
|
|
if ( m_progressBar == 2 ) percent *= .80;
|
|
int32_t percentLong = (int32_t)percent;
|
|
if ( percentLong >= 100 ) percentLong = 99;
|
|
int32_t tmpLen = sprintf(tmp,"%02" INT32 "%%",percentLong);
|
|
if ( tmpLen !=3)log("seo: bad progress bar output %" INT32 "",tmpLen);
|
|
// try a send on non-blocking socket
|
|
int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
|
|
if ( n != tmpLen ) log("seo: bad progress bar send %" INT32 "",n);
|
|
// forget error
|
|
errno = 0;
|
|
}
|
|
}
|
|
|
|
bool XmlDoc::clientClosedConnection ( ) {
|
|
|
|
if ( ! m_seoSocket ) return false;
|
|
|
|
if ( m_clientClosed ) return true;
|
|
|
|
if ( g_now - m_lastCheckTime < 50 ) return m_clientClosed;
|
|
|
|
m_lastCheckTime = g_now;
|
|
|
|
char buffer[100];
|
|
if ( recv(m_seoSocket->m_sd,buffer,99,MSG_PEEK|MSG_DONTWAIT) == 0 ) {
|
|
m_clientClosed = true;
|
|
log("xmldoc: CLIENT CLOSED CONNECTION!!");
|
|
}
|
|
|
|
return m_clientClosed;
|
|
}
|
|
|
|
// . returns -1 if blocked, NULL with g_errno set on error
|
|
// . we do this to get related docids
|
|
SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
|
|
|
|
setStatus ( "getmatchingqueriesscoredforfullquery" );
|
|
|
|
// just re-use the same m_queryPtrs SafeBuf we used above but we
|
|
// set the Msg99Reply::m_myScore here and sort them by that
|
|
if ( m_queryPtrsWholeValid )
|
|
return &m_matchingQueryBuf;
|
|
|
|
// get the queries sorted by the url: | scores for our main url
|
|
SafeBuf *mq = getMatchingQueryBuf();
|
|
if ( mq == NULL || mq == (void *)-1 ) return mq;
|
|
|
|
// setup timer
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
if ( ! m_beginTimeFullQueries )
|
|
m_beginTimeFullQueries = gettimeofdayInMilliseconds();
|
|
|
|
// this buffer holds a ptr to each query in each msg99 reply we
|
|
// received from all hosts in the network
|
|
QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
int32_t nks = mq->length()/sizeof(QueryLink);
|
|
|
|
int32_t maxFullQueries = 50;
|
|
int32_t tneed = maxFullQueries * sizeof(TopDocIds);
|
|
if ( m_topDocIdsBuf.length() == 0 && ! m_topDocIdsBuf.reserve(tneed) )
|
|
return NULL;
|
|
|
|
// . now launch msg3as at them
|
|
// . this is 60k so new it here
|
|
if ( ! m_msg3a ) {
|
|
// reset the query # we are processing
|
|
m_queryNum = 0;
|
|
m_numMsg3aRequests = 0;
|
|
m_numMsg3aReplies = 0;
|
|
if ( ! m_fullQueryDedup.set(8,0,256,NULL,0,
|
|
false,m_niceness,"fqdd"))
|
|
return NULL;
|
|
try { m_msg3a = new ( Msg3a ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_msg3a, sizeof(Msg3a),"xdmsg3a");
|
|
// need this too now i guess since it is 65k
|
|
try { m_query3a = new ( Query ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_query3a, sizeof(Query),"xdqry3a");
|
|
}
|
|
|
|
|
|
loop:
|
|
|
|
// breath in case we hit all cache
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// have we launched all the requests we need to
|
|
bool exhausted = false;
|
|
if ( m_queryNum >= nks ) exhausted = true;
|
|
if ( m_numMsg3aRequests >= maxFullQueries ) exhausted = true;
|
|
// if client closed browser connection by hitting the stop sign
|
|
// then stop here!
|
|
if ( clientClosedConnection() )
|
|
m_hadMatchError = (int)ESOCKETCLOSED;
|
|
|
|
if ( m_hadMatchError ) exhausted = true;
|
|
|
|
// if nothing to launch
|
|
if ( exhausted &&
|
|
// and all replies received
|
|
m_numMsg3aReplies >= m_numMsg3aRequests ) {
|
|
// nuke the msg3a to save mem
|
|
mdelete ( m_msg3a, sizeof(Msg3a) , "msg3a" );
|
|
delete ( m_msg3a );
|
|
m_msg3a = NULL;
|
|
mdelete ( m_query3a , sizeof(Query), "qry3a" );
|
|
delete ( m_query3a );
|
|
m_query3a = NULL;
|
|
// time it
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginTimeFullQueries;
|
|
log("seopipe: time: fullqueries took %" INT64 " ms",took);
|
|
// force closed?
|
|
if ( m_hadMatchError ) return NULL;
|
|
// we are done!
|
|
m_queryPtrsWholeValid = true;
|
|
return &m_matchingQueryBuf;//queryPtrs;
|
|
}
|
|
|
|
// if nothing to launch wait for all replies
|
|
if ( exhausted )
|
|
return (SafeBuf *)-1;
|
|
|
|
// get the current query to process
|
|
//Msg99Reply *qp = queryPtrs[m_queryNum];
|
|
QueryLink *qk = &qks[m_queryNum];
|
|
|
|
int32_t savedQueryNum = m_queryNum;
|
|
|
|
QueryLogEntry *qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
|
|
|
|
// int16_tcut
|
|
//int64_t h64 = qk->m_querySynBaseHash64;
|
|
int64_t h64 = getSynBaseHash64 ( qe->getQueryString(),qe->m_langId);
|
|
|
|
// . if we already did a similar query, then skip it
|
|
// . Msg99Reply::m_topDocIds will be NULL so getRelatedDocIds() will
|
|
// know we skipped this query and to ignore it
|
|
if ( m_fullQueryDedup.isInTable(&h64) ) {
|
|
m_queryNum++;
|
|
goto loop;
|
|
}
|
|
|
|
// or if importance is 0, which means to ignore!
|
|
if ( qk->m_queryImportance <= 0.0 ) {
|
|
m_queryNum++;
|
|
goto loop;
|
|
}
|
|
|
|
// int16_tcut
|
|
char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
|
|
|
|
// sanity
|
|
if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
|
|
// this is required for synonyms!
|
|
// TODO: use whatever language the query is!!!
|
|
uint8_t langId = langEnglish;
|
|
|
|
// int16_tcut
|
|
int32_t qlen = gbstrlen(qstr);
|
|
|
|
//int32_t collLen = gbstrlen(cr->m_coll);
|
|
// set the request
|
|
m_mr2.reset();
|
|
m_mr2.ptr_query = qstr;
|
|
m_mr2.size_query = qlen+1;
|
|
//m_mr2.ptr_coll = cr->m_coll;
|
|
//m_mr2.size_coll = collLen+1;
|
|
m_mr2.m_collnum = cr->m_collnum;
|
|
m_mr2.m_queryExpansion = 1;
|
|
m_mr2.m_language = langId;
|
|
m_mr2.m_niceness = m_niceness;
|
|
// . get top 50 results now
|
|
// . then related docids will have to be in there
|
|
m_mr2.m_docsToGet = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
|
|
m_mr2.m_useSeoResultsCache = true;
|
|
// we do not need this, we just want the related docids/scores
|
|
m_mr2.m_getDocIdScoringInfo = false;
|
|
// use cache for 7 days since it is just for getting related docids
|
|
// right now. make sure that that cache saves to disk.
|
|
// MDW: why is this not working?
|
|
//m_mr2.m_maxAge = 86400 * 7;
|
|
//m_mr2.m_addToCache = true;
|
|
//m_mr2.m_debug = 1;
|
|
// prepend to the query?
|
|
int32_t ulen = m_firstUrl.m_ulen;
|
|
// go to next guy if this query is too big already
|
|
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
|
|
m_queryNum++;
|
|
goto loop;
|
|
}
|
|
|
|
// support for the new TopDocIds class which holds detailed search
|
|
// results for selected matching queries QueryLinks
|
|
//int32_t maxt = numQueryPtrs;
|
|
//if ( maxt > m_maxQueries ) maxt = m_maxQueries;
|
|
//if ( ! maxt ) { char *xx=NULL;*xx=0; }
|
|
// we also need the top docids
|
|
//if ( ! m_topDocIdsBuf.m_capacity ) {
|
|
// int32_t need = sizeof(TopDocIds) * (int32_t)MAX_MATCHING_QUERIES;
|
|
// if ( ! m_topDocIdsBuf.reserve ( need ,"tdbuf" ) ) return NULL;
|
|
// //m_nextAvailTopDocIdsOffset = 0;// = m_topDocIdsBuf;
|
|
//}
|
|
// make matching query, "qk", point to the topdocids that we
|
|
// will fill in when we execute this query in full below
|
|
// sanity!
|
|
//int32_t off3 = m_nextAvailTopDocIdsOffset ;
|
|
//if ( off3/(int32_t)sizeof(TopDocIds)>=maxt){char *xx=NULL;*xx=0;}
|
|
// seo.cpp's handleRequest99() should have set it to -1
|
|
//if ( qp->m_topDocIdsBufOffset != -1 ) { char *xx=NULL;*xx=0; }
|
|
// assign this TopDocIds class to this query ptr now
|
|
//qp->m_topDocIdsBufOffset = m_nextAvailTopDocIdsOffset;
|
|
// get that ptr to reset its count to 0
|
|
//TopDocIds *ttt = qp->getTopDocIds(&m_topDocIdsBuf);
|
|
//ttt->m_numDocIds = 0;
|
|
// inc it
|
|
//m_nextAvailTopDocIdsOffset += sizeof(TopDocIds);
|
|
// update length since we store topdocids buf based on its m_length
|
|
//m_topDocIdsBuf.setLength ( m_nextAvailTopDocIdsOffset );
|
|
|
|
// advance for next guy
|
|
m_queryNum++;
|
|
|
|
// add it to dedup table
|
|
if ( ! m_fullQueryDedup.addKey(&h64) ) {
|
|
m_hadMatchError = g_errno;
|
|
goto loop;
|
|
}
|
|
|
|
// mark it out
|
|
m_numMsg3aRequests++;
|
|
|
|
// . set the query class for msg3a
|
|
// . queryExpansion = true
|
|
m_query3a->set2 ( qstr , langId , true );
|
|
|
|
// a debug thing
|
|
m_query3a->m_containingParent = (void *)this;
|
|
|
|
// secret variable latchon
|
|
m_msg3a->m_hack = this;
|
|
|
|
m_msg3a->m_hackQNum = savedQueryNum;
|
|
m_msg3a->m_hackQPtr = NULL;//(char *)qp;
|
|
|
|
// note it
|
|
setStatus("launching msg3a");
|
|
|
|
// . get the docIds
|
|
// . this sets m_msg3a.m_clusterLevels[] for us
|
|
// . it sends a msg39 request to each alive host in the network
|
|
bool status = m_msg3a->getDocIds ( &m_mr2,
|
|
m_query3a,
|
|
this,//m_msg3a , // this ,
|
|
gotMsg3aReplyForFullQueryWrapper);
|
|
// return false if msg3a blocked
|
|
if ( ! status ) return (SafeBuf *)-1;
|
|
// error?
|
|
if ( g_errno ) {
|
|
m_hadMatchError = g_errno;
|
|
m_numMsg3aReplies++;
|
|
goto loop;
|
|
}
|
|
// i guess did not block... can this happen? cached?
|
|
//log("xmldoc: msg3a did not block");
|
|
// not supported yet. we need to process reply.
|
|
//char *xx=NULL;*xx=0;
|
|
// yeah, msg17 in there can cache in seoresults cache now
|
|
gotMsg3aReplyForFullQuery();
|
|
// try looping
|
|
goto loop;
|
|
}
|
|
|
|
static int rdCmp ( const void *a, const void *b ) {
|
|
RelatedDocId *da = (RelatedDocId *)a;
|
|
RelatedDocId *db = (RelatedDocId *)b;
|
|
// get scores
|
|
float scorea = da->m_relatedWeight;//dotProduct;//similarityScore;
|
|
float scoreb = db->m_relatedWeight;//dotProduct;//similarityScore;
|
|
if ( scorea < scoreb ) return 1;
|
|
if ( scorea > scoreb ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
static int lkCmp ( const void *a, const void *b ) {
|
|
QueryNumLinkedNode *ka = *(QueryNumLinkedNode **)a;
|
|
QueryNumLinkedNode *kb = *(QueryNumLinkedNode **)b;
|
|
// get scores
|
|
int32_t ra = ka->m_relatedDocIdRank;
|
|
int32_t rb = kb->m_relatedDocIdRank;
|
|
if ( ra >= 0 && rb >= 0 ) {
|
|
if ( ra < rb ) return -1;
|
|
if ( ra > rb ) return 1; // swap
|
|
}
|
|
if ( ra >= 0 ) return -1;
|
|
if ( rb >= 0 ) return 1; // swap
|
|
// if neither ranked, go by serp score i guess
|
|
float sa = ka->m_relatedDocIdSerpScore;
|
|
float sb = kb->m_relatedDocIdSerpScore;
|
|
if ( sa > sb ) return -1;
|
|
if ( sa < sb ) return 1; // swap
|
|
return 0;
|
|
}
|
|
|
|
// buf is an array of RelatedDocId members
|
|
SafeBuf *XmlDoc::getRelatedDocIds ( ) {
|
|
|
|
setStatus ( "getrelateddocids" );
|
|
|
|
if ( m_relatedDocIdBufValid )
|
|
return &m_relatedDocIdBuf;
|
|
|
|
// get the full replies with the top 50 docids and scores listed
|
|
// for each query. should be sorted by m_myScore.
|
|
SafeBuf *mq = getMatchingQueriesScoredForFullQuery ( );
|
|
if ( ! mq || mq == (void *)-1 ) return mq;
|
|
|
|
// . how many queries do we have that match this url?
|
|
// . they should be sorted by our url's score
|
|
//QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
//int32_t nks = mq->length()/sizeof(QueryLink);
|
|
|
|
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SafeBuf *)sh32;
|
|
|
|
int32_t dh32 = getDomHash32();
|
|
|
|
//if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
int32_t ourSiteHash26 = *sh32 & 0x03ffffff;
|
|
int32_t ourDomHash26 = dh32 & 0x03ffffff;
|
|
|
|
// for deduping queries with the same "base hash" we do not want
|
|
// them to count twice for RelatedDocId::m_numCommonQueries
|
|
//HashTableX dedup;
|
|
//if ( ! dedup.set(8,0,1024,NULL,0,false,0,"dddtab"))
|
|
// return NULL;
|
|
|
|
// scan the top docids
|
|
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
|
|
int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
|
|
for ( int32_t i = 0 ; i < ntds ; i++ ) {
|
|
TopDocIds *td = &tds[i];
|
|
int32_t queryNum = td->m_queryNum;
|
|
//QueryLink *qk = &qks[queryNum];
|
|
// sanity
|
|
int32_t nd = td->m_numDocIds;
|
|
if( nd < 0) { char *xx=NULL;*xx=0; }
|
|
if( nd > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS){
|
|
char *xx=NULL;*xx=0;}
|
|
// get main url score for query
|
|
//float ourScore = qp->m_myScore;
|
|
// and the score of the top result
|
|
//float normScore = td->m_topScores[0];
|
|
// norm main url score
|
|
//ourScore /= normScore;
|
|
// scan the top 50 (or more) docids for this query
|
|
for ( int32_t j = 0 ; j < nd ; j++ ) {
|
|
// . do not allow related docid (aka competitor page)
|
|
// to be from our site! will make sure we exclude
|
|
// our url itself, too. otherwise competitor
|
|
// backlinks mentions when a link links to us, and
|
|
// we don't care about that, we already have the
|
|
// link. we just want to see recommended backlinks
|
|
// we do not yet have, so we can get them.
|
|
// . skip it if from our same sitehash26
|
|
if ( td->m_topSiteHashes26[j] == ourSiteHash26 )
|
|
continue;
|
|
// fix cheatcodes.com being a competitor page when
|
|
// our main url is www.cheatcodes.com
|
|
if ( td->m_topSiteHashes26[j] == ourDomHash26 )
|
|
continue;
|
|
// skip twitter facebook, etc
|
|
int64_t docId = td->m_topDocIds[j];
|
|
if ( docId == 114607849462LL || // https://www.twitter
|
|
docId == 273941610476LL || // twitter.com
|
|
docId == 1628437294LL || // facebook.com
|
|
docId == 146394931444LL ) // cnn.com/video/
|
|
continue;
|
|
// add RelatedDocId into m_relatedDocIdBuf and/or
|
|
// augment its linked list of query/score pairs
|
|
addRelatedDocIdInfo ( td->m_topDocIds[j],
|
|
queryNum ,
|
|
td->m_topScores[j], // score
|
|
j , // rank
|
|
td->m_topSiteHashes26[j] );
|
|
}
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// this is now in getRelatedDocIdsScored()!!!!!!!
|
|
/*
|
|
char *rdbuf = m_relatedDocIdBuf.getBufStart();
|
|
int32_t numDocIds = m_relatedDocIdBuf.length()/sizeof(RelatedDocId);
|
|
// now sort by RelatedDocId::m_relatedWeight
|
|
qsort ( rdbuf , numDocIds, sizeof(RelatedDocId),qp99docIdCmp );
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// limit to top MAX_RELATED_DOCIDS related docids
|
|
// will take longer to get titles/urls and related queries the
|
|
// higher this number is, but we will have more competitor backlinks
|
|
// and terms etc.
|
|
int32_t maxLen = sizeof(RelatedDocId) * MAX_RELATED_DOCIDS;
|
|
int32_t currentLen = m_relatedDocIdBuf.length();
|
|
if ( currentLen > maxLen ) currentLen = maxLen;
|
|
m_relatedDocIdBuf.setLength(currentLen);
|
|
numDocIds = currentLen / sizeof(RelatedDocId);
|
|
*/
|
|
|
|
int32_t numDocIds = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
|
|
/*
|
|
// log out for debug
|
|
char *rdbuf = m_relatedDocIdBuf.getBufStart();
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf;
|
|
for ( int32_t i = 0 ; g_conf.m_logDebugSEO && i < numDocIds ; i++ ) {
|
|
log("seopipe: related docId #%" INT32 " docid=%" INT64 " "
|
|
"score=?? common=%" INT32 "",
|
|
i,
|
|
rds[i].m_docId,
|
|
//rds[i].m_relatedWeight,//dotProduct, // similarityScore,
|
|
rds[i].m_numCommonQueries);
|
|
}
|
|
*/
|
|
|
|
log("seo: got %" INT32 " related docids in buf",numDocIds);
|
|
|
|
m_relatedDocIdBufValid = true;
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
|
|
// used as part of the msg4f request
|
|
SafeBuf *XmlDoc::getTopMatchingQueryBuf ( ) {
|
|
|
|
if ( m_topMatchingQueryBufValid )
|
|
return &m_topMatchingQueryBuf;
|
|
|
|
// scan matching queries that we evaluated fully using msg3a
|
|
SafeBuf *qkbuf = getMatchingQueriesScoredForFullQuery ( );
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
|
|
//int32_t nks = qkbuf->length()/sizeof(QueryLink);
|
|
|
|
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
|
|
int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
|
|
for ( int32_t i = 0 ; i < ntds ; i++ ) {
|
|
TopDocIds *td = &tds[i];
|
|
int32_t queryNum = td->m_queryNum;
|
|
QueryLink *qk = &qks[queryNum];
|
|
// ok, get it
|
|
char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
|
|
int32_t qlen = gbstrlen(qstr);
|
|
// store query #
|
|
if ( ! m_topMatchingQueryBuf.pushLong(queryNum) )
|
|
return NULL;
|
|
// then query
|
|
if ( ! m_topMatchingQueryBuf.safeMemcpy(qstr,qlen+1))
|
|
return NULL;
|
|
}
|
|
|
|
m_topMatchingQueryBufValid = true;
|
|
return &m_topMatchingQueryBuf;
|
|
}
|
|
|
|
|
|
|
|
static void gotMsg4fReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// a bit of a hack
|
|
THIS->m_savedSlot = slot;
|
|
// ultimately, getRelatedDocIdsScored() will be called from this
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . lets just put everything in this one function
|
|
// . launch a msg4f request for each relateddocid
|
|
// . get the msg4f reply back and add the positive scoring queries to the
|
|
// related docids linked list of QueryNumLinkedNodes in the
|
|
// m_commonQueryNumBuf, avoid dups.
|
|
// . then score each related docid by calling setRelatedDocIdScores()
|
|
SafeBuf *XmlDoc::getRelatedDocIdsScored ( ) {
|
|
|
|
setStatus ( "getrelateddocidsscored");
|
|
|
|
if ( m_relatedDocIdsScoredBufValid ) {
|
|
// and return the buf of RelatedDocIds
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
// what docids share our TOP-scoring matching queries?
|
|
SafeBuf *rdbuf = getRelatedDocIds();
|
|
if ( ! rdbuf || rdbuf == (void *)-1) return (SafeBuf *) rdbuf;
|
|
|
|
SafeBuf *tmq = getTopMatchingQueryBuf();
|
|
if ( ! tmq || tmq == (void *)-1) return (SafeBuf *) tmq;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// the top 50 or so matching queries will each be scored for
|
|
// every related docid we have in m_relatedDocIdBuf. these are
|
|
// the same queries we got the full results for above!!!
|
|
// we have to score them for each related docid here because we only
|
|
// get the top 300 or so results above for each one. so if the
|
|
// related docid matched the query but was not in the top 300 results,
|
|
// it would have appeared to NOT match the query. bad. that was
|
|
// causing google to come up high in related docids because it
|
|
// ranked high for so many generic queries. and the other good
|
|
// related docids did not rank in the top 300 for those same
|
|
// generic queries. so at least this logic will show that the
|
|
// related docids do indeed match those generic queries, too.
|
|
// and they will get higher scores (RelatedDocId::m_relatedWeight)
|
|
|
|
// we must be an incoming reply if we already sent out all the requests
|
|
if ( m_numMsg4fRequests > 0 ) {
|
|
// increment our reply counter
|
|
m_numMsg4fReplies++;
|
|
// . m_savedSlot is a hack
|
|
// . now parse the reply and add QueryNumLinkedNode
|
|
// into m_commonQueryNumBuf.
|
|
char *p = m_savedSlot->m_readBuf;
|
|
char *pend = m_savedSlot->m_readBufSize + p;
|
|
// now scan the reply
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// the queryNum is relative to the m_queryPtrs array
|
|
// which has all the matching queries of this document,
|
|
// not just the "top" 50 matching queries by score.
|
|
int32_t queryNum = *(int32_t *)p;
|
|
// sanity
|
|
if ( queryNum<0 ) {char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
// then docid of related docid that had this score
|
|
int64_t docId = *(int64_t *)p;
|
|
p += 8;
|
|
// then score
|
|
float score = *(float *)p;
|
|
p += 4;
|
|
// this will add the query/score pair into the
|
|
// related docid buf. it will not add dups if already
|
|
// ranked!
|
|
addRelatedDocIdInfo ( docId ,
|
|
queryNum ,
|
|
score ,
|
|
-1 , // rank unknown
|
|
-1 ); // sitehash26 unknown
|
|
}
|
|
|
|
// return if awaiting more replies
|
|
if ( m_numMsg4fReplies < m_numMsg4fRequests )
|
|
return (SafeBuf *)-1;
|
|
|
|
// point to buffer of related docids
|
|
char *rdbuf = m_relatedDocIdBuf.getBufStart();
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf;
|
|
int32_t nr = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
|
|
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
|
// int16_tcut
|
|
RelatedDocId *rd = &rds[i];
|
|
// now score it since we have all the serpscores for
|
|
// all top matching queries.
|
|
setRelatedDocIdWeightAndRank(rd);
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// now sort by RelatedDocId::m_relatedWeight
|
|
qsort ( rdbuf , nr , sizeof(RelatedDocId),rdCmp );
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// limit to top MAX_RELATED_DOCIDS related docids
|
|
// will take longer to get titles/urls and related queries the
|
|
// higher this number is, but we will have more competitor
|
|
// backlinks and terms etc. less space in cachedb too!
|
|
int32_t maxLen = MAX_RELATED_DOCIDS * sizeof(RelatedDocId);
|
|
int32_t newLen = m_relatedDocIdBuf.length();
|
|
if ( newLen > maxLen ) newLen = maxLen;
|
|
m_relatedDocIdBuf.setLength(newLen);
|
|
|
|
//
|
|
// make a new buffer for m_commonQueryNumBuf just for the
|
|
// related docids we picked, and sort them by rel docid rank.
|
|
// so it will be smaller and sorted.
|
|
//
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( m_commonQueryNumBuf.length() ) )
|
|
return NULL;
|
|
// scan each related docid in the top 300 or so
|
|
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
|
// int16_tcut
|
|
RelatedDocId *rd = &rds[i];
|
|
// store ptrs to query nums so we can sort them
|
|
QueryNumLinkedNode *links[1024];
|
|
int32_t nn = 0;
|
|
int32_t fo = rd->m_firstCommonQueryNumOff;
|
|
char *base = m_commonQueryNumBuf.getBufStart();
|
|
// scan down the linked list and store ptrs to links[]
|
|
for ( ; fo >= 0 ; ) {
|
|
// cast it
|
|
QueryNumLinkedNode *qn;
|
|
qn = (QueryNumLinkedNode *)(base + fo);
|
|
// point to next
|
|
fo = qn->m_nextOff;
|
|
// store this guy for sorting
|
|
links[nn] = qn;
|
|
nn++;
|
|
if ( nn >= 1024 ) break;
|
|
}
|
|
// now sort them by m_relatedDocIdRank
|
|
qsort( links, nn,sizeof(QueryNumLinkedNode *),lkCmp);
|
|
// point to our new linked list in tmpBuf, we will
|
|
// store them here.
|
|
rd->m_firstCommonQueryNumOff = tmpBuf.length();
|
|
QueryNumLinkedNode *prev = NULL;
|
|
// now store into tmpbuf
|
|
for ( int32_t k = 0 ; k < nn ; k++ ) {
|
|
QueryNumLinkedNode *qn = links[k];
|
|
int32_t size = sizeof(QueryNumLinkedNode);
|
|
if ( !tmpBuf.reserve(size) ) return NULL;
|
|
QueryNumLinkedNode *nn ;
|
|
nn = (QueryNumLinkedNode *)tmpBuf.getBuf();
|
|
int32_t clen = tmpBuf.length();
|
|
tmpBuf.safeMemcpy(qn,size);
|
|
// we are the previous guy's next node
|
|
if ( prev ) prev->m_nextOff = clen;
|
|
// assume nobody follows us
|
|
nn->m_nextOff = -1;
|
|
// we are now next guy's prev
|
|
prev = nn;
|
|
}
|
|
}
|
|
|
|
// now steal tmpbuf, and free our old stuff
|
|
m_commonQueryNumBuf.stealBuf ( &tmpBuf );
|
|
|
|
// i guess we are done now!
|
|
m_relatedDocIdsScoredBufValid = true;
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
|
|
// . there's a massive # of related docids at this point
|
|
// . possibly 50 x 300 = 15,000
|
|
// . so launch one msg4f for each host in our network
|
|
// . just specify all the related docids in the msg4f request and have
|
|
// the handleRequest4f() function in seo.cpp get the title rec.
|
|
// . make sure all docids are local to that host
|
|
// . dispatch the msg4f request to the machine that has that docid
|
|
// local so it can just hit disk
|
|
// . handleRequest4f() can follow the same logic as in
|
|
// getRelatedQueryLinks() which make a new xmldoc. then it can
|
|
// call newxd->getTermListBuf() instead of us passing it in.
|
|
// . so each host has a bin, a host bin
|
|
//#ifdef __APPLE__
|
|
SafeBuf hostBin[MAX_HOSTS];
|
|
//#else
|
|
//SafeBuf hostBin[g_hostdb.m_numHosts];
|
|
//#endif
|
|
|
|
// scan the related docids and send the requests if we have not already
|
|
for ( int32_t i = 0 ; ! m_sentMsg4fRequests && i < numRelated ; i++ ) {
|
|
RelatedDocId *rd = &rds[i];
|
|
//uint32_t gid=g_hostdb.getGroupIdFromDocId (rd->m_docId);
|
|
// pick host in that group
|
|
//Host *group = g_hostdb.getGroup ( gid );
|
|
int32_t shardNum = getShardNumFromDocId ( rd->m_docId );
|
|
Host *group = g_hostdb.getShard ( shardNum );
|
|
int32_t nh = g_hostdb.m_numHostsPerShard;
|
|
int32_t hostNum = rd->m_docId % nh;
|
|
Host *h = &group[hostNum];
|
|
int32_t hostId = h->m_hostId;
|
|
// skip if dead
|
|
int32_t count = 0;
|
|
if ( g_hostdb.isDead(hostId) && h->m_wasEverAlive ) {
|
|
// increment hostnum if that one is dead
|
|
if ( ++hostNum >= nh ) hostNum = 0;
|
|
// set these again
|
|
h = &group[hostNum];
|
|
hostId = h->m_hostId;
|
|
// if all dead, just pick this one i guess
|
|
if ( ++count >= nh ) break;
|
|
}
|
|
// int16_tcut
|
|
SafeBuf *hbin = &hostBin[hostId];
|
|
// if bin is empty initialize
|
|
if ( hbin->length() == 0 ) {
|
|
// provide only collection to handleRequest4f()
|
|
if ( ! hbin->safeMemcpy(cr->m_coll,
|
|
gbstrlen(cr->m_coll)+1) )
|
|
return NULL;
|
|
// . store the queries we want it to evaluate
|
|
// . these are null-terminated query strings preceeded
|
|
// by their corresponding query number in our
|
|
// m_queryPtrs[] array which pts to a Msg99Reply
|
|
if ( ! hbin->pushLong(tmq->length()))
|
|
return NULL;
|
|
if ( ! hbin->safeMemcpy(tmq))
|
|
return NULL;
|
|
}
|
|
// store this new docid, which is local to this host
|
|
if ( ! hbin->pushLongLong(rd->m_docId) ) return NULL;
|
|
}
|
|
|
|
// shotgun out the msg4f requests now
|
|
for ( int32_t i = 0 ;
|
|
! m_sentMsg4fRequests && i < g_hostdb.getNumHosts() ; i++ ) {
|
|
// int16_tcut
|
|
SafeBuf *hbin = &hostBin[i];
|
|
// get that host
|
|
Host *host = g_hostdb.getHost(i);
|
|
// make a copy for sending out
|
|
SafeBuf copy;
|
|
if ( ! copy.safeMemcpy ( hbin ) ) continue;
|
|
// get the bin copy
|
|
char *req = copy.getBufStart();
|
|
int32_t reqSize = copy.length();
|
|
// detach it so udpserver can free it when done transmitting
|
|
copy.detachBuf ();
|
|
// free this guy now i guess
|
|
hbin->purge();
|
|
// count as launched
|
|
m_numMsg4fRequests++;
|
|
// launch it
|
|
if ( ! g_udpServer.sendRequest ( req ,
|
|
reqSize,
|
|
0x4f , // msgtype
|
|
host->m_ip , // ip
|
|
host->m_port , // port
|
|
host->m_hostId,
|
|
NULL, // retslot
|
|
this,
|
|
gotMsg4fReplyWrapper,
|
|
10000 , // timeout
|
|
-1 , // backoff
|
|
-1 , // maxwait
|
|
NULL, // replybuf
|
|
0, // replybufmaxsize
|
|
m_niceness // niceness
|
|
)) {
|
|
// let admin know about error
|
|
log("seopipe: sendRequest 4f had error: %s",
|
|
mstrerror(g_errno));
|
|
// count it as replied then
|
|
m_numMsg4fReplies++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// do not re-send the requests
|
|
m_sentMsg4fRequests = true;
|
|
|
|
// wait for all replies to come in
|
|
if ( m_numMsg4fRequests > m_numMsg4fReplies ) return (SafeBuf *)-1;
|
|
|
|
// how can they all be done? all errors!
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
// remote host will alloc an xmldoc, about 1MB each...
|
|
#define MAX_OUT_MSG20S 30
|
|
|
|
// . like getRelatedDocIds() but with titles, etc.
|
|
// . return a list of competiting docids/titles/etc.
|
|
SafeBuf *XmlDoc::getRelatedDocIdsWithTitles ( ) {
|
|
|
|
setStatus ( "getrelateddocidswithtitles" );
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_relatedDocIdsWithTitlesValid )
|
|
return &m_relatedDocIdBuf;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
SafeBuf *rdbuf = getRelatedDocIdsScored();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
// now look up each docid in titledb and store the url title
|
|
// into m_relatedTitleBuf safebuf and set the RelatedDocId::
|
|
// rd_title_off and rd_url_off into that when done. store offsets for
|
|
// now and make into full out ptrs when done in case the
|
|
// m_relatedTitleBuf reallocs.
|
|
|
|
if ( ! m_msg20Buf.length() ) {
|
|
int32_t need = sizeof(Msg20) * MAX_OUT_MSG20S ;
|
|
if ( ! m_msg20Buf.reserve ( need,"m20buf" ) ) return NULL;
|
|
// mark it all in use
|
|
m_msg20Buf.setLength(need);
|
|
// init them
|
|
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
|
|
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
|
|
// reset cursor to start with first related docid
|
|
m_rdCursor = 0;
|
|
m_relatedDocIdError = 0;
|
|
m_numMsg20Replies = 0;
|
|
}
|
|
|
|
// point to buffer of related docids
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();;
|
|
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
|
|
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
|
|
|
|
// scan the msg20s we allocated to see if any got a reply
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &mp[i];
|
|
// skip if never launched
|
|
if ( ! msg20->m_launched ) continue;
|
|
// skip if it is in progress, awaiting its reply
|
|
if ( msg20->m_inProgress ) continue;
|
|
// get the reply from it (might be NULL iff g_errno is set)
|
|
Msg20Reply *reply = msg20->getReply(); // m_r
|
|
// get the corresponding related docid
|
|
int32_t hisCursor = msg20->m_hack2;
|
|
// int16_tcut
|
|
RelatedDocId *rd = &rds[hisCursor];
|
|
// ok, it has a reply. could be NULL if g_errno was set.
|
|
if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , reply ) )
|
|
m_relatedDocIdError = g_errno;
|
|
// reset it for later us... or not...
|
|
msg20->reset();
|
|
// count reply as back now
|
|
m_numMsg20Replies++;
|
|
}
|
|
|
|
// launch more if we can. one launch per msg20.
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// no more related docids left to launch?
|
|
if ( m_rdCursor >= numRelated ) break;
|
|
// int16_tcut
|
|
Msg20 *msg20 = &mp[i];
|
|
// skip if already launched/inuse
|
|
if ( msg20->m_inProgress ) continue;
|
|
// get current related docid
|
|
RelatedDocId *rd = &rds[m_rdCursor];
|
|
// make the request
|
|
Msg20Request req;
|
|
//req.ptr_coll = cr->m_coll;
|
|
//req.size_coll = gbstrlen(cr->m_coll)+1;
|
|
req.m_collnum = cr->m_collnum;
|
|
req.m_docId = rd->m_docId;
|
|
req.m_expected = true;
|
|
req.m_niceness = m_niceness;
|
|
req.m_state = m_masterState;
|
|
req.m_callback2 = m_masterLoop;
|
|
// do not get summary stuff. too slow.
|
|
req.m_numSummaryLines = 0;
|
|
// if it has an outlink to our site/domain set
|
|
// Msg20Reply::m_hasLinkToOurDomOrHost
|
|
req.m_ourHostHash32 = getHostHash32a();
|
|
req.m_ourDomHash32 = getDomHash32();
|
|
// store cursor in msg20 itself so we know what rd it's using
|
|
msg20->m_hack2 = m_rdCursor;
|
|
// advance cursor!!!
|
|
m_rdCursor++;
|
|
// launch it
|
|
if ( ! msg20->getSummary ( &req ) ) continue;
|
|
// it did not block... wtf? g_errno might be set. ENOMEM?
|
|
if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , NULL ) )
|
|
m_relatedDocIdError = g_errno;
|
|
// reset it
|
|
msg20->reset();
|
|
// count reply as back now
|
|
m_numMsg20Replies++;
|
|
// it is not launched
|
|
i--;
|
|
}
|
|
|
|
// wait for one reply per related docid
|
|
if ( m_numMsg20Replies < numRelated )
|
|
return (SafeBuf *)-1;
|
|
|
|
// call msg20 destructor
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
Msg20 *msg20 = &mp[i];
|
|
msg20->destructor();
|
|
}
|
|
// purge the mem they used
|
|
m_msg20Buf.purge();
|
|
|
|
// now we are done
|
|
m_relatedDocIdsWithTitlesValid = true;
|
|
m_relatedTitleBufValid = true;
|
|
|
|
// store it in cachedb
|
|
if ( ! storeRelatedDocIdsIntoCachedb( ))
|
|
return (SafeBuf *)-1;
|
|
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
|
|
bool XmlDoc::setRelatedDocIdInfoFromMsg20Reply ( RelatedDocId *rd ,
|
|
Msg20Reply *reply ) {
|
|
|
|
// get error. g_errno can be ENOTFOUND if titlerec not found
|
|
int32_t error = g_errno;
|
|
// . or could be EDOCBANNED/EDOCFILTERED etc.
|
|
// . if reply is NULL then g_errno MUST be set
|
|
if ( ! error ) error = reply->m_errno;
|
|
|
|
// int16_tcuts
|
|
char *urlStr = NULL;
|
|
char *titleStr = NULL;
|
|
char *siteStr = NULL;
|
|
|
|
if ( reply ) {
|
|
urlStr = reply->ptr_ubuf;
|
|
titleStr = reply->ptr_tbuf;
|
|
siteStr = reply->ptr_site;
|
|
}
|
|
|
|
// did that fail? i.e. docid not found!?!?!
|
|
if ( error ) {
|
|
// . just skip this asshole then
|
|
// . might be EDOCBANNED or EDOCFILTERED!
|
|
// . some are filtered because they are domain-only urls
|
|
// which should not be in the index because we force
|
|
// a "www." prepend on all urls now.
|
|
log("seo: msg20 reply for docid=%" INT64 " url=%s had "
|
|
"error: %s", rd->m_docId,urlStr,mstrerror(error));
|
|
// clear that
|
|
g_errno = 0;
|
|
ignoreRelatedDocId:
|
|
// mark them offsets as not-founds
|
|
rd->rd_title_off = -1;
|
|
rd->rd_url_off = -1;
|
|
rd->rd_site_off = -1;
|
|
return true;
|
|
}
|
|
|
|
// bar facebook.com and twitter.com roots... too popular for all!
|
|
// was coming up for jezebelgallery.com
|
|
if ( strcmp(urlStr,"http://www.twitter.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"https://www.twitter.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"http://www.facebook.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
// "/home.php?" or "home.*"
|
|
if ( strncmp(urlStr,"http://www.facebook.com/home.",29) == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"https://www.facebook.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"http://www.cnn.com/video/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
// fix robothits.com competitor pages
|
|
if ( strcmp(urlStr,"http://www.google.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
if ( strcmp(urlStr,"http://www.msn.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
// null means no title i guess
|
|
if ( ! titleStr ) titleStr = "";
|
|
|
|
// or if he links to us
|
|
if ( reply->m_hasLinkToOurDomOrHost ) {
|
|
log("seo: related docid=%" INT64 " url=%s links to our domain",
|
|
reply->m_docId,
|
|
urlStr);
|
|
goto ignoreRelatedDocId;
|
|
}
|
|
|
|
|
|
// store title
|
|
int32_t titleOffset = m_relatedTitleBuf.length();
|
|
if ( ! m_relatedTitleBuf.safeStrcpy ( titleStr ) ) return false;
|
|
m_relatedTitleBuf.pushChar('\0');
|
|
|
|
// then url
|
|
int32_t urlOffset = m_relatedTitleBuf.length();
|
|
if ( ! m_relatedTitleBuf.safeStrcpy ( urlStr ) ) return false;
|
|
m_relatedTitleBuf.pushChar('\0');
|
|
|
|
// then site
|
|
int32_t siteOffset = m_relatedTitleBuf.length();
|
|
if ( ! m_relatedTitleBuf.safeStrcpy ( siteStr ) ) return false;
|
|
m_relatedTitleBuf.pushChar('\0');
|
|
|
|
// then linkinfo
|
|
//int32_t linkInfo1Offset = m_relatedTitleBuf.length();
|
|
//if(!m_relatedTitleBuf.safeMemcpy(info1,info1->getSize()))return NULL;
|
|
|
|
// store as offset for easy serialization for storage into cachedb
|
|
//rd->m_linkInfo1Offset = linkInfo1Offset;
|
|
rd->m_relatedFirstIp = reply->m_firstIp;
|
|
rd->m_relatedCurrentIp = reply->m_ip;
|
|
rd->m_rd_siteRank = reply->m_siteRank;
|
|
rd->m_rd_langId = reply->m_language;
|
|
|
|
rd->m_rd_siteHash32 = 0;
|
|
if ( reply->ptr_site )
|
|
rd->m_rd_siteHash32 = hash32n ( reply->ptr_site );
|
|
|
|
// record the offsets of title/url/site in the m_relatedTitleBuf
|
|
rd->rd_title_off = titleOffset;
|
|
rd->rd_url_off = urlOffset;
|
|
rd->rd_site_off = siteOffset;
|
|
|
|
SafeBuf *rdbuf = getRelatedDocIds();
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
// log out for debug
|
|
log(LOG_DEBUG,
|
|
"seopipe: related docid (%" INT32 "of%" INT32 ") docid=%" INT64 " score=%f "
|
|
"title=\"%s\" url=\"%s\"",
|
|
m_numMsg20Replies,
|
|
numRelated-1,
|
|
rd->m_docId,
|
|
rd->m_relatedWeight,
|
|
titleStr,
|
|
urlStr);
|
|
|
|
return true;
|
|
}
|
|
/*
|
|
HashTableX *XmlDoc::getMatchingQueryHashTable ( ) {
|
|
|
|
setStatus ( "getmatchingqueryhashtable" );
|
|
|
|
if ( m_queryHashTableValid )
|
|
return &m_queryHashTable;
|
|
|
|
SafeBuf *qpbuf = getMatchingQueries(false);
|
|
if ( ! qpbuf || qpbuf == (void *)-1) return (HashTableX *)qpbuf;
|
|
|
|
// how many queries do we have that match this url?
|
|
Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
|
|
// init it
|
|
if ( ! m_queryHashTable.set(8,
|
|
0,
|
|
numQueryPtrs*4,
|
|
NULL,
|
|
0,
|
|
false,
|
|
m_niceness,
|
|
"qdht") )
|
|
return NULL;
|
|
|
|
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
|
|
// cast it
|
|
Msg99Reply *qp = qptrs[i];
|
|
// int16_tcut
|
|
int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
|
|
// hash it up
|
|
if ( ! m_queryHashTable.addKey ( &eh64 ) )
|
|
return NULL;
|
|
}
|
|
|
|
// all done
|
|
m_queryHashTableValid = true;
|
|
return &m_queryHashTable;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
HashTableX *XmlDoc::getMatchingQueryOffsetTable ( ) {
|
|
|
|
setStatus ( "getmatchingqueryoffsettable" );
|
|
|
|
if ( m_queryOffsetTableValid )
|
|
return &m_queryOffsetTable;
|
|
|
|
SafeBuf *qkbuf = getMatchingQueryBuf();
|
|
if ( ! qkbuf || qkbuf == (void *)-1) return (HashTableX *)qkbuf;
|
|
|
|
// how many queries do we have that match this url?
|
|
//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
|
|
int32_t nks = qkbuf->length()/sizeof(QueryLink);
|
|
|
|
|
|
// init it
|
|
if ( ! m_queryOffsetTable.set(8,
|
|
0,
|
|
nks*4,
|
|
NULL,
|
|
0,
|
|
false,
|
|
m_niceness,
|
|
"qdot") )
|
|
return NULL;
|
|
|
|
for ( int32_t i = 0 ; i < nks ; i++ ) {
|
|
// cast it
|
|
QueryLink *qk = &qks[i];
|
|
// int16_tcut
|
|
//int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
|
|
int64_t eh64 = qp->m_replyingHostId;
|
|
eh64 <<= 32;
|
|
eh64 |= qp->m_qbufOffset;
|
|
// hash it up
|
|
if ( ! m_queryOffsetTable.addKey ( &eh64 ) )
|
|
return NULL;
|
|
}
|
|
|
|
// all done
|
|
m_queryOffsetTableValid = true;
|
|
return &m_queryOffsetTable;
|
|
}
|
|
|
|
//static char *s_base = NULL;
|
|
|
|
// related QUERY compate
|
|
int qp99relatedCmp ( const void *a, const void *b ) {
|
|
// these are offsets
|
|
//int32_t offa = *(int32_t *)a;
|
|
//int32_t offb = *(int32_t *)b;
|
|
QueryLink *qa = *(QueryLink **)a;
|
|
QueryLink *qb = *(QueryLink **)b;
|
|
// make sure manually added queries are on top
|
|
//if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1;
|
|
//if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
|
|
//QueryInfo *qia = &qa->m_queryInfo;
|
|
//QueryInfo *qib = &qb->m_queryInfo;
|
|
// get scores
|
|
float scorea = qa->m_rq_totalScore;
|
|
float scoreb = qb->m_rq_totalScore;
|
|
if ( scorea < scoreb ) return 1;
|
|
if ( scorea > scoreb ) return -1;
|
|
//return 0;
|
|
// let docidsincommon break ties
|
|
return qb->m_docIdVotes - qa->m_docIdVotes;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
static int qlCmp ( const void *a, const void *b ) {
|
|
QueryLink *qa = (QueryLink *)a;
|
|
QueryLink *qb = (QueryLink *)b;
|
|
|
|
// let docid break ties
|
|
int64_t da = qa->getRelatedDocId(s_rdBuf)->m_docId;
|
|
int64_t db = qb->getRelatedDocId(s_rdBuf)->m_docId;
|
|
|
|
//int64_t da = qa->m_relatedDocId->m_docId;
|
|
//int64_t db = qb->m_relatedDocId->m_docId;
|
|
|
|
// always niceness 1 i guess
|
|
QUICKPOLL(1);
|
|
|
|
if ( da > db )
|
|
return 1; // 1 means to swap!
|
|
if ( da < db )
|
|
return -1;
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
#include <math.h> // sqrtf()
|
|
|
|
// now we can do square roots in gdb by calling this
|
|
float gbsqrt ( float x ) {
|
|
return sqrtf(x);
|
|
}
|
|
|
|
|
|
/*
|
|
// sort the related query links intersected buf by docid
|
|
QueryLink *ptrs;
|
|
ptrs = (QueryLink *)m_relatedQueryLinksIntersected.getBufStart();
|
|
int32_t nk = m_relatedQueryLinksIntersected.length() / sizeof(QueryLink);
|
|
qsort ( ptrs ,
|
|
nk,
|
|
sizeof(QueryLink),
|
|
qlCmp );
|
|
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - start;
|
|
log("seopipe: time: relatedqueryintersection took %" INT64 " ms",took);
|
|
*/
|
|
|
|
/*
|
|
void XmlDoc::gotMsg98Reply ( UdpSlot *slot ) {
|
|
// get replying hostid
|
|
int32_t hostId = slot->m_hostId;
|
|
// log
|
|
setStatus ( "gotmsg98reply" );
|
|
// sanity
|
|
if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
|
|
// point to it
|
|
char *p = slot->m_readBuf;
|
|
char *pend = p + slot->m_readBufSize;
|
|
// int16_tcuts
|
|
QueryLink *qks = (QueryLink *)m_tmpBuf5.getBufStart();
|
|
// sanity, i guess if oom
|
|
int32_t maxLinkOff = m_tmpBuf5.length() ;
|
|
maxLinkOff /= sizeof(QueryLink);
|
|
// make some space
|
|
int32_t need = slot->m_readBufSize;
|
|
if ( ! m_tmpStringBuf5.reserve(need,"rqdbuf") ) {
|
|
m_msg98ReplyError = g_errno;
|
|
// do not bother scanning the reply
|
|
p = pend;
|
|
}
|
|
|
|
// init table
|
|
if ( m_qstringTable.m_numSlots == 0 ) {
|
|
// 1M slots!
|
|
if ( ! m_qstringTable.set(4,4,1000000,NULL,0,false,
|
|
m_niceness,"qstrtbl") ) {
|
|
m_msg98ReplyError = g_errno;
|
|
// do not bother scanning the reply
|
|
p = pend;
|
|
}
|
|
}
|
|
|
|
|
|
//int32_t numQueryLinks = m_relatedQueryLinksIntersected.length() ;
|
|
//numQueryLinks /= sizeof(QueryLink);
|
|
// put strings into m_tmpStringBuf5
|
|
// parse these strings
|
|
// maybe index so we can assign to QueryLinks::m_queryStringOffset
|
|
// maybe include querylink # so we can assign quickly!
|
|
QueryLink *qk;
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// offset of query link
|
|
int32_t queryLinkOff = *(int32_t *)p;
|
|
p += 4;
|
|
// crazy? maybe we went oom on m_relatedQueryLinksIntersected
|
|
if ( queryLinkOff >= maxLinkOff ) {
|
|
log("seopipe: msg98 reply link off breach %" INT32 ">=%" INT32 "",
|
|
queryLinkOff,maxLinkOff);
|
|
m_msg98ReplyError = ENOMEM;
|
|
break;
|
|
}
|
|
|
|
// get that
|
|
QueryLogEntry *qe = (QueryLogEntry *)p;
|
|
// skip it
|
|
p += qe->getSize();
|
|
|
|
// point to it
|
|
qk = &qks[queryLinkOff];
|
|
|
|
// do not duplicate query strings!
|
|
int32_t qh32 = hash32n ( qe->getQueryString() );
|
|
int32_t slot = m_qstringTable.getSlot ( &qh32 );
|
|
if ( slot >= 0 ) {
|
|
int32_t qeOff;
|
|
qeOff =*(int32_t *)m_qstringTable.getValueFromSlot(slot);
|
|
qk->m_queryStringOffset = qeOff;
|
|
qk->m_queryHostId = -1;
|
|
continue;
|
|
}
|
|
|
|
// get offset of string in string bug
|
|
int32_t stringOff = m_tmpStringBuf5.length();
|
|
// store good serp score
|
|
if ( ! m_tmpStringBuf5.safeMemcpy(qe,qe->getSize() ) ) {
|
|
m_msg98ReplyError = g_errno;
|
|
break;
|
|
}
|
|
|
|
// add to table
|
|
if ( ! m_qstringTable.addKey(&qh32,&stringOff) ) {
|
|
m_msg98ReplyError = g_errno;
|
|
break;
|
|
}
|
|
|
|
|
|
// show it
|
|
//log("seopipe: DEBUG. mapped remote off %" INT32 " (hostid%" INT32 ") to "
|
|
// "local off %" INT32 " (%s)"
|
|
// ,qk->m_queryStringOffset,qk->m_queryHostId,stringOff,qstr);
|
|
// . save string offset
|
|
// . THIS OVERWRITES the g_qbuf offset that was in there!!!
|
|
qk->m_queryStringOffset = stringOff;
|
|
// to indicate that this QueryLink::m_queryStringOffset is now
|
|
// an offset into m_relatedQueryStringBuf and no longer an
|
|
// offset into g_qbuf of the specific hostid, we set hostid
|
|
// to -1
|
|
qk->m_queryHostId = -1;
|
|
}
|
|
// steal it so it doesn't free it
|
|
//slot->m_readBuf = NULL;
|
|
// inc the counter
|
|
m_numMsg98Replies++;
|
|
// return control to transmit function. it will call m_callback1
|
|
// if the function is done. but if a different parent function than
|
|
// transmit called us then we call that. it just depends on the
|
|
// initial entry function that called getMatchingQueries()
|
|
m_masterLoop ( m_masterState );
|
|
}
|
|
|
|
|
|
|
|
|
|
static void gotMsg3fReplyWrapper ( void *state , void *state2 ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
//Multicast *m = (Multicast *)state2;
|
|
Bin *bin = (Bin *)state2;
|
|
THIS->gotMsg3fReply ( bin ); // m
|
|
}
|
|
*/
|
|
|
|
static int mtCmp ( const void *a, const void *b ) {
|
|
MissingTerm *wa = *(MissingTerm **)a;
|
|
MissingTerm *wb = *(MissingTerm **)b;
|
|
if ( wb->m_importance > wa->m_importance ) return 1; // swap
|
|
if ( wb->m_importance < wa->m_importance ) return -1;
|
|
if ( wb->m_votes > wa->m_votes ) return 1; // swap
|
|
if ( wb->m_votes < wa->m_votes ) return -1;
|
|
if ( (int64_t)b < (int64_t)a ) return 1; // swap
|
|
if ( (int64_t)b > (int64_t)a ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
// . called by getMissingTermBuf() and getMatchingTermBuf()
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::addTermsFromQuery ( char *qstr,
|
|
uint8_t queryLangId,
|
|
int32_t gigablastTraffic,
|
|
int32_t googleTraffic2,
|
|
//QueryLogEntry *qe ,
|
|
int32_t hackqoff,
|
|
SafeBuf *tmpBuf ,
|
|
HashTableX *scoreTable ,
|
|
HashTableX *topTermsTable ,
|
|
float imp, // importance
|
|
bool isRelatedQuery ) {
|
|
|
|
// sanity
|
|
if ( hackqoff < 0 ) { char *xx=NULL;*xx=0; }
|
|
// print query but bold-face the terms our doc has not
|
|
Query qq;
|
|
//SafeBuf *rqsb = &m_relatedQueryStringBuf;
|
|
// doQueryExpansion = false
|
|
//char *qstr = qe->getQueryString ( );
|
|
qq.set2 ( qstr , queryLangId , false );
|
|
int32_t lastStart = -1;
|
|
for ( int32_t k = 0 ; k < qq.m_numWords ; k++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
QueryWord *qw = &qq.m_qwords[k];
|
|
int32_t tid32 = qw->m_wordId & 0xffffffff;
|
|
// is it not contained by our doc
|
|
if ( ! tid32 ) continue;
|
|
// skip if we contain it already
|
|
if ( isRelatedQuery && topTermsTable->isInTable ( &tid32 ) )
|
|
continue;
|
|
// skip if common word like "on" "at" etc.
|
|
if ( isCommonQueryWordInEnglish(tid32) ) continue;
|
|
// get start of wikipedia phrase it is in
|
|
int32_t start = qw->m_wikiPhraseStart;
|
|
int32_t nwk = qw->m_numWordsInWikiPhrase;
|
|
// if not in wiki phrase at all, just use single word
|
|
if ( qw->m_wikiPhraseId == 0 ) {
|
|
start = k;
|
|
nwk = 1;
|
|
}
|
|
// do not re-do any words in here
|
|
if ( start == lastStart ) continue;
|
|
lastStart = start;
|
|
// hash each wordid in the term into the th64 hash
|
|
int64_t th64 = 0LL;
|
|
//int32_t alnumWordCount = 0;
|
|
for ( int32_t j = start ; j < start + nwk ; j++ ) {
|
|
// int16_tcut
|
|
QueryWord *qw = &qq.m_qwords[j];
|
|
// skip punct
|
|
if ( qw->m_wordId == 0 ) continue;
|
|
// hash otherwise
|
|
th64 ^= qw->m_wordId;
|
|
// count it
|
|
//alnumWordCount++;
|
|
}
|
|
|
|
// get traffic of related query
|
|
int32_t traffic = gigablastTraffic;
|
|
// make gb traffic into google monthly traffic
|
|
traffic *= GB_TRAFFIC_MODIFIER;
|
|
// ues google numbers if we have them, more accurate
|
|
int32_t googleTraffic = googleTraffic2;
|
|
if ( googleTraffic >= 0 ) traffic = googleTraffic;
|
|
|
|
|
|
// now score that term
|
|
int32_t slot = scoreTable->getSlot ( &th64 );
|
|
if ( slot >= 0 ) {
|
|
int32_t off;
|
|
off=*(int32_t *)scoreTable->getValueFromSlot(slot);
|
|
char *base = tmpBuf->getBufStart();
|
|
MissingTerm *pt=(MissingTerm *)(base + off);
|
|
pt->m_importance += imp;
|
|
pt->m_votes++;
|
|
pt->m_traffic += traffic;
|
|
// store first 10 related query strings
|
|
// we got this term from
|
|
for ( int32_t x = 1 ; x < 10 ; x++ ) {
|
|
if ( pt->m_hackQueryOffsets[x] != -1 )
|
|
continue;
|
|
// grab it. querylogentry ptr!!
|
|
pt->m_hackQueryOffsets[x] = hackqoff;
|
|
break;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
|
|
// set a class to store in safebuf
|
|
MissingTerm mt;
|
|
mt.m_importance = imp;
|
|
//mt.m_numAlnumWords = alnumWordCount;
|
|
mt.m_synOf = NULL;
|
|
mt.m_votes = 1;
|
|
mt.m_traffic = traffic;
|
|
mt.m_hackQueryOffsets[0] = hackqoff;
|
|
// if not a missing term, we are a MATCHING term
|
|
mt.m_isMissingTerm = isRelatedQuery;
|
|
// invalidate the remaining 9 query offsets
|
|
for ( int32_t x = 1 ; x < 10 ; x++ )
|
|
mt.m_hackQueryOffsets[x] = -1;
|
|
int32_t offset = tmpBuf->length();
|
|
int32_t toCopy = sizeof(MissingTerm);
|
|
if ( ! tmpBuf->safeMemcpy(&mt,toCopy))
|
|
return false;
|
|
// for calculating length of stored term string
|
|
int32_t startLen = tmpBuf->length();
|
|
// . if first time in scoretable, add stuff
|
|
// . store the string, each word separately
|
|
for ( int32_t j = start ; j < start + nwk ; j++ ) {
|
|
// int16_tcut
|
|
QueryWord *qw = &qq.m_qwords[j];
|
|
// point to word as string
|
|
char *str = qw->m_word;
|
|
int32_t len = qw->m_wordLen;
|
|
// make all punct a space
|
|
if ( qw->m_wordId == 0 ) {
|
|
str = " ";
|
|
len = 1;
|
|
}
|
|
// store term string after MissingTerm class
|
|
if ( ! tmpBuf->safeMemcpy(str,len) )
|
|
return false;
|
|
}
|
|
tmpBuf->pushChar('\0');
|
|
// record MissingTerm::m_termSize
|
|
int32_t delta = tmpBuf->length() - startLen;
|
|
char *base = tmpBuf->getBufStart();
|
|
MissingTerm *pmt = (MissingTerm *)(base + offset);
|
|
pmt->m_termSize = delta;
|
|
// now score table entry
|
|
if ( ! scoreTable->addKey ( &th64 , &offset ) )
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// this is used to sort the MissingTerm instances in a safeBuf,
|
|
// missingTermBuf. it is also used to sort the Matching terms from
|
|
// getMatchingTermBuf() as well now!
|
|
bool XmlDoc::sortTermsIntoBuf ( HashTableX *scoreTable ,
|
|
SafeBuf *tmpBuf ,
|
|
SafeBuf *missingTermBuf ) {
|
|
|
|
// make ptrs for sorting
|
|
int32_t numTerms = scoreTable->getNumUsedSlots();
|
|
int32_t need = numTerms * 4;
|
|
SafeBuf ptrBuf;
|
|
if ( ! ptrBuf.reserve ( need ,"srtbuf") ) return false;
|
|
char *p = tmpBuf->getBufStart();
|
|
char *pend = tmpBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
MissingTerm *mt = (MissingTerm *)p;
|
|
p += mt->getSize();
|
|
ptrBuf.pushPtr ( mt );
|
|
}
|
|
gbqsort ( ptrBuf.getBufStart(),
|
|
numTerms,
|
|
sizeof(MissingTerm *),
|
|
mtCmp,
|
|
m_niceness);
|
|
|
|
// now write the missingTerm instances into m_missingTermBuf
|
|
int32_t need2 = tmpBuf->length();
|
|
if ( ! missingTermBuf->reserve ( need2 ,"mtbuf") ) return false;
|
|
// now write back into the real buf
|
|
MissingTerm **pp = (MissingTerm **)ptrBuf.getBufStart();
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
|
|
MissingTerm *mt = pp[i];
|
|
missingTermBuf->safeMemcpy ( mt , mt->getSize() );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . now this uses the related queries
|
|
// . use logic from getInsertableTerms()!!!
|
|
SafeBuf *XmlDoc::getMissingTermBuf ( ) {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_missingTermBufValid )
|
|
return &m_missingTermBuf;
|
|
|
|
SafeBuf *qkbuf = getRelatedQueryBuf ();
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
|
|
HashTableX *topTermsTable = getTermIdBufDedupTable32();
|
|
if ( ! topTermsTable || topTermsTable == (void *)-1 )
|
|
return (SafeBuf *)topTermsTable;
|
|
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( 100000 ,"t3buf" ) ) return NULL;
|
|
|
|
// maps 64-bit term hash (can be multiple words in a term) to
|
|
// an offset into tmpBuf.
|
|
HashTableX scoreTable;
|
|
if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
|
|
return NULL;
|
|
|
|
//
|
|
// taken from seo.cpp's printRelatedQueries() function
|
|
//
|
|
//int32_t *qrOffs = (int32_t *)relBuf->getBufStart();
|
|
//int32_t numRels = relBuf->length() / sizeof(int32_t);
|
|
//char *base = m_queryRelBuf.getBufStart();
|
|
//SafeBuf *rqsb = &m_relatedQueryStringBuf;
|
|
|
|
int32_t nks = qkbuf->length() / sizeof(QueryLink);
|
|
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
|
|
int32_t i;
|
|
for ( i = 0 ; i < nks ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
// stop at 300?
|
|
//if ( i >= 300 ) break;
|
|
QueryLink *qk = &qks[i];
|
|
int32_t qkOff = (char *)qk - qkbuf->getBufStart();
|
|
//int32_t relOff = qrOffs[i];
|
|
//QueryRel *rel = (QueryRel *)(base+relOff);
|
|
// skip if not head of a linked list
|
|
if ( ! qk->m_isFirst ) continue;
|
|
QueryLogEntry *qe ;
|
|
qe = qk->getQueryLogEntry(&m_relatedQueryStringBuf);
|
|
// relative to rqsb! m_relatedQueryStringBuf
|
|
float imp = qk->m_totalQueryImportance;
|
|
// modify by unique round? not yet...
|
|
//imp -= rel->m_uniqueRound * 1000;
|
|
// now use this function
|
|
if ( ! addTermsFromQuery ( qe->getQueryString() ,
|
|
qe->m_langId,
|
|
qe->m_gigablastTraffic,
|
|
qe->m_googleTraffic,
|
|
qkOff, // hackqoff
|
|
&tmpBuf ,
|
|
&scoreTable ,
|
|
topTermsTable ,
|
|
imp ,
|
|
true ) ) // is related query?
|
|
return NULL;
|
|
}
|
|
|
|
// sort MissingTerms from tmpBuf into m_missingTermBuf by
|
|
// MissingTerm::m_importance
|
|
if ( ! sortTermsIntoBuf ( &scoreTable,
|
|
&tmpBuf,
|
|
&m_missingTermBuf ) )
|
|
return NULL;
|
|
|
|
m_missingTermBufValid = true;
|
|
|
|
//m_numMissingTerms = i;
|
|
|
|
// store it
|
|
//if ( ! storeMissingTermBufIntoCachedb() )
|
|
// return (SafeBuf *)-1;
|
|
|
|
return &m_missingTermBuf;
|
|
}
|
|
|
|
|
|
|
|
// . now get the best terms from our matching queries
|
|
// . basically the exact same algo as getMissingTermBuf
|
|
SafeBuf *XmlDoc::getMatchingTermBuf ( ) {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_matchingTermBufValid )
|
|
return &m_matchingTermBuf;
|
|
|
|
SafeBuf *mq = getMatchingQueryBuf();
|
|
if ( mq == NULL || mq == (void *)-1 ) return mq;
|
|
|
|
|
|
HashTableX *topTermsTable = getTermIdBufDedupTable32();
|
|
if ( ! topTermsTable || topTermsTable == (void *)-1 )
|
|
return (SafeBuf *)topTermsTable;
|
|
|
|
// tmpBuf will hold the MissingTerms we add.
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( 100000 ,"t4buf") ) return NULL;
|
|
|
|
// maps 64-bit term hash (can be multiple words in a term) to
|
|
// an offset into tmpBuf. tmpBuf holds the missing terms, so we
|
|
// use scoreTable to accumulate MissingTerm::m_importance for
|
|
// the same term in different queries.
|
|
HashTableX scoreTable;
|
|
if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
|
|
return NULL;
|
|
|
|
// scan the queries this doc matches and add MissingTerms for them
|
|
// into tmpBuf
|
|
int32_t nks = mq->length() / sizeof(QueryLink);
|
|
QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
|
|
|
|
int32_t i; for ( i = 0 ; i < nks ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
QueryLink *qk = &qks[i];
|
|
// stop at 300?
|
|
if ( i >= 300 ) break;
|
|
// "matching terms" have different hackqoff than missing terms
|
|
int32_t qkOff = (char *)qk - mq->getBufStart();
|
|
// relative to rqsb! m_relatedQueryStringBuf
|
|
float imp = qk->m_queryImportance;
|
|
// querylogentry does not have string info here! it is
|
|
// just the basic class
|
|
QueryLogEntry *qe ;
|
|
qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
|
|
// . now use this function
|
|
if ( ! addTermsFromQuery ( qe->getQueryString(),
|
|
qe->m_langId,
|
|
qe->m_gigablastTraffic,
|
|
qe->m_googleTraffic,
|
|
qkOff, // hackqoff
|
|
&tmpBuf ,
|
|
&scoreTable ,
|
|
topTermsTable ,
|
|
imp ,
|
|
false ) ) // is related query?
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// sort MatchingTerms from tmpBuf into m_matchingTermBuf by
|
|
// MatchingTerm::m_importance
|
|
if ( ! sortTermsIntoBuf ( &scoreTable,
|
|
&tmpBuf,
|
|
&m_matchingTermBuf ) )
|
|
return NULL;
|
|
|
|
m_matchingTermBufValid = true;
|
|
|
|
//m_numMatchingTerms = i;
|
|
|
|
// store it
|
|
//if ( ! storeMatchingTermBufIntoCachedb() )
|
|
// return (SafeBuf *)-1;
|
|
|
|
return &m_matchingTermBuf;
|
|
}
|
|
/*
|
|
// . max # of outstanding msg3f requests we can send to one host
|
|
// . now just make it 1 since it is msg3f NOT msg39
|
|
#define MAXOUT 1
|
|
|
|
//#define BINSIZE 100000
|
|
|
|
class Bin {
|
|
public:
|
|
// the current position for adding queries into m_buf
|
|
int32_t m_cursor;
|
|
int32_t m_maxCursor;
|
|
int32_t m_allocSize;
|
|
// some hack storage
|
|
Host *m_hackHost;
|
|
bool m_hackIsMsg99ReplyPtr;
|
|
// for sending the m_buf to its host
|
|
Multicast m_mcast;
|
|
// allocates size of BINSIZE bytes
|
|
char m_buf[0];
|
|
};
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . returns true on successful launch of request, it will block always
|
|
bool XmlDoc::sendBin ( int32_t i ) {
|
|
|
|
Bin *bin = m_currentBinPtrs[i];
|
|
|
|
// get host
|
|
Host *h = g_hostdb.getHost(i);
|
|
|
|
// copy it
|
|
//int32_t reqSize = p - tmpBuf;
|
|
//char *req = mdup ( tmpBuf , reqSize , "3freq" );
|
|
//if ( ! req ) return true;
|
|
|
|
// increment outstanding requests he has
|
|
h->m_numOutstandingRequests++;
|
|
|
|
// this could be a ptr to a msg99reply or a querylink
|
|
Multicast *mcast = &bin->m_mcast;
|
|
|
|
//bin->m_hackxd = this;
|
|
//bin->m_hackPtrCursor = firstPtrCursor;
|
|
bin->m_hackHost = h;
|
|
|
|
// get his group id
|
|
uint32_t groupId = h->m_groupId;
|
|
|
|
char *req = bin->m_buf;
|
|
int32_t reqSize = bin->m_cursor;
|
|
|
|
// disown it so mcast can free it when its udpslot is destroyed
|
|
m_currentBinPtrs[i] = NULL;
|
|
|
|
// note that
|
|
setStatus("launching msg3f");
|
|
// log it too
|
|
//log("seopipe: launching msg3f request of %" INT32 " gbdocid queries to "
|
|
// "score to host %" INT32 "", queryCount,h->m_hostId);
|
|
// get the docIds for this query using msg3f.cpp's handleRequest3f()
|
|
bool status = mcast->send ( req ,
|
|
reqSize,
|
|
0x3f ,
|
|
false, // mcast frees request? no!!!
|
|
groupId, // group to send to
|
|
false, // send to whole group?
|
|
0 , // query hash for host in group select
|
|
this , // state1
|
|
bin,//mcast, // state2
|
|
gotMsg3fReplyWrapper,
|
|
86401, // timeout in seconds. LONG TIME!
|
|
m_niceness,
|
|
false, // realtime?
|
|
h->m_hostId // firsthostid to try
|
|
);
|
|
// mark it out
|
|
m_numMsg3fRequests++;
|
|
// if this is true then it was a success and we BLOCKED
|
|
if ( status ) {
|
|
// must BE IN USE!
|
|
if ( ! mcast->m_inUse ) { char *xx=NULL;*xx=0; }
|
|
// success
|
|
return true;
|
|
}
|
|
// it came back?
|
|
m_numMsg3fReplies++;
|
|
// undo this
|
|
h->m_numOutstandingRequests--;
|
|
// errno should be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// set error
|
|
m_binError = g_errno;
|
|
// note it
|
|
log("seopipe: mcast had error: %s", mstrerror(g_errno));
|
|
// free that bin i guess
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
// return false on error
|
|
return false;
|
|
}
|
|
|
|
|
|
// . this is called from two places:
|
|
// 1. getMatchingQueriesScored() (Msg99Reply ptrs)
|
|
// 2. getRelatedQueryBuf() (QueryLink ptrs)
|
|
// . this can take Msg99Reply ptrs or it can take QueryLink ptrs
|
|
// . it will glean the docid from either of these two ptrs types as well
|
|
// as glean the pointer to the query string.
|
|
// . THEN it can create a 'gbdocid:xxxx | <queryString>' query which
|
|
// it will send to a host in the network.
|
|
// . it will try to keep each host in the network answering 5 such queries
|
|
// at any one time. bins are no longer used.
|
|
// . we need to implement heavy termlist caching remotely and locally to
|
|
// ensure optimal speed
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true with g_errno set on error
|
|
bool XmlDoc::scoreDocIdRestrictedQueries ( Msg99Reply **replyPtrs ,
|
|
QueryLink *linkPtrs ,
|
|
int32_t numPtrs ) {
|
|
|
|
//log("debug: entered scoredocidrestrictedqueries");
|
|
|
|
if ( numPtrs == 0 ) return true;
|
|
|
|
// . sanity check
|
|
// . you can only score your Msg99Replies or your QueryLinks
|
|
// . score your Msg99Replies for queries that match the main url
|
|
// . score your QueryLinks for queries that match a related docid
|
|
if ( ! replyPtrs && ! linkPtrs ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( replyPtrs && m_setForReplyPtrs ) return true;
|
|
if ( linkPtrs && m_setForLinkPtrs ) return true;
|
|
|
|
// we now send the termlistbuf to each host receiving a msg3f
|
|
// request so when it performs the msg39 on a query we provide it
|
|
// will set QueryTerm::m_posdbListPtr to point to the termlists we
|
|
// provided only, just for this docid
|
|
SafeBuf *termListBuf = NULL;
|
|
if ( ! linkPtrs ) {
|
|
termListBuf = getTermListBuf();
|
|
if ( ! termListBuf ) return true;
|
|
if ( termListBuf==(void *)-1 ) return false;
|
|
}
|
|
|
|
// force to ten for debug
|
|
//numPtrs = 20;
|
|
|
|
sendLoop:
|
|
|
|
//
|
|
// cleanup if got all replies we can
|
|
//
|
|
if ( m_numMsg3fReplies == m_numMsg3fRequests &&
|
|
((m_qcursor >= numPtrs) || m_binError) ) {
|
|
|
|
//log("debug: cleanup");
|
|
|
|
// there might be remnant bins if we stopped trying to
|
|
// call sendBin because we hit m_binError
|
|
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
|
// see if that bin is still around
|
|
Bin *bin = m_currentBinPtrs[i];
|
|
if ( ! bin ) continue;
|
|
// this will core if the multicast is in use
|
|
bin->m_mcast.destructor();
|
|
// now nuke it then
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
// now make it null
|
|
m_currentBinPtrs[i] = NULL;
|
|
}
|
|
// nuke this too!
|
|
if ( m_newxd2 ) {
|
|
mdelete ( m_newxd2 , sizeof(XmlDoc) , "newxd2" );
|
|
delete ( m_newxd2 );
|
|
m_newxd2 = NULL;
|
|
}
|
|
// free table's mem if used
|
|
m_tmpDupTable.reset();
|
|
// do not repeat this logic!
|
|
if ( replyPtrs ) {
|
|
m_setForReplyPtrs = true;
|
|
m_binErrorForReplyPtrs = m_binError;
|
|
}
|
|
if ( linkPtrs ) {
|
|
m_setForLinkPtrs = true;
|
|
m_binErrorForLinkPtrs = m_binError;
|
|
}
|
|
// inherit error? pass it on to caller
|
|
//if ( m_binError ) g_errno = m_binError;
|
|
// reset for another call to this function since we call
|
|
// if from two different places above
|
|
m_numMsg3fRequests = 0;
|
|
m_numMsg3fReplies = 0;
|
|
m_qcursor = 0;
|
|
m_binError = 0;
|
|
// all done!
|
|
g_errno = 0;
|
|
return true;
|
|
}
|
|
|
|
// int16_tcut
|
|
char *base = m_tmpStringBuf5.getBufStart();
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store the queries in our buffer into the various bins and send
|
|
// a bin off when it gets full
|
|
queryLoop:
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// nothing left to do except wait for replies?
|
|
if ( m_qcursor >= numPtrs ) return false;
|
|
|
|
// assume ptr is good
|
|
bool good = true;
|
|
// set these
|
|
int64_t docId;
|
|
// the query as a string
|
|
char *qstr = NULL;
|
|
// for passing to mcast::m_hackQPtrs
|
|
void *vptr;
|
|
// get the ith QueryLink?
|
|
if ( linkPtrs ) {
|
|
QueryLink *qk = &linkPtrs[m_qcursor];
|
|
// skip if was not successfully processed above
|
|
// because it's hostid was dead perhaps?
|
|
if ( qk->m_queryHostId != -1 ) good = false;
|
|
// get from related docid in this case
|
|
SafeBuf *rdbuf = &m_relatedDocIdBuf;
|
|
if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
|
|
RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
|
|
docId = rd->m_docId;
|
|
// get it
|
|
QueryLogEntry *qe ;
|
|
qe = (QueryLogEntry *)(qk->m_queryStringOffset + base);
|
|
// and this. skip over goodserpscore, gigablastTraffic and
|
|
// googleTraffic
|
|
qstr = qe->getQueryString();
|
|
// save it
|
|
vptr = qk;
|
|
}
|
|
// make a new one for the first time
|
|
if ( linkPtrs && ! m_newxd2 ) {
|
|
try { m_newxd2 = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
mnew ( m_newxd2, sizeof(XmlDoc),"newxd2");
|
|
}
|
|
// set the xmldoc to this new docid, if it is new...
|
|
if ( linkPtrs && m_newxd2->m_docId != docId ) {
|
|
// a good stopping point?
|
|
if ( clientClosedConnection() ) {
|
|
m_binError = ESOCKETCLOSED;
|
|
goto sendLoop;
|
|
}
|
|
// set it from related doc's docid
|
|
if ( ! m_newxd2->set3 ( docId ,cr->m_coll, m_niceness ) ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// alloc space for tablen
|
|
if ( m_tmpDupTable.getNumSlots() <= 0 &&
|
|
! m_tmpDupTable.set ( 8,0,1024,NULL,0,false,m_niceness,
|
|
"tdtbl") ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// must not be in there already!
|
|
if ( m_tmpDupTable.isInTable ( &docId ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// add it
|
|
if ( ! m_tmpDupTable.addKey ( &docId ) ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// ensure content is recycled from title rec
|
|
m_newxd2->m_recycleContent = true;
|
|
// newxd2 needs to use our master functions. so
|
|
// anytime one of its internal functions blocks, then
|
|
// our m_masterLoop will be called
|
|
// and we'll end up right here again!
|
|
m_newxd2->m_masterLoop = m_masterLoop;
|
|
m_newxd2->m_masterState = m_masterState;
|
|
// only get posdb keys really for this stuff
|
|
m_newxd2->m_useTitledb = false;
|
|
m_newxd2->m_useTagdb = false;
|
|
m_newxd2->m_useClusterdb = false;
|
|
m_newxd2->m_useSpiderdb = false;
|
|
m_newxd2->m_useLinkdb = false;
|
|
// debug
|
|
log("seopipe: setting newxd2 docid=%" INT64 "",docId);
|
|
}
|
|
// pump this
|
|
if ( linkPtrs && ! m_newxd2->m_loaded ) {
|
|
|
|
// . CRAP, blocking here sucks because when this function
|
|
// is re-entered it can also be from a Msg3f reply
|
|
// not because this document is back from msg22a...
|
|
|
|
//log("debug: loading newxd2");
|
|
|
|
// try to set from title rec first. return false if blocks.
|
|
if ( ! m_newxd2->loadFromOldTitleRec() ) {
|
|
m_newxd2Blocked = true;
|
|
//log("debug: newxd2 blocked");
|
|
return false;
|
|
}
|
|
}
|
|
// i guess no longer out
|
|
if ( linkPtrs && m_newxd2->m_loaded )
|
|
m_newxd2Blocked = false;
|
|
|
|
//if ( linkPtrs )
|
|
// log("debug: newxd2 loaded=%" INT32 "",(int32_t)m_newxd2->m_loaded);
|
|
|
|
// sanity check
|
|
if ( linkPtrs && ! m_newxd2->m_oldTitleRecValid ) {
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// . did that fail? i.e. docid not found!?!?!
|
|
// . do not increment m_qcursor if m_binError is set
|
|
if ( linkPtrs && ! m_newxd2->m_oldTitleRec && ! m_binError ) {
|
|
// just skip this asshole then
|
|
if ( m_lastPrintedDocId != docId ) {
|
|
log("seopipe: related docid %" INT64 " titlerec "
|
|
"load failed99",
|
|
docId);
|
|
}
|
|
m_lastPrintedDocId = docId;
|
|
// clear that
|
|
g_errno = 0;
|
|
// skip it
|
|
m_qcursor++;
|
|
// try the next one
|
|
goto queryLoop;
|
|
}
|
|
if ( linkPtrs ) {
|
|
|
|
// . CRAP, blocking here sucks because when this function
|
|
// is re-entered it can also be from a Msg3f reply
|
|
// not because it has the termlistbuf ready
|
|
|
|
// . use termlist buf of related docid
|
|
// . we need to ENSURE that the QueryLinks are clustered
|
|
// by related docid so this logic is efficient here
|
|
termListBuf = m_newxd2->getTermListBuf();
|
|
// return false if it blocked
|
|
if ( termListBuf == (void *)-1 ) {
|
|
//log("debug: newxd2 blocked in termlistbuf");
|
|
m_newxd2Blocked = true;
|
|
return false;
|
|
}
|
|
// this sucks. error!
|
|
if ( ! termListBuf ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
}
|
|
// i guess no longer out
|
|
if ( linkPtrs ) {
|
|
//log("debug: newxd2 UNblocked in termlistbuf");
|
|
m_newxd2Blocked = false;
|
|
}
|
|
|
|
// wait for replies to come in so we can stop even if m_qcursor
|
|
// did not complete its scan!
|
|
// shit, but what if we are a msg22 coming in for m_newxd2? that
|
|
// is why i moved this check down here so we can set m_newxd2Blocked
|
|
// to false and allow the msg3f replies to come back in and free
|
|
// all the bins. this is kinda fucked up because everything is
|
|
// asynchronous.
|
|
if ( m_binError ) return false;
|
|
|
|
// otherwise the Msg99Reply
|
|
if ( ! linkPtrs ) {
|
|
Msg99Reply *qp = replyPtrs[m_qcursor];
|
|
// tis us!
|
|
docId = m_docId;
|
|
// sanity
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// and query string
|
|
qstr = qp->m_queryStr;
|
|
// save it
|
|
vptr = qp;
|
|
}
|
|
|
|
int32_t qlen = gbstrlen(qstr);
|
|
|
|
// mark as bad if this query is too big already
|
|
if ( m_firstUrl.m_ulen + qlen + 10 > MAX_QUERY_LEN )
|
|
good = false;
|
|
|
|
// if ptr was bad, do not evaluate at all
|
|
if ( ! good ) {
|
|
m_qcursor++;
|
|
goto queryLoop;
|
|
}
|
|
|
|
// sanity
|
|
if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . get hash of query to determine bin
|
|
// . this keeps our term freqs consistent since every query goes
|
|
// back TO THE SAME HOST!!! thus our scores remain consistent.
|
|
// each host has a slightly different TermFreq/Weight for the
|
|
// exact same query because the termfreq is based on the termlist
|
|
// length for that termid. and each host has a different set of
|
|
// docids in its index for the most part.
|
|
uint32_t h32 = hash32n ( qstr );
|
|
int32_t numHosts = g_hostdb.getNumHosts();
|
|
// do not send to host #0 if we got a lot of hosts
|
|
if ( g_hostdb.getNumHosts() >= 8 ) numHosts--;
|
|
int32_t hostNum = h32 % numHosts;
|
|
// skip host #0 which is us i guess!
|
|
if ( g_hostdb.getNumHosts() >= 8 ) hostNum++;
|
|
// sanity for that
|
|
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get the current bin for that host
|
|
Bin *bin = m_currentBinPtrs [ hostNum ];
|
|
|
|
// alloc on demand
|
|
if ( ! bin ) {
|
|
// how big is the termlistbuf?
|
|
int32_t tsize = termListBuf->length();
|
|
int32_t collLen = gbstrlen(cr->m_coll);
|
|
// how much space do we need for a good bin?
|
|
int32_t alloc = sizeof(Bin) + 8 +1+ collLen + 1 + tsize + 100000;
|
|
// make that
|
|
char *mem = (char *)mmalloc ( alloc ,"binreq" );
|
|
if ( ! mem ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// cast it
|
|
bin = (Bin *)mem;
|
|
// store it
|
|
m_currentBinPtrs [ hostNum ] = bin;
|
|
// this includes a Multicast in the Bin
|
|
bin->m_mcast.constructor();
|
|
// for freeing
|
|
bin->m_allocSize = alloc;
|
|
// the end of it
|
|
char *memEnd = mem + alloc;
|
|
// reset offset into Bin::m_buf
|
|
bin->m_cursor = 0;
|
|
// is it to a msg99reply? so the reply handler knows how to
|
|
// handle mcast::m_hackQPtr and what action to take. it is
|
|
// slightly different.
|
|
if ( linkPtrs ) bin->m_hackIsMsg99ReplyPtr = 0;
|
|
else bin->m_hackIsMsg99ReplyPtr = 1;
|
|
// . before we add any queries, store langid of QUERY
|
|
// . crap just use doc langid for now
|
|
char *bp = bin->m_buf;
|
|
// first is docid. if doing QueryLinks this is the docid
|
|
// of the related docid, otherwise, it is that of our main doc
|
|
*(int64_t *)bp = docId; bp += 8;
|
|
// then langid
|
|
*bp = m_langId; bp++;
|
|
// then the coll
|
|
gbmemcpy ( bp , cr->m_coll , collLen );
|
|
bp += collLen;
|
|
*bp++ = '\0';
|
|
// sanity!
|
|
if ( bp >= memEnd ) { char *xx=NULL;*xx=0; }
|
|
// the size of the termlist buf
|
|
*(int32_t *)bp = tsize; bp += 4;
|
|
// then the termlistbuf that has all the termlists forour docid
|
|
gbmemcpy ( bp , termListBuf->getBufStart(), tsize ); bp += tsize;
|
|
// update bin's cursor
|
|
bin->m_cursor = bp - bin->m_buf;
|
|
// for breach detection. send off Bin when breach happens.
|
|
bin->m_maxCursor = alloc - sizeof(Bin);
|
|
}
|
|
|
|
// can we store the current query into this bin?
|
|
bool storeInBin = true;
|
|
|
|
// is there enough room for this query in the bin?
|
|
int32_t need = qlen + 40;
|
|
if ( bin->m_cursor + need >= bin->m_maxCursor )
|
|
storeInBin = false;
|
|
|
|
// does docid of bin match?
|
|
int64_t binDocId = *(int64_t *)(bin->m_buf);
|
|
if ( docId != binDocId )
|
|
storeInBin = false;
|
|
|
|
// if we can't store this query into the bin, send it off now
|
|
if ( ! storeInBin ) {
|
|
// use its multicast to send this bin off if too full
|
|
if ( ! sendBin ( hostNum ) ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// . now the current bin should have been emptied
|
|
// . go back to top to realloc Bin::m_buf to hold this query
|
|
goto queryLoop;
|
|
}
|
|
|
|
char *p = bin->m_buf + bin->m_cursor;
|
|
|
|
// first store the offset from the buf so we can return it
|
|
// in the reply which is a list of scores basically and we know
|
|
// what score goes with what m_qcursor
|
|
*(int32_t *)p = m_qcursor;
|
|
p += 4;
|
|
|
|
// now store queries in the request buf for the msg3f
|
|
p += sprintf(p,"gbdocid:%" UINT64 " | %s",docId,qstr);
|
|
*p++ = '\0';
|
|
|
|
// update cursor
|
|
bin->m_cursor = p - bin->m_buf;
|
|
|
|
// skip to next query/docid to evaluate
|
|
m_qcursor++;
|
|
|
|
// if we have more queries left, add them to bins now
|
|
if ( m_qcursor < numPtrs ) goto queryLoop;
|
|
|
|
// now send every bin, we have no queries left.
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! m_currentBinPtrs[i] ) continue;
|
|
// this will transfer the request buffer over to mcast
|
|
// so it will be freed when mcast returns
|
|
sendBin ( i );
|
|
}
|
|
|
|
goto sendLoop;
|
|
}
|
|
|
|
// we got back the score for each query link in
|
|
// the bin that we sent out for the docid specified in the bin header request
|
|
void XmlDoc::gotMsg3fReply ( Bin *bin ) { // Multicast *mcast ) {
|
|
|
|
setStatus ( "gotmsg3freply" );
|
|
|
|
// do some housekeeping
|
|
Host *h = bin->m_hackHost;
|
|
h->m_numOutstandingRequests--;
|
|
|
|
m_numMsg3fReplies++;
|
|
|
|
// sanity
|
|
Multicast *mcast = &bin->m_mcast;
|
|
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get the reply
|
|
bool freeIt = false;
|
|
int32_t replySize = 0;
|
|
int32_t replyMaxSize;
|
|
char *rbuf = mcast->getBestReply ( &replySize ,
|
|
&replyMaxSize ,
|
|
&freeIt ,
|
|
true ); // steal it?
|
|
|
|
// log it too
|
|
//log("seopipe: got msg3f reply from host %" INT32 " size=%" INT32 " bytes",
|
|
// h->m_hostId,replySize);
|
|
|
|
// cast it
|
|
//Msg3fReply *mr = (Msg3fReply *)rbuf;
|
|
// in case of mem-leak this helps
|
|
//if ( rbuf ) relabel(rbuf,replyMaxSize,"xx-rb");
|
|
// . we must be able to free it... we must own it
|
|
// . this is true if we should free it, but we should not have
|
|
// to free it since it is owned by the slot?
|
|
if ( freeIt ) {
|
|
log(LOG_LOGIC,"query: msg3f: Steal failed.");
|
|
char *xx = NULL; *xx=0;
|
|
}
|
|
|
|
// if it failed for some reason i guess just bail
|
|
if ( ! rbuf ) {
|
|
// clean up the bin and the multicast and the request buffer
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
g_errno = EBADREPLYSIZE;
|
|
log(LOG_LOGIC,"seopipe: bad msg3f empty reply");
|
|
return;
|
|
}
|
|
|
|
// reply is just sequence of docid/score pairs
|
|
char *rp = rbuf;
|
|
char *rpEnd = rbuf + replySize;
|
|
|
|
//int32_t firstCursor = bin->m_hackPtrCursor;
|
|
|
|
// scan the msg99 replies and insert the scores we got for each
|
|
// query from the msg3f reply in "rbuf"
|
|
for ( ; rp < rpEnd ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . first is index, what query # in the request are we
|
|
// processing now, might not be in order because we launch
|
|
// a bunch of msg39s in parallel in handleRequest3f()'s call
|
|
// to processQueries()
|
|
// . but the corresponding msg99reply is reply # "qcursor"
|
|
int32_t qcursor = *(int32_t *)rp;
|
|
rp += 4;
|
|
int64_t docId = *(int64_t *)rp;
|
|
rp += 8;
|
|
float score = *(float *)rp;
|
|
rp += 4;
|
|
// . if this is true that means qcursor is referencing a
|
|
// msg99reply and we should set the score of that msg99
|
|
// reply to what the handlerequest3f provided
|
|
// . so store the docid and score for our url for this query
|
|
if ( bin->m_hackIsMsg99ReplyPtr ) {
|
|
SafeBuf *mqbuf = getMatchingQueries(false,-1);
|
|
Msg99Reply **qptrs=(Msg99Reply **)mqbuf->getBufStart();
|
|
Msg99Reply *qr = qptrs[qcursor];
|
|
qr->m_myScore = score;
|
|
qr->m_myDocId = docId;
|
|
int32_t numQueryPtrs=mqbuf->length()/sizeof(Msg99Reply *);
|
|
// if too many skip some
|
|
if ( numQueryPtrs > 1000 && (qcursor%1000)!=0)continue;
|
|
// if too many skip some
|
|
if ( numQueryPtrs > 400 && (qcursor%100) !=0)continue;
|
|
char *qstr = qr->m_queryStr;
|
|
log("seopipe: got query #%" INT32 "of%" INT32 " score=%f qstr=%s"
|
|
,qcursor+1
|
|
,numQueryPtrs
|
|
,score
|
|
,qstr
|
|
);
|
|
continue;
|
|
}
|
|
// might be storing in a QueryLink (doing related docids)
|
|
//SafeBuf *ibuf = getRelatedQueryLinksWithStrings();
|
|
QueryLink *qks =(QueryLink *)m_tmpBuf5.getBufStart();
|
|
//int32_t numQueryLinks = ibuf->length() / sizeof(QueryLink);
|
|
QueryLink *qk = &qks[qcursor];
|
|
// sanity. make sure qk->m_queryStringOffset is related to our
|
|
// local m_tmpStringBuf5 and not relative to the
|
|
// g_qbuf of the hostid that sent back the msg99 reply.
|
|
if ( qk->m_queryHostId != -1 ) { char *xx=NULL;*xx=0; }
|
|
// how many related query links do we got? for logging.
|
|
int32_t nks = m_tmpBuf5.length()/sizeof(QueryLink);
|
|
// int16_tcuts
|
|
char *base = m_tmpStringBuf5.getBufStart();
|
|
// skip over gigablastTraffic and googleTraffic
|
|
QueryLogEntry *qe;
|
|
qe = (QueryLogEntry *)(base + qk->m_queryStringOffset);
|
|
SafeBuf *rdbuf = &m_relatedDocIdBuf;
|
|
if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
|
|
RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
|
|
// note it
|
|
if ( (qcursor % 1000) == 0 ) // || qcursor < 100 )
|
|
log("seopipe: got msg3f reply for related query "
|
|
"#%" INT32 "of%" INT32 " "
|
|
"query \"gbdocid:%" INT64 " | %s\" gigablasttraffic=%" INT32 " "
|
|
"googletraffic=%" INT32 " serpscore=%f goodscore=%f"
|
|
,qcursor+1
|
|
,nks
|
|
,rd->m_docId
|
|
,qe->getQueryStr()
|
|
,qe->m_gigablastTraffic
|
|
,qe->m_googleTraffic
|
|
,score
|
|
,qe->m_topSERPScore // of a docid slice on 1 host
|
|
);
|
|
//
|
|
// no longer used queryrel!
|
|
//
|
|
// if we are scoring QueryLinks then we add a QueryRel
|
|
//QueryRel qr;
|
|
// clear that mem to zero
|
|
//memset ( &qr , 0 , sizeof(QueryRel));
|
|
// then add the info we know
|
|
//qr.m_relatedDocId = qk->m_relatedDocId;
|
|
//char *base2 = m_relatedDocIdBuf.getBufStart();
|
|
//int32_t rdOff = (char *)qk->m_relatedDocId - base2;
|
|
//qr.m_relatedDocIdOff = rdOff;
|
|
//qr.m_offsetIntoRelQStrBuf = qk->m_queryStringOffset;
|
|
//qr.m_myScore = score;
|
|
//qr.m_nextOff = -1;
|
|
//qr.m_tailOff = -1;
|
|
qk->m_serpScore = score;
|
|
// save that. WHAT IF THIS ERRORS?!?!?!
|
|
//if ( ! m_queryRelBuf.safeMemcpy(&qr,sizeof(QueryRel)) ) {
|
|
// m_binError = g_errno;
|
|
// log("xmldoc: panic. failed to store query rel");
|
|
// break;
|
|
//}
|
|
// debug test
|
|
//m_binError = EBADENGINEER;
|
|
//log("xmldoc: panic2. failed to store query rel");
|
|
//break;
|
|
}
|
|
|
|
// ok, we got the docid and score, now free it
|
|
mfree ( rbuf , replyMaxSize , "fmsg3f" );
|
|
|
|
// clean up the bin and the multicast and the request buffer
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
|
|
//if ( m_newxd2Blocked )
|
|
// log("debug: got reply, but returning because newxd2 "
|
|
// "had blocked");
|
|
|
|
// prevent double entry bug from entering scoreDocIdRestrictedQueries()
|
|
// from a newxd2 function blocking and coming in through msg22
|
|
// callback or whatever, vs. coming in from here
|
|
if ( m_newxd2Blocked ) return;
|
|
|
|
//log("debug: got reply and calling masterloop");
|
|
|
|
// go back to the transmit function
|
|
m_masterLoop ( m_masterState );
|
|
|
|
// if not done, just return... otherwise we double enter
|
|
// scoreDocIdRestrictedQueries() along with it's call to
|
|
// getTermListBuf()... and all hell breaks loose
|
|
return;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// send contents of m_socketWriteBuf to m_seoSocket
|
|
void XmlDoc::pumpSocketWriteBuf ( ) {
|
|
|
|
if ( ! m_seoSocket ) return;
|
|
|
|
setStatus ( "pumpsocketwritebuf" );
|
|
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
|
|
// insert http header into m_socketWriteBuf if not there
|
|
char *wbuf = sb->getBufStart();
|
|
bool insertIt = false;
|
|
if ( ! wbuf ) insertIt = true;
|
|
if ( wbuf && strncmp(wbuf,"HTTP/1.0 ",9 ) ) insertIt = true;
|
|
// add http header first
|
|
if ( insertIt ) {
|
|
// reset # bytes sent
|
|
m_socketWriteBufSent = 0;
|
|
m_registeredSocketCallback = false;
|
|
// xml-itize each query reply without scoring info
|
|
sb->insert("HTTP/1.0 200 OK\r\n"
|
|
"Content-Type: text/xml ; "
|
|
"charset=utf-8\r\n"
|
|
"\r\n"
|
|
"<response>\n",0);
|
|
}
|
|
|
|
// come back here to do another send
|
|
sendLoop:
|
|
|
|
// try sending out our xml buffer on the socket
|
|
// the very first things we do is send the queries over without
|
|
// the ranking info which we compute by calling msg39 on each query,
|
|
// so at least we can display something quite quickly.
|
|
if ( m_socketWriteBufSent < sb->length() ) {
|
|
int32_t sd = m_seoSocket->m_sd;
|
|
// just in case
|
|
if ( m_registeredSocketCallback ) {
|
|
g_loop.unregisterWriteCallback(sd,this,
|
|
getSEOQueryInfoWrapper2);
|
|
m_registeredSocketCallback = false;
|
|
}
|
|
// send that off
|
|
int32_t sendLen = sb->length();
|
|
char *sendStr = sb->getBufStart();
|
|
char *sendEnd = sendStr + sendLen;
|
|
// if we sent SOME last time, skip over that
|
|
sendStr += m_socketWriteBufSent;
|
|
// how much left?
|
|
int32_t remaining = sendEnd - sendStr;
|
|
// wtf?
|
|
if ( remaining <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// try a send on non-blocking socket
|
|
int32_t n = ::send ( sd , sendStr , remaining , 0 );
|
|
// did we send something?
|
|
if ( n > 0 ) {
|
|
m_socketWriteBufSent += n;
|
|
goto sendLoop;
|
|
}
|
|
// maybe it sent 0 because it was waiting for something
|
|
// so set our callback for when the socket is ready for
|
|
// writing again. try sending more later.
|
|
g_loop.registerWriteCallback ( sd ,
|
|
this ,
|
|
getSEOQueryInfoWrapper2,
|
|
0 ); // niceness = 0
|
|
// flag it so we don't leak these
|
|
m_registeredSocketCallback = true;
|
|
}
|
|
}
|
|
*/
|
|
|
|
bool XmlDoc::getIsInjecting ( ) {
|
|
bool isInjecting = false;
|
|
//if ( g_inPageInject ) isInjecting = true;
|
|
if ( m_sreqValid && m_sreq.m_isInjecting ) isInjecting = true;
|
|
if ( m_isInjecting && m_isInjectingValid ) isInjecting = true;
|
|
return isInjecting;
|
|
}
|
|
|
|
|
|
int posdbKeyCmp ( const void *a, const void *b ) {
|
|
char *ka = (char *)a;
|
|
char *kb = (char *)b;
|
|
//int64_t tid64a = g_posdb.getTermId(ka);
|
|
//int64_t tid64b = g_posdb.getTermId(kb);
|
|
// a bit of a hack so handleRequest8e already has these
|
|
// guys sorted by their lower 32-bits of termids so it can
|
|
// match this doc to queries without having to sort first.
|
|
//uint32_t tid32a = (uint32_t)tid64a;
|
|
//uint32_t tid32b = (uint32_t)tid64b;
|
|
//if ( tid32a < tid32b ) return -1;
|
|
//if ( tid32a > tid32b ) return 1; // swap
|
|
//if ( tid64a < tid64b ) return -1;
|
|
//if ( tid64a > tid64b ) return 1; // swap
|
|
char val = KEYCMP(ka,kb,sizeof(POSDBKEY));
|
|
if ( val > 0 ) return 1;
|
|
if ( val < 0 ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
// . used by XmlDoc::getTermListBuf() below
|
|
// . sorted by posdb key straight up
|
|
SafeBuf *XmlDoc::getTermIdSortedPosdbListBuf ( ) {
|
|
|
|
if ( m_sortedPosdbListBufValid )
|
|
return &m_sortedPosdbListBuf;
|
|
|
|
// get the lists. forDelete = false.
|
|
char *metaList = getMetaList ( false );
|
|
if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;
|
|
|
|
// sanity
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make a tmp buf to hold posdb keys
|
|
//SafeBuf tmp;
|
|
if ( ! m_sortedPosdbListBuf.reserve(m_metaListSize,"spbuf"))
|
|
return NULL;
|
|
// point into it
|
|
char *dst = m_sortedPosdbListBuf.getBufStart();
|
|
|
|
// debug test
|
|
//verifyMetaList ( m_metaList ,
|
|
// m_metaList + m_metaListSize ,
|
|
// false );
|
|
|
|
// scan the meta list for posdb keys
|
|
char *p = metaList;
|
|
char *pend = p + m_metaListSize;
|
|
// stole this loop from getMetaList()
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save it with the flag
|
|
char byte = *p;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// get key
|
|
char *key = p;
|
|
// skip that
|
|
p += ks;
|
|
// get datasize
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
// assume we do not store the datasize
|
|
//bool neg = false;
|
|
// . if key is negative, no data is present
|
|
// . the doledb key is negative for us here
|
|
if ( (key[0] & 0x01) == 0x00 ) ds = 0;
|
|
// if datasize variable, read it in
|
|
if ( ds == -1 ) {
|
|
// get data size
|
|
ds = *(int32_t *)p;
|
|
// skip data size int32_t
|
|
p += 4;
|
|
}
|
|
// point to data
|
|
//char *data = p;
|
|
// skip data if not zero
|
|
p += ds;
|
|
// if not posdb skip rec
|
|
if ( rdbId != RDB_POSDB ) continue;
|
|
// skip negative keys
|
|
if ( (key[0] & 0x01) == 0x00 ) continue;
|
|
// add to new buf now
|
|
gbmemcpy ( dst , key , sizeof(POSDBKEY) );
|
|
// advance
|
|
dst += sizeof(POSDBKEY);
|
|
}
|
|
char *start = m_sortedPosdbListBuf.getBufStart();
|
|
// update tmp
|
|
m_sortedPosdbListBuf.incrementLength ( dst - start );
|
|
// sanity
|
|
if ( m_sortedPosdbListBuf.length() > m_metaListSize ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// point
|
|
char *pbuf = m_sortedPosdbListBuf.getBufStart();
|
|
int32_t numKeys = m_sortedPosdbListBuf.length()/sizeof(POSDBKEY);
|
|
|
|
// sort keys by termid
|
|
gbqsort ( pbuf ,
|
|
numKeys,
|
|
sizeof(POSDBKEY),
|
|
posdbKeyCmp,
|
|
m_niceness );
|
|
|
|
m_sortedPosdbListBufValid = true;
|
|
return &m_sortedPosdbListBuf;
|
|
}
|
|
|
|
|
|
#define TLBUFSIZE 5000
|
|
|
|
// . used by the seo pipeline
|
|
// . this is a list of posdb termlists, one termlist per termid.
|
|
// . we store each termlist in this termlistbuf into g_termListCache
|
|
// . we use g_termListCache for evaluating gbdocid:xxx| restricted queries
|
|
// very quickly without having to hit disk because all the posdb termlists
|
|
// for that docid should be in g_termListCache
|
|
SafeBuf *XmlDoc::getTermListBuf ( ) {
|
|
|
|
if ( m_termListBufValid )
|
|
return &m_termListBuf;
|
|
|
|
// . ensure content is recycled from title rec
|
|
// . no, because if we had to download the doc fresh for the first
|
|
// time, this caused us headaches around line 30657 and we ended
|
|
// up setting m_docIndexed to false there and calling logIt() twice!
|
|
//m_recycleContent = true;
|
|
//m_recycleLinkInfo = true;
|
|
|
|
// try to set from title rec first. return false if it blocked.
|
|
//if ( ! loadFromOldTitleRec() ) return (SafeBuf *)-1;
|
|
|
|
// did that fail? i.e. docid not found!?!?!
|
|
//if ( m_oldTitleRecValid && ! m_oldTitleRec ) {
|
|
// g_errno = ENOTFOUND;
|
|
// return NULL;
|
|
//}
|
|
|
|
// only get posdb keys in getMetaList()
|
|
/*
|
|
m_useTitledb = false;
|
|
m_useTagdb = false;
|
|
m_useClusterdb = false;
|
|
m_useSpiderdb = false;
|
|
m_useLinkdb = false;
|
|
*/
|
|
|
|
// . these are FULL 18-byte keys, no compression
|
|
// . sorted by posdbkeys straight up, so by termid
|
|
SafeBuf *posdbBuf = getTermIdSortedPosdbListBuf ();
|
|
if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;
|
|
|
|
int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);
|
|
|
|
// . reserve mem for new termlistbuf
|
|
// . include 4 bytes for listsize
|
|
// . this buffer will be a list of lists
|
|
int32_t need = numKeys * (sizeof(POSDBKEY) + 4);
|
|
if ( ! m_termListBuf.reserve ( need ,"tlstbuf" ) )
|
|
return NULL;
|
|
|
|
|
|
int64_t lastTermId = -1LL;
|
|
/*
|
|
char tmpBuf[TLBUFSIZE];
|
|
// build termlists from the posdb records
|
|
RdbList termList;
|
|
// stolen from RdbList::set
|
|
termList.m_list = tmpBuf;
|
|
termList.m_listSize = 0;
|
|
termList.m_listEnd = tmpBuf;
|
|
termList.m_alloc = tmpBuf;
|
|
termList.m_allocSize = TLBUFSIZE;
|
|
termList.m_ownData = false;
|
|
termList.m_ks = sizeof(POSDBKEY);
|
|
termList.m_fixedDataSize = 0;
|
|
termList.m_ownData = false;
|
|
termList.m_useHalfKeys = true;
|
|
termList.resetListPtr();
|
|
bool breakOut = false;
|
|
*/
|
|
// start a size bookmark
|
|
int32_t *bookmark = NULL;
|
|
// scan all the sorted posdb keys and build posdb termlists and
|
|
// store the termlists into "m_termListBuf"
|
|
char *p = posdbBuf->getBufStart();
|
|
char *pend = p + posdbBuf->length();
|
|
for ( ; p < pend ; ) {
|
|
// get the key
|
|
char *key = p;
|
|
// must be full 18 byte keys!
|
|
if ( p[0] & 0x06 ) { char *xx=NULL;*xx=0; }
|
|
// skip it
|
|
p += sizeof(POSDBKEY);
|
|
// get key termid
|
|
int64_t termId = g_posdb.getTermId ( key );
|
|
// sanity
|
|
int64_t docId = g_posdb.getDocId ( key );
|
|
if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
|
|
// sanity. is it sorted by termid?
|
|
if ( termId < lastTermId && lastTermId == -1 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// log it for debug
|
|
//if ( docId == 192304365235LL )
|
|
// log("tlist: docid=%" INT64 " termId=%" INT64 " wpos=%" INT32 "",
|
|
// docId,
|
|
// termId,
|
|
// g_posdb.getWordPos(key));
|
|
// . store size of keys following that have same termid
|
|
// . assume just one for now!
|
|
if ( termId != lastTermId ) {
|
|
bookmark = (int32_t *)m_termListBuf.getBuf();
|
|
m_termListBuf.pushLong(sizeof(POSDBKEY));
|
|
}
|
|
// store the key
|
|
m_termListBuf.safeMemcpy ( key , sizeof(POSDBKEY) );
|
|
// if not first in the list, update size
|
|
if ( termId == lastTermId ) *bookmark += sizeof(POSDBKEY);
|
|
// . cache currently made list then
|
|
// . set startkey/endkey
|
|
//char startKey[sizeof(POSDBKEY)];
|
|
//char endKey [sizeof(POSDBKEY)];
|
|
//g_posdb.makeStartKey(startKey,lastTermId,m_docId);
|
|
//g_posdb.makeEndKey (endKey,lastTermId,m_docId);
|
|
// update it for next list
|
|
lastTermId = termId;
|
|
// . add to ongoing list? will use compression bit.
|
|
// . return true with g_errno set on error
|
|
// . use g_termListCache in Msg0.cpp
|
|
//if(!addToTermListCache(cr->m_coll,startKey,endKey,&termList))
|
|
// return true;
|
|
// first store the list size
|
|
//m_termListBuf.pushLong(termList.m_listSize);
|
|
// then the list data itself
|
|
//m_termListBuf.safeMemcpy(termList.m_list,termList.m_listSize)
|
|
// now reset
|
|
//termList.m_listSize = 0;
|
|
//termList.m_list = tmpBuf;
|
|
//termList.m_listEnd = tmpBuf;//ermList.m_list;
|
|
//termList.resetListPtr();
|
|
// if we are a loopback, bail
|
|
//if ( breakOut ) break;
|
|
// are we the last record?
|
|
//if ( p >= pend ) breakOut = true;
|
|
// add fresh to the new termlist
|
|
//goto addIt;
|
|
}
|
|
|
|
// sanity
|
|
if ( m_termListBuf.length() &&
|
|
g_posdb.getDocId(m_termListBuf.getBufStart()+4) != m_docId ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
m_termListBufValid = true;
|
|
|
|
return &m_termListBuf;
|
|
// print timing
|
|
//int64_t now = gettimeofdayInMilliseconds();
|
|
//int64_t took = now - m_cacheStartTime;
|
|
//log("seopipe: took %" INT64 " ms to parse docid %" INT64 "",took,m_docId);
|
|
// . flag it as being completely cached now
|
|
// . returns false and sets g_errno on error
|
|
//return addDocIdToTermListCache ( m_docId , cr->m_coll );
|
|
}
|
|
|
|
|
|
//int32_t XmlDoc::getNumInsertableTerms ( ) {
|
|
// // make sure they called getInsertableTerms() first!
|
|
// if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0;}
|
|
// return m_insertableTermsBuf.length() / sizeof(InsertableTerm);
|
|
//}
|
|
|
|
// . return a list of InsertableTerms
|
|
// . these are just terms we will try to insert into the document in every
|
|
// possible place to see how they affect ranking of this document for
|
|
// all the applicable queries
|
|
// . then when we call getScoredInsertableTerms() it will fill in the
|
|
// m_queryChangeBuf array
|
|
SafeBuf *XmlDoc::getInsertableTerms ( ) {
|
|
|
|
if ( m_insertableTermsBufValid )
|
|
return &m_insertableTermsBuf;
|
|
|
|
// make sure related query string buf is valid
|
|
//SafeBuf *rrr = getRelatedQueryLinksWithStrings();
|
|
//if ( ! rrr || rrr == (void *)-1 ) return rrr;
|
|
|
|
// just use this now
|
|
SafeBuf *mtBuf = getMissingTermBuf();
|
|
if ( ! mtBuf || mtBuf == (void *)-1 ) return mtBuf;
|
|
|
|
// get buffer of ptrs to the msg99 replies for this url
|
|
//SafeBuf *mqbuf = getMatchingQueries ( false );
|
|
//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
|
|
|
|
// just use the MissingTerm class for these as well!!
|
|
SafeBuf *maBuf = getMatchingTermBuf();
|
|
if ( ! maBuf || maBuf == (void *)-1 ) return maBuf;
|
|
|
|
|
|
|
|
//
|
|
// alloc space for the insertable terms in its safebuf
|
|
//
|
|
int32_t need = 0;
|
|
char *p;
|
|
char *pend;
|
|
p = mtBuf->getBufStart();
|
|
pend = mtBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
MissingTerm *mt = (MissingTerm *)p;
|
|
p += mt->getSize();
|
|
need += sizeof(InsertableTerm);
|
|
need += mt->getTermSize();
|
|
}
|
|
// these are the matching terms, but use the same MissingTerm class
|
|
p = maBuf->getBufStart();
|
|
pend = maBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
MissingTerm *mt = (MissingTerm *)p;
|
|
p += mt->getSize();
|
|
need += sizeof(InsertableTerm);
|
|
need += mt->getTermSize();
|
|
}
|
|
if ( ! m_insertableTermsBuf.reserve ( need ,"itblbuf" ) ) return NULL;
|
|
|
|
//
|
|
// now interleave the matching terms with the related terms
|
|
//
|
|
|
|
char *p1 = mtBuf->getBufStart();
|
|
char *p1End = mtBuf->getBuf();
|
|
|
|
char *p2 = maBuf->getBufStart();
|
|
char *p2End = maBuf->getBuf();
|
|
|
|
// int16_tcut
|
|
SafeBuf *ib = &m_insertableTermsBuf;
|
|
|
|
int32_t count; for ( count = 0 ; ; count++ ) {
|
|
// . just get top 50 insertable terms
|
|
// . use #define MAX_INSERTABLE_TERMS 50?
|
|
if ( count >= 50 ) break;
|
|
bool add1 = false;
|
|
bool add2 = false;
|
|
if ( ( count % 2 ) == 0 && p1 < p1End ) add1 = true;
|
|
if ( ( count % 2 ) == 1 && p2 < p2End ) add2 = true;
|
|
if ( ! add1 && ! add2 ) break;
|
|
MissingTerm *mt;
|
|
if ( add1 ) {
|
|
mt = (MissingTerm *)p1;
|
|
p1 += mt->getSize();
|
|
}
|
|
if ( add2 ) {
|
|
mt = (MissingTerm *)p2;
|
|
p2 += mt->getSize();
|
|
}
|
|
// make an insertable term
|
|
InsertableTerm it;
|
|
if ( add1 ) it.m_isRelatedTerm = true;
|
|
else it.m_isRelatedTerm = false;
|
|
// sum of traffic of the queries that contained this term
|
|
it.m_trafficSum = mt->m_traffic;
|
|
// hash it up
|
|
char *term = mt->getTerm();
|
|
int32_t termSize = mt->getTermSize();
|
|
it.m_termHash64 = hash64 ( term , termSize - 1 );
|
|
it.m_termSize = termSize;
|
|
// reset this for later use
|
|
it.m_bestTrafficGain = -1;
|
|
it.m_bestInsertPos = -1;
|
|
// store that insertable term
|
|
ib->safeMemcpy(&it,sizeof(InsertableTerm));
|
|
// then the term string itself follows for easy serialization
|
|
// into cachedb...
|
|
ib->safeMemcpy(term,termSize);
|
|
}
|
|
|
|
if ( ib->length() > need ) { char *xx=NULL;*xx=0; }
|
|
|
|
//m_numInsertableTerms = count;
|
|
|
|
m_insertableTermsBufValid = true;
|
|
return &m_insertableTermsBuf;
|
|
}
|
|
|
|
|
|
static void gotMsg95ReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotMsg95Reply( slot );
|
|
}
|
|
|
|
void XmlDoc::gotMsg95Reply ( UdpSlot *slot ) {
|
|
// count it
|
|
m_numMsg95Replies++;
|
|
// return if still waiting
|
|
if ( m_numMsg95Replies < m_numMsg95Requests ) return;
|
|
// . store each msg95reply
|
|
// . TODO: do we need m_msg95ReplyAlloc[] like m_msg99 has?
|
|
m_msg95ReplyPtrs [slot->m_hostId] = slot->m_readBuf;
|
|
m_msg95ReplySizes[slot->m_hostId] = slot->m_readBufSize;
|
|
// do not let it free it, we will free it
|
|
slot->m_readBuf = NULL;
|
|
// all done! should call getScoredInsertableTerms() indirectly
|
|
m_masterLoop ( m_masterState );
|
|
}
|
|
|
|
#include "seo.h" // for Msg95Request class
|
|
|
|
/*
|
|
// return a buffer of WordFreqInfo instances for every word in the
|
|
// insertable terms buffer. we use this so the msg95 handler can get the
|
|
// term freqs of any term in any matching query consistently, because
|
|
// we are host #0 calling this presumably. msg95 handler will use these
|
|
// to set the termfreqs in the Msg39Request when calling msg39.
|
|
// TODO: run through related queries as well! why didn't insertable terms
|
|
// work!?!?! it should...
|
|
SafeBuf *XmlDoc::getInsertableWordFreqInfoBuf ( ) {
|
|
|
|
// must always be host 0 or it's twin! we have to ensure
|
|
// consistency always when calling getTermFreq()...
|
|
if ( g_hostdb.m_groupId != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( m_iwfiBufValid )
|
|
return &m_iwfiBuf;
|
|
|
|
// get the same top word ids we pass to the msg95 request,
|
|
// because handleRequest95() uses those to get the queries
|
|
// that we match, and it evaluates each of those queries on each
|
|
// insertion we do.
|
|
// So that is the ptr_twid32Buf, which MUST include all
|
|
// insertable terms as well, like those insertable terms that are
|
|
// new to us!!
|
|
|
|
// scan list of insertable terms
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
|
|
|
|
// . true means to get synonyms
|
|
// . itBuf non-null will append new insertable terms we don't have
|
|
int32_t *twids = getTopTermsVectorWithNewTerms ( true , itBuf );
|
|
if ( ! twids || twids==(void *)-1 ) return (SafeBuf *)twids;
|
|
|
|
// int16_tcut
|
|
//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
|
|
//int32_t ni = itBuf->length() / sizeof(InsertableTerm);
|
|
|
|
// get buffer of ptrs to the msg99 replies for this url
|
|
//SafeBuf *mqbuf = getMatchingQueries ( false );
|
|
//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
|
|
//Msg99Reply **mrp = (Msg99Reply **)mqbuf->getBufStart();
|
|
//int32_t nmrp = mqbuf->length() / 4;
|
|
|
|
|
|
// use table to dedup so we do not store dups
|
|
HashTableX dups;
|
|
if ( ! dups.set ( 8,0,8192,NULL,0,false,m_niceness,"iwfidup") )
|
|
return NULL;
|
|
|
|
// . first store the langid in the buf!!!
|
|
// . then the wordfreqinfos follow!
|
|
if ( ! m_iwfiBuf.safeMemcpy ( &docLangId , 1 ) )
|
|
return NULL;
|
|
|
|
char *p = itBuf->getBufStart();
|
|
char*pend = itBuf->getBuf();
|
|
|
|
// scan each "term" which might be one or more words
|
|
for ( ; p < pend ; ) {
|
|
//for ( int32_t i = 0 ; i < nmrp ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// add it in
|
|
if ( ! addTermFreqsForTerm ( it->getTerm() , &dups ) )
|
|
return NULL;
|
|
}
|
|
|
|
// do the same for all words and bigram terms in doc as well
|
|
|
|
|
|
m_iwfiBufValid = true;
|
|
return &m_iwfiBuf;
|
|
}
|
|
|
|
bool XmlDoc::addTermFreqsForTerm ( char *term , HashTableX *dups ) {
|
|
|
|
// we need this for synonyms
|
|
//uint8_t langId = langEnglish;
|
|
uint8_t *langIdPtr = getLangId();
|
|
// this should have been set by parent caller
|
|
if ( ! langIdPtr || langIdPtr == (uint8_t *)-1 ) {char *xx=NULL;*xx=0;}
|
|
// get the language this doc is in
|
|
uint8_t docLangId = *langIdPtr;
|
|
// if unknown, use english!
|
|
if ( docLangId == langUnknown ) docLangId = langEnglish;
|
|
|
|
|
|
//Msg99Reply *mr = mrp[i];
|
|
//Words ww;
|
|
//ww.set3 ( it->m_termStr );
|
|
//ww.set3(it->getTerm() );//mr->m_queryStr );//it->m_termStr );
|
|
Query qq;
|
|
// false = query expansion? i.e. use synonyms?
|
|
//qq.set2 ( it->getTerm(),docLangId,true);
|
|
qq.set2 ( term,docLangId,true);
|
|
//if ( strstr ( mr->m_queryStr, "bio wagner"))
|
|
// log("hey");
|
|
log("adding %s",term);
|
|
//int64_t *wids = ww.getWordIds();
|
|
// scan each word for term freq
|
|
for ( int32_t j = 0 ; j < qq.m_numTerms ; j++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qq.m_qterms[j];
|
|
// get the full 64-bit hash of the word
|
|
int64_t wid = qt->m_rawTermId;
|
|
// skip if punct
|
|
if ( ! wid ) continue;
|
|
// dup?
|
|
if ( dups->isInTable ( &wid ) ) continue;
|
|
// add it
|
|
int64_t tf = g_posdb.getTermFreq ( cr->m_coll, wid );
|
|
if ( ! dups->addKey ( &wid ) ) return NULL;
|
|
WordFreqInfo wfi;
|
|
wfi.m_wordId64 = wid;
|
|
wfi.m_wordFreq64 = tf;
|
|
// note it
|
|
SafeBuf bb;
|
|
bb.safePrintf("seo: tf for term=\"");
|
|
bb.safeMemcpy ( qt->m_term, qt->m_termLen);
|
|
bb.safePrintf("\" = %" INT64 "",tf);
|
|
log("seo: %s",bb.getBufStart());
|
|
// store it
|
|
if(!m_iwfiBuf.safeMemcpy(&wfi,sizeof(WordFreqInfo)))
|
|
return NULL;
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// 2. now transmit all the insertable terms to each host in the network. each
|
|
// host will evaluate each term in the list for every query that that
|
|
// host has in its memory for every new word position. kick this process
|
|
// off with the getNewRanks() function which returns a list of
|
|
// query terms where each query term has a wordposition/trafficgain
|
|
// array. [try to also insert entire phrases not just words]
|
|
// Each host will return an InsertedTerm class for each term. But then
|
|
// WE have to merge the InsertedTerm classes together for a particular
|
|
// term. That can be a bit tricky since we do not list a wordposition
|
|
// if it's traffic gain was the same as its previous wordposition.
|
|
// PASS in the entire doc's termlist with each request in case not in cache
|
|
// so it can evaluate each query's scores very quickly!
|
|
//
|
|
// . send a msg95 request to each host consisting of a list of terms to
|
|
// insert, and the entire termlists of this document.
|
|
// . then merge the replies into a final list of InsertedTerms.
|
|
// . returned is buffer of InsertableTerms
|
|
SafeBuf *XmlDoc::getScoredInsertableTerms ( ) {
|
|
|
|
setStatus ( "getscoredinsertableterms" );
|
|
|
|
if ( m_scoredInsertableTermsBufValid )
|
|
return &m_insertableTermsBuf;
|
|
|
|
uint8_t *langIdPtr = getLangId();
|
|
if ( ! langIdPtr || langIdPtr == (void *)-1 )
|
|
return (SafeBuf *)langIdPtr;
|
|
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
|
|
|
|
// these are the posdb keys of our document, makes it fast
|
|
// and easy for msg39 to return a serp score restricted to our docid
|
|
SafeBuf *termListBuf = getTermListBuf();
|
|
if ( ! termListBuf || termListBuf==(void *)-1 )
|
|
return termListBuf;
|
|
|
|
|
|
// this has all our documents terms and their synonyms in it,
|
|
// as well as the new terms we plan to insert that our doc does not
|
|
// have, from the getMissingTerms() buffer. in addition it
|
|
// has the term freq of each one!
|
|
SafeBuf *ntiBuf = getNewTermInfoBuf();
|
|
if ( ! ntiBuf || ntiBuf == (void *)-1 ) return (SafeBuf *)ntiBuf;
|
|
|
|
// get list of TermFreqInfo instances for all words in the
|
|
// list of insertable terms
|
|
//SafeBuf *wfib = getInsertableWordFreqInfoBuf ( );
|
|
//if ( ! wfib || wfib == (void *)-1 ) return wfib;
|
|
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) return wpib;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if still waiting for replies to come in, return -1
|
|
if ( m_numMsg95Requests > 0 && m_numMsg95Replies < m_numMsg95Requests )
|
|
return (SafeBuf *)-1;
|
|
|
|
top:
|
|
|
|
// otherwise, we are done!
|
|
if ( m_numMsg95Requests > 0 && m_numMsg95Replies >=m_numMsg95Requests){
|
|
// . calculate the best insertable position for each
|
|
// Insertable Term.
|
|
// . we get a QueryChange array back from each host for
|
|
// the same term, but for queries local on that host,
|
|
// so add them all up here and set
|
|
// InsertableTerm::m_bestTrafficGain/m_bestTermPosition
|
|
// . queries that did not have us in the top 50 will not
|
|
// be in the reply
|
|
processMsg95Replies();
|
|
// show how long it took
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginMsg95s;
|
|
log("seopipe: time: getscoredinsertableterms took %" INT64 " ms",
|
|
took);
|
|
// return the list of InsertableTerms, scored
|
|
m_scoredInsertableTermsBufValid = true;
|
|
// cache it! if it blocks that is ok, since it is valid n
|
|
// disable for debug... MDW!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
if ( ! storeScoredInsertableTermsIntoCachedb() )
|
|
return (SafeBuf *)-1;
|
|
return &m_insertableTermsBuf;
|
|
}
|
|
|
|
|
|
// now send every term in this list to every host in the
|
|
// network so it can evaluate with each of the queries it contains
|
|
// in memory from the query log for every position in the doc.
|
|
// then it will return InsertableTerm::m_wordPositions/m_trafficGain
|
|
// arrays for each InsertableTerm.
|
|
|
|
// time how long this whole thing takes
|
|
m_beginMsg95s = gettimeofdayInMilliseconds();
|
|
// reset this crap i guess
|
|
m_numMsg95Requests = 0;
|
|
m_numMsg95Replies = 0;
|
|
|
|
// from seo.h
|
|
Msg95Request mr;
|
|
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
mr.m_docId = m_docId;
|
|
mr.m_docLangId = *langIdPtr;
|
|
mr.m_seoDebug = m_seoDebug;
|
|
|
|
mr.ptr_posdbTermList = termListBuf->getBufStart();
|
|
// a buffer of TermInfos. used to set the termFreq of each term
|
|
// and used to determine what queries match the doc and should be
|
|
// evaluated for every insertion.
|
|
mr.ptr_termInfoBuf = ntiBuf->getBufStart();
|
|
mr.ptr_coll = cr->m_coll;
|
|
//mr.ptr_wordFreqInfoBuf = wfib->getBufStart();
|
|
mr.ptr_wordPosInfoBuf = wpib->getBufStart();
|
|
// why do we need this? doesn't termInfoBuf have all that? no,
|
|
// because we limit insertableterms to like the top 300 highest
|
|
// scoring, so they are separate. the termInfoBuf is sorted by
|
|
// termid (lower 32-bits) and has a termfreq and is used to
|
|
// get the matching queries in seo.cpp:handlerequest95()
|
|
mr.ptr_insertableTerms = m_insertableTermsBuf.getBufStart();
|
|
|
|
mr.size_posdbTermList = termListBuf->length();
|
|
mr.size_termInfoBuf = ntiBuf->length();//m_numTwids * 4;
|
|
mr.size_coll = gbstrlen(cr->m_coll)+1;
|
|
//mr.size_wordFreqInfoBuf = wfib->length();
|
|
mr.size_wordPosInfoBuf = wpib->length();
|
|
mr.size_insertableTerms = m_insertableTermsBuf.length();
|
|
|
|
int32_t requestSize;
|
|
char *req = serializeMsg ( sizeof(Msg95Request),
|
|
&mr.size_posdbTermList ,// firstSizeParm
|
|
&mr.size_insertableTerms,//lastSizeP
|
|
&mr.ptr_posdbTermList ,// firststrptr
|
|
&mr ,// thisPtr
|
|
&requestSize ,
|
|
NULL ,
|
|
0 ,
|
|
true );
|
|
|
|
if ( ! req ) return NULL;
|
|
|
|
int32_t numHosts = g_hostdb.m_numHosts;
|
|
// do not re-send if we already did this!
|
|
if ( m_numMsg95Requests > 0 ) numHosts = 0;
|
|
|
|
// send one msg95 request to each host. skip if dead.
|
|
for ( int32_t i = 0; i < numHosts ; i++ ) {
|
|
// get ptr to the host
|
|
Host *host = g_hostdb.getHost(i);
|
|
// get hostid of host #i
|
|
int32_t hostId = host->m_hostId;
|
|
// count it
|
|
m_numMsg95Requests++;
|
|
// skip if dead. i guess no queries from that guy. we can't
|
|
// send to a twin because the twin does not have the same
|
|
// queries in its in-memory query log. once we get more
|
|
// machines we should probably make the twin have the same
|
|
// copy so we can be redundant.
|
|
if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive ) {
|
|
log("seo: warning. host %" INT32 " is dead so we could "
|
|
"not do the keyword tool right",hostId);
|
|
m_numMsg95Replies++;
|
|
continue;
|
|
}
|
|
// . send our posdb termlist to each host so it can
|
|
// call msg39 restricted to our docid very quickly
|
|
// . also send a ALL of the insertable terms to each
|
|
// host so they can evaluate the insertion for all of the
|
|
// relevant queries.
|
|
// . each host should be smart enough to realize that some
|
|
// queries need not be performed for an insertion because
|
|
// it is impossible to break the minimum score to be in the
|
|
// top 50 for that query. but we'll only have a minimum
|
|
// score for each query once we run a batch to eval
|
|
// each query at least partially to get a rough idea of
|
|
// the score needed to be in the top 50.
|
|
// . reply should be an array of QueryChanges for each
|
|
// insertable term for every query that matches this doc
|
|
// in the g_qlog buffer.
|
|
// . in most cases these arrays will be empty because we are
|
|
// not in the top 50 for that query
|
|
if ( ! g_udpServer.sendRequest ( req ,
|
|
requestSize ,
|
|
0x95 , // msgtype
|
|
host->m_ip , // ip
|
|
host->m_port , // port
|
|
hostId,
|
|
NULL, // retslot
|
|
this,
|
|
gotMsg95ReplyWrapper,
|
|
10000 , // timeout
|
|
-1 , // backoff
|
|
-1 , // maxwait
|
|
NULL, // replybuf
|
|
0, // replybufmaxsize
|
|
m_niceness // niceness
|
|
)) {
|
|
// let admin know about error
|
|
log("seopipe: sendRequest 95 had error: %s",
|
|
mstrerror(g_errno));
|
|
// count it as replied then
|
|
m_numMsg95Replies++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// wait for all msg95 replies to come in
|
|
if ( m_numMsg95Requests > m_numMsg95Replies )
|
|
return (SafeBuf *)-1;
|
|
|
|
// somehow we finished without blocking
|
|
goto top;
|
|
|
|
// dummy return
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// now sort the huge ptr buffer to QueryChanges first by:
|
|
// 1: QueryChange::m_termHash64
|
|
// 2: QueryChange::m_queryHash32
|
|
// 3: QueryChange::m_insertPos
|
|
int queryChangeCmp ( const void *a, const void *b ) {
|
|
QueryChange *qa = *(QueryChange **)a;
|
|
QueryChange *qb = *(QueryChange **)b;
|
|
// smallest term hash should be at the head of the list
|
|
if ( qa->m_termHash64 < qb->m_termHash64 ) return -1;
|
|
if ( qa->m_termHash64 > qb->m_termHash64 ) return 1;
|
|
if ( qa->m_queryHash32 < qb->m_queryHash32 ) return -1;
|
|
if ( qa->m_queryHash32 > qb->m_queryHash32 ) return 1;
|
|
if ( qa->m_insertPos < qb->m_insertPos ) return -1;
|
|
if ( qa->m_insertPos > qb->m_insertPos ) return 1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
// . make each InsertableTerm point to a linked list of QueryChanges for it.
|
|
// . each QueryChange is a word position and a rank change
|
|
// . the linked list will be sorted by QueryChange::m_insertPos
|
|
// . there can be multiple QueryChanges for a single m_insertPos, but
|
|
// they will be fore different queries.
|
|
bool XmlDoc::processMsg95Replies() {
|
|
|
|
int32_t need = 0;
|
|
// each reply is a list of QueryChanges
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// get reply
|
|
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
|
|
// skip if empty, error?
|
|
if ( ! mr ) continue;
|
|
// deserialize the msg95replies first
|
|
deserializeMsg ( sizeof(Msg95Reply) ,
|
|
(int32_t *)&mr->size_queryChangeBuf,//1stszparm
|
|
(int32_t *)&mr->size_queryLogBuf,//lastszparm
|
|
(char **)&mr->ptr_queryChangeBuf,//1ststrptr
|
|
mr->m_buf );
|
|
// scan the QueryChanges
|
|
//QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
|
|
int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
|
|
need += ncs * 4;
|
|
}
|
|
// alloc now
|
|
SafeBuf hugePtrBuf;
|
|
if ( ! hugePtrBuf.reserve ( need ,"hpbuf" ) ) return false;
|
|
|
|
// how big are all query log bufs?
|
|
int32_t sumTotal = 0;
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// get reply
|
|
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
|
|
// skip if empty, error?
|
|
if ( ! mr ) continue;
|
|
// how big
|
|
sumTotal += mr->size_queryLogBuf;
|
|
}
|
|
m_queryLogBuf.reset();
|
|
if ( ! m_queryLogBuf.reserve ( sumTotal ,"qlogbuf") ) return false;
|
|
char *orig = m_queryLogBuf.getBufStart();
|
|
|
|
int32_t ongoingOffset = 0;
|
|
int32_t ongoingDebugOffset = 0;
|
|
int32_t ongoingOrigOffset = 0;
|
|
|
|
// . fill up higePtrBuf for sorting below
|
|
// . also fill up m_queryLogBuf now for store*IntoCachedb()
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// get reply
|
|
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
|
|
// skip if empty, error?
|
|
if ( ! mr ) continue;
|
|
// ref it
|
|
//char *ref = m_queryLogBuf.getBuf();
|
|
//int32_t ref = m_queryLogBuf.length();
|
|
// add to our big buffer
|
|
m_queryLogBuf.safeMemcpy ( mr->ptr_queryLogBuf ,
|
|
mr->size_queryLogBuf );
|
|
// debug scores. should be length 0 if not debugging.
|
|
m_debugScoreInfoBuf.safeMemcpy ( mr->ptr_debugScoreInfoBuf ,
|
|
mr->size_debugScoreInfoBuf );
|
|
// original scores buf
|
|
m_origScoreInfoBuf.safeMemcpy ( mr->ptr_origScoreInfoBuf ,
|
|
mr->size_origScoreInfoBuf );
|
|
// scan the QueryChanges
|
|
QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
|
|
int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
|
|
for ( int32_t j = 0 ; j < ncs ; j++ ) {
|
|
QueryChange *qc = &qcs[j] ;
|
|
// this is relative to ptr_queryLogBuf
|
|
qc->m_replyQueryOffset += ongoingOffset;
|
|
// if we have debug score info
|
|
if ( m_seoDebug >= 2 ) {
|
|
if ( qc->m_debugScoreInfoOffset < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
if ( qc->m_origScoreInfoOffset < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
qc->m_debugScoreInfoOffset +=
|
|
ongoingDebugOffset;
|
|
qc->m_origScoreInfoOffset +=
|
|
ongoingOrigOffset;
|
|
}
|
|
// that's relative to the msg95reply's ptr_queruStrBuf
|
|
//QueryLogEntry *qe;
|
|
//qe = (QueryLogEntry *)(mr->ptr_queryLogBuf + qoff);
|
|
//qe = (QueryLogEntry *)(ref + qoff);
|
|
// HACK that in. RELATIVE to m_queryLogBuf!!!
|
|
//qc->m_queryOffset3 = ref;//(int32_t)qe;
|
|
// add ptr to our global buffer
|
|
hugePtrBuf.pushPtr ( qc );
|
|
}
|
|
// sum it up
|
|
ongoingOffset += mr->size_queryLogBuf;
|
|
ongoingDebugOffset += mr->size_debugScoreInfoBuf;
|
|
ongoingOrigOffset += mr->size_origScoreInfoBuf;
|
|
}
|
|
// sanity. make sure doesn't grow since we reference it
|
|
if ( m_queryLogBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now sort the huge ptr buffer to QueryChanges first by:
|
|
// 1: QueryChange::m_termHash64
|
|
// 2: QueryChange::m_queryHash32
|
|
// 3: QueryChange::m_insertPos
|
|
char *hhh = hugePtrBuf.getBufStart();
|
|
int32_t size = hugePtrBuf.length();
|
|
// this should breath with niceness!!
|
|
gbqsort ( hhh ,
|
|
size/4 ,
|
|
sizeof(QueryChange *),
|
|
queryChangeCmp ,
|
|
m_niceness ) ;
|
|
|
|
// now store those sorted query changes into m_queryChangeBuf
|
|
// so we can cache them in store*IntoCached() easily
|
|
int32_t nqc = (need / 4) ;
|
|
if ( ! m_queryChangeBuf.reserve ( nqc * sizeof(QueryChange),"qcbuf") )
|
|
return false;
|
|
// for sanity check
|
|
char *orig2 = m_queryChangeBuf.getBufStart();
|
|
|
|
// copy over sorted into m_queryChangeBuf so we can cache it in cachedb
|
|
char *p = hhh;
|
|
char *pend = hhh + size;
|
|
for ( ; p < pend ; p += sizeof(QueryChange *) ) {
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)p;
|
|
// save ptr to it
|
|
char *ref = m_queryChangeBuf.getBuf();
|
|
// save it
|
|
m_queryChangeBuf.safeMemcpy ( qc , sizeof(QueryChange) );
|
|
// now ref that instead
|
|
*(QueryChange **)p = (QueryChange *)ref;
|
|
}
|
|
// sanity test
|
|
if ( m_queryChangeBuf.getBufStart() != orig2 ) { char *xx=NULL;*xx=0;}
|
|
|
|
// now we can free the replies since we stored the replies into
|
|
// m_queryLogBuf and m_queryChangeBuf for store*IntoCachedb()
|
|
for ( int32_t i = 0;i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg95ReplyPtrs[i] ) continue;
|
|
mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
|
|
m_msg95ReplyPtrs[i] = NULL;
|
|
}
|
|
|
|
|
|
// . now set QueryChange::m_next to make our linked list
|
|
// . if it is for a different query or termhash then end the linked
|
|
// list by setting m_next to NULL
|
|
QueryChange *lastqc = NULL;
|
|
for ( p = hhh ; p < pend ; p += 4 ) {
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)p;
|
|
// assume we are the last one in the linked list
|
|
qc->m_next = NULL;
|
|
// make linked list
|
|
if ( lastqc &&
|
|
// terms must match to be in same linked list
|
|
lastqc->m_termHash64 == qc->m_termHash64 )
|
|
// link them
|
|
lastqc->m_next = qc;
|
|
// set this for next qc
|
|
lastqc = qc;
|
|
}
|
|
|
|
// now set InsertableTerm::m_firstQueryChange to point to the head
|
|
// of the linked list for that term based on it's m_termHash64.
|
|
// but the insertable terms are sorted by m_trafficSum.
|
|
// map a termHash64 to its corresponding first QueryChange.
|
|
HashTableX tit;
|
|
if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
|
|
if ( ! tit.set ( 8,4, ni*4,NULL,0,false,m_niceness,"tittbl") )
|
|
return false;
|
|
|
|
int64_t lastHash64 = 0LL;
|
|
// . store ptr to first querychange for each termhash64 into hash table
|
|
// . should be the head of the linked list for a termid
|
|
for ( p = hhh ; p < pend ; p += 4 ) {
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)p;
|
|
// skip if not a new term hash
|
|
if ( qc->m_termHash64 == lastHash64 ) continue;
|
|
// update it
|
|
lastHash64 = qc->m_termHash64;
|
|
// . map it in the hash table then
|
|
// . it should be pre-allocated!
|
|
if (!tit.addKey(&qc->m_termHash64,&qc)){char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// now scan the insertable terms and set their
|
|
// InsertableTerm::m_firstQueryChange ptr. points to the head
|
|
// of the QueryChange linked list for this insertable term
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
p = itBuf->getBufStart();
|
|
pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// assume none
|
|
it->m_firstQueryChange = NULL;
|
|
char *val = (char *)tit.getValue(&it->m_termHash64);
|
|
// i guess there is none
|
|
if ( ! val ) continue;
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)val;
|
|
// and assign
|
|
it->m_firstQueryChange = qc;
|
|
}
|
|
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
|
|
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
|
|
|
|
// now set InsertableTerm::m_bestTrafficGain/m_bestInsertPos/
|
|
// m_bestQueryChange by scanning the linked list and scoring each
|
|
// QueryChange::m_insertPos to see which is the highest traffic gain.
|
|
// and in the case of ties prefer the lowest word position.
|
|
p = itBuf->getBufStart();
|
|
pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// . use this function now so seo.cpp can call it too!
|
|
// . sets WordPosInfo::m_trafficGain members
|
|
setWordPosInfosTrafficGain ( it );
|
|
// now find the insert position with the most traffic gain!
|
|
int32_t bestTrafficGain = -1;
|
|
int32_t bestInsertPos = -1;
|
|
for ( int32_t j = 0 ; j < nwpis ; j++ ) {
|
|
// skip if not the best scoring position
|
|
if ( wpis[j].m_trafficGain <= bestTrafficGain &&
|
|
// and if not first time!
|
|
bestInsertPos != -1 )
|
|
continue;
|
|
// we got a new winner
|
|
bestTrafficGain = wpis[j].m_trafficGain;
|
|
bestInsertPos = wpis[j].m_wordPos;//insertPos;
|
|
}
|
|
// set it
|
|
it->m_bestTrafficGain = bestTrafficGain;
|
|
it->m_bestInsertPos = bestInsertPos;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void XmlDoc::setWordPosInfosTrafficGain ( InsertableTerm *it ) {
|
|
|
|
// get the wordposinfobuf!
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
|
|
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
|
|
|
|
// . use the wordposinfo array to accumulate traffic gains
|
|
// for each word position, WordPosInfo::m_insertPos.
|
|
// . TODO: ignore tags like gblangid:
|
|
// . so reset the traffic gains first
|
|
for ( int32_t j = 0 ; j < nwpis ; j++ )
|
|
wpis[j].m_trafficGain = 0;
|
|
|
|
|
|
if ( ! it ) return;
|
|
|
|
// head of the linked list of QueryChanges for this InsertableTerm
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
// skip if no list. leave traffic gains set to 0 for all
|
|
if ( ! qc ) return;
|
|
|
|
|
|
// accumulate traffic gains
|
|
int32_t k = 0;
|
|
int32_t lastQueryHash32 = 0;
|
|
//bool firstQueryChangeForQuery;
|
|
QueryChange *lastqc = NULL;
|
|
// . scan the linked list of query changes
|
|
// . this is sorted by query first then m_insertPos
|
|
for ( ; qc ; qc = qc->m_next ) {
|
|
// assume NOT the first QueryChange for this query
|
|
//firstQueryChangeForQuery = false;
|
|
// . reset stuff for each different query
|
|
// . QueryChanges are sorted by m_queryHash32 secondly
|
|
// and by m_insertPos thirdly now...
|
|
if ( qc->m_queryHash32 != lastQueryHash32 ) {
|
|
// reset our WordPosInfo cursor
|
|
k = 0;
|
|
// for detecting the next set of QueryChanges
|
|
// for a different query
|
|
lastQueryHash32 = qc->m_queryHash32;
|
|
//firstQueryChangeForQuery = true;
|
|
lastqc = NULL;
|
|
}
|
|
// sanity
|
|
if ( lastqc && lastqc->m_insertPos > qc->m_insertPos ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// compute th traffic in advance from the rank changes
|
|
int32_t trafficGain = getTrafficGain( qc );
|
|
// checkpoint
|
|
/*
|
|
if ( trafficGain > 0 )
|
|
log("got some traffic gain qh=%" UINT32 " "
|
|
"pos=%" INT32 " term=%s gain=%" INT32 "",
|
|
qc->m_queryHash32,
|
|
qc->m_insertPos,
|
|
it->m_termStr,
|
|
trafficGain);
|
|
*/
|
|
// get next query change
|
|
QueryChange *nqc = qc->m_next;
|
|
// make it NULL if for a different query
|
|
if ( nqc && nqc->m_queryHash32 != qc->m_queryHash32 )
|
|
nqc = NULL;
|
|
// . we use a compression where we only store a
|
|
// QueryChange if different than the last QueryChange
|
|
// . so advance the WordPosInfos cursor "k" until
|
|
// we catch up to the qc->m_insertPos.
|
|
for ( ; k < nwpis ; k++ ) {
|
|
// stop if we are caught up
|
|
if ( wpis[k].m_wordPos >= qc->m_insertPos )
|
|
break;
|
|
}
|
|
// now this position and up to next qc "nqc" gets the traffic
|
|
for ( ; k < nwpis ; k++ ) {
|
|
// stop if we are caught up
|
|
if ( nqc && wpis[k].m_wordPos >= nqc->m_insertPos )
|
|
break;
|
|
wpis[k].m_trafficGain += trafficGain;
|
|
}
|
|
}
|
|
|
|
/*
|
|
// print out positives - debug
|
|
for ( int32_t k = 0 ; k < nwpis ; k++ ) {
|
|
// stop if we are caught up
|
|
if ( ! wpis[k].m_trafficGain ) continue;
|
|
if ( wpis[k].m_trafficGain <= 0 ) continue;
|
|
// note it
|
|
log("seo: gain pos=%" INT32 " gain=%" INT32 "",
|
|
wpis[k].m_wordPos,
|
|
wpis[k].m_trafficGain);
|
|
}
|
|
*/
|
|
}
|
|
|
|
double getTrafficPercent ( int32_t rank ) {
|
|
// from aol's query logs from that same searchenginewatch.com url
|
|
static double s_posClicks[1000] = {
|
|
.4230, // #1
|
|
.1192,
|
|
.0844,
|
|
.0603,
|
|
.0486,
|
|
.0399,
|
|
.0337,
|
|
.0298,
|
|
.0283,
|
|
.0270 // #10 (was .297 but for our purposes, make it <)
|
|
};
|
|
|
|
//static float s_pageClicks[5];
|
|
|
|
// set total of clicks each page gets
|
|
static bool s_init = false;
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
//float sum = 0.0;
|
|
//for ( int32_t i = 0 ; i < 10 ; i++ )
|
|
// sum += s_posClicks[i];
|
|
// this is about .11 or so
|
|
//float pageFactor = 1.0 - sum;
|
|
// HACK! make it pass the sanity check below!
|
|
//pageFactor *= .50;
|
|
// sanity. do not allow top result on 2nd page
|
|
// to rank higher!!
|
|
//if ( pageFactor * s_posClicks[0] > s_posClicks[9] ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// will be like .11 for second page, .01 for 3rd, etc.
|
|
//float pageMult = 1.0;
|
|
// fill in the rest
|
|
for ( int32_t i = 10 ; i < 1000 ; i++ ) {
|
|
// just make it linear since there is too much
|
|
// chaos as to our diffs with google. so this is
|
|
// a good estimation way...
|
|
s_posClicks[i] = .0270 - .0007 * i;
|
|
if ( s_posClicks[i] < 0 )
|
|
s_posClicks[i] = 0.0;
|
|
}
|
|
// sanity to make sure all in order
|
|
for ( int32_t i = 1 ; i < 1000 ; i++ ) {
|
|
if ( s_posClicks[i-1] < s_posClicks[i] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
if ( s_posClicks[i] < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
}
|
|
|
|
if ( rank >= 1000 ) rank = 999;
|
|
if ( rank < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
return s_posClicks[rank];
|
|
}
|
|
// . based on difference between m_oldRank and m_newRank
|
|
// . m_*Rank starts at 0 and goes to 9 for first page of results
|
|
int32_t XmlDoc::getTrafficGain ( QueryChange *qc ) {
|
|
|
|
// no rank change? this can both be -1 if it is a missing
|
|
// term i guess... and we're not inserting it.
|
|
if ( qc->m_oldRank == qc->m_newRank ) return 0;
|
|
|
|
// get old clicks
|
|
int32_t oldRank = qc->m_oldRank;
|
|
double oldp;
|
|
// if not ranked before because this was inserting a brand new
|
|
// missing term, this will be -1
|
|
if ( oldRank == -1 ) oldp = 0.0;
|
|
else oldp = getTrafficPercent ( oldRank );
|
|
//if ( oldRank < 50 ) oldp = s_posClicks[oldRank];
|
|
|
|
// get new clicks
|
|
int32_t newRank = qc->m_newRank;
|
|
float newp = getTrafficPercent ( newRank );
|
|
//if ( newRank < 50 ) newp = s_posClicks[newRank];
|
|
|
|
// HACK
|
|
// we stored the entire querylogreply buf in here
|
|
char *ref = m_queryLogBuf.getBufStart();
|
|
// so we can use the replyqueryoffset then...
|
|
QueryLogEntry *qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
|
|
int32_t traffic = qe->m_gigablastTraffic;
|
|
traffic *= GB_TRAFFIC_MODIFIER;
|
|
|
|
int32_t trafficChange = (int32_t)((newp - oldp) * traffic);
|
|
|
|
// sanity.
|
|
if ( qc->m_oldRank > qc->m_newRank && trafficChange < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// ignore this sanity check if not ranked before. i.e. inserting
|
|
// a new missing term...
|
|
if ( qc->m_oldRank != -1 &&
|
|
qc->m_oldRank < qc->m_newRank && trafficChange > 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// return the change. it might be negative!
|
|
return trafficChange;
|
|
}
|
|
|
|
|
|
// 4. then we just dump out all the InsertedTerms into xml so they can be
|
|
// displayed on the front end.
|
|
|
|
// dump the list of InsertedTerms into "sbuf" as xml
|
|
bool XmlDoc::printScoredInsertableTerms ( SafeBuf *sbuf ) {
|
|
// print the header
|
|
sbuf->safePrintf("\t<insertableTerms>\n");
|
|
// scan each term
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
// has to be there
|
|
if ( ! itBuf || itBuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
|
|
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
|
|
|
|
// cast it
|
|
//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
|
|
// how many terms do we have?
|
|
//int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
|
|
// dedup queries used in query changes
|
|
HashTableX qdups;
|
|
if ( ! qdups.set(4,0,32,NULL,0,false,m_niceness,"qddd") ) return false;
|
|
|
|
//
|
|
// . print query map
|
|
// . print all query ids we use and their strings
|
|
//
|
|
bool firstTime = true;
|
|
|
|
char *p = itBuf->getBufStart();
|
|
char *pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// scan its query changes
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
for ( ; qc ; qc = qc->m_next ) {
|
|
// skip if already printed
|
|
if ( qdups.isInTable(&qc->m_queryHash32) ) continue;
|
|
if ( firstTime ) {
|
|
sbuf->safePrintf("\t\t<queryMap>\n");
|
|
sbuf->safePrintf("\t\t\t<desc>"
|
|
"<![CDATA["
|
|
"32bitSignedQueryHash,"
|
|
"queryString"
|
|
"]]></desc>\n"
|
|
);
|
|
}
|
|
firstTime = false;
|
|
// HACK
|
|
char *ref = m_queryLogBuf.getBufStart();
|
|
QueryLogEntry *qe;
|
|
qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
|
|
// new query, print it. map the hash to the string
|
|
// so we can just show the hash when printing
|
|
// out all the QueryChanges below to save space
|
|
sbuf->safePrintf("\t\t\t<queryPoint>"
|
|
"<![CDATA[%" INT32 ",%s]]>"
|
|
"</queryPoint>\n"
|
|
, qc->m_queryHash32
|
|
// hack...
|
|
, qe->getQueryStr()
|
|
);
|
|
// do not re-print
|
|
if ( ! qdups.addKey(&qc->m_queryHash32) )return false;
|
|
}
|
|
}
|
|
if ( ! firstTime )
|
|
sbuf->safePrintf("\t\t</queryMap>\n");
|
|
|
|
// . now the word position map
|
|
// . we only provided querychange if it has a different score than
|
|
// the previously stored querychange. this is a kind of compression
|
|
// . so you need to know all the possible word positions we tried
|
|
// for each insertion we did
|
|
sbuf->safePrintf("\t\t<wordInsertionMap>\n");
|
|
sbuf->safePrintf("\t\t\t<desc>"
|
|
"<![CDATA["
|
|
"Describes all positions we attempt to insert each "
|
|
"insertable term into. The terms at that position "
|
|
"and up are pushed forward by the insertion. "
|
|
"<sent> is the sentence number."
|
|
"]]></desc>\n"
|
|
);
|
|
for ( int32_t i = 0 ; i < nwpis ; i++ ) {
|
|
WordPosInfo *wpi = &wpis[i];
|
|
sbuf->safePrintf("\t\t\t<word>\n"
|
|
"\t\t\t\t<pos>%" INT32 "</pos>\n"
|
|
"\t\t\t\t<sent>%" INT32 "</sent>\n"
|
|
"\t\t\t\t<hashGroup>%s</hashGroup>\n"
|
|
"\t\t\t\t<densityRank>%" INT32 "</densityRank>\n"
|
|
"\t\t\t\t<spamRank>%" INT32 "</spamRank>\n"
|
|
"\t\t\t</word>\n"
|
|
,wpi->m_wordPos
|
|
,wpi->m_sentNum
|
|
,getHashGroupString(wpi->m_hashGroup)
|
|
,(int32_t)wpi->m_densityRank
|
|
,(int32_t)wpi->m_wordSpamRank
|
|
);
|
|
|
|
}
|
|
sbuf->safePrintf("\t\t</wordInsertionMap>\n");
|
|
|
|
|
|
// scan all the insertable terms
|
|
p = itBuf->getBufStart();
|
|
pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// print the term
|
|
sbuf->safePrintf("\t\t<term>\n");
|
|
// the string
|
|
sbuf->safePrintf("\t\t\t<string><![CDATA[%s]]></string>\n",
|
|
it->getTerm());
|
|
// sum of traffic of all queries containing this term
|
|
sbuf->safePrintf("\t\t\t<importance>%" INT32 "</importance>\n",
|
|
it->m_trafficSum);
|
|
// is it contained in the doc/linktext or is it "related"
|
|
sbuf->safePrintf("\t\t\t<isRelatedTerm>%" INT32 "</isRelatedTerm>\n",
|
|
(int32_t)it->m_isRelatedTerm);
|
|
// get the first query change if any
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
// limit to fix firefox crash
|
|
//int32_t queryChangeLimit = 30;
|
|
// skip if no list
|
|
if ( ! qc ) goto skip;
|
|
// print the insert position that gives us the most traffic
|
|
sbuf->safePrintf("\t\t\t<bestInsertPosition>%" INT32 ""
|
|
"</bestInsertPosition>\n",
|
|
it->m_bestInsertPos);
|
|
sbuf->safePrintf("\t\t\t<bestTrafficGain>%" INT32 ""
|
|
"</bestTrafficGain>\n",
|
|
it->m_bestTrafficGain);
|
|
// print query changes
|
|
if ( it->m_firstQueryChange )
|
|
sbuf->safePrintf("\t\t\t<queryChanges><![CDATA["
|
|
);
|
|
// print out query changes for this term
|
|
for ( qc = it->m_firstQueryChange ; qc ; qc = qc->m_next ) {
|
|
// fix firefox crash for now
|
|
//if ( --queryChangeLimit <= 0 ) break;
|
|
// now store in binary
|
|
sbuf->pushLong(qc->m_insertPos);
|
|
sbuf->pushLong(qc->m_queryHash32);
|
|
sbuf->pushChar(qc->m_oldRank);
|
|
sbuf->pushChar(qc->m_newRank);
|
|
/*
|
|
// . TODO: make sure to remove QueryChanges that have
|
|
// the same old and new rank
|
|
// . print it
|
|
sbuf->safePrintf("\t\t\t<queryChange>\n");
|
|
sbuf->safePrintf("\t\t\t\t<insertPos>%" INT32 ""
|
|
"</insertPos>\n", qc->m_insertPos);
|
|
sbuf->safePrintf("\t\t\t\t<oldRank>%" INT32 ""
|
|
"</oldRank>\n",(int32_t)qc->m_oldRank);
|
|
sbuf->safePrintf("\t\t\t\t<newRank>%" INT32 ""
|
|
"</newRank>\n",(int32_t)qc->m_newRank);
|
|
sbuf->safePrintf("\t\t\t\t<queryId>%" INT32 ""
|
|
"</queryId>\n",
|
|
qc->m_queryHash32 );
|
|
sbuf->safePrintf("\t\t\t</queryChange>\n");
|
|
*/
|
|
}
|
|
if ( it->m_firstQueryChange )
|
|
sbuf->safePrintf("]]></queryChanges>\n");
|
|
|
|
skip:
|
|
// print the term end
|
|
sbuf->safePrintf("\t\t</term>\n");
|
|
}
|
|
sbuf->safePrintf("\t</insertableTerms>\n");
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
static int wordPosInfoCmp ( const void *a, const void *b ) {
|
|
WordPosInfo *wa = (WordPosInfo *)a;
|
|
WordPosInfo *wb = (WordPosInfo *)b;
|
|
// smallest word position should be at the head of the list
|
|
if ( wa->m_wordPos < wb->m_wordPos ) return -1;
|
|
if ( wa->m_wordPos > wb->m_wordPos ) return 1;
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
static int wpPosdbKeyCmp ( const void *a, const void *b ) {
|
|
int32_t wpa = g_posdb.getWordPos((char *)a);
|
|
int32_t wpb = g_posdb.getWordPos((char *)b);
|
|
return wpa - wpb;
|
|
}
|
|
|
|
SafeBuf *XmlDoc::getWordPosSortedPosdbListBuf ( ) {
|
|
|
|
if ( m_wpSortedPosdbListBufValid )
|
|
return &m_wpSortedPosdbListBuf;
|
|
|
|
// get the lists. forDelete = false.
|
|
char *metaList = getMetaList ( false );
|
|
if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;
|
|
|
|
// sanity
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make a tmp buf to hold posdb keys
|
|
//SafeBuf tmp;
|
|
if ( ! m_wpSortedPosdbListBuf.reserve ( m_metaListSize,"wpsbuf" ) )
|
|
return NULL;
|
|
// point into it
|
|
char *dst = m_wpSortedPosdbListBuf.getBufStart();
|
|
|
|
// scan the meta list for posdb keys
|
|
char *p = metaList;
|
|
char *pend = p + m_metaListSize;
|
|
// stole this loop from getMetaList()
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save it with the flag
|
|
char byte = *p;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// get key
|
|
char *key = p;
|
|
// skip that
|
|
p += ks;
|
|
// get datasize
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
// assume we do not store the datasize
|
|
//bool neg = false;
|
|
// . if key is negative, no data is present
|
|
// . the doledb key is negative for us here
|
|
if ( (key[0] & 0x01) == 0x00 ) ds = 0;
|
|
// if datasize variable, read it in
|
|
if ( ds == -1 ) {
|
|
// get data size
|
|
ds = *(int32_t *)p;
|
|
// skip data size int32_t
|
|
p += 4;
|
|
}
|
|
// point to data
|
|
//char *data = p;
|
|
// skip data if not zero
|
|
p += ds;
|
|
// if not posdb skip rec
|
|
if ( rdbId != RDB_POSDB ) continue;
|
|
// skip negative keys
|
|
if ( (key[0] & 0x01) == 0x00 ) continue;
|
|
// add to new buf now
|
|
gbmemcpy ( dst , key , sizeof(POSDBKEY) );
|
|
// advance
|
|
dst += sizeof(POSDBKEY);
|
|
}
|
|
char *start = m_wpSortedPosdbListBuf.getBufStart();
|
|
// update tmp
|
|
m_wpSortedPosdbListBuf.incrementLength ( dst - start );
|
|
// sanity
|
|
if ( m_wpSortedPosdbListBuf.length() > m_metaListSize ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// point
|
|
char *pbuf = m_wpSortedPosdbListBuf.getBufStart();
|
|
int32_t numKeys = m_wpSortedPosdbListBuf.length()/sizeof(POSDBKEY);
|
|
// sort keys by word position
|
|
gbqsort ( pbuf ,
|
|
numKeys,
|
|
sizeof(POSDBKEY),
|
|
wpPosdbKeyCmp ,
|
|
m_niceness );
|
|
|
|
m_wpSortedPosdbListBufValid = true;
|
|
return &m_wpSortedPosdbListBuf;
|
|
}
|
|
|
|
// now pass this into Msg95Request so we only try to insert right before
|
|
// or after m_wordPos values in this WordPosInfo vector.
|
|
SafeBuf *XmlDoc::getWordPosInfoBuf ( ) {
|
|
|
|
// if it is valid and we have not yet added to cachedb...
|
|
if ( m_wordPosInfoBufValid && ! m_triedToAddWordPosInfoToCachedb ) {
|
|
// only do this once
|
|
m_triedToAddWordPosInfoToCachedb = true;
|
|
// store the m_wordPosInfoBuf into cachedb
|
|
if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
|
|
return (SafeBuf *)-1;
|
|
}
|
|
|
|
|
|
if ( m_wordPosInfoBufValid )
|
|
return &m_wordPosInfoBuf;
|
|
|
|
// it should be valid now from our logic in hashWords3() if
|
|
// m_doingSEO is set to true
|
|
char *xx=NULL; *xx=0;
|
|
|
|
// these are FULL 18-byte keys, no compression, sorted by word pos
|
|
SafeBuf *posdbBuf = getWordPosSortedPosdbListBuf ();
|
|
if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;
|
|
|
|
// scan posdb keys
|
|
int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);
|
|
|
|
// . reserve mem for new buf
|
|
int32_t need = numKeys * sizeof(WordPosInfo);
|
|
if ( ! m_wordPosInfoBuf.reserve ( need ,"wpibuf" ) )
|
|
return NULL;
|
|
|
|
int32_t sentNum = 0;
|
|
int32_t lastWordPos = -1;
|
|
//int32_t lastwp = -1;
|
|
int32_t lastSentNum = -1;
|
|
|
|
// scan all the sorted posdb keys and build posdb termlists and
|
|
// store the termlists into "m_termListBuf"
|
|
char *p = posdbBuf->getBufStart();
|
|
char *pend = p + posdbBuf->length();
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the key
|
|
char *key = p;
|
|
// sanity
|
|
if ( g_posdb.getKeySize(p) != 18 ) { char *xx=NULL;*xx=0; }
|
|
// skip del keys
|
|
if ( (p[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
|
|
// skip it
|
|
p += sizeof(POSDBKEY);
|
|
// get key termid
|
|
//int64_t termId = g_posdb.getTermId ( key );
|
|
// sanity
|
|
//int64_t docId = g_posdb.getDocId ( key );
|
|
//if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
|
|
// log it for debug
|
|
//if ( docId == 192304365235LL )
|
|
// log("tlist: docid=%" INT64 " termId=%" INT64 " wpos=%" INT32 "",
|
|
// docId,
|
|
// termId,
|
|
// g_posdb.getWordPos(key));
|
|
WordPosInfo wpi;
|
|
int32_t wp = g_posdb.getWordPos(key);
|
|
// set "m_sentNum"
|
|
if ( wp >= lastWordPos + 50 ) sentNum++;
|
|
wpi.m_wordPos = wp;
|
|
wpi.m_sentNum = sentNum;
|
|
wpi.m_hashGroup = g_posdb.getHashGroup (key);
|
|
wpi.m_densityRank = g_posdb.getDensityRank (key);
|
|
wpi.m_wordSpamRank = g_posdb.getWordSpamRank (key);
|
|
wpi.m_trafficGain = 0;
|
|
// log it
|
|
/*
|
|
log("seopipe: term=%" INT64 " pos=%" INT32 " sent=%" INT32 " hg=%s dr=%" INT32 "",
|
|
g_posdb.getTermId(key),
|
|
(int32_t)wp,
|
|
sentNum,
|
|
getHashGroupString(wpi.m_hashGroup),
|
|
(int32_t)wpi.m_densityRank);
|
|
*/
|
|
// bigrams share the same word position as the single term.
|
|
// so ignore them. we only want unique insertion positions.
|
|
if ( wp == lastWordPos ) continue;
|
|
// . i thought sorted by word position??
|
|
// . word position 0 is used by generic terms, like tags
|
|
if ( wp < lastWordPos ) { char *xx=NULL;*xx=0; }
|
|
// additional position at the end of a sentence?
|
|
//if ( lastwp != wp && lastSentNum == sentNum )
|
|
// // store it
|
|
// m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
|
|
// to right as well! so it can be in same sentence, if this
|
|
// word as at the end of the sentence.
|
|
//wpi.m_wordPos = wp;// + 2;
|
|
// add it
|
|
m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
|
|
|
|
int32_t nextSent = -1;
|
|
if ( p < pend ) {
|
|
// assume same as current sentence
|
|
nextSent = sentNum;
|
|
// get word position of next term
|
|
int32_t nextwp = g_posdb.getWordPos(p);
|
|
// same as us? then it is a bigram, so try the
|
|
// word after that!
|
|
if ( nextwp == wp && p+18<pend )
|
|
nextwp = g_posdb.getWordPos(p+18);
|
|
// if the following word position is in a new sentence
|
|
// he will be separated by 50 units! that is our base
|
|
// for sentence skip.
|
|
if ( nextwp >= wp + SENT_UNITS )
|
|
nextSent = sentNum+1;
|
|
}
|
|
|
|
// HACK. if next word starts a new sentence, add a WordPosInfo
|
|
// here so we can insert term at end of THIS sentence.
|
|
// otherwise we are inserted BEFORE the term whose position
|
|
// we use.
|
|
if ( nextSent != sentNum ) {
|
|
wpi.m_wordPos += 2;
|
|
m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
|
|
}
|
|
|
|
// set these
|
|
lastWordPos = wp;
|
|
//lastwp = wp;// + 2;
|
|
lastSentNum = sentNum;
|
|
}
|
|
|
|
/*
|
|
// point to raw buf
|
|
char *raw = m_wordPosInfoBuf.getBufStart();
|
|
int32_t size = m_wordPosInfoBuf.length();
|
|
// this shit is sorted by termid then pos, so sort just by pos
|
|
// this should breath with niceness!!
|
|
gbqsort ( raw ,
|
|
size / sizeof(WordPosInfo),
|
|
sizeof(WordPosInfo) ,
|
|
wordPosInfoCmp ,
|
|
m_niceness ) ;
|
|
*/
|
|
|
|
m_wordPosInfoBufValid = true;
|
|
|
|
return &m_wordPosInfoBuf;
|
|
}
|
|
|
|
// . i made this easy to serialize by using offsets and not ptrs
|
|
// . so we can add to cachedb easily
|
|
// . and so its immune to reallocs() on m_linkSourceBuf SafeBuf
|
|
class LinkSource {
|
|
public:
|
|
|
|
int32_t m_linkSiteRank;
|
|
|
|
// the actual url of the link, references into m_buf
|
|
int32_t m_linkUrlOffset;
|
|
// the title of the link, references into m_buf
|
|
int32_t m_linkTitleOffset;
|
|
|
|
// . we store the offsets of the RelatedDocIds in m_relatedDocIdBuf
|
|
// . these are the related docids that are linked to by this link src
|
|
int32_t m_offsetOfRelatedDocIdOffsets;
|
|
int32_t m_numRelatedDocIds;
|
|
|
|
char m_buf[0];
|
|
|
|
char *getLinkUrl ( SafeBuf *linkSourceBuf ) {
|
|
char *buf = linkSourceBuf->getBufStart();
|
|
buf += m_linkUrlOffset;
|
|
return buf;
|
|
};
|
|
|
|
char *getLinkTitle ( SafeBuf *linkSourceBuf ) {
|
|
char *buf = linkSourceBuf->getBufStart();
|
|
buf += m_linkTitleOffset;
|
|
return buf;
|
|
};
|
|
|
|
// crap, do we store RelatedDocIds into cachedb? we should
|
|
// make it use offsets and not ptrs too...
|
|
int32_t *getRelatedDocIdOffsets ( SafeBuf *linkSourceBuf ) {
|
|
// how can this be?
|
|
//if ( m_numRelatedDocIds == 0 ) return NULL;
|
|
char *buf = linkSourceBuf->getBufStart();
|
|
buf += m_offsetOfRelatedDocIdOffsets;
|
|
return (int32_t *)buf;
|
|
};
|
|
|
|
};
|
|
|
|
|
|
/*
|
|
static void gotLinkInfoReplyWrapper ( void *state ) {
|
|
//XmlDoc *newxd = (XmlDoc *)state;
|
|
Msg25 *msg25 = (Msg25 *)state;
|
|
XmlDoc *xd = msg25->m_xd;
|
|
// count it as returned
|
|
xd->m_numLinkRequestsIn++;
|
|
// this will nuke the msg25 as well after copying its linkinfo
|
|
xd->processLinkInfoMsg20Reply ( msg25 );
|
|
// try to send out more requests or intersect them if done
|
|
xd->m_masterLoop ( xd->m_masterState );
|
|
}
|
|
|
|
// . before we were just looking at the LinkInfo the msg25 makes from
|
|
// all the Msg20Replies it gets, but let's keep the msg20 replies
|
|
// intact because they have the titles we need!
|
|
// . return false on error, true otherwise
|
|
bool XmlDoc::processLinkInfoMsg20Reply ( Msg25 *msg25 ) {
|
|
// int16_tcut
|
|
//LinkInfo *info = msg25->getLinkInfo ();
|
|
// store into our buffer
|
|
//bool status ;
|
|
// i guess info can be NULL on error
|
|
//if ( info )
|
|
// status = m_linkInfoReplyBuf.safeMemcpy (info, info->getSize());
|
|
|
|
// give front-end the progress bar info
|
|
if ( m_seoSocket && m_progressBar ) {
|
|
// tmp buf
|
|
char tmp[16];
|
|
float percent = (float)m_rdCursor;
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
percent /= (float)numRelated;
|
|
// 80% of the pipeline was doing the full queries
|
|
percent *= .20;
|
|
percent += .80;
|
|
percent *= 100.0;
|
|
int32_t percentLong = (int32_t)percent;
|
|
if ( percentLong >= 100 ) percentLong = 99;
|
|
int32_t tmpLen = sprintf(tmp,"%02" INT32 "%%",percentLong);
|
|
if ( tmpLen !=3)log("seo: bad progress bar output %" INT32 "",tmpLen);
|
|
// try a send on non-blocking socket
|
|
int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
|
|
if ( n != tmpLen ) log("seo: bad progress bar send %" INT32 "",n);
|
|
// forget error
|
|
errno = 0;
|
|
}
|
|
|
|
// store this
|
|
int32_t nr = msg25->m_numReplyPtrs;
|
|
// reserve space
|
|
if ( ! m_msg20ReplyPtrBuf.reserve ( 8 + nr * 4 * 2 ) ) {
|
|
m_hadLinkInfoError = g_errno;
|
|
nr = 0;
|
|
}
|
|
// first store related docid ptr into m_relatedDocIdBuf safebuf
|
|
RelatedDocId *rd = (RelatedDocId *)msg25->m_hackrd;
|
|
m_msg20ReplyPtrBuf.pushLong((int32_t)rd);
|
|
// then store the # of msg20 replies
|
|
m_msg20ReplyPtrBuf.pushLong(nr);
|
|
// . scan each msg20reply it got, each msg20reply is an inlink
|
|
// for this docid
|
|
// . seems like they are only freed in Msg25::reset()
|
|
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
|
// get one
|
|
Msg20Reply *r = msg25->m_replyPtrs[i];
|
|
int32_t size = msg25->m_replySizes[i];
|
|
// steal it, we will free them ourselves below
|
|
m_msg20ReplyPtrBuf.pushLong((int32_t)r);
|
|
// we need this since we need to free it when done
|
|
m_msg20ReplyPtrBuf.pushLong(size);
|
|
}
|
|
// . do not allow Msg25 to free it, we will free it below
|
|
// . on OOM error above we set nr to 0 on error, so allow msg25
|
|
// to free the replies in that case
|
|
if ( nr ) msg25->m_numReplyPtrs = 0;
|
|
// nuke it
|
|
mdelete ( msg25 , sizeof(Msg25), "m25li" );
|
|
delete ( msg25 );
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
static int riCmp ( const void *a, const void *b ) {
|
|
RecommendedLink *wa = *(RecommendedLink **)a;
|
|
RecommendedLink *wb = *(RecommendedLink **)b;
|
|
int32_t diff = wb->m_votes - wa->m_votes;
|
|
if ( diff ) return diff;
|
|
if ( wb->m_totalRecommendedScore > wa->m_totalRecommendedScore )
|
|
return 1;
|
|
if ( wb->m_totalRecommendedScore < wa->m_totalRecommendedScore )
|
|
return -1;
|
|
// docid to break all ties
|
|
if ( wb->m_rl_docId > wa->m_rl_docId )
|
|
return 1;
|
|
if ( wb->m_rl_docId < wa->m_rl_docId )
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void gotLinkdbListWrapper ( void *state ) {
|
|
Msg0 *msg0 = (Msg0 *)state;
|
|
XmlDoc *xd = msg0->m_hackxd;
|
|
// free it's memory here lest we have a leak
|
|
//msg0->reset();
|
|
xd->m_numLinkRequestsIn++;
|
|
xd->m_masterLoop ( xd->m_masterState );
|
|
}
|
|
|
|
|
|
#define MAX_RECOMMENDED_LINKS 300
|
|
|
|
// . returns safebuf of RecommendedLinks
|
|
// . use RecommendedLink::getSize() to skip over element in array/safebuf
|
|
// . these are the recommended link sources
|
|
// . these are the links that your relateddocids (i.e. competing pages) have
|
|
// in common the most
|
|
// . TODO: store the returned safebuf in cachedb as well!
|
|
SafeBuf *XmlDoc::getRecommendedLinksBuf ( ) {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_recommendedLinksBufValid )
|
|
return &m_recommendedLinksBuf;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// what docids share our matching queries?
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
// cast then
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
// how many related docids do we have?
|
|
int32_t numRelatedDocIds = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
if ( m_numLinkRequestsOut == 0 ) {
|
|
// reset these on first call
|
|
m_rdCursor = 0;
|
|
m_numLinkRequestsIn = 0;
|
|
m_hadLinkInfoError = 0;
|
|
m_numMsg20sIn = 0;
|
|
m_numMsg20sOut = 0;
|
|
m_numValidMsg20s = 0;
|
|
m_titleCursor = 0;
|
|
m_msg20Phase = 0;
|
|
m_recommendedLinkError = 0;
|
|
}
|
|
|
|
if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if we are looking up the title/url of each docid in
|
|
// the m_recommendedLinksBuf now, go back there
|
|
if ( m_msg20Phase )
|
|
return lookupTitles();
|
|
|
|
for ( ; m_rdCursor < numRelatedDocIds ; m_rdCursor++ ) {
|
|
// wait if too many out. only allow 2 out. otherwise each
|
|
// one can send out like 500 msg20s
|
|
if ( m_numLinkRequestsOut - m_numLinkRequestsIn > 60 )
|
|
// wait for 1 to come back
|
|
return (SafeBuf *)-1;
|
|
// skip the rest on error
|
|
if ( m_hadLinkInfoError ) continue;
|
|
// cast it
|
|
RelatedDocId *rd = &rds[m_rdCursor];
|
|
// bogus? a not found, EDOCBANNED/EDOCFILTERED or it
|
|
// linked to our domain
|
|
if ( rd->rd_url_off < 0 )
|
|
continue;
|
|
// bogus?
|
|
if ( ! rd->getUrl( &m_relatedTitleBuf ) ) {
|
|
log("seo: skipping null url");
|
|
continue;
|
|
}
|
|
if ( ! rd->getSite( &m_relatedTitleBuf ) ) {
|
|
log("seo: skipping null site");
|
|
continue;
|
|
}
|
|
|
|
// allocate msg0 array into m_tmpMsg0Buf safebuf
|
|
if ( ! m_tmpMsg0Buf.length() ) {
|
|
// fill tmpmsg0 buf
|
|
int32_t need = sizeof(Msg0) * numRelatedDocIds;
|
|
if ( ! m_tmpMsg0Buf.reserve ( need , "tmp20s" ) )
|
|
return NULL;
|
|
// do not re-call!
|
|
m_tmpMsg0Buf.setLength(need);
|
|
char *p = m_tmpMsg0Buf.getBufStart();
|
|
char *pend = p + need;
|
|
for ( ; p < pend ; p += sizeof(Msg0) ) {
|
|
Msg0 *msg0 = (Msg0 *)p;
|
|
msg0->constructor();
|
|
}
|
|
}
|
|
|
|
// debug it
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: getting inlinks to related docid=%" INT64 " "
|
|
"weight=%f "
|
|
"url=%s",
|
|
rd->m_docId,
|
|
rd->m_relatedWeight,
|
|
rd->getUrl(&m_relatedTitleBuf));
|
|
|
|
// just get his linkdb list!
|
|
Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
|
|
Msg0 *msg0 = &array[m_rdCursor];
|
|
key224_t startKey;
|
|
key224_t endKey;
|
|
char *rdurl = rd->getUrl(&m_relatedTitleBuf);
|
|
// by default, just hash of hostname, unless overridden
|
|
// with "site" tag in tagdb, or has a path like /~mwells
|
|
int32_t siteHash32 = rd->m_rd_siteHash32;
|
|
int64_t linkHash64 = hash64n(rdurl);
|
|
startKey = g_linkdb.makeStartKey_uk (siteHash32,linkHash64 );
|
|
endKey = g_linkdb.makeEndKey_uk (siteHash32,linkHash64 );
|
|
|
|
// hack that thing
|
|
msg0->m_hackxd = this;
|
|
|
|
// consider it outstanding
|
|
m_numLinkRequestsOut++;
|
|
|
|
// int16_tcut, piggyback on the msg0
|
|
RdbList *list = &msg0->m_handyList;
|
|
//RdbList list2;
|
|
|
|
if ( ! msg0->getList ( -1 , // hostId, -1 if none
|
|
0 , // hostId ip
|
|
0 , // hostId port
|
|
0 , // max cache age -secs
|
|
false , // addToCache?
|
|
RDB_LINKDB ,
|
|
cr->m_collnum ,
|
|
list , // linkdb list to fill
|
|
(char*)&startKey,
|
|
(char*)&endKey ,
|
|
1000000 , // 1MB minrecsizes
|
|
msg0 ,
|
|
gotLinkdbListWrapper ,
|
|
m_niceness ,
|
|
true , // error correct?
|
|
true , // includeTree
|
|
true , // do merge
|
|
-1,//hostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
60*60*24*365 )){//timeout of one year
|
|
// blocked? keep chugging
|
|
continue;
|
|
}
|
|
|
|
// . maybe it was cached or something, or we had an error!
|
|
// . this will nuke the msg25
|
|
// . returns false and sets g_errno on error
|
|
//processLinkInfoMsg20Reply ( msg25 );
|
|
m_numLinkRequestsIn++;
|
|
// save g_errno
|
|
int32_t saved = g_errno;
|
|
// free it's memory here lest we have a leak
|
|
//msg0->reset();
|
|
// error? it will not have blocked then
|
|
if ( ! saved ) continue;
|
|
// save error, and stop launching any more requests
|
|
m_hadLinkInfoError = saved;
|
|
log("xmldoc: linksrc error3 = %s",mstrerror(saved));
|
|
}
|
|
|
|
// return -1 if waiting for more requests to come in
|
|
if ( m_numLinkRequestsOut > m_numLinkRequestsIn )
|
|
return (SafeBuf *)-1;
|
|
|
|
// vote table to allow inlink voting
|
|
HashTableX riTable;
|
|
// do not return on error setting this table because we'll leave
|
|
// the msg20 replies unfreed!
|
|
if ( ! riTable.set ( 8,4,1024,NULL,0,false,m_niceness,"ritbl") )
|
|
m_hadLinkInfoError = g_errno;
|
|
|
|
RecommendedLink *ri;
|
|
|
|
HashTableX dedupVotesTable;
|
|
if ( ! dedupVotesTable.set(8,0,1024,NULL,0,false,m_niceness,"dvtt") )
|
|
return NULL;
|
|
|
|
// need this for computing rdOff
|
|
char *rdStart = m_relatedDocIdBuf.getBufStart();
|
|
|
|
// store recommended links bufs here temporarily
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( 10000000 ,"tt5buf" ) ) return NULL;
|
|
|
|
// all done. scan linkdb lists and intersect. there is one list
|
|
// per related docid.
|
|
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
|
|
// get related docid that had the following msg20replies
|
|
RelatedDocId *rd = &rds[i];
|
|
// his offset in his buf
|
|
int32_t rdOff = (char *)rd - rdStart;
|
|
// get linkdb list loaded from msg0 call above
|
|
Msg0 *msg0 = &((Msg0 *)m_tmpMsg0Buf.getBufStart())[i];
|
|
RdbList *list = &msg0->m_handyList;
|
|
list->resetListPtr();
|
|
// scan the docids in list
|
|
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
|
|
// get the current key if list has more left
|
|
key224_t key;
|
|
list->getCurrentKey( &key );
|
|
//int32_t itop = g_linkdb.getLinkerIp24_uk ( &key );
|
|
int32_t ip32 = g_linkdb.getLinkerIp_uk ( &key );
|
|
//bool isLinkSpam = g_linkdb.isLinkSpam_uk ( &key );
|
|
int64_t docId = g_linkdb.getLinkerDocId_uk ( &key );
|
|
//int32_t discovered = g_linkdb.getDiscoveryDate_uk(&key);
|
|
|
|
// skip if no longer there on page, we keep these
|
|
// only to graph lost links over time
|
|
int32_t lostDate = g_linkdb.getLostDate_uk ( &key );
|
|
if ( lostDate )
|
|
continue;
|
|
|
|
// if the inlink is from the same c-block IP as the
|
|
// related docid it links to, then do not consider.
|
|
// the ip used in linkdb is the current ip not the
|
|
// first ip actually.
|
|
if ( ipdom(ip32)==ipdom(rd->m_relatedCurrentIp))
|
|
continue;
|
|
if ( ipdom(ip32)==ipdom(rd->m_relatedFirstIp))
|
|
continue;
|
|
// if the linking document links to the same related
|
|
// docid multiple times/ we need to dedup so m_votes
|
|
// is not incremented multiple times!
|
|
// actually make it use c-block not docid to fix
|
|
// links/pages getting two m_votes for linking to
|
|
// two competitors, where each competitor linked to
|
|
// is on the same c-block... kinda strange.
|
|
int64_t dkey = docId ^ ipdom(rd->m_relatedFirstIp);
|
|
if ( dedupVotesTable.isInTable(&dkey) )
|
|
continue;
|
|
if ( ! dedupVotesTable.addKey(&dkey) ) return NULL;
|
|
|
|
// now we associate a new class with each unique linker
|
|
int32_t *poff = (int32_t *)riTable.getValue ( &docId );
|
|
// if there, it will be an offset into the links buf
|
|
if ( poff ) {
|
|
char *ptr = tmpBuf.getBufStart();
|
|
ptr += *poff;
|
|
RecommendedLink *rip = (RecommendedLink *)ptr;
|
|
rip->m_totalRecommendedScore +=
|
|
rd->m_relatedWeight;
|
|
rip->m_votes++;
|
|
// add to array of rd offs
|
|
int32_t k; for ( k = 0 ; k < 10 ; k++ ) {
|
|
if ( rip->m_relatedDocIdOff[k]==-1)
|
|
break;
|
|
}
|
|
if ( k < 10 )
|
|
rip->m_relatedDocIdOff[k] = rdOff;
|
|
continue;
|
|
}
|
|
|
|
// reserve space
|
|
int32_t need = sizeof(RecommendedLink);
|
|
// reserve
|
|
if ( ! tmpBuf.reserve ( need , "tt5buf" ) ) {
|
|
m_hadLinkInfoError = g_errno;
|
|
continue;
|
|
}
|
|
|
|
// save this
|
|
int32_t firstOff = tmpBuf.length();
|
|
|
|
// ref it
|
|
char *buf = tmpBuf.getBuf();
|
|
ri = (RecommendedLink *)buf;
|
|
// advance over that
|
|
int32_t over = sizeof(RecommendedLink);
|
|
// increase buf length
|
|
tmpBuf.incrementLength(over);
|
|
|
|
// this is how similar the relatedDocId is to the
|
|
// main url. these dotproducts are all relative
|
|
// with the other relatedDocIds for this url.
|
|
// the dotproduct was basically a dotproduct
|
|
// of the score vector of "rd" with that of
|
|
// the main url for the same queries. and that
|
|
// was normalized by the score of the top result
|
|
// for each query that have in common. see the
|
|
// the algo above for the "m_dotProduct" computation.
|
|
ri->m_totalRecommendedScore = rd->m_relatedWeight;
|
|
ri->m_votes = 1;
|
|
|
|
ri->m_rl_docId = docId;
|
|
|
|
// we do not know these things until we call msg20
|
|
// on the docid:
|
|
ri->m_rl_siteRank = -1;//reply->m_siteRank;
|
|
ri->m_rl_firstIp = 0;//reply->m_firstIp;
|
|
|
|
// each recommended link links to one or more
|
|
// related docids. so record them!
|
|
ri->m_relatedDocIdOff[0] = rdOff;
|
|
ri->m_relatedDocIdOff[1] = -1;
|
|
ri->m_relatedDocIdOff[2] = -1;
|
|
ri->m_relatedDocIdOff[3] = -1;
|
|
ri->m_relatedDocIdOff[4] = -1;
|
|
ri->m_relatedDocIdOff[5] = -1;
|
|
ri->m_relatedDocIdOff[6] = -1;
|
|
ri->m_relatedDocIdOff[7] = -1;
|
|
ri->m_relatedDocIdOff[8] = -1;
|
|
ri->m_relatedDocIdOff[9] = -1;
|
|
|
|
ri->m_urlSize = 0;
|
|
ri->m_titleSize = 0;
|
|
|
|
// store it in table then, pointing into the new buf
|
|
if ( ! riTable.addKey ( &docId, &firstOff ) )
|
|
m_hadLinkInfoError = g_errno;
|
|
}
|
|
// free that list now to save mem
|
|
list->freeList();
|
|
}
|
|
|
|
// free the msg0s now, including Msg0::m_handyList, what we used
|
|
// to hold the linkdb list
|
|
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
|
|
Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
|
|
Msg0 *msg0 = &array[i];
|
|
// free the mem and the handylist now that we've processed them
|
|
msg0->reset();
|
|
}
|
|
// no longer need the msg0s and linkdb lists (Msg0::m_handyLists)
|
|
m_tmpMsg0Buf.purge();
|
|
|
|
|
|
//
|
|
// now sort RecommendedLinks in tmpBuf by their scores
|
|
//
|
|
// get the top 300 recommended links so we can save mem and
|
|
// store this beastie in cachedb
|
|
SafeBuf ptrBuf;
|
|
int32_t maxNumPtrs = tmpBuf.length() / sizeof(RecommendedLink);
|
|
if ( ! ptrBuf.reserve(maxNumPtrs *sizeof(RecommendedLink *),"ptrbuf"))
|
|
return NULL;
|
|
char *p = tmpBuf.getBufStart();
|
|
char *pend = tmpBuf.getBuf();
|
|
int32_t numPtrs = 0;
|
|
for ( ; p < pend ; ) {
|
|
RecommendedLink *ri = (RecommendedLink *)p;
|
|
ptrBuf.pushPtr ( ri );
|
|
p += sizeof(RecommendedLink);
|
|
// we have no title or url at this point...
|
|
if ( ri->getSize() != sizeof(RecommendedLink) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
numPtrs++;
|
|
}
|
|
// now sort!
|
|
RecommendedLink **ptrs = (RecommendedLink **)ptrBuf.getBufStart();
|
|
gbqsort ( ptrs ,
|
|
numPtrs ,
|
|
sizeof(RecommendedLink *),
|
|
riCmp,
|
|
m_niceness );
|
|
// copy over the top recommended links into permanent buffer in order
|
|
// of score
|
|
int32_t need2 = tmpBuf.length();
|
|
// increase for storing titles/urls into here
|
|
need2 = numPtrs * sizeof(RecommendedLink);
|
|
// allocate that now
|
|
if ( ! m_recommendedLinksBuf.reserve ( need2 ,"rlkbuf") ) return NULL;
|
|
// and copy over from tmpBuf, sorted by the score
|
|
for ( int32_t i = 0 ; i < numPtrs ; i++ )
|
|
m_recommendedLinksBuf.safeMemcpy(ptrs[i],
|
|
sizeof(RecommendedLink));
|
|
// this can be really huge! > 30MB
|
|
tmpBuf.purge();
|
|
// free the ptrs too!
|
|
ptrBuf.purge();
|
|
|
|
|
|
//
|
|
// now m_recommendedLinksBuf is a bunch of RecommendedLinks sorted
|
|
// by score. now use msg20 to lookup the top 300 or so that
|
|
// do not link to our main doc
|
|
//
|
|
m_msg20Phase = true;
|
|
return lookupTitles ();
|
|
}
|
|
|
|
//static void gotLinkerTitleWrapper ( void *state ) {
|
|
// Msg20 *msg20 = (Msg20 *)state;
|
|
// XmlDoc *THIS = (XmlDoc *)msg20->m_state2;
|
|
// THIS->gotLinkerTitle ( msg20 );
|
|
// THIS->m_masterLoop ( THIS->m_masterState );
|
|
//}
|
|
|
|
SafeBuf *XmlDoc::lookupTitles ( ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// none have a title/url following them in here yet
|
|
int32_t numLinkers = m_recommendedLinksBuf.length();
|
|
numLinkers /= sizeof(RecommendedLink);
|
|
|
|
if ( ! m_msg20Array.length() ) {
|
|
int32_t need = numLinkers * sizeof(Msg20);
|
|
if ( ! m_msg20Array.reserve ( need,"m20arr" ) )
|
|
return (SafeBuf *)-1;
|
|
// do not re-call!
|
|
m_msg20Array.setLength(need);
|
|
char *p = m_msg20Array.getBufStart();
|
|
char *pend = p + need;
|
|
for ( ; p < pend ; p += sizeof(Msg20) )
|
|
((Msg20 *)p)->constructor();
|
|
}
|
|
|
|
Msg20 *msg20s = (Msg20 *)m_msg20Array.getBufStart();
|
|
// one per linker
|
|
int32_t numMsg20s = numLinkers;
|
|
|
|
// we can use the array model because each element is fixed size
|
|
// because they do not have the url/title string following them
|
|
// yet...
|
|
char *ppp = m_recommendedLinksBuf.getBufStart();
|
|
RecommendedLink *ptr = (RecommendedLink *)ppp;
|
|
|
|
// scan the msg20s we allocated to see if any got a reply
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &msg20s[i];
|
|
// skip if never launched
|
|
if ( ! msg20->m_launched ) continue;
|
|
// skip if it is in progress, awaiting its reply
|
|
if ( msg20->m_inProgress ) continue;
|
|
// ok, it has a reply. could be NULL if g_errno was set.
|
|
if ( ! gotLinkerTitle ( msg20 ) )
|
|
m_recommendedLinkError = g_errno;
|
|
// reset it for later us... or not...
|
|
msg20->reset();
|
|
}
|
|
|
|
//
|
|
// call a msg20 on each recommendedlink to get url/title and
|
|
// see if it links to any url on our main url's site/domain
|
|
//
|
|
for ( ; m_titleCursor < numLinkers ; m_titleCursor++ ) {
|
|
// bail?
|
|
if ( m_numMsg20sOut - m_numMsg20sIn > 60 )
|
|
break;
|
|
// stop launching if got enough
|
|
if ( m_numValidMsg20s >= MAX_RECOMMENDED_LINKS )
|
|
break;
|
|
// cast it
|
|
RecommendedLink *rl = &ptr[m_titleCursor];
|
|
|
|
// get avail msg20
|
|
int32_t i; for ( i = 0 ; i < 100 ; i++ ) {
|
|
if ( msg20s[i].m_inProgress ) continue;
|
|
break;
|
|
}
|
|
// sanity!
|
|
if ( i >= 100 ) { char *xx=NULL;*xx=0; }
|
|
// look it up
|
|
Msg20 *msg20 = &msg20s[i];
|
|
// make request
|
|
Msg20Request req;
|
|
req.m_docId = rl->m_rl_docId;
|
|
//req.m_state = msg20;
|
|
req.m_state = m_masterState;//this;
|
|
req.m_callback2 = m_masterLoop;//gotLinkerTitleWrapper;
|
|
//req.ptr_coll = cr->m_coll;
|
|
//req.size_coll = gbstrlen(cr->m_coll)+1;
|
|
req.m_collnum = cr->m_collnum;
|
|
req.m_expected = true;
|
|
req.m_niceness = m_niceness;
|
|
// do not get summary stuff. too slow.
|
|
req.m_numSummaryLines = 0;
|
|
// if it has an outlink to our site/domain set
|
|
// Msg20Reply::m_hasLinkToOurDomOrHost
|
|
req.m_ourHostHash32 = getHostHash32a();
|
|
req.m_ourDomHash32 = getDomHash32();
|
|
|
|
// store cursor in msg20 itself so we know what rd it's using
|
|
msg20->m_hack2 = m_titleCursor;
|
|
|
|
// assume outstanding
|
|
m_numMsg20sOut++;
|
|
// debug
|
|
//log("seo: DEBUG: launching msg20 d=%" INT64 "",req.m_docId);
|
|
// get it. continue if blocked
|
|
if ( ! msg20->getSummary ( &req ) ) continue;
|
|
// error?
|
|
if ( ! gotLinkerTitle ( msg20 ) )
|
|
m_recommendedLinkError = g_errno;
|
|
// save mem
|
|
msg20->reset();
|
|
}
|
|
|
|
// wait for all to return?
|
|
if ( m_numMsg20sOut > m_numMsg20sIn )
|
|
return (SafeBuf *)-1;
|
|
|
|
|
|
// we called gotLinkerTitle() on all msg20s, so destroy them
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &msg20s[i];
|
|
// free
|
|
msg20->destructor();
|
|
}
|
|
// and free the lot of them
|
|
m_msg20Array.purge();
|
|
|
|
|
|
// now revert back
|
|
m_recommendedLinksBuf.stealBuf ( &m_newLinkerBuf );
|
|
|
|
// . this is an array of Inlinks
|
|
// . shit, but we need to add a count of how many related docids
|
|
// had the inlink, and what the weight or score of it was
|
|
// . it should be based on the weights/scores of the related docids
|
|
// . maybe just hijack "Inlink::m_numUniqueIPs" or something
|
|
// . crap, we also need to store the RelatedDocIds, i guess we
|
|
// could store a list of offsets to them in m_relatedDocIdBuf
|
|
m_recommendedLinksBufValid = true;
|
|
|
|
// store in cachedb. if it blocks return -1. bufvalid is set to
|
|
// true so when this function is re-entered it should return
|
|
// the safebuf ptr right away.
|
|
if ( ! storeRecommendedLinksBuf () )
|
|
return (SafeBuf *)-1;
|
|
|
|
return &m_recommendedLinksBuf;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::gotLinkerTitle ( Msg20 *msg20 ) {
|
|
// count it as returned
|
|
m_numMsg20sIn++;
|
|
|
|
// debug
|
|
//log("seo: DEBUG: got msg20 reply");
|
|
|
|
// get the recommendedlink for this (titleCursor)
|
|
char *vvv = m_recommendedLinksBuf.getBufStart();
|
|
RecommendedLink *rptrs = (RecommendedLink *)vvv;
|
|
int32_t titleCursor = msg20->m_hack2;
|
|
RecommendedLink *rl = &rptrs[titleCursor];
|
|
// sanity
|
|
if ( titleCursor < 0 ) {char *xx=NULL;*xx=0;}
|
|
|
|
// not found?
|
|
if ( g_errno ) {
|
|
log("seo: lookuptitles: %s",mstrerror(g_errno));
|
|
// ignore
|
|
g_errno = 0;
|
|
return true;
|
|
}
|
|
// get reply
|
|
Msg20Reply *reply = msg20->getReply();
|
|
// skip if linked to our site!
|
|
if ( reply->m_hasLinkToOurDomOrHost ) {
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: inlinker %s links to our "
|
|
"domain. ignoring.",
|
|
reply->ptr_ubuf);
|
|
return true;
|
|
}
|
|
// or if banned/filtered.. then skip
|
|
if ( reply->m_errno ) {
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: inlinker %s had error: %s",
|
|
reply->ptr_ubuf,
|
|
mstrerror(reply->m_errno));
|
|
return true;
|
|
}
|
|
// wtf?
|
|
if ( reply->size_ubuf <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
// set basic info
|
|
rl->m_rl_siteRank = reply->m_siteRank;
|
|
rl->m_rl_firstIp = reply->m_firstIp;
|
|
|
|
// sanity
|
|
if ( rl->m_rl_docId != reply->m_docId ) { char *xx=NULL;*xx=0; }
|
|
|
|
char *title = reply->ptr_tbuf;
|
|
int32_t titleSize = reply->size_tbuf;
|
|
if ( titleSize == 0 ) {
|
|
title = "\0";
|
|
titleSize = 1;
|
|
}
|
|
|
|
// debug
|
|
//log("seo: DEBUG: got VALID msg20 reply #%" INT32 "",m_numValidMsg20s);
|
|
|
|
// count as valid
|
|
m_numValidMsg20s++;
|
|
|
|
rl->m_urlSize = reply->size_ubuf;
|
|
rl->m_titleSize = titleSize;
|
|
|
|
|
|
if ( ! m_newLinkerBuf.safeMemcpy ( rl , sizeof(RecommendedLink) ) )
|
|
return false;
|
|
if ( ! m_newLinkerBuf.safeMemcpy ( reply->ptr_ubuf,reply->size_ubuf))
|
|
return false;
|
|
if ( ! m_newLinkerBuf.safeMemcpy ( title , titleSize ) )
|
|
return false;
|
|
|
|
|
|
// i guess we are done then
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// returns false if blocked, true otherwise. sets g_errno on error
|
|
bool XmlDoc::printRecommendedLinksBuf ( SafeBuf *sb ) {
|
|
|
|
SafeBuf *recBuf = getRecommendedLinksBuf();
|
|
if ( ! recBuf ) return true;
|
|
if ( recBuf == (void *)-1 ) return false;
|
|
|
|
int32_t count = 1;
|
|
char *p = recBuf->getBufStart();
|
|
char *pend = recBuf->getBuf ();
|
|
for ( ; p < pend ; ) {
|
|
// cast it
|
|
RecommendedLink *ri = (RecommendedLink *)p;
|
|
// skip it
|
|
p += ri->getSize();
|
|
// print it out
|
|
sb->safePrintf("%" INT32 ") %.04f %s | %s<br>"
|
|
,count++
|
|
,ri->m_totalRecommendedScore
|
|
,ri->getUrl(recBuf)
|
|
,ri->getTitle(recBuf)
|
|
);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
// . use Msg25::m_numReplyPtrs and Msg25::m_replyPtrs[i] to access the
|
|
// Msg20s of the inlinks
|
|
// . NOT the same as getLinkInfo() because this does not filter out the
|
|
// "bad" inlinks, it gets everything and keeps the full Msg20Replies!!
|
|
Msg25 *XmlDoc::getAllInlinks ( bool forSite ) {
|
|
|
|
// if valid, return it now
|
|
if ( forSite && m_tempMsg25SiteValid )
|
|
return m_tempMsg25Site;
|
|
|
|
if ( ! forSite && m_tempMsg25PageValid )
|
|
return m_tempMsg25Page;
|
|
|
|
Msg25 *myMsg25 ;
|
|
if ( forSite ) myMsg25 = m_tempMsg25Site;
|
|
else myMsg25 = m_tempMsg25Page;
|
|
|
|
int32_t *ipp = getIp();
|
|
if ( ! ipp || ipp == (void *)-1 ) return (Msg25 *)ipp;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Msg25 *)d;
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (Msg25 *)site;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
Url *fu = getFirstUrl();
|
|
|
|
// make a new one
|
|
if ( ! myMsg25 ) {
|
|
Msg25 *msg25 = NULL;
|
|
try { msg25 = new ( Msg25 ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("xmldoc: linksrc error2 = %s",mstrerror(g_errno));
|
|
m_hadLinkInfoError = g_errno;
|
|
}
|
|
mnew ( msg25, sizeof(Msg25),"m25li");
|
|
// record it for freeing/deleting later
|
|
if ( forSite ) m_tempMsg25Site = msg25;
|
|
else m_tempMsg25Page = msg25;
|
|
// reference it
|
|
myMsg25 = msg25;
|
|
}
|
|
|
|
int32_t type ;
|
|
if ( forSite ) type = cr_Msg25SiteInfo;
|
|
else type = cr_Msg25PageInfo;
|
|
|
|
// get list
|
|
RdbList *myList;
|
|
if ( forSite ) myList = &m_siteReplyList;
|
|
else myList = &m_pageReplyList;
|
|
|
|
int32_t uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
// first check cachedb!
|
|
bool checkIt = false;
|
|
if ( forSite && ! m_checkedCachedbForSite ) checkIt = true;
|
|
if ( ! forSite && ! m_checkedCachedbForPage ) checkIt = true;
|
|
if ( checkIt ) {
|
|
// do not repeat
|
|
if ( forSite ) m_checkedCachedbForSite = true;
|
|
else m_checkedCachedbForPage = true;
|
|
// use 0 for content hash since the link info is independent
|
|
// of your page's or site's content
|
|
key_t sk = g_cachedb.makeStartKey2 ( uh32 , 0 , type );
|
|
key_t ek = g_cachedb.makeEndKey2 ( uh32 , 0 , type );
|
|
// . get it from the appropriate host
|
|
// . get cachedb rec for all types of safebufs for this
|
|
// url/content
|
|
// . then we will set safebufs based on what recs we find
|
|
// in the returned list
|
|
if ( ! m_msg0.getList ( -1, // hostid
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxcacheage
|
|
false, // addtocache?
|
|
RDB_CACHEDB,
|
|
cr->m_collnum ,
|
|
myList, // &m_cacheList,
|
|
(char *)&sk ,
|
|
(char *)&ek ,
|
|
30000000, // minrecsizes 30MB
|
|
m_masterState,
|
|
m_masterLoop,
|
|
m_niceness ) )
|
|
// blocked?
|
|
return (Msg25 *)-1;
|
|
}
|
|
|
|
Msg20Reply *reply;
|
|
|
|
// even if it had 0 msg20replies, list should be non-zero length
|
|
if ( ! myList->isEmpty() ) {
|
|
// get # replies
|
|
char *p = myList->getList();
|
|
// first is key
|
|
p += 12;
|
|
// then datasize
|
|
p += 4;
|
|
// then # msg20 replies
|
|
int32_t numReplies = *(int32_t *)p;
|
|
p += 4;
|
|
myMsg25->m_numReplyPtrs = numReplies;
|
|
// do not free any replies, they reference into m_pageList
|
|
myMsg25->m_ownReplies = false;
|
|
// loop over replies
|
|
for ( int32_t i = 0 ; i < numReplies ; i++ ) {
|
|
// get reply size
|
|
int32_t replySize = *(int32_t *)p;
|
|
p += 4;
|
|
// reply itself
|
|
reply = (Msg20Reply *)p;
|
|
// reconstruct ptrs from the offsets relative
|
|
// to start of "reply"
|
|
int32_t used = reply->deserialize();
|
|
if ( used < 0 ) {
|
|
log("xmldoc: reply deserialize error");
|
|
g_errno = ECORRUPTDATA;
|
|
return NULL;
|
|
}
|
|
// skip reply
|
|
p += replySize;
|
|
// store it
|
|
myMsg25->m_replyPtrs[i] = reply;
|
|
}
|
|
// validate!
|
|
if ( forSite ) m_tempMsg25SiteValid = true;
|
|
else m_tempMsg25PageValid = true;
|
|
// all done!
|
|
return myMsg25;
|
|
}
|
|
|
|
bool *calledItPtr ;
|
|
if ( forSite ) calledItPtr = &m_calledMsg25ForSite;
|
|
else calledItPtr = &m_calledMsg25ForPage;
|
|
|
|
|
|
// ok, get it the hard way
|
|
// send out the request now
|
|
if ( ! *calledItPtr ) {
|
|
// do not re-call!
|
|
*calledItPtr = true;
|
|
// call it now
|
|
if ( ! myMsg25->getLinkInfo2( site,
|
|
fu->getUrl() , // url
|
|
false , // isSiteLinkInfo?
|
|
*ipp,
|
|
*d, // docid
|
|
m_collnum,//cr->m_coll,
|
|
NULL, // qbuf
|
|
0, // qbufSize
|
|
m_masterState, // state
|
|
m_masterLoop, // callback
|
|
false, // isInjecting?
|
|
false, // pbuf (for printing)
|
|
//this, // xd holder (Msg25::m_xd
|
|
false, // printInXml
|
|
// this is irrelevant since we
|
|
// are getting all inlinks:
|
|
0, // siteNumInlinks, irrelevant
|
|
NULL, // oldlinkinfo
|
|
m_niceness,
|
|
true, // doLinkSpamCheck?
|
|
true, // onevoteperip. unused?
|
|
false,// can be cancelled?
|
|
0, // lastupdatetime
|
|
// !!!!!!!!!!
|
|
// we want all!!!!!!!!!!!!!!!!!!!
|
|
// !!!!!!!!!!
|
|
false ,//onlyneedgoodinlinks?
|
|
false,//getlinkertitles?
|
|
0, // ourhosthash32 (special)
|
|
0, // ourdomhash32 (special)
|
|
&m_myTempLinkInfoBuf ) )
|
|
// blocked?
|
|
return (Msg25 *)-1;
|
|
}
|
|
|
|
// validate it so when msg1 below returns and calls this function
|
|
// again at the top we return the ptr right away
|
|
if ( forSite ) m_tempMsg25SiteValid = true;
|
|
else m_tempMsg25PageValid = true;
|
|
|
|
// serialize the msg20 reply ptrs into a buf for list
|
|
SafeBuf listBuf;
|
|
// compute datasize
|
|
int32_t dataSize = 0;
|
|
// # of replies
|
|
dataSize += 4;
|
|
// each reply
|
|
for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
|
|
// reply size
|
|
dataSize += 4;
|
|
// reply data
|
|
//dataSize += myMsg25->m_replySizes[i];
|
|
// we can't use replySizes[i] because Linkdb.cpp will
|
|
// MODIFY the msg20 requests to add ptr_note/size_note
|
|
reply = myMsg25->m_replyPtrs[i];
|
|
// so we have to calculate the new serialized size
|
|
dataSize += reply->getStoredSize();
|
|
}
|
|
// how much to reserve?
|
|
int32_t need = sizeof(key_t) + 4 + dataSize;
|
|
// reserve that space!
|
|
if ( ! listBuf.reserve ( need ,"listbuf" ) ) {
|
|
// just ignore error
|
|
g_errno = 0;
|
|
// and return
|
|
if ( forSite ) return m_tempMsg25Site;
|
|
else return m_tempMsg25Page;
|
|
}
|
|
// make key for it, contenthash is 0, since it is irrelevant
|
|
key_t kk = g_cachedb.makeKey ( uh32 , 0 , type );
|
|
// store key
|
|
listBuf.safeMemcpy ( &kk , sizeof(key_t) );
|
|
// store datasize
|
|
listBuf.pushLong ( dataSize );
|
|
// # of replies
|
|
listBuf.pushLong ( myMsg25->m_numReplyPtrs );
|
|
// store each reply then
|
|
for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
|
|
// get reply
|
|
reply = myMsg25->m_replyPtrs[i];
|
|
// . how many bytes to store the MODIFIED msg20reply?
|
|
// . Linkdb.cpp adds the ptr_note AFTER it receives all replies
|
|
// so we can't just use Msg25::m_replySizes[i]
|
|
int32_t replySize = reply->getStoredSize();
|
|
listBuf.pushLong ( replySize );
|
|
// store that
|
|
int32_t stored = reply->serialize ( listBuf.getBuf() ,
|
|
listBuf.getAvail() );
|
|
// skip that
|
|
listBuf.incrementLength ( stored );
|
|
// sanity
|
|
if ( stored != replySize ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// make the list to add to cachedb
|
|
RdbList storeList;
|
|
key_t startKey = g_cachedb.makeStartKey2 ( uh32, 0 , type );
|
|
key_t endKey = g_cachedb.makeEndKey2 ( uh32, 0 , type );
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
//m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
char *tt ;
|
|
if ( forSite ) tt = "site";
|
|
else tt = "page";
|
|
log("xmldoc: adding msg20%slinkreplies list of %" INT32 " bytes to cachedb",
|
|
tt,m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
// blocked?
|
|
return (Msg25 *)-1;
|
|
|
|
if ( forSite ) return m_tempMsg25Site;
|
|
else return m_tempMsg25Page;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . sets RelatedDocId::m_relatedWeight
|
|
// . when printing the competitor pages, we sort by this, highest first
|
|
// 1. then scan the list of queries for each related docid
|
|
// 2. determine each of those matching queries weights
|
|
// 3. add up the weights and set RelatedDocId::m_relatedWeight to that
|
|
bool XmlDoc::setRelatedDocIdWeightAndRank ( RelatedDocId *rd ) {
|
|
|
|
// get our site hash
|
|
int32_t *shp = getSiteHash32();
|
|
if ( ! shp ) return false;
|
|
if ( shp == (int32_t *)-1 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
int32_t mainUrlSiteRank = getSiteRank();
|
|
|
|
// max queries
|
|
int32_t nc = rd->m_numCommonQueries;
|
|
int32_t unit = 0;
|
|
unit += sizeof(float);
|
|
//unit += sizeof(Msg99Reply *);
|
|
unit += sizeof(Query);
|
|
unit += sizeof(HashTableX);
|
|
unit += sizeof(QueryNumLinkedNode *);
|
|
int32_t need = nc * unit;
|
|
char *mem = (char *)mmalloc ( need , "qrybuf" );
|
|
if ( ! mem ) {
|
|
log("seo: failed to set related docid weight: %s",
|
|
mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
char *p = mem;
|
|
|
|
float *queryWeights = (float *)p;
|
|
p += nc * sizeof(float);
|
|
|
|
//Msg99Reply **replyPtrs = (Msg99Reply **)p;
|
|
//p += nc * sizeof(Msg99Reply *);
|
|
|
|
Query *queries = (Query *)p;
|
|
p += nc * sizeof(Query);
|
|
|
|
QueryNumLinkedNode **qnPtrs = (QueryNumLinkedNode **)p;
|
|
p += nc * sizeof(QueryNumLinkedNode *);
|
|
|
|
HashTableX *htables = (HashTableX *)p;
|
|
p += nc * sizeof(HashTableX);
|
|
|
|
// sanity
|
|
if ( p != mem + need ) { char *xx=NULL;*xx=0; }
|
|
// initialize the mem
|
|
for ( int32_t i = 0 ; i < nc ; i++ ) {
|
|
queryWeights[i] = 1.0;
|
|
qnPtrs[i] = NULL;
|
|
queries[i].constructor();
|
|
htables[i].constructor();
|
|
}
|
|
|
|
// total pages indexed!
|
|
//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
|
|
|
|
|
|
float totalWeight;
|
|
|
|
// get matching queries
|
|
//SafeBuf *qpbuf = getMatchingQueriesScored();
|
|
//if ( ! qpbuf || qpbuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// cast it
|
|
//Msg99Reply **qptrs=(Msg99Reply **)qpbuf->getBufStart();
|
|
|
|
SafeBuf *mq = getMatchingQueryBuf();
|
|
if ( mq == NULL || mq == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
int32_t nks = mq->length() / sizeof(QueryLink);
|
|
QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
|
|
// print the queries in common!
|
|
int32_t firstOff = rd->m_firstCommonQueryNumOff;
|
|
int32_t offset = firstOff;
|
|
int32_t qc = 0;
|
|
|
|
//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
|
|
// this is fixed at the time we set QueryLogEntry::m_numResultsInSlice
|
|
int64_t numPagesIndexed = 1114000000;
|
|
int64_t point0 = numPagesIndexed / 119LL;
|
|
int64_t point1 = numPagesIndexed / 15LL;
|
|
|
|
// loop over the query/score pairs this related docid matched
|
|
for ( ; offset >= 0 ; qc++ ) {
|
|
// get that node
|
|
char *buf = m_commonQueryNumBuf.getBufStart();
|
|
// and offset
|
|
buf += offset;
|
|
// then cast
|
|
QueryNumLinkedNode *qn;
|
|
qn = (QueryNumLinkedNode *)buf;
|
|
// advance. will be -1 when done
|
|
if ( qn ) offset = qn->m_nextOff;
|
|
else offset = -1;
|
|
// get #qn into there
|
|
//Msg99Reply *rp = qptrs[qn->m_queryNum];
|
|
if ( qn->m_queryNum < 0 || qn->m_queryNum >= nks ) {
|
|
char *xx=NULL;*xx=0; }
|
|
QueryLink *qk = &qks[qn->m_queryNum];
|
|
QueryLogEntry *qe ;
|
|
qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
|
|
char *qstr = qe->getQueryString();
|
|
|
|
qnPtrs[qc] = qn;
|
|
|
|
// save ptrs too
|
|
//replyPtrs[qc] = rp;
|
|
|
|
// get main url score for query
|
|
//float mainUrlScore = rp->m_myScore;
|
|
int32_t mainUrlSiteHash26 = m_siteHash32;
|
|
// seems like clusterdb masks them a bit in
|
|
// Clusterdb::getSiteHash()
|
|
mainUrlSiteHash26 &= 0x03ffffff;
|
|
|
|
int32_t mainUrlRank = -1;
|
|
int32_t rdRank = -1;
|
|
//float mainUrlSerpScore = -1.0;
|
|
|
|
// . the relateddocidnumhack
|
|
// . this is used as the topdocidnum # in the case of
|
|
// m_matchingQueryBuf (doMatchingQueries)
|
|
int32_t tdnum = qk->m_relatedDocIdNum;
|
|
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
|
|
int32_t maxnum = m_topDocIdsBuf.length()/sizeof(TopDocIds);
|
|
if ( tdnum < 0 || tdnum >= maxnum ) { char *xx=NULL;*xx=0; }
|
|
TopDocIds *td = &tds[tdnum];
|
|
|
|
// assume none
|
|
//float rdScore = 0.0;
|
|
// find docid for this related docid
|
|
//TopDocIds *td = rp->getTopDocIds(&m_topDocIdsBuf);
|
|
|
|
int32_t nd = td->m_numDocIds;
|
|
for ( int32_t y = 0 ; y < nd ; y++ ) {
|
|
// if we first encounter a result from the same
|
|
// site as the main url then stop! you don't get
|
|
// the 10x bonus then!
|
|
if ( td->m_topSiteHashes26[y] == mainUrlSiteHash26 &&
|
|
mainUrlRank == -1 ) {
|
|
//mainUrlSerpScore = td->m_topScores[y];
|
|
mainUrlRank = y;
|
|
}
|
|
// set our score?
|
|
if ( td->m_topDocIds[y] == rd->m_docId ) {
|
|
//rdScore = td->m_topScores[y];
|
|
rdRank = y;
|
|
}
|
|
}
|
|
// these should always be set! even if not ranked in the
|
|
// top 300 because of our new logic using msg4f in
|
|
// getRelatedDocIdsScored()
|
|
float rdScore = qn->m_relatedDocIdSerpScore;
|
|
float mainUrlSerpScore = qk->m_serpScore;
|
|
|
|
bool better = false;
|
|
// give it a weight of 10 if higher-scoring!
|
|
//if ( rdRank < mainUrlRank ) better = true;
|
|
if ( rdScore >= mainUrlSerpScore ) better = true;
|
|
// if your site not in top 300 or so, and he is, he's better
|
|
//if ( mainUrlRank == -1 && rdRank >= 0 ) better = true;
|
|
|
|
// this is the specific url, not the SITE, like
|
|
// mainUrlRank is, for the entire site
|
|
//if ( rdScore > mainUrlScore ) better = true;
|
|
|
|
// how many search results does this query have total?
|
|
int64_t numResults = qe->m_numTotalResultsInSlice;
|
|
// fix it to be global
|
|
numResults *= (int64_t)g_hostdb.getNumShards();
|
|
// big indexes did the "slice logic" restricting docid
|
|
// range to MAX_DOCID * .10 when setting this!
|
|
if ( numPagesIndexed > 10000000 ) numResults *= 10;
|
|
|
|
////////////////////
|
|
//
|
|
// Scoring is what we do when the number of combinations
|
|
// it too high to effectively compute. - matt
|
|
//
|
|
////////////////////
|
|
|
|
|
|
// lower from 10 so google still won't dominate generic qyries?
|
|
// crap, at 2.0 gigablast.com had bad competitors because
|
|
// they all match queries with gigablast in them.
|
|
// i put it down from 30.0 to 5.0 to fix chessusa.com
|
|
// who was getting bad competitor pages that had just
|
|
// 'ccc' matching non-generic queries having them come up too
|
|
// high of score.
|
|
//if ( better )
|
|
// queryWeights[qc] = 1.0;//30.0;//100.0; // 10.0;
|
|
|
|
//
|
|
// do not give related docid query that has YOUR brand in it
|
|
// much weight. we do not want it talking about you, because
|
|
// it is a competitor.
|
|
//
|
|
// PROBLEM: "cheatcodes.com"'s brand is descriptive!
|
|
//
|
|
// . if not generic and it beats YOU, give more!
|
|
// . try to fix ibm.com gigablast.com seomoz.org ahrefs.com
|
|
// that suffer because of matching their brand. actually
|
|
// maybe only do this if seomoz.org matches this query
|
|
// with their link text only...??? thus, pages that contain
|
|
// "seo moz" will match the "seo moz" query but will gain
|
|
// RELATIVELY little because they can't be seomoz.org on it.
|
|
// . crap though this will hurt chessusa.com right?? try again
|
|
// since algo changed a lot since then
|
|
bool isBrand = true;
|
|
// if other guy ranks better than you, probably not
|
|
// your brand, or if it is, it could be his brand too?
|
|
if ( better ) // && numResults < point0 )
|
|
isBrand = false;
|
|
// or if you are not in the top 100 it is probably not
|
|
// your brand name either!
|
|
if ( mainUrlRank == -1 )
|
|
isBrand = false;
|
|
// fix chessusa.com for 'chess' by lowering from 100 to 20...
|
|
if ( mainUrlRank >= 20 )
|
|
isBrand = false;
|
|
// fix 'corporation' for ibm.com. it is too generic to
|
|
// be a brand. on our 1.1B page index, point0 is like 9.3M.
|
|
// 'ibm' is 5.5M, 'corporation' is 25M,...
|
|
if ( numResults >= point0 )
|
|
isBrand = false;
|
|
// or for ibm.com ... or other pages with high siteranks,
|
|
// your brand queries should be in the top 10!! otherwise,
|
|
// ibm has so many other matching queries in the top 100 that
|
|
// are not brands for it because its siterank is so high.
|
|
if ( mainUrlSiteRank >= 10 && mainUrlRank >= 10 )
|
|
isBrand = false;
|
|
// top 5 for brands in siterank 11 sites
|
|
if ( mainUrlSiteRank >= 11 && mainUrlRank >= 5 )
|
|
isBrand = false;
|
|
|
|
// . good competitors will be in top 30 for a query
|
|
// . let's keep in mind though that we use these competitors
|
|
// to find backlinks AND to generate related terms, so
|
|
// it's not so important that they dominate a query, but
|
|
// rather that they match your content...
|
|
/*
|
|
if ( better &&
|
|
numResults < point0 &&
|
|
rdRank >= 0 &&
|
|
rdRank < 20 )
|
|
queryWeights[qc] *= 1.2;//50.0;
|
|
|
|
// top ten???
|
|
if ( better &&
|
|
numResults < point0 &&
|
|
rdRank >= 0 &&
|
|
rdRank < 10 )
|
|
queryWeights[qc] *= 1.3;//51.0;
|
|
|
|
// top 5?
|
|
if ( better &&
|
|
numResults < point0 &&
|
|
rdRank >= 0 &&
|
|
rdRank < 5 )
|
|
queryWeights[qc] *= 1.4;//52.0;
|
|
*/
|
|
|
|
// weight it by how relevant the query it matches is to us
|
|
//if ( better && numResults < point0 )
|
|
// queryWeights[qc] = (qk->m_serpScore / 1000000.0);
|
|
|
|
//
|
|
// generic query?
|
|
//
|
|
float weight = 1.0;
|
|
if ( numResults < point0 ) weight = 100.0;
|
|
else if ( numResults < point1 ) weight = 10.0;
|
|
queryWeights[qc] *= weight;
|
|
|
|
//
|
|
// weight by related docid's serp score
|
|
//
|
|
float ss = qk->m_serpScore;
|
|
float w2 = 1.0;
|
|
if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
|
|
else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
|
|
else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M
|
|
else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M
|
|
else if ( ss > 100000.0 ) w2 = 6.0; // > 100k
|
|
else if ( ss > 10000.0 ) w2 = 5.0; // > 10k
|
|
else if ( ss > 1000.0 ) w2 = 4.0; // > 1k
|
|
else if ( ss > 100.0 ) w2 = 3.0; // > 100
|
|
else if ( ss > 10.0 ) w2 = 2.0; // > 10
|
|
queryWeights[qc] *= w2;
|
|
|
|
|
|
//
|
|
// weight by main url's serp score as well!
|
|
//
|
|
ss = mainUrlSerpScore;//qk->m_serpScore;
|
|
w2 = 1.0;
|
|
if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
|
|
else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
|
|
else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M
|
|
else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M
|
|
else if ( ss > 100000.0 ) w2 = 6.0; // > 100k
|
|
else if ( ss > 10000.0 ) w2 = 5.0; // > 10k
|
|
else if ( ss > 1000.0 ) w2 = 4.0; // > 1k
|
|
else if ( ss > 100.0 ) w2 = 3.0; // > 100
|
|
else if ( ss > 10.0 ) w2 = 2.0; // > 10
|
|
queryWeights[qc] *= w2;
|
|
|
|
|
|
// punish query weight if it is your brand most likely
|
|
//if ( isBrand )
|
|
// queryWeights[qc] = 0.01;
|
|
|
|
// . store related docid rank and your rank
|
|
// . then we do not need cache m_topDocIdsBuf and seo.cpp
|
|
// has this info readily available.
|
|
qn->m_relatedDocIdRank = rdRank;
|
|
qn->m_mainUrlRank = mainUrlRank;
|
|
//qn->m_mainUrlSerpScore = mainUrlSerpScore;
|
|
|
|
/*
|
|
int64_t numResults = qe->m_numTotalResultsInSlice;
|
|
// fix it to be global
|
|
numResults *= (int64_t)g_hostdb.getNumGroups();
|
|
// big indexes did the "slice logic" restricting docid
|
|
// range to MAX_DOCID * .10 when setting this!
|
|
if ( numPagesIndexed > 10000000 ) numResults *= 10;
|
|
// fix divide by zero and make all rare queries similar weight
|
|
//if ( numResults < 1000 ) numResults = 1000;
|
|
// divide by # results query has so more generic stuff
|
|
// is down weighted
|
|
//queryWeights[qc] /= (float)numResults;
|
|
if ( numResults < 1000 )
|
|
queryWeights[qc] /= 1;
|
|
else if ( numResults < 10000 )
|
|
queryWeights[qc] /= 2;
|
|
else if ( numResults < 100000 )
|
|
queryWeights[qc] /= 4;
|
|
else if ( numResults < 1000000 ) // 1M
|
|
queryWeights[qc] /= 8;
|
|
else if ( numResults < 10000000 ) // 10M
|
|
queryWeights[qc] /= 16;
|
|
else if ( numResults < 10000000 ) // 100M
|
|
queryWeights[qc] /= 32;
|
|
else
|
|
queryWeights[qc] /= 64;
|
|
*/
|
|
|
|
//int32_t qlen = gbstrlen(qstr);
|
|
// int16_tcuts
|
|
Query *qp = &queries[qc];
|
|
HashTableX *ht = &htables[qc];
|
|
// this is currently a int64_t bit vector
|
|
int32_t vs = sizeof(qvec_t);
|
|
if ( ! ht->set ( 8,vs,128,NULL,0,false,m_niceness,"wbvbuf") )
|
|
// hopefully g_errno is preserved
|
|
goto done;
|
|
// if unknown use english so pandora's -> pandora,pandoras?
|
|
// because 'pandora's tower' was not matching
|
|
// 'pandoras tower' because both words could have been
|
|
// english or german, thus the queries were thought to be
|
|
// independent! giving rise to high-scoring competitive pages
|
|
// that matched only those two queries.
|
|
uint8_t qlangId = qe->m_langId;
|
|
if ( ! qlangId ) qlangId = langEnglish;
|
|
qp->set2 ( qstr , qlangId , true );
|
|
// hash it up
|
|
for ( int32_t i = 0 ; i < qp->m_numTerms ; i++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qp->m_qterms[i];
|
|
// bigrams imply 2 explicit bits, one from each term
|
|
// in the bigram. synonym terms should share the same
|
|
// bit as the term they are a synonym of
|
|
int64_t bits = qt->m_implicitBits;
|
|
// . add bit vec. use rawTermId?
|
|
// . hash to wordbit vector of query words contained
|
|
if ( ! ht->addKey ( &qt->m_termId , &bits ) )
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
// . set the dup flags!
|
|
// . scan queries related docid matches
|
|
for ( int32_t i = 0 ; i < qc ; i++ ) {
|
|
// get it
|
|
Query *qpi = &queries[i];
|
|
HashTableX *hti = &htables[i];
|
|
// scan all queries above
|
|
for ( int32_t j = i+1 ; j < qc ; j++ ) {
|
|
// reset
|
|
bool jIsSubQueryOfi = false;
|
|
bool iIsSubQueryOfj = false;
|
|
// skip ourselves
|
|
//if ( j == i ) continue;
|
|
// get it
|
|
Query *qpj = &queries[j];
|
|
HashTableX *htj = &htables[j];
|
|
// scan every query term in query #j and map each
|
|
// termid to the term bit vector that indicates what
|
|
// terms query #j has in query #i.
|
|
qvec_t totalVec = 0LL;
|
|
// is it a dup?
|
|
for ( int32_t k = 0 ; k < qpj->m_numTerms ; k++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qpj->m_qterms[k];
|
|
// see if in there
|
|
char *val ;
|
|
val = (char *)hti->getValue(&qt->m_termId);
|
|
if ( ! val ) continue;
|
|
// get implied term bits
|
|
qvec_t vec = *(qvec_t *)val;
|
|
// this is the termbit vector for query #i.
|
|
// it tells us what terms query #j shares.
|
|
totalVec |= vec;
|
|
}
|
|
// we only care about "required" terms. i.e. bigrams
|
|
// are essentially ignored if not in quotes.
|
|
totalVec &= qpi->m_requiredBits;
|
|
// how many words do we match?
|
|
if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
|
|
int32_t numSharedWithQueryi = getNumBitsOn64(totalVec);
|
|
// how many required bits does it have?
|
|
int32_t needi = getNumBitsOn64(qpi->m_requiredBits);
|
|
// if all terms in query #i are in query #j then subset
|
|
if ( numSharedWithQueryi == needi )
|
|
iIsSubQueryOfj = true;
|
|
|
|
//
|
|
// now go the other way
|
|
//
|
|
totalVec = 0LL;
|
|
// is it a dup?
|
|
for ( int32_t k = 0 ; k < qpi->m_numTerms ; k++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qpi->m_qterms[k];
|
|
// see if in there
|
|
char *val;
|
|
val = (char *)htj->getValue(&qt->m_termId);
|
|
if ( ! val ) continue;
|
|
// get implied term bits
|
|
qvec_t vec = *(qvec_t *)val;
|
|
// this is the termbit vector for query #j.
|
|
// it tells us what terms query #i shares.
|
|
totalVec |= vec;
|
|
}
|
|
// we only care about "required" terms. i.e. bigrams
|
|
// are essentially ignored if not in quotes.
|
|
totalVec &= qpj->m_requiredBits;
|
|
// how many words do we match?
|
|
if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
|
|
int32_t numSharedWithQueryj = getNumBitsOn64(totalVec);
|
|
// how many required bits does it have?
|
|
int32_t needj = getNumBitsOn64(qpj->m_requiredBits);
|
|
// if all terms in query #i are in query #j then subset
|
|
if ( numSharedWithQueryj == needj )
|
|
jIsSubQueryOfi = true;
|
|
|
|
|
|
// now set dup bit if query #i is same as query #j
|
|
// taking into account "missing spaces" so that we
|
|
// have two terms in one query , and their bigram
|
|
// in the other query. OR we have synonyms. OR we
|
|
// have differences of "ignored" words.
|
|
// "leg" = "legs"
|
|
// "cheat code" = "cheatcodes"
|
|
// "the tigers" = "tigers"
|
|
if(jIsSubQueryOfi&&
|
|
iIsSubQueryOfj&&
|
|
queryWeights[j]>.02){
|
|
// debug?
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: %s ISDUPOF %s",
|
|
qpj->m_orig,
|
|
qpi->m_orig);
|
|
// the dup weight is .02
|
|
queryWeights[j] *= .1; // = .02
|
|
}
|
|
|
|
// proper subquery examples:
|
|
// "leg" is subquery of "nice legs"
|
|
else if ( jIsSubQueryOfi &&
|
|
! iIsSubQueryOfj &&
|
|
queryWeights[j] > .05 ) {
|
|
// debug?
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: %s SUBQUERYOF %s",
|
|
qpj->m_orig,
|
|
qpi->m_orig);
|
|
// the subquery weight is .05
|
|
queryWeights[j] *= 0.1; // = 5.0;//.05;
|
|
}
|
|
|
|
// is query #i a PROPER subquery of query #j
|
|
else if ( iIsSubQueryOfj &&
|
|
! jIsSubQueryOfi &&
|
|
queryWeights[i] > .05 ) {
|
|
// debug?
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: %s SUBQUERYOF %s",
|
|
qpi->m_orig,
|
|
qpj->m_orig);
|
|
// the subquery weight is .05
|
|
// increase to 5.0 to try to drown out the
|
|
// anomaly queries promoting poker sites
|
|
// for cheatcodes.com competitors
|
|
queryWeights[i] *= 0.1; // = 5.0;//.05;
|
|
}
|
|
|
|
else {
|
|
// debug?
|
|
//if ( debug )
|
|
//log("seo: %s UNRELATEDTO %s",
|
|
// qpi->m_orig,
|
|
// qpj->m_orig);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
// scan the queries again and add up their weights this time!
|
|
totalWeight = 0.0;
|
|
for ( int32_t i = 0 ; i < qc ; i++ ) {
|
|
totalWeight += queryWeights[i];
|
|
qnPtrs[i]->m_queryScoreWeight = queryWeights[i];
|
|
//Msg99Reply *ptr = replyPtrs[i];
|
|
Query *qp = &queries[i];
|
|
char *qstr = qp->m_orig;//ptr->m_queryStr;
|
|
// log it
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: docid=%" INT64 " weight=%f qry=%s",
|
|
rd->m_docId,
|
|
queryWeights[i],
|
|
qstr);
|
|
}
|
|
|
|
|
|
// that is the docid related weight now
|
|
rd->m_relatedWeight = totalWeight;
|
|
|
|
done:
|
|
for ( int32_t i = 0 ; i < nc ; i++ ) {
|
|
queries[i].destructor();
|
|
htables[i].destructor();
|
|
}
|
|
mfree ( mem , need , "qrybuf" );
|
|
return true;
|
|
}
|
|
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::addRelatedDocIdInfo ( int64_t docId ,
|
|
int32_t queryNum ,
|
|
float score ,
|
|
int32_t rank ,
|
|
int32_t siteHash26 ) {
|
|
|
|
// do not add if does not match the query
|
|
if ( score <= 0.0 ) return true;
|
|
|
|
// alloc space if first time calling
|
|
if ( ! m_rdtab.m_numSlots ) {
|
|
if ( ! m_rdtab.set(8,sizeof(RelatedDocId),1024,NULL,0,
|
|
false,0,"rdtab"))
|
|
return false;
|
|
}
|
|
|
|
// get the related docid as it exists in m_relatedDocIdBuf
|
|
RelatedDocId *rd = NULL;
|
|
|
|
// now we also store these for intersecting
|
|
// in phase 2 to see what urls are most
|
|
// similar to us
|
|
int32_t slot = m_rdtab.getSlot(&docId);
|
|
// if not there, add it
|
|
if ( slot < 0 ) {
|
|
// make one
|
|
RelatedDocId rdx;
|
|
// the most important thing is the docid!
|
|
rdx.m_docId = docId;
|
|
// and now the 32-bit site hash
|
|
rdx.m_siteHash26 = siteHash26;
|
|
// how many search results we are in
|
|
rdx.m_numCommonQueries = 0;
|
|
// the queryImportance should be our score
|
|
// for this query divided by m_minTop50Score
|
|
// to normalize it.
|
|
//float qimp=qp->m_queryInfo.m_queryImportance;
|
|
// just add up the query importance for
|
|
// each query we share in common with main url
|
|
//rd.m_similarityScore = qip;
|
|
// now we do a dot product of this related
|
|
// docids score vector with the main url's
|
|
// score vector. both vector's are normalized
|
|
// using the score of the 1st result!
|
|
//rd.m_dotProduct = score;
|
|
// reset this
|
|
rdx.m_rd_siteRank = -1;
|
|
rdx.m_rd_langId = 255;
|
|
rdx.rd_title_off = -1;
|
|
rdx.rd_url_off = -1;
|
|
rdx.rd_site_off = -1;
|
|
// point to beginning of linked list of qrynums
|
|
rdx.m_firstCommonQueryNumOff = -1;//off;
|
|
//rdx.m_lastCommonQueryNumOff = -1;//off;
|
|
// remember offset
|
|
int32_t rdOff = m_relatedDocIdBuf.length();
|
|
// store it
|
|
m_relatedDocIdBuf.safeMemcpy ( &rdx , sizeof(RelatedDocId) );
|
|
// add OFFSET to table. data is 12 bytes
|
|
if(! m_rdtab.addKey(&docId,&rdOff)) return false;
|
|
// all done then
|
|
//continue;
|
|
// set this for adding to the linked list
|
|
char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
|
|
// cast it
|
|
rd = (RelatedDocId *)p;
|
|
}
|
|
else {
|
|
// get the data
|
|
int32_t rdOff = *(int32_t *)m_rdtab.getValueFromSlot(slot);
|
|
// point to it
|
|
char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
|
|
// cast it
|
|
rd = (RelatedDocId *)p;
|
|
}
|
|
|
|
// before we add the querynumlinkednode make sure not a dup!
|
|
char *qnbuf = m_commonQueryNumBuf.getBufStart();
|
|
// . offset of first node for this related docid
|
|
// . this is the start of his linked list of query/score nodes
|
|
int32_t firstOff = rd->m_firstCommonQueryNumOff;
|
|
|
|
// sanity
|
|
if ( firstOff == -1 && rd->m_numCommonQueries ) { char *xx=NULL;*xx=0;}
|
|
|
|
// assume no linked list
|
|
QueryNumLinkedNode *node = NULL;
|
|
// only a linked list if firstOff is not -1
|
|
if ( firstOff >= 0 ) node = (QueryNumLinkedNode *)(qnbuf + firstOff);
|
|
// scan the nodes (query/score pairs) we got for this related docid
|
|
for ( ; node ; ) {
|
|
// if this query is already in the linked list, stop! we
|
|
// do not want to add dup QueryNumLinkedNode nodes.
|
|
if ( node->m_queryNum == queryNum ) return true;
|
|
// end of linked list?
|
|
if ( node->m_nextOff == -1 ) break;
|
|
// advance to next node in linked list
|
|
node = (QueryNumLinkedNode *)(qnbuf+node->m_nextOff);
|
|
}
|
|
|
|
// store query num element in a linked list so
|
|
// we can print the actually queryNums a related
|
|
// docid has in common with the main url
|
|
int32_t nodeOff = m_commonQueryNumBuf.length();
|
|
// we can record our rank and your rank in this!
|
|
QueryNumLinkedNode qn;
|
|
qn.m_queryNum = queryNum; // qp->m_queryNum;
|
|
qn.m_nextOff = -1;
|
|
qn.m_relatedDocIdRank = rank;
|
|
qn.m_relatedDocIdSerpScore = score;
|
|
qn.m_mainUrlRank = -1;
|
|
//qn.m_mainUrlSerpScore = -1.0;
|
|
int32_t sq = sizeof(QueryNumLinkedNode);
|
|
// point to it
|
|
if ( ! m_commonQueryNumBuf.safeMemcpy(&qn,sq) )
|
|
return false;
|
|
// point to node we stored in the buf so we can adjust it below
|
|
QueryNumLinkedNode *stored ;
|
|
stored = (QueryNumLinkedNode *)(m_commonQueryNumBuf.getBuf() - sq);
|
|
|
|
|
|
// increment the count. the # of nodes in his linked list.
|
|
rd->m_numCommonQueries++;
|
|
|
|
// continue the linked list
|
|
qnbuf = m_commonQueryNumBuf.getBufStart();
|
|
|
|
// the first node?
|
|
if ( firstOff == -1 ) {
|
|
rd->m_firstCommonQueryNumOff = nodeOff;
|
|
//rd->m_lastCommonQueryNumOff = nodeOff;
|
|
return true;
|
|
}
|
|
|
|
// get the current first
|
|
int32_t oldFirstOff = rd->m_firstCommonQueryNumOff;
|
|
//char *vv = qnbuf + rd->m_firstCommonQueryNumOff;
|
|
//QueryNumLinkedNode *first = (QueryNumLinkedNode *)vv;
|
|
// we are the new first
|
|
rd->m_firstCommonQueryNumOff = nodeOff;
|
|
// we point to old first as our next
|
|
stored->m_nextOff = oldFirstOff;
|
|
// and update that node's next link
|
|
//last->m_nextOff = nodeOff;
|
|
// and our new tail
|
|
//rd->m_lastCommonQueryNumOff = nodeOff;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . safebuf returned is a buffer of QueryLinks
|
|
// . use m_matchingQueryBuf/m_matchingStringBuf
|
|
SafeBuf *XmlDoc::getMatchingQueryBuf ( ) {
|
|
|
|
setStatus ( "getmatchingqueries" );
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_matchingQueryBufValid )
|
|
return &m_matchingQueryBuf;
|
|
|
|
|
|
if ( ! m_beginTimeAllMatch )
|
|
m_beginTimeAllMatch = gettimeofdayInMilliseconds();
|
|
|
|
if ( m_docIdListBuf.length() == 0 )
|
|
m_docIdListBuf.pushLongLong(m_docId);
|
|
|
|
// true = doMatchingQueries?
|
|
SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , true );
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
|
|
m_matchingQueryBuf .stealBuf ( qkbuf );
|
|
m_matchingQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );
|
|
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginTimeAllMatch;
|
|
log("seopipe: time: getMatchingQueries took %" INT64 " ms",took);
|
|
|
|
m_matchingQueryBufValid = true;
|
|
|
|
// if getRelatedQueryBuf calls getQueryLinkBuf() it should
|
|
// do a recompute, so set this to false
|
|
m_queryLinkBufValid = false;
|
|
|
|
m_docIdListBuf.purge();
|
|
|
|
// store it
|
|
if ( ! storeMatchingQueriesIntoCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
return &m_matchingQueryBuf;
|
|
}
|
|
|
|
// . returns safebuf of QueryLinks, representing the intersected matching
|
|
// queries of all the related docids
|
|
SafeBuf *XmlDoc::getRelatedQueryBuf () {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_relatedQueryBufValid )
|
|
return &m_relatedQueryBuf;
|
|
|
|
// we need these
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
|
|
|
|
if ( ! m_beginRelatedQueries )
|
|
m_beginRelatedQueries = gettimeofdayInMilliseconds();
|
|
|
|
if ( m_docIdListBuf.length() == 0 ) {
|
|
int32_t numRelatedDocIds = rdbuf->length()/sizeof(RelatedDocId);
|
|
// just use the top 50 for related queries for speed!
|
|
if ( numRelatedDocIds > 50 ) numRelatedDocIds = 50;
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
|
|
RelatedDocId *rd = &rds[i];
|
|
m_docIdListBuf.pushLongLong(rd->m_docId);
|
|
}
|
|
}
|
|
|
|
// false = doMatchingQueries?
|
|
SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , false );
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
|
|
m_relatedQueryBuf .stealBuf ( qkbuf );
|
|
m_relatedQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );
|
|
|
|
m_relatedQueryBufValid = true;
|
|
m_queryLinkBufValid = false;
|
|
|
|
m_docIdListBuf.purge();
|
|
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginRelatedQueries;
|
|
log("seopipe: time: getRelatedQueries took %" INT64 " ms",took);
|
|
|
|
// store it
|
|
if ( ! storeRelatedQueriesIntoCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
|
|
return &m_relatedQueryBuf;
|
|
}
|
|
|
|
|
|
static void gotMsg8eReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
int32_t hostId = slot->m_hostId;
|
|
THIS->m_msg8eReply [hostId] = slot->m_readBuf;
|
|
THIS->m_msg8eReplySize[hostId] = slot->m_readBufSize;
|
|
// do not let udpserver.cpp free it, we will later
|
|
slot->m_readBuf = NULL;
|
|
|
|
log("seo: got msg8e reply #%" INT32 " of %" INT32 " from host #%" INT32 "",
|
|
(int32_t)THIS->m_numMsg8eReplies,
|
|
(int32_t)THIS->m_numMsg8eRequests,
|
|
(int32_t)hostId);
|
|
|
|
THIS->m_numMsg8eReplies++;
|
|
// do not free send buf until last reply!
|
|
if ( THIS->m_numMsg8eReplies < THIS->m_numMsg8eRequests ) {
|
|
slot->m_sendBufAlloc = NULL;
|
|
return;
|
|
}
|
|
// ok, sendBuf will auto free in UdpServer.cpp when we return from this
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
//static void gotMsg20ReplyWrapper ( void *state ) {
|
|
// XmlDoc *THIS = (XmlDoc *)state;
|
|
// THIS->m_numMsg20Replies++;
|
|
// if ( THIS->m_numMsg20Replies < THIS->m_numMsg20Requests )
|
|
// return;
|
|
// THIS->m_masterLoop ( THIS->m_masterState );
|
|
//}
|
|
|
|
|
|
// . returned safebuf is array of QueryLinks
|
|
// . gets all matching queries from all related docids and store them
|
|
// compactly as QueryLinks, otherwise we'd run out of memory because
|
|
// each docid has like 50,000 matching queries on avg.
|
|
// . we now get matching queries in modulus parts to avoid OOM, because
|
|
// with my new changes i made we are getting like a few hundred thousand
|
|
// matching queries per related docid.
|
|
// . we do not store the query string, etc, for the QueryLink,
|
|
// just the query offset and the hostid that has the query in its
|
|
// memory (g_qbuf). after we intersect the QueryLinks we will get the
|
|
// query strings, etc. there will be a lot fewer in the intersection.
|
|
SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {
|
|
|
|
if ( m_queryLinkBufValid )
|
|
return &m_queryLinkBuf;
|
|
|
|
bool doRelatedQueries = true;
|
|
if ( doMatchingQueries ) doRelatedQueries = false;
|
|
|
|
// get the 32-bit terms the main doc matches, so we may determine
|
|
// what terms in a related query are novel to this document.
|
|
SafeBuf *mainUrlTwidBuf32 = NULL;
|
|
if ( doRelatedQueries ) {
|
|
mainUrlTwidBuf32 = getTermId32Buf() ;//InfoBuf();
|
|
if ( ! mainUrlTwidBuf32 || mainUrlTwidBuf32 == (void *)-1 )
|
|
return mainUrlTwidBuf32;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
//
|
|
// SHIT! we can't use the keys in the termlistbuf for dual purpose
|
|
// role as terms the doc contains, because they do not have the
|
|
// synonym forms!!! So we have to get this terminfobuf as wells
|
|
// as the termlistbuf for each docid!!!!
|
|
//
|
|
// so we might as well not sort by the lower 32 bit hack as well
|
|
//
|
|
|
|
|
|
//
|
|
//
|
|
// 1. get termlistbuf for each docid possibly using msg20s
|
|
//
|
|
// we need this for getting the QueryLink::m_serpScores in
|
|
// handleRequest8e
|
|
//
|
|
//
|
|
//int32_t numDocIds = docIdList->length() / 8;
|
|
//int64_t *docIds = (int64_t *)docIdList->getBufStart();
|
|
|
|
|
|
//SafeBuf *tlistBuf = NULL;
|
|
//SafeBuf *twidBuf32 = NULL;
|
|
|
|
// . we just want the termlistbuf of each related docid
|
|
// . hack: it should be sorted by the LOWER 32 bits of termid
|
|
// so handlerequest8e does not need to sort its termid32/twid32 buf
|
|
//if ( doMatchingQueries ) {
|
|
// tlistBuf = getTermListBuf();
|
|
// if ( ! tlistBuf || tlistBuf == (void *)-1 ) return tlistBuf;
|
|
// twidBuf32 = getTermId32Buf();
|
|
// if ( ! twidBuf32 || twidBuf32 == (void *)-1 ) return twidBuf32;
|
|
//}
|
|
|
|
/*
|
|
if ( doRelatedQueries && ! m_launchedAll ) {
|
|
int32_t need = sizeof(Msg20) * numDocIds;
|
|
// we also use this same buf in getRelatedDocIdsWithTitles
|
|
if ( ! m_msg20Buf.reserve ( need,"m20buf3" ) ) return NULL;
|
|
// mark it all in use
|
|
m_msg20Buf.setLength(need);
|
|
// init them
|
|
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
|
|
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
|
|
// reset cursor to start with first related docid
|
|
m_numMsg20Replies = 0;
|
|
m_numMsg20Requests = 0;
|
|
// launch all!
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &mp[i];
|
|
// get current related docid
|
|
//RelatedDocId *rd = &rds[i];
|
|
// make the request
|
|
Msg20Request req;
|
|
req.ptr_coll = cr->m_coll;
|
|
req.size_coll = gbstrlen(cr->m_coll)+1;
|
|
req.m_docId = docIds[i];
|
|
req.m_expected = true;
|
|
req.m_niceness = m_niceness;
|
|
//req.m_state = m_masterState;
|
|
//req.m_callback2 = m_masterLoop;
|
|
req.m_state = this;
|
|
req.m_callback2 = gotMsg20ReplyWrapper;
|
|
// do not get summary stuff. too slow.
|
|
req.m_numSummaryLines = 0;
|
|
// get this
|
|
req.m_getTermListBuf = true;
|
|
// count these!
|
|
m_numMsg20Requests++;
|
|
// store cursor in msg20 itself so we know the rd
|
|
//msg20->m_hack2 = i;
|
|
// launch it
|
|
if ( ! msg20->getSummary ( &req ) ) continue;
|
|
// error?
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("seo: error getting termlistbuf docid=%" INT64 "",
|
|
docIds[i]);
|
|
// reset it
|
|
//msg20->reset();
|
|
// count reply as back now
|
|
m_numMsg20Replies++;
|
|
}
|
|
m_launchedAll = true;
|
|
}
|
|
// wait for one reply per related docid
|
|
if ( doRelatedQueries && m_numMsg20Replies < m_numMsg20Requests )
|
|
return (SafeBuf *)-1;
|
|
*/
|
|
|
|
//
|
|
//
|
|
// 2. send one msg8e request to each host with those termlistbufs
|
|
//
|
|
// it has one termlistbuf per relateddocid, enough info
|
|
// for handlerequest8e to return the list of matching QueryLinks
|
|
// intersected for all related docids.
|
|
//
|
|
if ( m_numMsg8eRequests == 0 ) {
|
|
SafeBuf request;
|
|
// how big is the request?
|
|
int32_t need = 0;
|
|
need += 1; // for the byte flag
|
|
int32_t collLen = gbstrlen(cr->m_coll);
|
|
need += collLen + 1;
|
|
// list of docids (just one for matching queries)
|
|
need += 4;
|
|
need += docIdList->length();
|
|
|
|
// twidtable alloc
|
|
if ( doRelatedQueries ) {
|
|
need += 4;
|
|
need += mainUrlTwidBuf32->length();
|
|
}
|
|
|
|
//if ( doMatchingQueries ) {
|
|
// // just our main url's termlistbuf
|
|
// need += 4;
|
|
// need += tlistBuf->length();
|
|
// need += 4;
|
|
// need += twidBuf32->length();
|
|
//}
|
|
|
|
//
|
|
// make the 8e request
|
|
//
|
|
if ( ! request.reserve ( need ,"rep8ebuf" ) )
|
|
return NULL;
|
|
// first store flag to indicate if getting matching or
|
|
// related queries
|
|
if ( doMatchingQueries ) request.pushChar(1);
|
|
else request.pushChar(0);
|
|
// then coll\0
|
|
request.safeMemcpy ( cr->m_coll, collLen );
|
|
request.pushChar ( 0 );
|
|
// then docids after the collection name
|
|
request.pushLong ( docIdList->length() );
|
|
request.safeMemcpy ( docIdList );
|
|
|
|
// then if doing related queries we need to store our
|
|
// 32-bit twids of the main url for setting m_uniqueRound
|
|
if ( doRelatedQueries ) {
|
|
request.pushLong(mainUrlTwidBuf32->length());
|
|
request.safeMemcpy(mainUrlTwidBuf32->getBufStart(),
|
|
mainUrlTwidBuf32->length() );
|
|
}
|
|
/*
|
|
// then store each termlistbuf from each msg20
|
|
for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *mp = &mps[i];
|
|
Msg20Reply *rep = mp->getReply();
|
|
if ( rep ) {
|
|
request.pushLong ( rep->size_tlistBuf );
|
|
request.safeMemcpy ( rep->ptr_tlistBuf ,
|
|
rep->size_tlistBuf );
|
|
// then the 32-bit termid buf with synonyms
|
|
// that the above posdblist termlists don't
|
|
// have so we can match queries
|
|
request.pushLong ( rep->size_tiBuf );
|
|
request.safeMemcpy ( rep->ptr_tiBuf,
|
|
rep->size_tiBuf );
|
|
}
|
|
// make them empty i guess
|
|
else {
|
|
request.pushLong ( 0 );
|
|
request.pushLong ( 0 );
|
|
}
|
|
}
|
|
*/
|
|
/*
|
|
// just our main url's termlistbuf
|
|
if ( doMatchingQueries ) {
|
|
request.pushLong (tlistBuf->length());
|
|
request.safeMemcpy (tlistBuf);
|
|
// then the 32-bit termid buf with synonyms that
|
|
// the above posdblist termlists don't have so
|
|
// we can match queries
|
|
request.pushLong (twidBuf32->length());
|
|
request.safeMemcpy (twidBuf32);
|
|
|
|
}
|
|
*/
|
|
// sanity
|
|
if ( request.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do not free it here, let udpserver free it
|
|
char *req = request.getBufStart();
|
|
int32_t reqSize = request.length();
|
|
request.detachBuf();
|
|
|
|
// we've formulated the 8e request, no need for msg20s anymore
|
|
//for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ){
|
|
// // int16_tcut
|
|
// Msg20 *mp = &mps[i];
|
|
// mp->destructor();
|
|
//}
|
|
// free the mem as well
|
|
//m_msg20Buf.purge();
|
|
|
|
// must be host #0 for this next algo to work
|
|
if ( g_hostdb.m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
|
//
|
|
// send msg8e request to each host. skip if dead.
|
|
//
|
|
for ( int32_t k = 1; k <= g_hostdb.m_numHosts ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// send to ourselves last so we can do all in parallel
|
|
int32_t hosti = k;
|
|
if ( k == g_hostdb.m_numHosts ) hosti = 0;
|
|
// get ptr to the host
|
|
Host *host = g_hostdb.getHost(hosti);
|
|
// get hostid of host #i
|
|
int32_t hostId = host->m_hostId;
|
|
if ( hostId != hosti ) { char *xx=NULL;*xx=0; }
|
|
// count it
|
|
m_numMsg8eRequests++;
|
|
// skip if dead. i guess no queries from that guy. we
|
|
// can't send to a twin because the twin does not have
|
|
// the same queries in its in-memory query log.
|
|
if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive) {
|
|
log("seo: skipping msg8e to dead host %" INT32 "",
|
|
hostId);
|
|
m_msg8eReply [hostId] = NULL;
|
|
m_msg8eReplySize[hostId] = 0;
|
|
m_numMsg8eReplies++;
|
|
continue;
|
|
}
|
|
// . send request to him
|
|
// . reply is the query strings
|
|
// . when reply comes in we store it in the query
|
|
// string buf and make the QueryLinks reference it
|
|
// with their QueryLink::m_queryStringOffset
|
|
if ( ! g_udpServer.sendRequest ( req ,
|
|
reqSize ,
|
|
0x8e , // msgtype
|
|
host->m_ip , // ip
|
|
host->m_port , // port
|
|
hostId,
|
|
NULL, // retslot
|
|
this,
|
|
gotMsg8eReplyWrapper,
|
|
999999, // timeout
|
|
-1 , // backoff
|
|
-1 , // maxwait
|
|
NULL, // replybuf
|
|
0, // replybufmaxsize
|
|
m_niceness // niceness
|
|
)) {
|
|
// let admin know about error
|
|
log("seopipe: sendRequest 8e had error: %s",
|
|
mstrerror(g_errno));
|
|
// count it as replied then
|
|
m_numMsg8eReplies++;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// this should never happen now with our new wrapper
|
|
if ( m_numMsg8eReplies < m_numMsg8eRequests )
|
|
return (SafeBuf *)-1;
|
|
|
|
//
|
|
//
|
|
// 3. MERGE the msg8e replies from all hosts
|
|
//
|
|
//
|
|
|
|
// gotMgs8eReplyWrapper() should have recorded each one into
|
|
// m_msg8Reply[i], the msg20 reply ptr. set up for merging.
|
|
char *bestPtr[MAX_HOSTS];
|
|
char *bufEnd [MAX_HOSTS];
|
|
for ( int32_t i = 0; i < g_hostdb.m_numHosts ; i++ ) {
|
|
char *reply = m_msg8eReply [i];
|
|
// this happens if host is dead...
|
|
if ( ! reply ) {
|
|
bestPtr[i] = NULL;
|
|
bufEnd [i] = NULL;
|
|
continue;
|
|
}
|
|
//int32_t replySize = m_msg8eReplySize [i];
|
|
// it should be a list of QueryLinks
|
|
char *p = reply;
|
|
int32_t queryLinkBufSize = *(int32_t *)p;
|
|
p += 4;
|
|
bestPtr[i] = p;
|
|
// bufEnd[i] also marks the start of the querystringbuf
|
|
bufEnd [i] = p + queryLinkBufSize;
|
|
}
|
|
int32_t count = 0;
|
|
int32_t maxQueryLinks = MAX_RELATED_QUERIES;
|
|
if ( doMatchingQueries ) maxQueryLinks = MAX_MATCHING_QUERIES;
|
|
|
|
// now merge the top "max" highest scoring
|
|
// QueryLinks and their corresponding QueryLogEntries into
|
|
// m_queryLinkBuf/m_queryLinkStringBuf
|
|
storeMore:
|
|
// get the max scoring QueryLink from the 8e replies
|
|
int32_t maxi = -1;
|
|
float maxScore = -1.0;
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// skip if exhausted
|
|
if ( bestPtr[i] >= bufEnd[i] ) continue;
|
|
// cast it
|
|
QueryLink *qk = (QueryLink *)bestPtr[i];
|
|
// sanity, if not list head core
|
|
if ( ! qk->m_isFirst ) { char *xx=NULL;*xx=0; }
|
|
// skip if score is not the current maximum
|
|
if ( qk->m_totalQueryImportance < maxScore ) continue;
|
|
// we got a new max!
|
|
maxScore = qk->m_totalQueryImportance;
|
|
maxi = i;
|
|
}
|
|
|
|
// store max into m_queryLinkBuf and m_queryLinkStringBuf
|
|
if ( maxi >= 0 ) {
|
|
// int16_tcut
|
|
QueryLink *best = (QueryLink *)bestPtr[maxi];
|
|
// get # to copy
|
|
int32_t toCopy = sizeof(QueryLink);
|
|
if ( doRelatedQueries )
|
|
// how many querylinks in this list? i.e. those
|
|
// that all share the same query, but different
|
|
// relateddocid?
|
|
toCopy = best->m_numInList * sizeof(QueryLink);
|
|
// copy the querylink
|
|
if ( ! m_queryLinkBuf.reserve ( toCopy ) ) return NULL;
|
|
// point to it
|
|
QueryLink *qk = (QueryLink *)m_queryLinkBuf.getBuf();
|
|
// THEN store it
|
|
m_queryLinkBuf.safeMemcpy( best , toCopy );
|
|
// point to its querylogentry buf, it occurs right
|
|
// after the list of QueryLinks!
|
|
char *p = bufEnd[maxi];
|
|
// and the query it is for
|
|
p += qk->m_queryStringOffset;
|
|
// cast that
|
|
QueryLogEntry *qe = (QueryLogEntry *)p;
|
|
// ensure enough space
|
|
if ( ! m_queryLinkStringBuf.reserve(qe->getSize(),"rqbb" ) )
|
|
return NULL;
|
|
// we are moving it into the final buf
|
|
qk->m_queryStringOffset = m_queryLinkStringBuf.length();
|
|
// store query log entry here now
|
|
m_queryLinkStringBuf.safeMemcpy ( qe, qe->getSize() );
|
|
// advance
|
|
bestPtr[maxi] += toCopy;
|
|
}
|
|
// limit
|
|
if ( ++count < maxQueryLinks ) goto storeMore;
|
|
// liberate those msg20 reply buffers
|
|
for ( int32_t i = 0; i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg8eReply[i] ) continue;
|
|
mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
|
|
m_msg8eReply[i] = NULL;
|
|
}
|
|
// reset our parms if we are re-called for related queries
|
|
m_numMsg8eReplies = 0;
|
|
m_numMsg8eRequests = 0;
|
|
m_queryLinkBufValid = true;
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginRelatedQueries;
|
|
log("seopipe: getrelatedquerybuftook %" INT64 " ms",took);
|
|
m_beginRelatedQueries = 0LL;
|
|
|
|
// validate
|
|
m_queryLinkBufValid = true;
|
|
|
|
/*
|
|
// log for debug
|
|
qks = (QueryLink *)m_queryLinkBuf->getBufStart();
|
|
nks = m_queryLinkBuf->length() / sizeof(QueryLink);
|
|
for ( int32_t k = 0 ; k < nks ; k++ ) {
|
|
// now we use offsets into m_relatedQueryBuf.m_buf[]
|
|
QueryRel *qk = &qks[k];
|
|
// skip if not a head
|
|
if ( ! qk->m_isFirst ) continue;
|
|
char *qstr = qk->getQueryString(&m_queryLinkStringBuf);
|
|
log("seopipe: relquery=\"%s\" imp=%f votes=%" INT32 "",
|
|
qstr,
|
|
qk->m_rq_totalScore,
|
|
qk->m_docIdVotes);
|
|
}
|
|
*/
|
|
|
|
return &m_queryLinkBuf;
|
|
}
|
|
|
|
// scan matches like XmlDoc::getSummary() does and get all sentences
|
|
// containing a query term...
|
|
//void XmlDoc::getGigabitExcerpts ( ) {
|
|
//}
|
|
|
|
|
|
// this is still used by Title.cpp to get the title: field quickly
|
|
char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {
|
|
|
|
if ( ! json ) return NULL;
|
|
|
|
// get length
|
|
int32_t fieldLen = gbstrlen(field);
|
|
// keep track of in a quote or not
|
|
bool inQuotes = false;
|
|
char *stringStart = NULL;
|
|
char *p = json;
|
|
bool gotOne = false;
|
|
int32_t depth = 0;
|
|
// scan
|
|
for ( ; *p ; p++ ) {
|
|
// escaping a quote? ignore quote then.
|
|
if ( *p == '\\' && p[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
p++;
|
|
continue;
|
|
}
|
|
// count {} depth
|
|
if ( ! inQuotes ) {
|
|
if ( *p == '{' ) depth++;
|
|
if ( *p == '}' ) depth--;
|
|
}
|
|
// a quote?
|
|
if ( *p == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
// set start of the string if quote is beginning
|
|
if ( inQuotes ) stringStart = p + 1;
|
|
// if quote is ending and a colon follows then
|
|
// it was a json field name. so if it matches the
|
|
// field we want return the following field for it.
|
|
else if ( ! inQuotes &&
|
|
! gotOne &&
|
|
p[1] == ':' &&
|
|
// {"title":"whatever",...}
|
|
// could be product:{title:... depth=2
|
|
(depth == 1 ||depth==2) &&
|
|
stringStart &&
|
|
(p - stringStart) == fieldLen &&
|
|
strncmp(field,stringStart,fieldLen)==0 ) {
|
|
// now, the next time we set stringStart
|
|
// it will be set to the VALUE of this field
|
|
// assuming the field is a STRING!!!!
|
|
gotOne = true;
|
|
// return after the quote
|
|
//return p + 2;
|
|
}
|
|
// ok, we got the string after the field string...
|
|
else if ( ! inQuotes && gotOne ) {
|
|
if ( valueLen ) *valueLen = p - stringStart;
|
|
return stringStart;
|
|
}
|
|
// keep chugging
|
|
continue;
|
|
}
|
|
}
|
|
// done, not found
|
|
return NULL;
|
|
}
|
|
|
|
|
|
Json *XmlDoc::getParsedJson ( ) {
|
|
|
|
if ( m_jpValid ) return &m_jp;
|
|
|
|
// core if not a json object
|
|
if ( m_contentTypeValid && m_contentType != CT_JSON &&
|
|
// spider status docs are now really json
|
|
m_contentType != CT_STATUS ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// \0 terminated
|
|
char **pp = getUtf8Content();
|
|
if ( ! pp || pp == (void *)-1 ) return (Json *)pp;
|
|
|
|
// point to the json
|
|
char *p = *pp;
|
|
|
|
// empty? all done then.
|
|
//if ( ! p ) return (char *)pp;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . if p is NULL i guess this should still be ok and be empty
|
|
if ( ! m_jp.parseJsonStringIntoJsonItems ( p , m_niceness ) ) {
|
|
g_errno = EBADJSONPARSER;
|
|
return NULL;
|
|
}
|
|
|
|
m_jpValid = true;
|
|
return &m_jp;
|
|
}
|
|
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . hash each json VALUE (not FIELD) ... AND ... hash each json
|
|
// VALUE with its FIELD like "title:cool" or "description:whatever"
|
|
// . example:
|
|
// [{"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":1378322570280,"matched":64,"status":"Stopped","start":1378322184332,"token":"poo","parameterMap":{"token":"poo","seed":"www.alleyinsider.com","api":"article"},"crawled":64},{"id":"830e0584-7f69-4bdd-
|
|
|
|
#include "Json.h"
|
|
|
|
char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
|
|
|
setStatus ( "hashing json fields" );
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = table;
|
|
hi.m_desc = "json object";
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
|
|
|
|
return hashJSONFields2 ( table , &hi , jp , true );
|
|
}
|
|
|
|
|
|
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
|
HashInfo *hi , Json *jp ,
|
|
bool hashWithoutFieldNames ) {
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
|
|
char nb[1024];
|
|
SafeBuf nameBuf(nb,1024);
|
|
|
|
//int32_t totalHash32 = 0;
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
// reset, but don't free mem etc. just set m_length to 0
|
|
nameBuf.reset();
|
|
|
|
// get its full compound name like "meta.twitter.title"
|
|
JsonItem *p = ji;
|
|
char *lastName = NULL;
|
|
char *nameArray[20];
|
|
int32_t numNames = 0;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
// empty name?
|
|
if ( ! p->m_name ) continue;
|
|
if ( ! p->m_name[0] ) continue;
|
|
// dup? can happen with arrays. parent of string
|
|
// in object, has same name as his parent, the
|
|
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
|
|
if ( p->m_name == lastName ) continue;
|
|
// update
|
|
lastName = p->m_name;
|
|
// add it up
|
|
nameArray[numNames++] = p->m_name;
|
|
// breach?
|
|
if ( numNames < 15 ) continue;
|
|
log("build: too many names in json tag");
|
|
break;
|
|
}
|
|
|
|
// if we are the diffbot reply "html" field do not hash this
|
|
// because it is redundant and it hashes html tags etc.!
|
|
// plus it slows us down a lot and bloats the index.
|
|
if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"html")==0)
|
|
continue;
|
|
|
|
// assemble the names in reverse order which is correct order
|
|
for ( int32_t i = 1 ; i <= numNames ; i++ ) {
|
|
// copy into our safebuf
|
|
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
|
|
return NULL;
|
|
// separate names with periods
|
|
if ( ! nameBuf.pushChar('.') ) return NULL;
|
|
}
|
|
// remove last period
|
|
nameBuf.removeLastChar('.');
|
|
// and null terminate
|
|
if ( ! nameBuf.nullTerm() ) return NULL;
|
|
// change all :'s in names to .'s since : is reserved!
|
|
char *px = nameBuf.getBufStart();
|
|
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
|
|
//for ( px = nameBuf.getBufStart(); *px ; px++ ) if ( *px == '-' ) *px = '_';
|
|
//
|
|
// DIFFBOT special field hacks
|
|
//
|
|
char *name = nameBuf.getBufStart();
|
|
hi->m_hashGroup = HASHGROUP_BODY;
|
|
if ( strstr(name,"title") )
|
|
hi->m_hashGroup = HASHGROUP_TITLE;
|
|
if ( strstr(name,"url") )
|
|
hi->m_hashGroup = HASHGROUP_INURL;
|
|
if ( strstr(name,"resolved_url") )
|
|
hi->m_hashGroup = HASHGROUP_INURL;
|
|
if ( strstr(name,"tags") )
|
|
hi->m_hashGroup = HASHGROUP_INTAG;
|
|
if ( strstr(name,"meta") )
|
|
hi->m_hashGroup = HASHGROUP_INMETATAG;
|
|
//
|
|
// now Json.cpp decodes and stores the value into
|
|
// a buffer, so ji->getValue() should be decoded completely
|
|
//
|
|
|
|
// . get the value of the json field
|
|
// . if it's a number or bool it converts into a string
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
char tbuf[32];
|
|
|
|
// if the value is clearly a date, just hash it as
|
|
// a number, so use a temporary value that holds the
|
|
// time_t and hash with that... this will hash
|
|
// diffbot's article date field as a number so we can
|
|
// sortby and constrain by it in the search results
|
|
if ( name && (strcasecmp(name,"date") == 0 || strcasecmp(name,"estimatedDate") == 0)) {
|
|
// this is in HttpMime.cpp
|
|
int64_t tt = atotime1 ( val );
|
|
// we can't store 64-bit dates... so truncate to -2147483648
|
|
// which is Dec 13 1901. so we don't quite get the 1898 date
|
|
// for the new york times dbpedia entry. maybe if we added
|
|
// an extra termlist for more precision to indicate century or
|
|
// something.
|
|
if ( tt && tt < (int32_t)0x80000000 )
|
|
tt = (int32_t)0x80000000;
|
|
// likewise, we can't be too big, passed 2038
|
|
if ( tt && tt > 0x7fffffff )
|
|
tt = (int32_t)0x7fffffff;
|
|
if ( tt ) {
|
|
// print out the time_t in ascii
|
|
vlen = sprintf(tbuf,"%" INT32 "",(int32_t)tt);
|
|
// and point to it for hashing/indexing
|
|
val = tbuf;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// for deduping search results we set m_contentHash32 here for
|
|
// diffbot json objects.
|
|
// we can't do this here anymore, we have to set the
|
|
// contenthash in ::getContentHash32() because we need it to
|
|
// set EDOCUNCHANGED in ::getIndexCode() above.
|
|
//
|
|
/*
|
|
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
|
|
// make the content hash so we can set m_contentHash32
|
|
// for deduping
|
|
int32_t nh32 = hash32n ( name );
|
|
// do an exact hash for now...
|
|
int32_t vh32 = hash32 ( val , vlen , m_niceness );
|
|
// accumulate, order independently
|
|
totalHash32 ^= nh32;
|
|
totalHash32 ^= vh32;
|
|
}
|
|
*/
|
|
|
|
// index like "title:whatever"
|
|
hi->m_prefix = name;
|
|
hashString ( val , vlen , hi );
|
|
|
|
//log("hashing json var as %s %s %d", name, val, vlen);
|
|
|
|
// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
|
|
if ( name )
|
|
hashFieldMatchTerm ( val , (int32_t)vlen , hi );
|
|
|
|
if ( ! hashWithoutFieldNames )
|
|
continue;
|
|
|
|
// hash without the field name as well
|
|
hi->m_prefix = NULL;
|
|
hashString ( val , vlen , hi );
|
|
|
|
/*
|
|
// a number? hash special then as well
|
|
if ( ji->m_type != JT_NUMBER ) continue;
|
|
|
|
// use prefix for this though
|
|
hi->m_prefix = name;
|
|
|
|
// hash as a number so we can sort search results by
|
|
// this number and do range constraints
|
|
float f = ji->m_valueDouble;
|
|
if ( ! hashNumber2 ( f , hi ) )
|
|
return NULL;
|
|
*/
|
|
}
|
|
|
|
//m_contentHash32 = totalHash32;
|
|
//m_contentHash32Valid = true;
|
|
|
|
return (char *)0x01;
|
|
}
|
|
|
|
char *XmlDoc::hashXMLFields ( HashTableX *table ) {
|
|
|
|
setStatus ( "hashing xml fields" );
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = table;
|
|
hi.m_desc = "xml object";
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
|
|
Xml *xml = getXml();
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes ();
|
|
|
|
SafeBuf nameBuf;
|
|
|
|
// scan the xml nodes
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . skip if it's a tag not text node skip it
|
|
// . we just want the "text" nodes
|
|
if ( nodes[i].isTag() ) continue;
|
|
|
|
//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
|
|
// log("hey:hy");
|
|
|
|
// assemble the full parent name
|
|
// like "tag1.tag2.tag3"
|
|
nameBuf.reset();
|
|
xml->getCompoundName ( i , &nameBuf );
|
|
|
|
// this is \0 terminated
|
|
char *tagName = nameBuf.getBufStart();
|
|
|
|
// get the utf8 text
|
|
char *val = nodes[i].m_node;
|
|
int32_t vlen = nodes[i].m_nodeLen;
|
|
|
|
// index like "title:whatever"
|
|
if ( tagName && tagName[0] ) {
|
|
hi.m_prefix = tagName;
|
|
hashString ( val , vlen , &hi );
|
|
}
|
|
|
|
// hash without the field name as well
|
|
hi.m_prefix = NULL;
|
|
hashString ( val , vlen , &hi );
|
|
}
|
|
|
|
return (char *)0x01;
|
|
}
|
|
|
|
// if our url is that of a subdoc, then get the url of the parent doc
|
|
// from which we were a subsection
|
|
char *XmlDoc::getDiffbotParentUrl( char *myUrl ) {
|
|
// remove -diffbotxyz
|
|
if ( ! m_kbuf.safeStrcpy( myUrl ) ) return NULL;
|
|
char *p = m_kbuf.getBufStart();
|
|
char *s = strstr(p,"-diffbotxyz");
|
|
if ( s ) { *s = '\0'; return p; }
|
|
// temporarily until we inject "diffbotreply" uncomment this
|
|
/*
|
|
// otherwise i guess we got dan's format of -article|%" INT32 "|%" INT32 "
|
|
char *e = m_kbuf.getBuf() - 1;
|
|
for ( ; *e && is_digit(*e) ; e-- );
|
|
if ( *e != '|' ) return NULL;
|
|
e--;
|
|
for ( ; *e && is_digit(*e) ; e-- );
|
|
if ( *e != '|' ) return NULL;
|
|
e--;
|
|
// now to hyphen
|
|
char *estart = m_kbuf.getBufStart();
|
|
for ( ; e>estart && *e !='-' ; e-- );
|
|
if ( *e != '-' ) return NULL;
|
|
*e = '\0';
|
|
return p;
|
|
*/
|
|
return NULL;
|
|
}
|
|
|
|
bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
|
|
|
|
// sanity
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
storeFacetValuesSite ( qs, sb, fvh );
|
|
|
|
if ( m_hasMetadata) {
|
|
Json jpMetadata;
|
|
if (jpMetadata.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
|
|
storeFacetValuesJSON ( qs, sb, fvh, &jpMetadata );
|
|
}
|
|
}
|
|
|
|
// if "qa" is a gbxpathsitehash123456 type of beastie then we
|
|
// gotta scan the sections
|
|
if ( strncasecmp(qs,"gbxpathsitehash",15) == 0 )
|
|
return storeFacetValuesSections ( qs , sb , fvh );
|
|
|
|
// if a json doc, get json field
|
|
// spider status docs are really json now
|
|
if ( m_contentType == CT_JSON || m_contentType == CT_STATUS )
|
|
return storeFacetValuesJSON ( qs , sb , fvh, getParsedJson());
|
|
|
|
|
|
if ( m_contentType == CT_HTML )
|
|
return storeFacetValuesHtml ( qs , sb , fvh );
|
|
|
|
if ( m_contentType == CT_XML )
|
|
return storeFacetValuesXml ( qs , sb , fvh );
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// Store facet for site
|
|
bool XmlDoc::storeFacetValuesSite ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
|
|
|
|
char* val = getSite();
|
|
int vlen = gbstrlen(val);
|
|
FacetValHash_t val32 = hash32 ( val , vlen );
|
|
|
|
|
|
// skip if not for us
|
|
if ( fvh && val32 != fvh ) return false;
|
|
if ( strcmp("gbtagsite",qs) ) return false;
|
|
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( !sb->safePrintf("%" UINT32 ",",(uint32_t)val32)) return false;
|
|
if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeFacetValuesSections ( char *qs , SafeBuf *sb ,
|
|
FacetValHash_t fvh ) {
|
|
|
|
// scan all sections
|
|
Sections *ss = getSections();
|
|
if ( ! ss ) return false;
|
|
if ( ss == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww ) return false;
|
|
if ( ww == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t siteHash32 = *getSiteHash32();
|
|
|
|
// qs is like gbxpathsitehash1234567
|
|
// so get the digit part
|
|
char *p = qs;
|
|
for ( ; *p && ! is_digit(*p); p++ );
|
|
uint64_t xsh = (uint64_t)atoll(p);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
Section *si = ss->m_rootSection;
|
|
//sec_t mflags = SEC_SENTENCE | SEC_MENU;
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// is it a match?
|
|
uint64_t mod;
|
|
mod = (uint32_t)si->m_turkTagHash32;
|
|
mod ^= (uint32_t)siteHash32;
|
|
if ( mod != xsh ) continue;
|
|
// . then add facet VALUE
|
|
// . hash of the innerhtml of sentence
|
|
// . get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
|
if ( ! val32 ) continue;
|
|
// if a facetvalhash was provided we must match
|
|
if ( fvh && val32 != fvh ) continue;
|
|
// got one print the facet field
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
if ( isString && ! sb->safePrintf("%" UINT32 ",",val32) )
|
|
return false;
|
|
// put ALSO print the string somewhat
|
|
char *a = m_words.m_words[si->m_next->m_a];
|
|
char *b = m_words.m_words[si->m_next->m_b-1];
|
|
b += m_words.m_wordLens [si->m_next->m_b-1];
|
|
if ( ! sb->safeTruncateEllipsis (a,b-a,160) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
// if wanted a specific string, we are done
|
|
if ( fvh ) return true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::storeFacetValuesHtml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
|
|
|
|
Xml *xml = getXml();
|
|
|
|
int32_t qsLen = gbstrlen(qs);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
// check for gblang:en etc.
|
|
// if ( isString && strncmp(qs,"gblang",6)==0 ) {
|
|
// if (!sb->safeStrcpy(qs) ) return false;
|
|
// if (!sb->pushChar('\0') ) return false;
|
|
// // find the lang that has that hash!
|
|
// if (!sb->safePrintf("%" UINT32 ",",(uint32_t)val32))return false;
|
|
// if (!sb->safeMemcpy(content,contentLen) ) return false;
|
|
// if (!sb->pushChar('\0') ) return false;
|
|
//}
|
|
|
|
|
|
char *content;
|
|
int32_t contentLen;
|
|
int32_t nameLen;
|
|
char *s;
|
|
int32_t i = 0;
|
|
|
|
bool uniqueField = false;
|
|
|
|
// a title tag can count now too
|
|
if ( strcmp(qs,"title") == 0 ) {
|
|
// skip leading spaces = false
|
|
content = xml->getString ("title",&contentLen,false);
|
|
uniqueField = true;
|
|
goto skip;
|
|
}
|
|
|
|
|
|
|
|
// find the first meta summary node
|
|
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
|
|
|
|
// continue if not a meta tag
|
|
if ( xml->m_nodes[i].m_nodeId != TAG_META ) continue;
|
|
// . does it have a type field that's "summary"
|
|
// . <meta name=summary content="...">
|
|
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
|
|
s = xml->getString ( i , "name", &nameLen );
|
|
// "s" can be "summary","description","keywords",...
|
|
if ( nameLen != qsLen ) continue;
|
|
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
|
|
// point to the summary itself
|
|
content = xml->getString ( i , "content" , &contentLen );
|
|
if ( ! content || contentLen <= 0 ) continue;
|
|
|
|
skip:
|
|
// hash it to match it if caller specified a particular hash
|
|
// because they are coming from Msg40::lookUpFacets() function
|
|
// to convert the hashes to strings, like for rendering in
|
|
// the facets box to the left of the search results
|
|
FacetValHash_t val32 = hash32 ( content, contentLen);
|
|
if ( fvh && fvh != val32 ) continue;
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( isString && !sb->safePrintf("%" UINT32 ",",(uint32_t)val32))
|
|
return false;
|
|
if ( !sb->safeMemcpy(content,contentLen) ) return false;
|
|
if ( !sb->pushChar('\0') ) return false;
|
|
|
|
// if only one specified, we are done
|
|
if ( fvh ) return true;
|
|
|
|
if ( uniqueField ) return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::storeFacetValuesXml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
|
|
|
|
Xml *xml = getXml();
|
|
|
|
int32_t qsLen = gbstrlen(qs);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
int32_t i = 0;
|
|
|
|
bool uniqueField = false;
|
|
|
|
SafeBuf nameBuf;
|
|
|
|
// find the first meta summary node
|
|
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
|
|
|
|
// skip text nodes
|
|
if ( xml->m_nodes[i].m_nodeId == 0 ) continue;
|
|
|
|
// assemble the full parent name
|
|
// like "tag1.tag2.tag3"
|
|
nameBuf.reset();
|
|
xml->getCompoundName ( i , &nameBuf );
|
|
int32_t nameLen = nameBuf.length();
|
|
char *s = nameBuf.getBufStart();
|
|
|
|
// . does it have a type field that's "summary"
|
|
// . <meta name=summary content="...">
|
|
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
|
|
//s = xml->getString ( i , "name", &nameLen );
|
|
|
|
// "s" can be "summary","description","keywords",...
|
|
if ( nameLen != qsLen ) continue;
|
|
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
|
|
|
|
// got it...
|
|
|
|
// wtf?
|
|
if ( i + 1 >= xml->m_numNodes ) continue;
|
|
|
|
// point to the content! this is a text node?
|
|
|
|
// skip if not a text node, we don't return tag nodes i guess
|
|
if ( xml->m_nodes[i+1].m_nodeId ) continue;
|
|
|
|
char *content = xml->m_nodes[i+1].m_node;
|
|
int32_t contentLen = xml->m_nodes[i+1].m_nodeLen;
|
|
|
|
// skip if empty
|
|
if ( ! content || contentLen <= 0 ) continue;
|
|
|
|
// skip commen cases too! like white space
|
|
if ( contentLen == 1 && is_wspace_a(content[0]) ) continue;
|
|
|
|
// hash it to match it if caller specified a particular hash
|
|
// because they are coming from Msg40::lookUpFacets() function
|
|
// to convert the hashes to strings, like for rendering in
|
|
// the facets box to the left of the search results
|
|
FacetValHash_t val32 = hash32 ( content, contentLen);
|
|
if ( fvh && fvh != val32 ) continue;
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( isString && !sb->safePrintf("%" UINT32 ",",(uint32_t)val32))
|
|
return false;
|
|
if ( !sb->safeMemcpy(content,contentLen) ) return false;
|
|
if ( !sb->pushChar('\0') ) return false;
|
|
|
|
// if only one specified, we are done
|
|
if ( fvh ) return true;
|
|
|
|
if ( uniqueField ) return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeFacetValuesJSON (char *qs,
|
|
SafeBuf *sb,
|
|
FacetValHash_t fvh,
|
|
Json *jp ) {
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
|
|
char nb[1024];
|
|
SafeBuf nameBuf(nb,1024);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
|
|
// reset, but don't free mem etc. just set m_length to 0
|
|
nameBuf.reset();
|
|
|
|
// get its full compound name like "meta.twitter.title"
|
|
ji->getCompoundName ( nameBuf );
|
|
|
|
// skip if not for us
|
|
if ( strcmp(nameBuf.getBufStart(),qs) ) continue;
|
|
|
|
//
|
|
// now Json.cpp decodes and stores the value into
|
|
// a buffer, so ji->getValue() should be decoded completely
|
|
//
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
|
|
// hash it to match it if caller specified a particular hash
|
|
// because they are coming from Msg40::lookUpFacets() function
|
|
// to convert the hashes to strings, like for rendering in
|
|
// the facets box to the left of the search results
|
|
FacetValHash_t val32 = hash32 ( val , vlen );
|
|
if ( fvh && val32 != fvh )
|
|
continue;
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( isString && !sb->safePrintf("%" UINT32 ",",(uint32_t)val32))
|
|
return false;
|
|
|
|
if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// if wanted a specific string, then we are done
|
|
if ( fvh ) return true;
|
|
}
|
|
|
|
return true;
|
|
}
|