xor the firstip into the doledb key this time. seems to avoid all collisions now so we don't overwrite nodes in the doledb tree.
53448 lines
1.6 MiB
53448 lines
1.6 MiB
//-*- coding: utf-8 -*-
|
|
|
|
#include "gb-include.h"
|
|
|
|
#include "hash.h"
|
|
#include "XmlDoc.h"
|
|
#include "Indexdb.h" // for TERMID_MASK definition and g_indexdb.getTermId()
|
|
#include "Conf.h"
|
|
#include "Query.h" // getFieldCode()
|
|
#include "Clusterdb.h" // g_clusterdb
|
|
#include "Categories.h" // g_categories
|
|
#include "iana_charset.h"
|
|
//#include "Checksumdb.h"
|
|
//#include "Msg24.h"
|
|
#include "Stats.h"
|
|
#include "Sanity.h"
|
|
#include "Speller.h"
|
|
#include "CountryCode.h"
|
|
//#include "SiteBonus.h"
|
|
#include "linkspam.h"
|
|
#include "Tagdb.h"
|
|
//#include "Dates.h"
|
|
#include "Repair.h"
|
|
//#include "Links.h"
|
|
#include "HashTableX.h"
|
|
#include "LanguageIdentifier.h" // g_langId
|
|
#include "CountryCode.h" // g_countryCode
|
|
#include "sort.h"
|
|
#include "Wiki.h"
|
|
#include "Speller.h"
|
|
#include "SiteGetter.h"
|
|
#include "Placedb.h"
|
|
#include "Test.h"
|
|
#include "Synonyms.h"
|
|
//#include "Revdb.h"
|
|
#include "Timedb.h"
|
|
#ifdef _USETURKS_
|
|
//#include "PageTurk.h"
|
|
#endif
|
|
#include "PageInject.h"
|
|
#include "HttpServer.h"
|
|
#include "Facebook.h"
|
|
#include "Posdb.h"
|
|
#include "Highlight.h"
|
|
#include "Wiktionary.h"
|
|
#include "seo.h" // Msg99Request etc.
|
|
//#include <regex.h>
|
|
#include "PingServer.h"
|
|
#include "Parms.h"
|
|
|
|
extern int g_inMemcpy;
|
|
|
|
//#define MAXDOCLEN (1024*1024 * 5)
|
|
//#define MAXDOCLEN (1024*1024)
|
|
|
|
HashTableX *g_ct = NULL;
|
|
XmlDoc *g_doc = NULL;
|
|
char *g_ptr = NULL;
|
|
int32_t *g_int32_t = NULL;
|
|
|
|
#define SENT_UNITS 30
|
|
|
|
static int32_t getIsContacty ( Url *url ,
|
|
LinkInfo *info1 ,
|
|
int32_t hops ,
|
|
uint8_t ct ,
|
|
bool isRoot ,
|
|
int32_t niceness );
|
|
|
|
|
|
static int32_t getTopGigabits ( HashTableX *ht ,
|
|
GigabitInfo **top ,
|
|
int32_t max ,
|
|
int32_t minDocCount ) ;
|
|
|
|
static void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
|
|
int64_t wid1 ,
|
|
int64_t pid2 ,
|
|
int64_t wid2 , // post word
|
|
float *ww ,
|
|
HashTableX *tt1 ,
|
|
int32_t titleRecVersion ) ;
|
|
|
|
static bool addGigabit ( HashTableX *ht ,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int64_t docId ,
|
|
Section *sp ,
|
|
bool singleWord ,
|
|
uint8_t langId ,
|
|
// starts with word #i
|
|
int32_t i ,
|
|
int32_t ptsArg = -1 ) ;
|
|
|
|
static bool getWordPosVec ( Words *words ,
|
|
Sections *sections,
|
|
//int32_t wordStart,
|
|
//int32_t wordEnd,
|
|
int32_t startDist,
|
|
char *fragVec,
|
|
int32_t niceness ,
|
|
SafeBuf *wpos ) ;
|
|
|
|
static void getMetaListWrapper ( void *state ) ;
|
|
|
|
char *getFirstJSONObject ( char *p ,
|
|
int32_t niceness ,
|
|
bool *isProduct ,
|
|
bool *isImage ) ;
|
|
char *getJSONObjectEnd ( char *p , int32_t niceness ) ;
|
|
|
|
void doneReadingArchiveFileWrapper ( int fd, void *state );
|
|
|
|
XmlDoc::XmlDoc() {
|
|
m_readThreadOut = false;
|
|
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) m_msg7s[i] = NULL;
|
|
m_esbuf.setLabel("exputfbuf");
|
|
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
|
|
m_freed = false;
|
|
m_contentInjected = false;
|
|
m_wasContentInjected = false;
|
|
|
|
// warc parsing stuff
|
|
m_msg7 = NULL;
|
|
m_warcError = 0;
|
|
m_arcError = 0;
|
|
m_doneInjectingWarc = false;
|
|
m_numInjectionsOut = 0;
|
|
m_fptr = NULL;
|
|
m_fptrEnd = NULL;
|
|
m_fileBuf = NULL;
|
|
m_warcContentPtr = NULL;
|
|
m_calledWgetThread = false;
|
|
|
|
//m_coll = NULL;
|
|
m_ubuf = NULL;
|
|
m_pbuf = NULL;
|
|
//m_contactDoc = NULL;
|
|
m_rootDoc = NULL;
|
|
m_oldDoc = NULL;
|
|
m_dx = NULL;
|
|
m_printedMenu = false;
|
|
// reset all *valid* flags to false
|
|
void *p = &m_VALIDSTART;
|
|
void *pend = &m_VALIDEND;
|
|
memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p
|
|
m_msg22Request.m_inUse = 0;
|
|
m_msg4Waiting = false;
|
|
m_msg4Launched = false;
|
|
//m_sectiondbData = NULL;
|
|
//m_placedbData = NULL;
|
|
m_dupTrPtr = NULL;
|
|
m_oldTitleRec = NULL;
|
|
m_filteredContent = NULL;
|
|
m_filteredContentAllocSize = 0;
|
|
m_metaList = NULL;
|
|
m_metaListSize = 0;
|
|
m_metaListAllocSize = 0;
|
|
//m_titleRec = NULL;
|
|
//m_freeTitleRec = true;
|
|
m_rootTitleRec = NULL;
|
|
m_outlinkHopCountVector = NULL;
|
|
//m_gsbuf = NULL;
|
|
m_extraDoc = NULL;
|
|
m_ahrefsDoc = NULL;
|
|
m_wikiqbuf = NULL;
|
|
//m_cr = NULL;
|
|
//m_msg3aArray = NULL;
|
|
m_msg3a = NULL;
|
|
m_query3a = NULL;
|
|
//m_numMsg99Replies = 0;
|
|
m_numMsg95Replies = 0;
|
|
m_seoSocket = NULL;
|
|
m_hackSocket = NULL;
|
|
m_doingSEO = false;
|
|
//m_newxd = NULL;
|
|
//m_newxd2 = NULL;
|
|
//m_newMsg20 = NULL;
|
|
m_registeredSocketCallback = false;
|
|
//m_numMsg98Requests = 0;
|
|
//m_numMsg98Replies = 0;
|
|
m_numMsg8eReplies = 0;
|
|
m_numMsg8eRequests = 0;
|
|
m_tempMsg25Page = NULL;
|
|
m_tempMsg25Site = NULL;
|
|
m_numLinkRequestsOut = 0;
|
|
m_numLinkRequestsIn = 0;
|
|
m_numMsg3fReplies = 0;
|
|
m_numMsg3fRequests = 0;
|
|
m_numMsg4fRequests = 0;
|
|
m_numMsg4fReplies = 0;
|
|
m_sentMsg4fRequests = false;
|
|
|
|
//m_notifyBlocked = 0;
|
|
//m_mcasts = NULL;
|
|
//for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ )
|
|
// m_currentBinPtrs[i] = NULL;
|
|
m_registeredWgetReadCallback = false;
|
|
m_pipe = NULL;
|
|
reset();
|
|
};
|
|
|
|
XmlDoc::~XmlDoc() {
|
|
setStatus("freeing this xmldoc");
|
|
reset();
|
|
m_freed = true;
|
|
};
|
|
|
|
static int64_t s_lastTimeStart = 0LL;
|
|
|
|
// for debugging
|
|
class XmlDoc *g_xd;
|
|
|
|
void XmlDoc::reset ( ) {
|
|
|
|
m_zeroedOut = false;
|
|
|
|
m_oldDocExistedButHadError = false;
|
|
|
|
m_addedStatusDocId = 0;
|
|
|
|
if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) {
|
|
mfree ( m_diffbotProxyReply , sizeof(ProxyReply) , "dprox" );
|
|
m_diffbotProxyReply = NULL;
|
|
}
|
|
|
|
if ( m_readThreadOut )
|
|
log("build: deleting xmldoc class that has a read thread out "
|
|
"on a warc file");
|
|
|
|
if ( m_fileValid ) {
|
|
m_file.close();
|
|
m_file.unlink();
|
|
}
|
|
|
|
if ( m_fileBuf )
|
|
mfree ( m_fileBuf , m_fileBufAllocSize , "fbdd");
|
|
|
|
for ( int i = 0 ; i < MAXMSG7S ; i++ ) {
|
|
Msg7 *msg7 = m_msg7s[i];
|
|
if ( ! msg7 ) continue;
|
|
if(msg7->m_inUse) {
|
|
log("build: archive: reseting xmldoc when msg7s are outstanding");
|
|
|
|
}
|
|
mdelete ( msg7 , sizeof(Msg7) , "xdmsg7" );
|
|
delete ( msg7 );
|
|
m_msg7s[i] = NULL;
|
|
}
|
|
|
|
if ( m_msg7 ) {
|
|
mdelete ( m_msg7 , sizeof(Msg7) , "xdmsg7" );
|
|
delete ( m_msg7 );
|
|
m_msg7 = NULL;
|
|
}
|
|
m_warcContentPtr = NULL;
|
|
m_arcContentPtr = NULL;
|
|
m_anyContentPtr = NULL;
|
|
m_savedChar = '\0';
|
|
m_contentDelim = NULL;
|
|
|
|
if(m_registeredWgetReadCallback && m_pipe) {
|
|
log("build:came back from sleep callback");
|
|
g_loop.unregisterReadCallback( fileno(m_pipe), this,doneReadingArchiveFileWrapper);
|
|
m_registeredWgetReadCallback = false;
|
|
}
|
|
|
|
if(m_pipe) {
|
|
int32_t retCode = fclose(m_pipe);
|
|
log("we closed the warc pipe on reset with error %s", mstrerror(retCode));
|
|
m_pipe = NULL;
|
|
}
|
|
|
|
|
|
|
|
m_redirUrl.reset();
|
|
|
|
m_updatedMetaData = false;
|
|
|
|
m_ipStartTime = 0;
|
|
m_ipEndTime = 0;
|
|
m_diffbotReplyRetries = 0;
|
|
|
|
m_isImporting = false;
|
|
|
|
m_printedMenu = false;
|
|
|
|
// for hashing CT_STATUS docs consistently, this might be invalid
|
|
// so call it 0
|
|
m_pubDate = 0;
|
|
|
|
m_tmpBuf2.purge();
|
|
m_gotFacets = false;
|
|
|
|
m_bodyStartPos = 0;
|
|
|
|
m_mcastArray = NULL;
|
|
|
|
m_skipIframeExpansion = false;
|
|
m_indexedTime = 0;
|
|
|
|
m_didDelete = false;
|
|
|
|
m_metaList2.purge();
|
|
m_zbuf.purge();
|
|
m_kbuf.purge();
|
|
|
|
m_mySiteLinkInfoBuf.purge();
|
|
m_myPageLinkInfoBuf.purge();
|
|
m_myTempLinkInfoBuf.purge();
|
|
|
|
// reset count for nukeJSONObjects() function
|
|
m_joc = 0;
|
|
|
|
// notifications pending?
|
|
//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
|
|
|
m_sentToDiffbot = 0;
|
|
m_gotDiffbotSuccessfulReply = 0;
|
|
// we need to reset this to false
|
|
m_useTimeAxis = false;
|
|
|
|
m_sentToDiffbotThisTime = false;
|
|
|
|
m_loaded = false;
|
|
|
|
m_msg4Launched = false;
|
|
|
|
m_diffbotReplyError = 0;
|
|
m_diffbotJSONCount = 0;
|
|
//m_downloadAttempted = false;
|
|
m_incrementedAttemptsCount = false;
|
|
m_incrementedDownloadCount = false;
|
|
|
|
if ( m_dx ) {
|
|
mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
|
|
delete ( m_dx );
|
|
m_dx = NULL;
|
|
//log("diffbot: deleting m_dx2");
|
|
}
|
|
|
|
m_isDiffbotJSONObject = false;
|
|
|
|
m_dmozBuf.purge();
|
|
m_fakeIpBuf.purge();
|
|
m_fakeTagRecPtrBuf.purge();
|
|
|
|
m_tlbufTimer = 0LL;
|
|
m_gsbuf.reset();
|
|
|
|
//m_launchedAll = false;
|
|
|
|
m_qstringTable.reset();
|
|
|
|
//m_setForReplyPtrs = false;
|
|
//m_setForLinkPtrs = false;
|
|
|
|
// must be none outstanding
|
|
if ( m_numMsg3fReplies != m_numMsg3fRequests ) { char *xx=NULL;*xx=0;}
|
|
if ( m_numMsg4fReplies != m_numMsg4fRequests ) { char *xx=NULL;*xx=0;}
|
|
|
|
m_numMsg4fRequests = 0;
|
|
m_numMsg4fReplies = 0;
|
|
m_sentMsg4fRequests = false;
|
|
|
|
// free table's mem if used
|
|
//m_tmpDupTable.reset();
|
|
|
|
//m_newxd2Blocked = false;
|
|
|
|
m_lastPrintedDocId = 0LL;
|
|
|
|
m_loggedMsg3 = false;
|
|
|
|
m_progressBar = 0;
|
|
|
|
m_triedToAddWordPosInfoToCachedb = false;
|
|
|
|
if ( m_numLinkRequestsOut > m_numLinkRequestsIn ){char *xx=NULL;*xx=0;}
|
|
|
|
m_doConsistencyTesting = g_conf.m_doConsistencyTesting;
|
|
|
|
m_computedMetaListCheckSum = false;
|
|
|
|
m_msg3aErrno = 0;
|
|
|
|
m_hadMatchError = 0;
|
|
m_clientClosed = false;
|
|
m_lastCheckTime = 0;
|
|
|
|
m_calledMsg25ForSite = false;
|
|
m_calledMsg25ForPage = false;
|
|
m_checkedCachedbForSite = false;
|
|
m_checkedCachedbForPage = false;
|
|
m_allHashed = false;
|
|
|
|
// nuke it
|
|
if ( m_tempMsg25Page ) {
|
|
mdelete ( m_tempMsg25Page , sizeof(Msg25), "m25li" );
|
|
delete ( m_tempMsg25Page );
|
|
m_tempMsg25Page = NULL;
|
|
}
|
|
|
|
if ( m_tempMsg25Site ) {
|
|
mdelete ( m_tempMsg25Site , sizeof(Msg25), "m25li" );
|
|
delete ( m_tempMsg25Site );
|
|
m_tempMsg25Site = NULL;
|
|
}
|
|
|
|
m_numLinkRequestsOut = 0;
|
|
m_seoDebug = 0;
|
|
//m_seoInfoSetFromCache = false;
|
|
m_checkedCachedb = false;
|
|
m_processedCachedbReply = false;
|
|
m_cacheList.freeList();
|
|
|
|
for ( int32_t i = 0; m_numMsg8eReplies && i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg8eReply[i] ) continue;
|
|
mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
|
|
m_msg8eReply[i] = NULL;
|
|
}
|
|
m_numMsg8eRequests = 0;
|
|
m_numMsg8eReplies = 0;
|
|
|
|
|
|
for ( int32_t i = 0; m_numMsg95Replies && i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg95ReplyPtrs[i] ) continue;
|
|
mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
|
|
m_msg95ReplyPtrs[i] = NULL;
|
|
}
|
|
m_numMsg95Replies = 0;
|
|
|
|
|
|
m_numMsg3fRequests = 0;
|
|
m_numMsg3fReplies = 0;
|
|
m_qcursor = 0;
|
|
//m_binError = 0;
|
|
//m_msg98ReplyError = 0;
|
|
//m_binErrorForReplyPtrs = 0;
|
|
//m_binErrorForLinkPtrs = 0;
|
|
|
|
//m_msg17.reset();
|
|
//m_triedCache = false;
|
|
//m_cacheRec = NULL;
|
|
//m_cacheRecSize = 0;
|
|
|
|
// reset this crap
|
|
m_beginTimeAllMatch = 0LL;
|
|
m_beginTimeMatchUrl = 0LL;
|
|
m_beginTimeFullQueries = 0LL;
|
|
m_beginTimeLinks = 0LL;
|
|
//m_beginMsg98s = 0LL;
|
|
m_beginRelatedQueries = 0LL;
|
|
|
|
m_doledbKey.n0 = 0LL;
|
|
m_doledbKey.n1 = 0;
|
|
|
|
// sanity check, any outstanding?
|
|
//if( m_numMsg98Requests != m_numMsg98Replies ) { char *xx=NULL;*xx=0;}
|
|
// reset them now
|
|
//m_numMsg98Requests = 0;
|
|
//m_numMsg98Replies = 0;
|
|
|
|
//if ( m_newxd ) {
|
|
// mdelete ( m_newxd , sizeof(XmlDoc),"newxd");
|
|
// delete ( m_newxd );
|
|
// m_newxd = NULL;
|
|
//}
|
|
|
|
//if ( m_newxd2 ) {
|
|
// mdelete ( m_newxd2 , sizeof(XmlDoc),"newxd2");
|
|
// delete ( m_newxd2 );
|
|
// m_newxd2 = NULL;
|
|
//}
|
|
|
|
/*
|
|
if ( m_newMsg20 ) {
|
|
mdelete ( m_newMsg20 , sizeof(Msg20),"newmsg20");
|
|
delete ( m_newMsg20 );
|
|
m_newMsg20 = NULL;
|
|
}*/
|
|
/*
|
|
|
|
NO! we use this for clientClosedConnection() function now
|
|
|
|
if ( m_seoSocket ) {
|
|
TcpServer *tcp = m_seoSocket->m_this;
|
|
// gotta set this so it can be destroyed and closed
|
|
m_seoSocket->m_waitingOnHandler = false;
|
|
tcp->destroySocket ( m_seoSocket );
|
|
m_seoSocket = NULL;
|
|
}
|
|
*/
|
|
if ( m_registeredSocketCallback ) { char *xx=NULL; *xx=0; }
|
|
//for ( int32_t i = 0 ; i < m_numMsg99Replies ; i++ ) {
|
|
// if ( ! m_msg99ReplyPtrs[i] ) continue;
|
|
// mfree ( m_msg99ReplyPtrs [i] ,
|
|
// m_msg99ReplyAlloc[i] ,
|
|
// "m99reply" );
|
|
//}
|
|
//m_numMsg99Replies = 0;
|
|
//m_sentMsg99Requests = false;
|
|
|
|
|
|
if ( m_msg3a ) {
|
|
mdelete ( m_msg3a , sizeof(Msg3a) , "xdmsg3a" );
|
|
delete ( m_msg3a );
|
|
m_msg3a = NULL;
|
|
}
|
|
|
|
if ( m_query3a ) {
|
|
mdelete ( m_query3a , sizeof(Query),"xdqry3a");
|
|
delete ( m_query3a );
|
|
m_query3a = NULL;
|
|
}
|
|
|
|
m_surroundingTextBuf.purge();
|
|
m_rssItemBuf.purge();
|
|
//m_twbuf.purge();
|
|
m_topMatchingQueryBuf.purge();
|
|
//m_queryPtrs.purge();
|
|
m_queryOffsets.purge();
|
|
m_extraQueryBuf.purge();
|
|
//m_socketWriteBuf.purge();
|
|
m_relatedDocIdBuf.purge();
|
|
m_relatedTitleBuf.purge();
|
|
m_commonQueryNumBuf.purge();
|
|
m_queryLinkBuf.purge();
|
|
//m_relatedQueryLinksIntersected.purge();
|
|
m_queryLinkStringBuf.purge();
|
|
//m_queryRelBuf.purge();
|
|
//m_relPtrs.purge();
|
|
m_sortedPosdbListBuf.purge();
|
|
m_wpSortedPosdbListBuf.purge();
|
|
m_termListBuf.purge();
|
|
m_insertableTermsBuf.purge();
|
|
//m_iwfiBuf.purge();
|
|
m_wordPosInfoBuf.purge();
|
|
//m_msg20ReplyPtrBuf.purge();
|
|
m_recommendedLinksBuf.purge();
|
|
m_tmpMsg0Buf.purge();
|
|
m_msg20Array.purge();
|
|
m_newLinkerBuf.purge();
|
|
|
|
//m_msg99ReplyBuf.purge();
|
|
m_matchingQueryBuf.purge();
|
|
m_relatedQueryBuf.purge();
|
|
m_queryLinkBuf.purge();
|
|
m_matchingQueryStringBuf.purge();
|
|
m_relatedQueryStringBuf.purge();
|
|
m_queryLinkStringBuf.purge();
|
|
m_docIdListBuf.purge();
|
|
|
|
m_queryChangeBuf.purge();
|
|
m_queryLogBuf.purge();
|
|
//m_itStrBuf.purge();
|
|
m_debugScoreInfoBuf.purge();
|
|
m_origScoreInfoBuf.purge();
|
|
m_msg20Buf.purge();
|
|
m_topDocIdsBuf.purge();
|
|
m_missingTermBuf.purge();
|
|
m_termInfoBuf.purge();
|
|
m_newTermInfoBuf.purge();
|
|
m_matchingTermBuf.purge();
|
|
m_termId32Buf.purge();
|
|
m_storeList.freeList();
|
|
|
|
//m_queryHashTable.reset();
|
|
m_tidTable32.reset();
|
|
m_queryOffsetTable.reset();
|
|
m_tmpTable.reset();
|
|
m_fullQueryDedup.reset();
|
|
//m_dupVotes.reset();
|
|
|
|
m_wordSpamBuf.purge();
|
|
m_fragBuf.purge();
|
|
|
|
m_downloadLevel = 0;
|
|
|
|
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) {
|
|
if ( ! m_xmlDocs[i] ) continue;
|
|
mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
|
|
delete ( m_xmlDocs[i] );
|
|
m_xmlDocs[i] = NULL;
|
|
}
|
|
|
|
s_lastTimeStart = 0LL;
|
|
|
|
m_req = NULL;
|
|
|
|
m_doneWithAhrefs = false;
|
|
m_useAhrefs = false;
|
|
m_linkDedupTablePtr = NULL;
|
|
m_domDedupTablePtr = NULL;
|
|
|
|
m_storeTermListInfo = false;
|
|
m_gotDupStats = false;
|
|
//m_nextSection = (Section *)-1;
|
|
m_si = (Section *)-1;
|
|
|
|
// for limiting # of iframe tag expansions
|
|
m_numExpansions = 0;
|
|
|
|
// . are not allowed to exit if waiting for msg4 to complete
|
|
// . yes we are, it should be saved as addsinprogress.dat
|
|
if ( m_msg4Waiting ) {
|
|
log("doc: resetting xmldoc with outstanding msg4. should "
|
|
"me saved in addsinprogress.dat. docid=%"UINT64"",m_docId);
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
m_ei = 0;
|
|
m_lastLaunch = -1;
|
|
|
|
m_pbuf = NULL;
|
|
m_wts = NULL;
|
|
|
|
m_deleteFromIndex = false;
|
|
|
|
//if ( m_contactDocValid ) nukeDoc ( m_contactDoc );
|
|
if ( m_rootDocValid ) nukeDoc ( m_rootDoc );
|
|
if ( m_oldDocValid ) nukeDoc ( m_oldDoc );
|
|
if ( m_extraDocValid ) nukeDoc ( m_extraDoc );
|
|
if ( m_ahrefsDocValid ) nukeDoc ( m_ahrefsDoc );
|
|
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 && m_freeLinkInfo1 ) {
|
|
// it now points into m_myPageLinkInfoBuf !
|
|
//mfree ( ptr_linkInfo1 , size_linkInfo1, "LinkInfo1");
|
|
ptr_linkInfo1 = NULL;
|
|
m_linkInfo1Valid = false;
|
|
}
|
|
if ( m_linkInfo2Valid && ptr_linkInfo2 && m_freeLinkInfo2 ) {
|
|
// should point into a safebuf as well
|
|
//mfree ( ptr_linkInfo2 , size_linkInfo2, "LinkInfo2");
|
|
ptr_linkInfo2 = NULL;
|
|
m_linkInfo2Valid = false;
|
|
}
|
|
if ( m_rawUtf8ContentValid && m_rawUtf8Content && !m_setFromTitleRec
|
|
// was content supplied by pageInject.cpp?
|
|
//! m_contentInjected ) {
|
|
) {
|
|
mfree ( m_rawUtf8Content, m_rawUtf8ContentAllocSize,"Xml3");
|
|
}
|
|
|
|
// reset this
|
|
m_contentInjected = false;
|
|
m_rawUtf8ContentValid = false;
|
|
m_wasContentInjected = false;
|
|
|
|
m_rootDoc = NULL;
|
|
|
|
// if this is true, then only index if new
|
|
m_newOnly = 0;
|
|
|
|
//if ( m_sectiondbData ) {
|
|
// mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" );
|
|
// m_sectiondbData = NULL;
|
|
//}
|
|
|
|
//if ( m_placedbData ) {
|
|
// mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" );
|
|
// m_placedbData = NULL;
|
|
//}
|
|
|
|
if ( m_httpReplyValid && m_httpReply ) {
|
|
mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
|
|
m_httpReply = NULL;
|
|
m_httpReplyValid = false;
|
|
}
|
|
|
|
if ( m_filteredContentAllocSize ) {
|
|
mfree (m_filteredContent,m_filteredContentAllocSize,"xdfc");
|
|
m_filteredContent = NULL;
|
|
m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
//if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content )
|
|
// mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3");
|
|
|
|
|
|
if ( m_metaList ) { // m_metaListValid && m_metaList ) {
|
|
mfree ( m_metaList , m_metaListAllocSize , "metalist");
|
|
m_metaList = NULL;
|
|
m_metaListSize = 0;
|
|
m_metaListAllocSize = 0;
|
|
}
|
|
|
|
if ( m_ubuf ) {
|
|
// log("xmldoc: delete m_ubuf=%"PTRFMT" this=%"PTRFMT
|
|
// , (PTRTYPE) m_ubuf
|
|
// , (PTRTYPE) this
|
|
// );
|
|
mfree ( m_ubuf , m_ubufAlloc , "ubuf");
|
|
m_ubuf = NULL;
|
|
}
|
|
|
|
//if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) {
|
|
// mfree ( m_titleRec , m_titleRecAllocSize , "trec" );
|
|
//}
|
|
//m_titleRec = NULL;
|
|
m_titleRecBuf.purge();
|
|
|
|
if ( m_dupTrPtr ) {
|
|
mfree ( m_dupTrPtr , m_dupTrSize , "trecd" );
|
|
m_dupTrPtr = NULL;
|
|
}
|
|
|
|
if ( m_oldTitleRecValid && m_oldTitleRec ) {
|
|
mfree ( m_oldTitleRec , m_oldTitleRecSize , "treca" );
|
|
m_oldTitleRec = NULL;
|
|
m_oldTitleRecValid = false;
|
|
}
|
|
|
|
if ( m_rootTitleRecValid && m_rootTitleRec ) {
|
|
mfree ( m_rootTitleRec , m_rootTitleRecSize , "treca" );
|
|
m_rootTitleRec = NULL;
|
|
m_rootTitleRecValid = false;
|
|
}
|
|
|
|
|
|
if ( m_outlinkHopCountVectorValid && m_outlinkHopCountVector ) {
|
|
int32_t sz = m_outlinkHopCountVectorSize;
|
|
mfree ( m_outlinkHopCountVector,sz,"ohv");
|
|
}
|
|
m_outlinkHopCountVector = NULL;
|
|
|
|
//if ( m_gsbufValid && m_gsbuf ) {
|
|
// mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" );
|
|
//}
|
|
//m_gsbuf = NULL;
|
|
m_gsbuf.reset();
|
|
|
|
|
|
// reset all *valid* flags to false
|
|
void *p = &m_VALIDSTART;
|
|
void *pend = &m_VALIDEND;
|
|
memset ( p , 0 , (char *)pend - (char *)p );
|
|
|
|
m_hashedMetas = false;
|
|
|
|
m_mcastBuf.purge();
|
|
m_serpBuf.purge();
|
|
|
|
// Doc.cpp:
|
|
m_mime.reset();
|
|
m_words.reset();
|
|
m_phrases.reset();
|
|
m_bits.reset();
|
|
m_sections.reset();
|
|
//m_weights.reset();
|
|
m_countTable.reset();
|
|
m_dates.reset();
|
|
m_addresses.reset();
|
|
|
|
// other crap
|
|
m_xml.reset();
|
|
m_links.reset();
|
|
m_bits2.reset();
|
|
m_pos.reset();
|
|
//m_synonyms.reset();
|
|
m_synBuf.reset();
|
|
//m_nsvt.reset();
|
|
//m_osvt.reset();
|
|
m_turkVotingTable.reset();
|
|
m_turkBitsTable.reset();
|
|
m_vtr.reset();
|
|
m_rdtab.reset();
|
|
m_vctab.reset();
|
|
m_vcduptab.reset();
|
|
m_images.reset();
|
|
m_countTable.reset();
|
|
m_mime.reset();
|
|
m_tagRec.reset();
|
|
m_newTagBuf.reset();
|
|
m_catRec.reset();
|
|
//m_clockCandidatesTable.reset();
|
|
//m_cctbuf.reset();
|
|
m_dupList.reset();
|
|
//m_oldMetaList.reset();
|
|
m_msg8a.reset();
|
|
//m_siteLinkInfo.reset();
|
|
//m_msg25.reset();
|
|
//m_msgc.reset();
|
|
m_msg13.reset();
|
|
m_tmpsb1.reset();
|
|
m_tmpsb2.reset();
|
|
m_turkBuf.reset();
|
|
m_msg0b.reset();
|
|
//m_siteGetter.reset();
|
|
m_msge0.reset();
|
|
m_msge1.reset();
|
|
m_reply.reset();
|
|
// mroe stuff skipped
|
|
|
|
m_wtsTable.reset();
|
|
m_wbuf.reset();
|
|
m_pageLinkBuf.reset();
|
|
m_siteLinkBuf.reset();
|
|
m_esbuf.reset();
|
|
m_xbuf.reset();
|
|
m_tagRecBuf.reset();
|
|
|
|
//m_titleRec = NULL;
|
|
//m_titleRecSize = 0;
|
|
|
|
// origin of this XmlDoc
|
|
m_setFromTitleRec = false;
|
|
m_setFromUrl = false;
|
|
m_setFromDocId = false;
|
|
m_setFromSpiderRec = false;
|
|
m_freeLinkInfo1 = false;
|
|
m_freeLinkInfo2 = false;
|
|
|
|
m_checkedUrlFilters = false;
|
|
|
|
m_indexCode = 0;
|
|
m_masterLoop = NULL;
|
|
m_masterState = NULL;
|
|
|
|
//m_isAddUrl = false;
|
|
m_isInjecting = false;
|
|
m_useFakeMime = false;
|
|
m_useSiteLinkBuf = false;
|
|
m_usePageLinkBuf = false;
|
|
m_printInXml = false;
|
|
|
|
m_check1 = false;
|
|
m_check2 = false;
|
|
m_prepared = false;
|
|
|
|
// keep track of updates to the rdbs we have done, so we do not re-do
|
|
m_listAdded = false;
|
|
m_listFlushed = false;
|
|
m_updatedCounts = false;
|
|
m_updatedCounts2 = false;
|
|
//m_updatedTagdb1 = false;
|
|
//m_updatedTagdb2 = false;
|
|
//m_updatedTagdb3 = false;
|
|
//m_updatedTagdb4 = false;
|
|
//m_updatedTagdb5 = false;
|
|
m_copied1 = false;
|
|
m_updatingSiteLinkInfoTags = false;
|
|
m_addressSetCalled = false;
|
|
m_hashedTitle = false;
|
|
|
|
m_registeredSleepCallback = false;
|
|
m_addedNegativeDoledbRec = false;
|
|
|
|
m_numRedirects = 0;
|
|
m_numOutlinksAdded = 0;
|
|
// . use sameDomain and sameIp waits?
|
|
// . these may be bypassed in getContactDoc()
|
|
//m_throttleDownload = true;
|
|
m_spamCheckDisabled = false;
|
|
m_useRobotsTxt = true;
|
|
m_redirectFlag = false;
|
|
|
|
// Scraper.cpp sets this to true
|
|
//m_isScraping = false;
|
|
|
|
m_allowSimplifiedRedirs = false;
|
|
|
|
//m_calledMsg22a = false;
|
|
//m_calledMsg22b = false;
|
|
//m_calledMsg22c = false;
|
|
m_didDelay = false;
|
|
m_didDelayUnregister = false;
|
|
m_calledMsg22d = 0LL;
|
|
m_calledMsg22e = false;
|
|
m_calledMsg22f = false;
|
|
m_calledMsg25 = false;
|
|
m_calledMsg25b = false;
|
|
m_calledMsg40 = false;
|
|
m_calledSections = false;
|
|
m_calledThread = false;
|
|
m_alreadyRegistered = false;
|
|
m_loaded = false;
|
|
m_firstEntry = true;
|
|
m_firstEntry2 = true;
|
|
m_launchedSpecialMsg8a = false;
|
|
m_launchedMsg8a2 = false;
|
|
|
|
m_numSectiondbReads = 0;
|
|
m_numSectiondbNeeds = 0;
|
|
m_sectiondbRecall = 0;
|
|
|
|
//m_triedVoteCache = false;
|
|
//m_storedVoteCache = false;
|
|
|
|
m_setTr = false;
|
|
//m_checkedRobots = false;
|
|
m_triedTagRec = false;
|
|
m_didGatewayPage = false;
|
|
m_didQuickDupCheck = false;
|
|
m_calledMsg8b = false;
|
|
|
|
m_recycleContent = false;
|
|
//m_loadFromOldTitleRec = false;
|
|
m_callback1 = NULL;
|
|
m_callback2 = NULL;
|
|
m_state = NULL;
|
|
|
|
// used for getHasContactInfo()
|
|
m_processed0 = false;
|
|
m_hasContactInfo = false;
|
|
m_hasContactInfo2 = false;
|
|
|
|
|
|
//m_checkForRedir = true;
|
|
|
|
m_processedLang = false;
|
|
|
|
m_doingConsistencyCheck = false;
|
|
|
|
// used for getting contact info
|
|
//m_triedRoot = false;
|
|
//m_winner = -2;
|
|
|
|
// tell Msg13 to just call HttpServer::getDoc() and not to forward
|
|
// the download request to another host. although this does not
|
|
// exclude possible forwarding it to a compression proxy if
|
|
// g_conf.m_useCompressionProxy is set
|
|
m_forwardDownloadRequest = false;
|
|
|
|
m_isChildDoc = false;
|
|
m_parentDocPtr = NULL;
|
|
|
|
// for utf8 content functions
|
|
m_savedp = NULL;
|
|
m_oldp = NULL;
|
|
m_didExpansion = false;
|
|
|
|
// Repair.cpp now explicitly sets these to false if needs to
|
|
m_usePosdb = true;
|
|
//m_useDatedb = true;
|
|
m_useClusterdb = true;
|
|
m_useLinkdb = true;
|
|
m_useSpiderdb = true;
|
|
m_useTitledb = true;
|
|
m_useTagdb = true;
|
|
m_usePlacedb = true;
|
|
//m_useTimedb = true;
|
|
// only use for custom crawls for now to save disk space
|
|
m_useSectiondb = false;
|
|
//m_useRevdb = true;
|
|
m_useSecondaryRdbs = false;
|
|
|
|
//m_useIpsTxtFile = true;
|
|
|
|
// used by Msg13.cpp only. kinda a hack.
|
|
m_isSpiderProxy = false;
|
|
|
|
// do not cache the http reply in msg13 etc.
|
|
m_maxCacheAge = 0;
|
|
|
|
// reset these ptrs too!
|
|
void *px = &ptr_firstUrl;
|
|
void *pxend = &size_firstUrl;
|
|
memset ( px , 0 , (char *)pxend - (char *)px );
|
|
|
|
m_hasMetadata = false;
|
|
ptr_metadata = NULL;
|
|
size_metadata = 0;
|
|
}
|
|
|
|
// . set the url with the intention of adding it or deleting it from the index
|
|
// . Msg7 and Repair.cpp can also set other members of XmlDoc rather than just
|
|
// m_firstUrl. they can provide the ip, the http reply, content, filtered
|
|
// content, the forced next spider time and the forced first indexed date,
|
|
// the hop count
|
|
// . they might also want to skip deduping, or any algo deemed unnecessary
|
|
// by setting, for instance, m_isDupValid = true, or something
|
|
bool XmlDoc::set1 ( char *url ,
|
|
char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ) {
|
|
|
|
reset();
|
|
|
|
// this is true
|
|
m_setFromUrl = true;
|
|
|
|
//m_coll = coll;
|
|
m_pbuf = pbuf;
|
|
m_niceness = niceness;
|
|
m_version = TITLEREC_CURRENT_VERSION;
|
|
m_versionValid = true;
|
|
|
|
// sanity check
|
|
if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// copy this in case collection gets deleted i guess...
|
|
//m_forceDelete = forceDelete;
|
|
// did we get this url from PageAddUrl?
|
|
//m_isAddUrl = isAddUrl;
|
|
// set m_indexCode so that XmlDoc::indexDoc() will delete it
|
|
//if ( forceDelete ) m_indexCode = EDOCFORCEDELETE;
|
|
|
|
// set this important member var
|
|
//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
|
|
//if ( ! cr ) return false;
|
|
if ( ! setCollNum ( coll ) ) return false;
|
|
|
|
setFirstUrl ( url , false );
|
|
|
|
//setSpideredTime();
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
char *XmlDoc::getTestDir ( ) {
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// return NULL if we are not the "qatest123" collection
|
|
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
|
|
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
|
|
// then return "test-spider" otherwise...
|
|
//if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
|
|
// return "qa";//"test-spider";
|
|
// ... default to "test-parser"
|
|
//return "test-parser";
|
|
return "qa";
|
|
/*
|
|
if ( getIsPageParser() )
|
|
return "test-page-parser";
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting )
|
|
// return "test-page-inject";
|
|
else if ( g_conf.m_testParserEnabled )
|
|
return "test-parser";
|
|
else if ( g_conf.m_testSpiderEnabled )
|
|
return "test-spider";
|
|
// default to being from PageInject
|
|
return "test-page-inject";
|
|
*/
|
|
//else { char *xx=NULL;*xx=0; }
|
|
//return NULL;
|
|
}
|
|
|
|
int32_t XmlDoc::getSpideredTime ( ) {
|
|
// stop if already set
|
|
if ( m_spideredTimeValid ) return m_spideredTime;
|
|
|
|
// tmp var
|
|
int32_t date = 0;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return 0;
|
|
|
|
// if not test collection keep it simple
|
|
if ( strcmp(cr->m_coll,"qatest123") || cr->m_useTimeAxis) {
|
|
// . set spider time to current time
|
|
// . this might already be valid if we set it in
|
|
// getTestSpideredDate()
|
|
m_spideredTime = getTimeGlobal();
|
|
m_spideredTimeValid = true;
|
|
return m_spideredTime;
|
|
}
|
|
|
|
char *testDir = getTestDir();
|
|
|
|
// get url
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this returns false if not in there, in which case, add it
|
|
if ( ! getTestSpideredDate(cu,&date,testDir) ) {
|
|
m_spideredTime = getTimeGlobal();
|
|
m_spideredTimeValid = true;
|
|
addTestSpideredDate ( cu , m_spideredTime , testDir );
|
|
return m_spideredTime;
|
|
}
|
|
|
|
// if we are injecting into the test coll for the 2nd+ time
|
|
// we need to use the spidered date from the first time we
|
|
// injected the doc in order to ensure things are parsed
|
|
// exactly the same way since some things depend on the
|
|
// spideredTime, like Dates (for setting "in future"
|
|
// flags)
|
|
m_spideredTimeValid = true;
|
|
m_spideredTime = date;
|
|
// hack for test coll which has fake vals for these because
|
|
// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
|
|
//m_minPubDate = m_spideredTime - 48*3600;
|
|
//m_maxPubDate = m_spideredTime - 24*3600;
|
|
|
|
return m_spideredTime;
|
|
}
|
|
|
|
// . we need this so PageGet.cpp can get the cached web page
|
|
// . but not for Msg20::getSummary(), that uses XmlDoc::set(Msg20Request*)
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::set3 ( int64_t docId ,
|
|
char *coll ,
|
|
int32_t niceness ) {
|
|
|
|
reset();
|
|
|
|
// this is true
|
|
m_setFromDocId = true;
|
|
|
|
m_docId = docId;
|
|
m_docIdValid = true;
|
|
//m_coll = coll;
|
|
m_niceness = niceness;
|
|
|
|
// . sanity check
|
|
// . why can't we allow this??? MDW
|
|
//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// set this important member var
|
|
//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
|
|
//if ( ! cr ) { m_errno = ENOCOLLREC; return false; }
|
|
if ( ! setCollNum ( coll ) ) return false;
|
|
|
|
// solidify some parms
|
|
//m_eliminateMenus = cr->m_eliminateMenus;
|
|
//m_eliminateMenusValid = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
void loadFromOldTitleRecWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "loading from old title rec wrapper" );
|
|
// return if it blocked
|
|
if ( ! THIS->loadFromOldTitleRec ( ) ) return;
|
|
|
|
char *coll = "";
|
|
CollectionRec *cr = THIS->getCollRec();
|
|
if ( cr ) coll = cr->m_coll;
|
|
|
|
// error?
|
|
if ( g_errno ) log("doc: loadfromtitlerec coll=%s: %s",
|
|
coll,
|
|
mstrerror(g_errno));
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// returns false if blocked, returns true and sets g_errno on error otherwise
|
|
bool XmlDoc::loadFromOldTitleRec ( ) {
|
|
// . we are an entry point.
|
|
// . if anything blocks, this will be called when it comes back
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = loadFromOldTitleRecWrapper;
|
|
m_masterState = this;
|
|
}
|
|
// if we already loaded!
|
|
if ( m_loaded ) return true;
|
|
// if set from a docid, use msg22 for this!
|
|
char **otr = getOldTitleRec ( );
|
|
// error?
|
|
if ( ! otr ) return true;
|
|
// blocked?
|
|
if ( otr == (void *)-1 ) return false;
|
|
// this is a not found
|
|
if ( ! *otr ) {
|
|
// so we do not retry
|
|
m_loaded = true;
|
|
// make it an error
|
|
g_errno = ENOTFOUND;
|
|
return true;
|
|
}
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
// use that. decompress it! this will also set
|
|
// m_setFromTitleRec to true
|
|
if ( ! set2 ( m_oldTitleRec ,
|
|
m_oldTitleRecSize , // maxSize
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness )) {
|
|
// we are now loaded, do not re-call
|
|
m_loaded = true;
|
|
// return true with g_errno set on error uncompressing
|
|
return true;
|
|
}
|
|
// we are now loaded, do not re-call
|
|
m_loaded = true;
|
|
// sanity check
|
|
if ( ! m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
|
|
// good to go
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::setCollNum ( char *coll ) {
|
|
CollectionRec *cr;
|
|
cr = g_collectiondb.getRec ( coll , gbstrlen(coll) );
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
return log("build: collrec not found for %s",coll);
|
|
}
|
|
// we can store this safely:
|
|
m_collnum = cr->m_collnum;
|
|
m_collnumValid = true;
|
|
// if user "resets" the collection we need to know
|
|
m_lastCollRecResetCount = cr->m_lastResetCount;
|
|
return true;
|
|
}
|
|
|
|
CollectionRec *XmlDoc::getCollRec ( ) {
|
|
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
|
|
if ( ! cr ) {
|
|
log("build: got NULL collection rec for collnum=%"INT32".",
|
|
(int32_t)m_collnum);
|
|
g_errno = ENOCOLLREC;
|
|
return NULL;
|
|
}
|
|
// was it reset since we started spidering this url?
|
|
// we don't do it this way, when resetting a coll when delete it and
|
|
// re-add under a different collnum to avoid getting msg4 adds to it.
|
|
//if ( cr->m_lastResetCount != m_lastCollRecResetCount ) {
|
|
// log("build: collection rec was reset. returning null.");
|
|
// g_errno = ENOCOLLREC;
|
|
// return NULL;
|
|
//}
|
|
return cr;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
|
key_t *doledbKey ,
|
|
char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ,
|
|
char *utf8ContentArg ,
|
|
bool deleteFromIndex ,
|
|
int32_t forcedIp ,
|
|
uint8_t contentType ,
|
|
uint32_t spideredTime ,
|
|
bool contentHasMimeArg ,
|
|
char *contentDelim,
|
|
char *metadata ,
|
|
uint32_t metadataLen,
|
|
int32_t payloadLen
|
|
) {
|
|
|
|
// sanity check
|
|
if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
reset();
|
|
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("xmldoc: set4 uh48=%"UINT64" parentdocid=%"UINT64"",
|
|
sreq->getUrlHash48(),sreq->getParentDocId());
|
|
|
|
// used by PageSpiderdb.cpp
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
m_startTimeValid = true;
|
|
|
|
// this is true
|
|
m_setFromSpiderRec = true;
|
|
|
|
// did page inject (pageinject) request to delete it?
|
|
m_deleteFromIndex = deleteFromIndex;
|
|
|
|
// PageReindex.cpp will set this in the spider request
|
|
if ( sreq->m_forceDelete )
|
|
m_deleteFromIndex = true;
|
|
|
|
// if we are a container doc then we need the content delimeter,
|
|
// unless if we are a warc or arc, then we know how those delimit
|
|
// already.
|
|
m_contentDelim = contentDelim;
|
|
m_contentDelimValid = true;
|
|
|
|
bool contentHasMime = contentHasMimeArg;
|
|
// but if we are a container doc then this parm applies to each subdoc
|
|
// not to us, so turn it off for this part.
|
|
if ( isContainerDoc() ) {
|
|
contentHasMime = false;
|
|
m_subDocsHaveMime = contentHasMimeArg;
|
|
}
|
|
|
|
|
|
char *utf8Content = utf8ContentArg;
|
|
|
|
if ( contentHasMime && utf8Content ) {
|
|
// get length of it all
|
|
int32_t clen = gbstrlen(utf8Content);
|
|
// return true on error with g_errno set
|
|
if ( ! m_mime.set ( utf8ContentArg , clen , NULL ) ) {
|
|
if ( ! g_errno ) g_errno = EBADMIME;
|
|
log("xmldoc: could not set mime: %s",
|
|
mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// it's valid
|
|
m_mimeValid = true;
|
|
// advance
|
|
utf8Content = m_mime.getContent();
|
|
|
|
if(payloadLen != -1) {
|
|
payloadLen -= m_mime.getContent() - utf8ContentArg;
|
|
}
|
|
}
|
|
|
|
// use this to avoid ip lookup if it is not zero
|
|
if ( forcedIp ) {
|
|
m_ip = forcedIp;
|
|
m_ipValid = true;
|
|
}
|
|
|
|
// sometimes they supply the content they want! like when zaks'
|
|
// injects pages from PageInject.cpp
|
|
if ( utf8Content ) {
|
|
// . this is the most basic content from the http reply
|
|
// . only set this since sometimes it is facebook xml and
|
|
// contains encoded html which needs to be decoded.
|
|
// like <name>Ben & Jerry's</name> otherwise are
|
|
// sentence formation stops at the ';' in the "&" and
|
|
// we also index "amp" which is bad.
|
|
m_content = utf8Content;
|
|
if(payloadLen != -1) {
|
|
m_contentLen = payloadLen;
|
|
}
|
|
else if ( m_mimeValid && m_mime.m_contentLen > 0) {
|
|
m_contentLen = m_mime.m_contentLen;
|
|
} else {
|
|
m_contentLen = gbstrlen(utf8Content);
|
|
}
|
|
|
|
m_contentValid = true;
|
|
|
|
//m_rawUtf8Content = utf8Content;
|
|
//m_expandedUtf8Content = utf8Content;
|
|
|
|
//ptr_utf8Content = utf8Content;
|
|
//size_utf8Content = slen+1;
|
|
|
|
//m_rawUtf8ContentValid = true;
|
|
//m_expandedUtf8ContentValid = true;
|
|
//m_utf8ContentValid = true;
|
|
|
|
m_contentInjected = true;
|
|
m_wasContentInjected = true;
|
|
m_contentType = contentType;
|
|
m_contentTypeValid = true;
|
|
// use this ip as well for now to avoid ip lookup
|
|
//m_ip = atoip("127.0.0.1");
|
|
//m_ipValid = true;
|
|
// do not need robots.txt then
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
// nor mime
|
|
m_httpStatus = 200;
|
|
m_httpStatusValid = true;
|
|
// this too
|
|
m_downloadStatus = 0;
|
|
m_downloadStatusValid = true;
|
|
// assume this is the download time since the content
|
|
// was pushed/provided to us
|
|
if ( spideredTime )
|
|
m_downloadEndTime = spideredTime;
|
|
else
|
|
m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
|
|
// either way, validate it
|
|
m_downloadEndTimeValid = true;
|
|
// and need a legit mime
|
|
if ( ! m_mimeValid ) {
|
|
m_mime.m_bufLen = 1;
|
|
m_mimeValid = true;
|
|
m_mime.m_contentType = contentType;
|
|
}
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// no redir
|
|
ptr_redirUrl = NULL;
|
|
size_redirUrl = 0;
|
|
m_redirUrl.reset();
|
|
m_redirUrlPtr = NULL;//&m_redirUrl;
|
|
m_redirUrlValid = true;
|
|
m_redirErrorValid = true;
|
|
m_redirError = 0;
|
|
m_crawlDelay = -1;
|
|
m_crawlDelayValid = true;
|
|
}
|
|
|
|
// override content type based on mime for application/json
|
|
if ( m_mimeValid ) {
|
|
m_contentType = m_mime.m_contentType;
|
|
m_contentTypeValid = true;
|
|
}
|
|
|
|
|
|
//m_coll = coll;
|
|
m_pbuf = pbuf;
|
|
m_niceness = niceness;
|
|
m_version = TITLEREC_CURRENT_VERSION;
|
|
m_versionValid = true;
|
|
|
|
/*
|
|
// set min/max pub dates right away
|
|
m_minPubDate = -1;
|
|
m_maxPubDate = -1;
|
|
// parentPrevSpiderTime is 0 if that was the first time that the
|
|
// parent was spidered, in which case isNewOutlink will always be set
|
|
// for every outlink it had!
|
|
if ( sreq->m_isNewOutlink && sreq->m_parentPrevSpiderTime ) {
|
|
// sanity check
|
|
if ( ! sreq->m_parentPrevSpiderTime ) {char *xx=NULL;*xx=0;}
|
|
// pub date is somewhere between these two times
|
|
m_minPubDate = sreq->m_parentPrevSpiderTime;
|
|
m_maxPubDate = sreq->m_addedTime;
|
|
}
|
|
*/
|
|
|
|
// this is used to removing the rec from doledb after we spider it
|
|
m_doledbKey.setMin();
|
|
if ( doledbKey ) m_doledbKey = *doledbKey;
|
|
|
|
// . sanity check
|
|
// . we really don't want the parser holding up the query pipeline
|
|
// even if this page is being turked!
|
|
//if ( m_niceness == 0 &&
|
|
// // spider proxy uses xmldoc class to expand iframe tags and
|
|
// // sometimes the initiating msg13 class was re-niced to 0
|
|
// // in the niceness converstion logic.
|
|
// ! g_hostdb.m_myHost->m_isProxy ) {
|
|
// char *xx=NULL; *xx=0; }
|
|
|
|
if ( sreq->isCorrupt(m_collnum) )
|
|
return log("XmlDoc: set4() spider request is corrupt in coll "
|
|
"%s u=%s",coll,sreq->m_url);
|
|
|
|
m_sreqValid = true;
|
|
|
|
// store the whole rec, key+dataSize+data, in case it disappears.
|
|
gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
|
|
|
|
// set m_collnum etc.
|
|
if ( ! setCollNum ( coll ) )
|
|
return log("XmlDoc: set4() coll %s invalid",coll);
|
|
|
|
// it should be valid since we just set it
|
|
CollectionRec *cr = getCollRec();
|
|
|
|
m_useRobotsTxt = cr->m_useRobotsTxt;
|
|
|
|
// solidify some parms
|
|
//m_eliminateMenus = cr->m_eliminateMenus;
|
|
//m_eliminateMenusValid = true;
|
|
|
|
// validate these here too
|
|
/*
|
|
m_titleWeight = cr->m_titleWeight;
|
|
m_headerWeight = cr->m_headerWeight;
|
|
m_urlPathWeight = cr->m_urlPathWeight;
|
|
m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
|
|
m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
|
|
m_conceptWeight = cr->m_conceptWeight;
|
|
|
|
m_titleWeightValid = true;
|
|
m_headerWeightValid = true;
|
|
m_urlPathWeightValid = true;
|
|
m_externalLinkTextWeightValid = true;
|
|
m_internalLinkTextWeightValid = true;
|
|
m_conceptWeightValid = true;
|
|
*/
|
|
|
|
// fix some corruption i've seen
|
|
if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
|
|
log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
|
|
m_sreq.m_urlIsDocId = 0;
|
|
}
|
|
|
|
// if url is a docid... we are from pagereindex.cpp
|
|
//if ( sreq->m_isPageReindex ) {
|
|
// now we can have url-based page reindex requests because
|
|
// if we have a diffbot json object fake url reindex request
|
|
// we add a spider request of the PARENT url for it as page reindex
|
|
//if ( is_digit ( sreq->m_url[0] ) ) {
|
|
// watch out for 0.r.msn.com!!
|
|
if ( m_sreq.m_urlIsDocId ) {
|
|
m_docId = atoll(m_sreq.m_url);
|
|
// assume its good
|
|
m_docIdValid = true;
|
|
// similar to set3() above
|
|
m_setFromDocId = true;
|
|
// use content and ip from old title rec to save time
|
|
// . crap this is making the query reindex not actually
|
|
// re-download the content.
|
|
// . we already check the m_deleteFromIndex flag below
|
|
// in getUtf8Content() and use the old content in that case
|
|
// so i'm not sure why we are recycling here, so take
|
|
// this out. MDW 9/25/2014.
|
|
//m_recycleContent = true;
|
|
// sanity
|
|
if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
// add www is now REQUIRED for all!
|
|
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
|
|
// www.tmblr.co has no IP
|
|
setFirstUrl ( m_sreq.m_url , false );//true ); // false );
|
|
// you can't call this from a docid based url until you
|
|
// know the uh48
|
|
//setSpideredTime();
|
|
}
|
|
|
|
// now query reindex can specify a recycle content option so it
|
|
// can replace the rebuild tool. try to recycle on global index.
|
|
if ( m_sreqValid )
|
|
m_recycleContent = m_sreq.m_recycleContent;
|
|
|
|
m_hasMetadata = (bool)metadata;
|
|
|
|
ptr_metadata = metadata;
|
|
size_metadata = metadataLen;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . set our stuff from the TitleRec (from titledb)
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::set2 ( char *titleRec ,
|
|
int32_t maxSize ,
|
|
char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ,
|
|
SpiderRequest *sreq ) {
|
|
|
|
// NO! can't do this. see below
|
|
//reset();
|
|
|
|
setStatus ( "setting xml doc from title rec");
|
|
|
|
// . it resets us, so save this
|
|
// . we only save these for set2() not the other sets()!
|
|
//void (*cb1)(void *state) = m_callback1;
|
|
//bool (*cb2)(void *state) = m_callback2;
|
|
//void *state = m_state;
|
|
|
|
// . clear it all out
|
|
// . no! this is clearing our msg20/msg22 reply...
|
|
// . ok, but repair.cpp needs it so do it there then
|
|
//reset();
|
|
|
|
// restore callbacks
|
|
//m_callback1 = cb1;
|
|
//m_callback2 = cb2;
|
|
//m_state = state;
|
|
|
|
// sanity check - since we do not reset
|
|
if ( m_contentValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this is true
|
|
m_setFromTitleRec = true;
|
|
|
|
// this is valid i guess. includes key, etc.
|
|
//m_titleRec = titleRec;
|
|
//m_titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
|
|
//m_titleRecValid = true;
|
|
// . should we free m_cbuf on our reset/destruction?
|
|
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
|
|
// that should not be freed, besides the alloc size is not known!
|
|
//m_freeTitleRec = false;
|
|
|
|
int32_t titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
|
|
// . should we free m_cbuf on our reset/destruction?
|
|
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
|
|
// that should not be freed, besides the alloc size is not known!
|
|
m_titleRecBuf.setBuf ( titleRec ,
|
|
titleRecSize , // bufmax
|
|
titleRecSize , // bytes in use
|
|
false, // ownData?
|
|
csUTF8); // encoding
|
|
m_titleRecBufValid = true;
|
|
|
|
|
|
//m_coll = coll;
|
|
m_pbuf = pbuf;
|
|
m_niceness = niceness;
|
|
|
|
// . sanity check
|
|
// . NO! could be from XmlDoc::getMsg20Reply()!
|
|
//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// it must be there!
|
|
if ( !titleRec||titleRecSize==0 ) {g_errno=ENOTFOUND; return false;}
|
|
|
|
// set our collection number
|
|
if ( ! setCollNum ( coll ) ) return false;
|
|
|
|
// store the whole rec, key+dataSize+data, in case it disappears.
|
|
if ( sreq ) {
|
|
gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
|
|
m_sreqValid = true;
|
|
}
|
|
|
|
m_hashedTitle = false;
|
|
m_hashedMetas = false;
|
|
|
|
// save the compressed buffer in case we should free it when done
|
|
//m_titleRec = titleRec;
|
|
// should we free m_cbuf on our reset/destruction?
|
|
//m_freeTitleRec = true;
|
|
// our record may not occupy all of m_cbuf, careful
|
|
//m_titleRecAllocSize = maxSize;
|
|
|
|
// get a parse ptr
|
|
char *p = titleRec ;
|
|
// . this is just like a serialized RdbList key/dataSize/data of 1 rec
|
|
// . first thing is the key
|
|
// . key should have docId embedded in it
|
|
m_titleRecKey = *(key_t *) p ;
|
|
//m_titleRecKeyValid = true;
|
|
p += sizeof(key_t);
|
|
// bail on error
|
|
if ( (m_titleRecKey.n0 & 0x01) == 0x00 ) {
|
|
g_errno = EBADTITLEREC;
|
|
log("db: Titledb record is a negative key.");
|
|
char *xx=NULL; *xx=0;
|
|
return false;
|
|
}
|
|
// set m_docId from key
|
|
m_docId = g_titledb.getDocIdFromKey ( m_titleRecKey );
|
|
// validate that
|
|
m_docIdValid = true;
|
|
// then the size of the data that follows this
|
|
int32_t dataSize = *(int32_t *) p ;
|
|
p += 4;
|
|
// bail on error
|
|
if ( dataSize < 4 ) {
|
|
g_errno = EBADTITLEREC;
|
|
return log("db: Titledb record has size of %"INT32" which "
|
|
"is less then 4. Probable disk corruption in a "
|
|
"titledb file.",
|
|
dataSize);
|
|
}
|
|
// what is the size of cbuf/titleRec in bytes?
|
|
int32_t cbufSize = dataSize + 4 + sizeof(key_t);
|
|
// . the actual data follows "dataSize"
|
|
// . what's the size of the uncompressed compressed stuff below here?
|
|
m_ubufSize = *(int32_t *) p ; p += 4;
|
|
// . because of disk/network data corruption this may be wrong!
|
|
// . we can now have absolutely huge titlerecs...
|
|
if ( m_ubufSize <= 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 )
|
|
g_errno = EBADTITLEREC;
|
|
return log("db: TitleRec::set: uncompress uncompressed "
|
|
"size=%"INT32".",m_ubufSize );
|
|
}
|
|
// trying to uncompress corrupt titlerecs sometimes results in
|
|
// a seg fault... watch out
|
|
if ( m_ubufSize > 100*1024*1024 ) {
|
|
g_errno = EBADTITLEREC;
|
|
return log("db: TitleRec::set: uncompress uncompressed "
|
|
"size=%"INT32" > 100MB. unacceptable, probable "
|
|
"corruption.",m_ubufSize );
|
|
}
|
|
// make buf space for holding the uncompressed stuff
|
|
m_ubufAlloc = m_ubufSize;
|
|
m_ubuf = (char *) mmalloc ( m_ubufAlloc ,"TitleRecu1");
|
|
// log("xmldoc: m_ubuf=%"PTRFMT" this=%"PTRFMT
|
|
// , (PTRTYPE) m_ubuf
|
|
// , (PTRTYPE) this
|
|
// );
|
|
if ( ! m_ubuf ) {
|
|
// we had bad ubufsizes on gb6, like > 1GB print out key
|
|
// so we can manually make a titledb.dat file to delete these
|
|
// bad keys
|
|
log("build: alloc failed ubufsize=%"INT32" key.n1=%"UINT32" "
|
|
"n0=%"UINT64,
|
|
m_ubufAlloc,m_titleRecKey.n1,m_titleRecKey.n0);
|
|
return false;
|
|
}
|
|
// we need to loop since uncompress is wierd, sometimes it needs more
|
|
// space then it should. see how much it actually took.
|
|
int32_t realSize = m_ubufSize;
|
|
// time it
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
// debug msg
|
|
|
|
setStatus( "Uncompressing title rec." );
|
|
// . uncompress the data into m_ubuf
|
|
// . m_ubufSize should remain unchanged since we stored it
|
|
int err = gbuncompress ( (unsigned char *) m_ubuf ,
|
|
(uint32_t *) &realSize ,
|
|
(unsigned char *) p ,
|
|
(uint32_t ) (dataSize - 4) );
|
|
// hmmmm...
|
|
if ( err == Z_BUF_ERROR ) {
|
|
log("db: Buffer is too small to hold uncompressed "
|
|
"document. Probable disk corruption in a titledb file.");
|
|
g_errno = EUNCOMPRESSERROR;
|
|
return false;
|
|
}
|
|
// set g_errno and return false on error
|
|
if ( err != Z_OK ) {
|
|
g_errno = EUNCOMPRESSERROR;
|
|
return log("db: Uncompress of document failed. ZG_ERRNO=%i. "
|
|
"cbufSize=%"INT32" ubufsize=%"INT32" realSize=%"INT32"",
|
|
err , cbufSize , m_ubufSize , realSize );
|
|
}
|
|
if ( realSize != m_ubufSize ) {
|
|
g_errno = EBADENGINEER;
|
|
return log("db: Uncompressed document size is not what we "
|
|
"recorded it to be. Probable disk corruption in "
|
|
"a titledb file.");
|
|
}
|
|
// . add the stat
|
|
// . use white for the stat
|
|
g_stats.addStat_r ( 0 ,
|
|
startTime ,
|
|
gettimeofdayInMilliseconds(),
|
|
0x00ffffff );
|
|
|
|
// first 2 bytes in m_ubuf is the header size
|
|
int32_t headerSize = *(uint16_t *)m_ubuf;
|
|
|
|
int32_t shouldbe = (char *)&ptr_firstUrl - (char *)&m_headerSize;
|
|
|
|
if ( headerSize != shouldbe ) {
|
|
g_errno = ECORRUPTDATA;
|
|
return log("doc: bad header size in title rec");
|
|
}
|
|
|
|
// set our easy stuff
|
|
gbmemcpy ( (void *)this , m_ubuf , headerSize );
|
|
|
|
// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
|
|
// like in Msg.cpp and Msg20Reply.cpp
|
|
|
|
if ( m_pbuf ) {
|
|
int32_t crc = hash32(m_ubuf,headerSize);
|
|
m_pbuf->safePrintf("crchdr=0x%"XINT32" sizehdr=%"INT32", ",
|
|
crc,headerSize);
|
|
}
|
|
|
|
|
|
// point to the string data
|
|
char *up = m_ubuf + headerSize;
|
|
|
|
// end of the rec
|
|
char *upend = m_ubuf + m_ubufSize;
|
|
|
|
// how many XmlDoc::ptr_* members do we have? set "np" to that
|
|
int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ;
|
|
np /= sizeof(char *);
|
|
|
|
// point to the first ptr
|
|
char **pd = (char **)&ptr_firstUrl;
|
|
// point to the first size
|
|
int32_t *ps = (int32_t *)&size_firstUrl;
|
|
|
|
// loop over them
|
|
for ( int32_t i = 0 ; i < np ; i++ , pd++ , ps++ ) {
|
|
// zero out the ith ptr_ and size_ member
|
|
*pd = 0;
|
|
*ps = 0;
|
|
// make the mask
|
|
uint32_t mask = 1 << i ;
|
|
// do we have this member? skip if not.
|
|
if ( ! (m_internalFlags1 & mask) ) continue;
|
|
// watch out for corruption
|
|
if ( up > upend ) {
|
|
g_errno = ECORRUPTDATA;
|
|
return log("doc: corrupt titlerec.");
|
|
}
|
|
// get the size
|
|
*ps = *(int32_t *)up;
|
|
// this should never be 0, otherwise, why was its flag set?
|
|
if ( *ps <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip over to point to data
|
|
up += 4;
|
|
// point to the data. could be 64-bit ptr.
|
|
*pd = up;//(int32_t)up;
|
|
// debug
|
|
if ( m_pbuf ) {
|
|
int32_t crc = hash32(up,*ps);
|
|
m_pbuf->safePrintf("crc%"INT32"=0x%"XINT32" size%"INT32"=%"INT32", ",
|
|
i,crc,i,*ps);
|
|
}
|
|
// skip over data
|
|
up += *ps;
|
|
// watch out for corruption
|
|
if ( up > upend ) {
|
|
g_errno = ECORRUPTDATA;
|
|
return log("doc: corrupt titlerec.");
|
|
}
|
|
}
|
|
// cap it
|
|
char *pend = m_ubuf + m_ubufSize;
|
|
// sanity check. must match exactly.
|
|
if ( up != pend ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set the urls i guess
|
|
m_firstUrl.set ( ptr_firstUrl );
|
|
if ( ptr_redirUrl ) {
|
|
m_redirUrl.set ( ptr_redirUrl );
|
|
m_currentUrl.set ( ptr_redirUrl );
|
|
m_currentUrlValid = true;
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
}
|
|
else {
|
|
m_currentUrl.set ( ptr_firstUrl );
|
|
m_currentUrlValid = true;
|
|
m_redirUrlPtr = NULL;
|
|
}
|
|
m_firstUrlValid = true;
|
|
m_redirUrlValid = true;
|
|
|
|
|
|
// convert 8 bit to a 32 bit
|
|
//m_numBannedOutlinks = score8to32 ( m_numBannedOutlinks8 );
|
|
|
|
// validate *shadow* members since bit flags cannot be returned
|
|
m_isRSS2 = m_isRSS;
|
|
m_isPermalink2 = m_isPermalink;
|
|
m_isAdult2 = m_isAdult;
|
|
m_spiderLinks2 = m_spiderLinks;
|
|
m_isContentTruncated2 = m_isContentTruncated;
|
|
m_isLinkSpam2 = m_isLinkSpam;
|
|
m_hasAddress2 = m_hasAddress;
|
|
m_hasTOD2 = m_hasTOD;
|
|
//m_hasSiteVenue2 = m_hasSiteVenue;
|
|
m_hasContactInfo2 = m_hasContactInfo;
|
|
//m_skipIndexingByte = m_skipIndexing;
|
|
m_isSiteRoot2 = m_isSiteRoot;
|
|
|
|
// these members are automatically validated
|
|
m_ipValid = true;
|
|
m_spideredTimeValid = true;
|
|
m_indexedTimeValid = true;
|
|
|
|
m_pubDateValid = true;
|
|
m_firstIndexedValid = true;
|
|
m_outlinksAddedDateValid = true;
|
|
m_charsetValid = true;
|
|
m_countryIdValid = true;
|
|
/*
|
|
m_titleWeightValid = true;
|
|
m_headerWeightValid = true;
|
|
m_urlPathWeightValid = true;
|
|
m_externalLinkTextWeightValid = true;
|
|
m_internalLinkTextWeightValid = true;
|
|
m_conceptWeightValid = true;
|
|
*/
|
|
|
|
// new stuff
|
|
m_siteNumInlinksValid = true;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
//m_sitePopValid = true;
|
|
m_rootLangIdValid = true;
|
|
m_hasContactInfoValid = true;
|
|
m_metaListCheckSum8Valid = true;
|
|
|
|
m_hopCountValid = true;
|
|
//m_numBannedOutlinksValid = true;
|
|
m_langIdValid = true;
|
|
m_contentTypeValid = true;
|
|
m_isRSSValid = true;
|
|
m_isPermalinkValid = true;
|
|
m_isAdultValid = true;
|
|
//m_eliminateMenusValid = true;
|
|
m_spiderLinksValid = true;
|
|
m_isContentTruncatedValid = true;
|
|
m_isLinkSpamValid = true;
|
|
m_hasAddressValid = true;
|
|
m_tagRecDataValid = true;
|
|
m_gigabitHashesValid = true;
|
|
m_contentHash32Valid = true;
|
|
//m_tagHash32Valid = true;
|
|
m_tagPairHash32Valid = true;
|
|
m_adVectorValid = true;
|
|
m_wikiDocIdsValid = true;
|
|
m_imageDataValid = true;
|
|
m_catIdsValid = true;
|
|
m_indCatIdsValid = true;
|
|
// ptr_dmozTitles/Summs/Anchors valid:
|
|
m_dmozInfoValid = true;
|
|
m_utf8ContentValid = true;
|
|
//m_sectionsReplyValid = true;
|
|
//m_sectionsVotesValid = true;
|
|
//m_addressReplyValid = true;
|
|
m_siteValid = true;
|
|
m_linkInfo1Valid = true;
|
|
m_linkInfo2Valid = true;
|
|
m_versionValid = true;
|
|
m_httpStatusValid = true;
|
|
m_crawlDelayValid = true;
|
|
//m_sectiondbDataValid = true;
|
|
//m_placedbDataValid = true;
|
|
//m_clockCandidatesDataValid = true;
|
|
//m_skipIndexingValid = true;
|
|
m_isSiteRootValid = true;
|
|
|
|
// ptr_linkInfo2 is valid. so getDiffbotTitleHashes() works.
|
|
m_diffbotTitleHashBufValid = true;
|
|
|
|
// set "m_oldTagRec" from ptr_tagRecData
|
|
//gbmemcpy ( &m_oldTagRec , ptr_tagRecData , size_tagRecData );
|
|
//m_oldTagRecValid = true;
|
|
|
|
// there was no issue indexing it...
|
|
m_indexCode = 0;
|
|
m_indexCodeValid = true;
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
|
|
// stop core when importing and calling getNewSpiderReply()
|
|
m_downloadEndTime = m_spideredTime;
|
|
m_downloadEndTimeValid = true;
|
|
|
|
// make a copy for new tag rec too, this one we modify
|
|
//gbmemcpy ( &m_newTagRec , ptr_tagRecData , size_tagRecData );
|
|
|
|
// set "m_siteNumInlinks" from m_oldTagRec
|
|
//Tag *tag = m_oldTagRec.getTag("sitenuminlinks");
|
|
// must always be there!
|
|
//if ( ! tag ) { char *xx=NULL;*xx=0; }
|
|
// must be null terminated
|
|
//if ( tag->getTagData()[tag->getTagData()Size-1] != 0 ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// grab that
|
|
//m_siteNumInlinks = atol(tag->getTagData());
|
|
//m_siteNumInlinksValid = true;
|
|
// must not be negative
|
|
if ( m_siteNumInlinks < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set m_hasContactInfo in case someone calls ::getHasContactInfo()
|
|
// which will do a bunch of parsing!!
|
|
//tag = m_oldTagRec.getTag ("hascontactinfo");
|
|
//if ( tag ) m_hasContactInfo = true;
|
|
//else m_hasContactInfo = false;
|
|
//m_hasContactInfoValid = true;
|
|
|
|
// sanity check. if m_siteValid is true, this must be there
|
|
if ( ! ptr_site ) {
|
|
log("set2: ptr_site is null for docid %"INT64"",m_docId);
|
|
//char *xx=NULL;*xx=0; }
|
|
g_errno = ECORRUPTDATA;
|
|
return false;
|
|
}
|
|
|
|
// lookup the tagdb rec fresh if setting for a summary. that way we
|
|
// can see if it is banned or not
|
|
//if ( m_req ) m_tagRecDataValid = false;
|
|
|
|
// debug thing
|
|
ptr_sectiondbData = NULL;
|
|
size_sectiondbData = 0;
|
|
|
|
// set m_sections.m_nsvt from data. ptr_sectiondbData is the m_osvt
|
|
// serialized, which is from our read of sectiondb at the time we
|
|
// indexed it. but now that we may have nulled out our content to
|
|
// save space in titledb because m_skipIndexing is true, then we have
|
|
// to save our votes as well, BUT, only if we skipped indexing.
|
|
// and not allowed to serialize UNLESS we skipped because
|
|
// that would waste space as well
|
|
//if (! m_skipIndexing && size_sectionsVotes ) { char *xx=NULL;*xx=0; }
|
|
|
|
// success, return true then
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::setFirstUrl ( char *u , bool addWWW , Url *baseUrl ) {
|
|
|
|
m_firstUrl.reset();
|
|
m_currentUrl.reset();
|
|
|
|
m_firstUrlValid = true;
|
|
|
|
// sanity check. "u" must be normalized
|
|
//if ( strncmp(u,"http",4 ) != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume url is not correct format
|
|
ptr_firstUrl = NULL;
|
|
size_firstUrl = 0;
|
|
|
|
if ( ! u || ! u[0] ) {
|
|
//if ( ! m_indexCode ) m_indexCode = EBADURL;
|
|
return true;
|
|
}
|
|
|
|
//if ( gbstrlen (u) + 1 > MAX_URL_LEN )
|
|
// m_indexCode = EURLTOOLONG;
|
|
|
|
m_firstUrl.set ( baseUrl , u , gbstrlen(u) , addWWW ) ;
|
|
|
|
// it is the active url
|
|
m_currentUrl.set ( &m_firstUrl , false );
|
|
m_currentUrlValid = true;
|
|
|
|
// set this to the normalized url
|
|
ptr_firstUrl = m_firstUrl.getUrl();
|
|
size_firstUrl = m_firstUrl.getUrlLen() + 1;
|
|
|
|
// is it is a link loop?
|
|
//if ( m_firstUrl.isLinkLoop() ) {
|
|
// if ( ! m_indexCode ) m_indexCode = ELINKLOOP;
|
|
// return true;
|
|
//}
|
|
// it it illegal?
|
|
//if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
|
|
// if ( ! m_indexCode ) m_indexCode = EBADURL;
|
|
// return true;
|
|
//}
|
|
|
|
// check if url is porn words in it
|
|
//if ( cr->m_doUrlSpamCheck && m_firstUrl.isSpam() ) {
|
|
// if ( ! m_indexCode ) m_indexCode = EDOCURLSPAM;
|
|
// return true;
|
|
//}
|
|
|
|
return true;
|
|
}
|
|
|
|
//CollectionRec *XmlDoc::getCollRec ( ) {
|
|
// return g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
|
|
//}
|
|
|
|
//bool XmlDoc::setRedirUrl ( char *u , bool addWWW ) {
|
|
// m_redirUrl.set ( u , gbstrlen(u) , addWWW );
|
|
// ptr_redirUrl = m_redirUrl.getUrl();
|
|
// size_redirUrl = m_redirUrl.getUrlLen()+1;
|
|
// return true;
|
|
//}
|
|
|
|
void XmlDoc::setStatus ( char *s ) {
|
|
m_statusMsg = s;
|
|
m_statusMsgValid = true;
|
|
static char *s_last = NULL;
|
|
|
|
if ( s == s_last ) return;
|
|
|
|
bool timeIt = false;
|
|
// if ( m_sreqValid &&
|
|
// m_sreq.m_isInjecting &&
|
|
// m_sreq.m_isPageInject )
|
|
// timeIt = true;
|
|
if ( g_conf.m_logDebugBuildTime )
|
|
timeIt = true;
|
|
|
|
// log times to detect slowness
|
|
if ( timeIt ) {
|
|
int64_t now = gettimeofdayInMillisecondsLocal();
|
|
if ( s_lastTimeStart == 0LL ) s_lastTimeStart = now;
|
|
int32_t took = now - s_lastTimeStart;
|
|
//if ( took > 100 )
|
|
log("xmldoc: %s (xd=0x%"PTRFMT" "
|
|
"u=%s) took %"INT32"ms",
|
|
s_last,
|
|
(PTRTYPE)this,
|
|
m_firstUrl.m_url,
|
|
took);
|
|
s_lastTimeStart = now;
|
|
}
|
|
|
|
s_last = s;
|
|
|
|
bool logIt = g_conf.m_logDebugBuild;
|
|
// CollectionRec *cr = NULL;
|
|
// if ( m_collnumValid )
|
|
// cr = g_collectiondb.m_recs[m_collnum];
|
|
// if ( cr &&
|
|
// cr->m_coll &&
|
|
// cr->m_coll[0] == 'c' &&
|
|
// cr->m_coll[1] == 'r' &&
|
|
// strncmp(cr->m_coll,"crawlbottesting-",16) == 0 )
|
|
// logIt = true;
|
|
|
|
if ( ! logIt ) return;
|
|
//return;
|
|
if ( m_firstUrlValid )
|
|
logf(LOG_DEBUG,"build: status = %s for %s (this=0x%"PTRFMT")",
|
|
s,m_firstUrl.m_url,(PTRTYPE)this);
|
|
else
|
|
logf(LOG_DEBUG,"build: status = %s for docId %"INT64" "
|
|
"(this=0x%"PTRFMT")",
|
|
s,m_docId, (PTRTYPE)this);
|
|
}
|
|
|
|
// caller must now call XmlDoc::setCallback()
|
|
void XmlDoc::setCallback ( void *state, void (* callback) (void *state) ) {
|
|
m_state = state;
|
|
m_callback1 = callback;
|
|
// add this additional state==this constraint to prevent core when
|
|
// doing a page parser
|
|
if ( state == this &&
|
|
// i don't remember why i added this sanity check...
|
|
callback == getMetaListWrapper ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) {
|
|
m_state = state;
|
|
m_callback2 = callback;
|
|
}
|
|
|
|
// . similar to XmlDoc::indexDoc() but just adds m_firstUrl to spiderdb
|
|
// . used by PageAddUrl.cpp
|
|
/*
|
|
bool XmlDoc::addToSpiderdb ( ) {
|
|
// set a flag
|
|
m_isAddUrl = true;
|
|
// url must be valid
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
// do not add if something wrong with url
|
|
if ( m_indexCode ) return true;
|
|
// this should just add to spiderdb because m_isAddUrl is true
|
|
return indexDoc(false,false,false,false,true,false);
|
|
}
|
|
*/
|
|
|
|
void indexDocWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in index doc wrapper" );
|
|
// return if it blocked
|
|
if ( ! THIS->indexDoc( ) ) return;
|
|
// otherwise, all done, call the caller callback
|
|
|
|
// g_statsdb.addStat ( MAX_NICENESS,
|
|
// "docs_indexed",
|
|
// 20,
|
|
// 21,
|
|
// );
|
|
|
|
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// for registerSleepCallback
|
|
void indexDocWrapper2 ( int fd , void *state ) {
|
|
indexDocWrapper ( state );
|
|
}
|
|
|
|
// . inject from http request
|
|
// . replace more of Msg7.cpp logic with this?
|
|
//bool XmlDoc::injectDoc ( HttpRequest *hr ) {
|
|
//}
|
|
|
|
// . the highest level function in here
|
|
// . user is requesting to inject this url
|
|
// . returns false if blocked and your callback will be called when done
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::injectDoc ( char *url ,
|
|
CollectionRec *cr ,
|
|
char *content ,
|
|
char *diffbotReply, // usually null
|
|
bool contentHasMimeArg ,
|
|
int32_t hopCount,
|
|
int32_t charset,
|
|
|
|
bool deleteUrl,
|
|
char *contentTypeStr, // text/html application/json
|
|
bool spiderLinks ,
|
|
char newOnly, // index iff new
|
|
|
|
void *state,
|
|
void (*callback)(void *state) ,
|
|
|
|
uint32_t firstIndexed,
|
|
uint32_t lastSpidered ,
|
|
int32_t injectDocIp ,
|
|
char *contentDelim,
|
|
char *metadata,
|
|
uint32_t metadataLen,
|
|
int32_t payloadLen
|
|
) {
|
|
|
|
|
|
// wait until we are synced with host #0
|
|
if ( ! isClockInSync() ) {
|
|
log("xmldocl: got injection request but clock not yet "
|
|
"synced with host #0");
|
|
g_errno = ETRYAGAIN;//CLOCKNOTSYNCED;
|
|
return true;
|
|
}
|
|
|
|
// normalize url
|
|
Url uu;
|
|
// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
|
|
// which has no www.tmblr.co IP!
|
|
uu.set(url,gbstrlen(url),false);//true);
|
|
|
|
// if (!strncmp(url , "http://www.focusinfo.com/products/mxprodv" ,40))
|
|
// log("hey");
|
|
|
|
|
|
// remove >'s i guess and store in st1->m_url[] buffer
|
|
char cleanUrl[MAX_URL_LEN+1];
|
|
cleanInput ( cleanUrl,
|
|
MAX_URL_LEN,
|
|
uu.getUrl(),
|
|
uu.getUrlLen() );
|
|
|
|
|
|
int32_t contentType = CT_UNKNOWN;
|
|
if ( contentTypeStr && contentTypeStr[0] )
|
|
contentType = getContentTypeFromStr(contentTypeStr);
|
|
|
|
// use CT_HTML if contentTypeStr is empty or blank. default
|
|
if ( ! contentTypeStr || ! contentTypeStr[0] )
|
|
contentType = CT_HTML;
|
|
|
|
// this can go on the stack since set4() copies it
|
|
SpiderRequest sreq;
|
|
sreq.setFromInject ( cleanUrl );
|
|
|
|
if ( lastSpidered )
|
|
sreq.m_addedTime = lastSpidered;
|
|
|
|
if ( deleteUrl )
|
|
sreq.m_forceDelete = 1;
|
|
|
|
//static char s_dummy[3];
|
|
// sometims the content is indeed NULL...
|
|
//if ( newOnly && ! content ) {
|
|
// // don't let it be NULL because then xmldoc will
|
|
// // try to download the page!
|
|
// s_dummy[0] = '\0';
|
|
// content = s_dummy;
|
|
// //char *xx=NULL;*xx=0; }
|
|
//}
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
if ( ! set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// from PageInject.cpp:
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
1, // niceness, // 1 ,
|
|
// inject this content
|
|
content ,
|
|
deleteUrl, // false, // deleteFromIndex ,
|
|
injectDocIp, // 0,//forcedIp ,
|
|
contentType ,
|
|
lastSpidered,//lastSpidered overide
|
|
contentHasMimeArg ,
|
|
contentDelim,
|
|
metadata,
|
|
metadataLen,
|
|
payloadLen
|
|
)) {
|
|
// g_errno should be set if that returned false
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// a diffbot reply? should be in json
|
|
if ( diffbotReply ) {
|
|
if ( ! m_diffbotReply.safeStrcpy(diffbotReply) )
|
|
return true;
|
|
// it was injected so assume no error
|
|
m_diffbotReplyError = 0;
|
|
m_diffbotReplyValid = true;
|
|
}
|
|
|
|
//m_doConsistencyTesting = doConsistencyTesting;
|
|
|
|
// . set xd from the old title rec if recycle is true
|
|
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
|
//if ( recycleContent ) m_recycleContent = true;
|
|
|
|
// othercrap. used for importing from titledb of another coll/cluster.
|
|
if ( firstIndexed ) {
|
|
m_firstIndexedDate = firstIndexed;
|
|
m_firstIndexedDateValid = true;
|
|
}
|
|
|
|
if ( lastSpidered ) {
|
|
m_spideredTime = lastSpidered;
|
|
m_spideredTimeValid = true;
|
|
}
|
|
|
|
if ( hopCount != -1 ) {
|
|
m_hopCount = hopCount;
|
|
m_hopCountValid = true;
|
|
}
|
|
|
|
// PageInject calls memset on gigablastrequest so add '!= 0' here
|
|
if ( charset != -1 && charset != csUnknown && charset != 0 ) {
|
|
m_charset = charset;
|
|
m_charsetValid = true;
|
|
}
|
|
|
|
// avoid looking up ip of each outlink to add "firstip" tag to tagdb
|
|
// because that can be slow!!!!!!!
|
|
m_spiderLinks = spiderLinks;
|
|
m_spiderLinks2 = spiderLinks;
|
|
m_spiderLinksValid = true;
|
|
|
|
// . newOnly is true --> do not inject if document is already indexed!
|
|
// . maybe just set indexCode
|
|
m_newOnly = newOnly;
|
|
|
|
// do not re-lookup the robots.txt
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
m_crawlDelay = -1; // unknown
|
|
m_crawlDelayValid = true;
|
|
|
|
|
|
m_isInjecting = true;
|
|
m_isInjectingValid = true;
|
|
|
|
// set this now
|
|
//g_inPageInject = true;
|
|
|
|
// log it now
|
|
//log("inject: indexing injected doc %s",cleanUrl);
|
|
|
|
// make this our callback in case something blocks
|
|
setCallback ( state , callback );
|
|
|
|
// . now tell it to index
|
|
// . this returns false if blocked
|
|
// . eventually it will call "callback" when done if it blocks
|
|
bool status = indexDoc ( );
|
|
if ( ! status ) return false;
|
|
|
|
// log it. i guess only for errors when it does not block?
|
|
// because xmldoc.cpp::indexDoc calls logIt()
|
|
if ( status ) logIt();
|
|
|
|
|
|
|
|
// undo it
|
|
//g_inPageInject = false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// XmlDoc::injectDoc uses a fake spider request so we have to add
|
|
// a real spider request into spiderdb so that the injected doc can
|
|
// be spidered again in the future by the spidering process, otherwise,
|
|
// injected docs can never be re-spidered. they would end up having
|
|
// a SpiderReply in spiderdb but no matching SpiderRequest as well.
|
|
void XmlDoc::getRevisedSpiderRequest ( SpiderRequest *revisedReq ) {
|
|
|
|
if ( ! m_sreqValid ) { char *xx=NULL; *xx=0; }
|
|
|
|
// we are doing this because it has a fake first ip
|
|
if ( ! m_sreq.m_fakeFirstIp ) { char *xx=NULL;*xx=0; }
|
|
|
|
// copy it over from our current spiderrequest
|
|
gbmemcpy ( revisedReq , &m_sreq , m_sreq.getRecSize() );
|
|
|
|
// this must be valid for us of course
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// wtf? it might be invalid!!! parent caller will handle it...
|
|
//if ( m_firstIp == 0 || m_firstIp == -1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store the real ip in there now
|
|
revisedReq->m_firstIp = m_firstIp;
|
|
|
|
// but turn off this flag! the whole point of all this...
|
|
revisedReq->m_fakeFirstIp = 0;
|
|
|
|
// re-make the key since it contains m_firstIp
|
|
int64_t uh48 = m_sreq.getUrlHash48();
|
|
int64_t parentDocId = m_sreq.getParentDocId();
|
|
|
|
// set the key properly to reflect the new "first ip" since
|
|
// we shard spiderdb by that.
|
|
revisedReq->m_key = g_spiderdb.makeKey ( m_firstIp,
|
|
uh48,
|
|
true, // is request?
|
|
parentDocId ,
|
|
false );// isDel );
|
|
revisedReq->setDataSize();
|
|
}
|
|
|
|
void XmlDoc::getRebuiltSpiderRequest ( SpiderRequest *sreq ) {
|
|
|
|
// memset 0
|
|
sreq->reset();
|
|
|
|
// assume not valid
|
|
sreq->m_siteNumInlinks = -1;
|
|
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// how many site inlinks?
|
|
sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
sreq->m_siteNumInlinksValid = true;
|
|
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set other fields besides key
|
|
sreq->m_firstIp = m_firstIp;
|
|
sreq->m_hostHash32 = m_hostHash32a;
|
|
//sreq->m_domHash32 = m_domHash32;
|
|
//sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
//sreq->m_pageNumInlinks = m_pageNumInlinks;
|
|
sreq->m_hopCount = m_hopCount;
|
|
|
|
sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32;
|
|
sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32;
|
|
sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32;
|
|
sreq->m_pageNumInlinks = 0;//m_sreq.m_parentFirstIp;
|
|
|
|
Url *fu = getFirstUrl();
|
|
|
|
sreq->m_isNewOutlink = 0;
|
|
sreq->m_isAddUrl = 0;//m_isAddUrl;
|
|
sreq->m_isPingServer = fu->isPingServer();
|
|
//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;
|
|
|
|
// transcribe from old spider rec, stuff should be the same
|
|
sreq->m_addedTime = m_firstIndexedDate;
|
|
sreq->m_sameDom = 0;//m_sreq.m_sameDom;
|
|
sreq->m_sameHost = 0;//m_sreq.m_sameHost;
|
|
sreq->m_sameSite = 0;//m_sreq.m_sameSite;
|
|
sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed;
|
|
sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS;
|
|
sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink;
|
|
sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer;
|
|
|
|
// validate the stuff so getUrlFilterNum() acks it
|
|
sreq->m_hopCountValid = 1;
|
|
|
|
// we need this now for ucp ucr upp upr new url filters that do
|
|
// substring matching on the url
|
|
if ( m_firstUrlValid )
|
|
strcpy(sreq->m_url,m_firstUrl.m_url);
|
|
|
|
// re-make the key since it contains m_firstIp
|
|
long long uh48 = fu->getUrlHash48();
|
|
// set the key properly to reflect the new "first ip"
|
|
// since we shard spiderdb by that.
|
|
sreq->m_key = g_spiderdb.makeKey ( m_firstIp,//ip,
|
|
uh48,
|
|
true,//is req?
|
|
0LL, // parentDocId ,
|
|
false );//isDel
|
|
sreq->setDataSize();
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////
|
|
// THIS IS THE HEART OF HOW THE PARSER ADDS TO THE RDBS
|
|
////////////////////////////////////////////////////////////////////
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error and returns true
|
|
// . this is now a WRAPPER for indexDoc2() and it will deal with
|
|
// g_errnos by adding an error spider reply so we offload the
|
|
// logic to the url filters table
|
|
bool XmlDoc::indexDoc ( ) {
|
|
|
|
// return from the msg4.addMetaList() below?
|
|
if ( m_msg4Launched ) {
|
|
// must have been waiting
|
|
if ( ! m_msg4Waiting ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// return true with g_errno set on error
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = indexDocWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// do not index if already indexed and we are importing
|
|
// from the code in PageInject.cpp from a foreign titledb file
|
|
if ( m_isImporting && m_isImportingValid ) {
|
|
char *isIndexed = getIsIndexed();
|
|
if ( ! isIndexed ) {
|
|
log("import: import had error: %s",mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
if ( isIndexed == (char *)-1)
|
|
return false;
|
|
if ( *isIndexed ) {
|
|
log("import: skipping import for %s. already indexed.",
|
|
m_firstUrl.getUrl());
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// . even if not using diffbot, keep track of these counts
|
|
// . even if we had something like EFAKEFIRSTIP, OOM, or whatever
|
|
// it was an attempt we made to crawl this url
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
! m_incrementedAttemptsCount ) {
|
|
// do not repeat
|
|
m_incrementedAttemptsCount = true;
|
|
// log debug
|
|
//log("build: attempted %s count=%"INT64"",m_firstUrl.getUrl(),
|
|
// cr->m_localCrawlInfo.m_pageDownloadAttempts);
|
|
// this is just how many urls we tried to index
|
|
//cr->m_localCrawlInfo.m_urlsConsidered++;
|
|
// avoid counting if it is a fake first ip
|
|
bool countIt = true;
|
|
// pagereindex.cpp sets this as does any add url (bulk job)
|
|
if ( m_sreqValid && m_sreq.m_fakeFirstIp )
|
|
countIt = false;
|
|
if ( countIt ) {
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
}
|
|
// need to save collection rec now during auto save
|
|
cr->m_needsSave = true;
|
|
// update this just in case we are the last url crawled
|
|
//int64_t now = gettimeofdayInMillisecondsGlobal();
|
|
//cr->m_diffbotCrawlEndTime = now;
|
|
}
|
|
|
|
|
|
bool status = true;
|
|
|
|
if ( ! g_errno ) status = indexDoc2 ( );
|
|
|
|
// blocked?
|
|
if ( ! status ) return false;
|
|
|
|
// done with no error?
|
|
bool success = true;
|
|
if ( g_errno ) success = false;
|
|
// if we were trying to spider a fakefirstip request then
|
|
// pass through because we lookup the real firstip below and
|
|
// add a new request as well as a reply for this one
|
|
if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) success = false;
|
|
|
|
if ( success ) return true;
|
|
|
|
// . ignore failed child docs like diffbot pages
|
|
// . they are getting EMALFORMEDSECTIONS
|
|
if ( m_isChildDoc ) {
|
|
log("build: done indexing child doc. error=%s. not adding "
|
|
"spider reply for %s",
|
|
mstrerror(g_errno),
|
|
m_firstUrl.m_url);
|
|
return true;
|
|
}
|
|
|
|
///
|
|
// otherwise, an internal error. we must add a SpiderReply
|
|
// to spiderdb to release the lock.
|
|
///
|
|
|
|
logErr:
|
|
|
|
if ( m_firstUrlValid && g_errno )
|
|
log("build: %s had internal error = %s. adding spider "
|
|
"error reply.",
|
|
m_firstUrl.m_url,mstrerror(g_errno));
|
|
else if ( g_errno )
|
|
log("build: docid=%"INT64" had internal error = %s. "
|
|
"adding spider error reply.",
|
|
m_docId,mstrerror(g_errno));
|
|
|
|
// seems like this was causing a core somehow...
|
|
if ( g_errno == ENOMEM )
|
|
return true;
|
|
|
|
// and do not add spider reply if shutting down the server
|
|
if ( g_errno == ESHUTTINGDOWN )
|
|
return true;
|
|
|
|
// i saw this on shard 9, how is it happening
|
|
if ( g_errno == EBADRDBID )
|
|
return true;
|
|
|
|
// if docid not found when trying to do a query reindex...
|
|
// this really shouldn't happen but i think we were adding
|
|
// additional SpiderRequests since we were using a fake first ip.
|
|
// but i have since fixed that code. so if the titlerec was not
|
|
// found when trying to do a force delete... it's not a temporary
|
|
// error and should not be retried. if we set indexCode to
|
|
// EINTERNALERROR it seems to be retried.
|
|
if ( g_errno == ENOTFOUND ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// this should not be retired either. i am seeing it excessively
|
|
// retried from a
|
|
// "TitleRec::set: uncompress uncompressed size=-2119348471"
|
|
// error condition. it also said
|
|
// "Error spidering for doc http://www.... : Bad cached document"
|
|
if ( g_errno == EBADTITLEREC ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
|
|
// transId=496026 nice=1 net=default): Buf too small.
|
|
// so fix that with this
|
|
if ( g_errno == EBUFTOOSMALL ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
if ( g_errno == EBADURL ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
if ( g_errno == ENOTITLEREC ) {
|
|
m_indexCode = g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// default to internal error which will be retried forever otherwise
|
|
if ( ! m_indexCodeValid ) {
|
|
m_indexCode = EINTERNALERROR;//g_errno;
|
|
m_indexCodeValid = true;
|
|
}
|
|
|
|
// if our spiderrequest had a fake "firstip" so that it could be
|
|
// injected quickly into spiderdb, then do the firstip lookup here
|
|
// and re-add the new spider request with that, and add the reply
|
|
// to the fake firstip request below.
|
|
if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) {
|
|
// at least get this if possible
|
|
int32_t *fip = getFirstIp();
|
|
if ( fip == (void *) -1 ) return false;
|
|
// error? g_errno will be changed if this is NULL
|
|
if ( ! fip ) {
|
|
log("build: error getting real firstip: %s",
|
|
mstrerror(g_errno));
|
|
m_indexCode = EINTERNALERROR;
|
|
m_indexCodeValid = true;
|
|
goto logErr;
|
|
}
|
|
// sanity log
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
// sanity log
|
|
if ( *fip == 0 || *fip == -1 ) {
|
|
//
|
|
// now add a spider status doc for this so we know
|
|
// why a crawl might have failed to start
|
|
//
|
|
SafeBuf *ssDocMetaList = NULL;
|
|
// save this
|
|
int32_t saved = m_indexCode;
|
|
// make it the real reason for the spider status doc
|
|
m_indexCode = EDNSERROR;
|
|
// get the spiderreply ready to be added. false=del
|
|
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
|
|
// revert
|
|
m_indexCode = saved;
|
|
// error?
|
|
if ( ! ssDocMetaList ) return true;
|
|
// blocked?
|
|
if ( ssDocMetaList == (void *)-1 ) return false;
|
|
// need to alloc space for it too
|
|
char *list = ssDocMetaList->getBufStart();
|
|
int32_t len = ssDocMetaList->length();
|
|
//needx += len;
|
|
// this too
|
|
m_addedStatusDocSize = len;
|
|
m_addedStatusDocSizeValid = true;
|
|
|
|
char *url = "unknown";
|
|
if ( m_sreqValid ) url = m_sreq.m_url;
|
|
log("build: error2 getting real firstip of "
|
|
"%"INT32" for "
|
|
"%s. Not adding new spider req. "
|
|
"spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
|
|
m_addedStatusDocSize);
|
|
// also count it as a crawl attempt
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
|
|
if ( ! m_metaList2.safeMemcpy ( list , len ) )
|
|
return true;
|
|
|
|
goto skipNewAdd1;
|
|
}
|
|
// store the new request (store reply for this below)
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
if ( ! m_metaList2.pushChar(rd) )
|
|
return true;
|
|
// store it here
|
|
SpiderRequest revisedReq;
|
|
// this fills it in
|
|
getRevisedSpiderRequest ( &revisedReq );
|
|
// and store that new request for adding
|
|
if ( ! m_metaList2.safeMemcpy (&revisedReq,
|
|
revisedReq.getRecSize()))
|
|
return true;
|
|
// make sure to log the size of the spider request
|
|
m_addedSpiderRequestSize = revisedReq.getRecSize();
|
|
m_addedSpiderRequestSizeValid = true;
|
|
}
|
|
|
|
skipNewAdd1:
|
|
|
|
SpiderReply *nsr = NULL;
|
|
|
|
// if only rebuilding posdb do not rebuild spiderdb
|
|
if ( m_useSpiderdb && ! m_addedSpiderReplySizeValid ) {
|
|
|
|
////
|
|
//
|
|
// make these fake so getNewSpiderReply() below does not block
|
|
//
|
|
////
|
|
nsr = getFakeSpiderReply ( );
|
|
// this can be NULL and g_errno set to ENOCOLLREC or something
|
|
if ( ! nsr )
|
|
return true;
|
|
|
|
//SafeBuf metaList;
|
|
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
if ( ! m_metaList2.pushChar( rd ) )
|
|
return true;
|
|
|
|
if ( ! m_metaList2.safeMemcpy ( (char *)nsr,nsr->getRecSize()))
|
|
return true;
|
|
|
|
m_addedSpiderReplySize = nsr->getRecSize();
|
|
m_addedSpiderReplySizeValid = true;
|
|
}
|
|
|
|
// for other errors like EBADTITLEREC we are not adding spider
|
|
// status docs, so add them here
|
|
/*
|
|
if ( ! m_addedStatusDocSizeValid ) {
|
|
SafeBuf *ssDocMetaList = NULL;
|
|
// if calling getSpiderStatusDocMetaList blocks then
|
|
// call addErrorStuffWrapper() to call msg4
|
|
//m_masterLoop = addErrorStuffWrapper();
|
|
//m_state = this;
|
|
// this uses m_indexCode to set it
|
|
// if this blocks it ends up calling m_masterLoop and
|
|
// re-entering this function with g_errno clear possibly
|
|
// so do we make it back here????? MDW
|
|
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);
|
|
// error?
|
|
if ( ! ssDocMetaList ) return true;
|
|
// blocked?
|
|
if ( ssDocMetaList == (void *)-1 ) return false;
|
|
// need to alloc space for it too
|
|
char *list = ssDocMetaList->getBufStart();
|
|
int32_t len = ssDocMetaList->length();
|
|
// this too
|
|
m_addedStatusDocSize = len;
|
|
m_addedStatusDocSizeValid = true;
|
|
// also count it as a crawl attempt
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
if ( ! m_metaList2.safeMemcpy ( list , len ) )
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
m_msg4Launched = true;
|
|
|
|
// display the url that had the error
|
|
logIt();
|
|
|
|
// log this for debug now
|
|
if ( nsr ) {
|
|
SafeBuf tmp;
|
|
nsr->print(&tmp);
|
|
log("xmldoc: added reply %s",tmp.getBufStart());
|
|
}
|
|
|
|
// clear g_errno
|
|
g_errno = 0;
|
|
|
|
// "cr" might have been deleted by calling indexDoc() above i think
|
|
// so use collnum here, not "cr"
|
|
if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart() ,
|
|
m_metaList2.length() ,
|
|
m_collnum,//cr->m_coll ,
|
|
m_masterState , // state
|
|
m_masterLoop ,
|
|
m_niceness ) ) {
|
|
// spider hang bug
|
|
//if ( g_conf.m_testSpiderEnabled )
|
|
// logf(LOG_DEBUG,"build: msg4 meta add3 blocked"
|
|
// "msg4=0x%"XINT32"" ,(int32_t)&m_msg4);
|
|
m_msg4Waiting = true;
|
|
return false;
|
|
}
|
|
|
|
//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );
|
|
|
|
m_msg4Launched = false;
|
|
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error and returns true
|
|
bool XmlDoc::indexDoc2 ( ) {
|
|
|
|
if ( g_isYippy ) return true;
|
|
|
|
// if anything blocks, this will be called when it comes back
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = indexDocWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
|
|
// do this before we increment pageDownloadAttempts below so that
|
|
// john's smoke tests, which use those counts, are not affected
|
|
if ( m_sreqValid &&
|
|
m_sreq.m_fakeFirstIp &&
|
|
// only do for add url, not for injects. injects expect
|
|
// the doc to be indexed while the browser waits. add url
|
|
// is really just adding the spider request and returning
|
|
// to the browser without delay.
|
|
! m_sreq.m_isInjecting &&
|
|
// not for page reindexes either!
|
|
! m_sreq.m_isPageReindex &&
|
|
// just add url
|
|
m_sreq.m_isAddUrl &&
|
|
// diffbot requests are ok though!
|
|
! strstr(m_sreq.m_url,"-diffbotxyz") ) {
|
|
m_indexCodeValid = true;
|
|
m_indexCode = EFAKEFIRSTIP;
|
|
return true;
|
|
}
|
|
|
|
|
|
// ensure that CollectionRec::m_globalCrawlInfo (spider stats)
|
|
// is at least 1 minute in sync with counts of
|
|
// all hosts in network. this returns false if it sent out requests
|
|
// to update the counts from all the hosts in the network, and
|
|
// when it updates CollectionRec::m_crawlInfoGlobal with all the
|
|
// requests from each hosts in the network it will call the
|
|
// specified callback, m_masterLoop with m_masterState. this code
|
|
// is all in Spider.cpp.
|
|
// this is now in a sleep wrapper in spider.cpp.
|
|
//setStatus ( "updating crawl info" );
|
|
//if ( ! g_errno &&
|
|
// ! updateCrawlInfo ( cr , m_masterState , m_masterLoop ) )
|
|
// return false;
|
|
|
|
|
|
// MDW: we do this in indexDoc() above why do we need it here?
|
|
/*
|
|
// even if not using diffbot, keep track of these counts
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
! m_incrementedAttemptsCount ) {
|
|
// do not repeat
|
|
m_incrementedAttemptsCount = true;
|
|
// this is just how many urls we tried to index
|
|
//cr->m_localCrawlInfo.m_urlsConsidered++;
|
|
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
|
// need to save collection rec now during auto save
|
|
cr->m_needsSave = true;
|
|
// update this just in case we are the last url crawled
|
|
int64_t now = gettimeofdayInMillisecondsGlobal();
|
|
cr->m_diffbotCrawlEndTime = now;
|
|
}
|
|
*/
|
|
/*
|
|
// if we are being called from Spider.cpp and we met our max
|
|
// to crawl requirement, then bail out on this. this might
|
|
// become true when we are in the middle of processing this url...
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
// this is just for this collection, from all hosts in network
|
|
cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >=
|
|
cr->m_diffbotMaxToCrawl ) {
|
|
// set the code to badness
|
|
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
|
|
m_indexCodeValid = true;
|
|
log("diffbot: abandoning url because we hit crawl limit "
|
|
"of %"INT64". downloaded %"INT64". Disabling spiders."
|
|
,cr->m_diffbotMaxToCrawl
|
|
,cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
|
);
|
|
g_errno = m_indexCode;
|
|
// if spiders already off..
|
|
if ( ! cr->m_spideringEnabled ) return true;
|
|
// do not repeat call sendNotification()
|
|
cr->m_spideringEnabled = false;
|
|
// set this
|
|
m_emailInfo.reset();
|
|
m_emailInfo.m_finalCallback = m_masterLoop;
|
|
m_emailInfo.m_finalState = m_masterState;
|
|
m_emailInfo.m_collnum = m_collnum;
|
|
// note it
|
|
setStatus("sending notification");
|
|
// this returns false if it would block, so we ret fals
|
|
if ( ! sendNotification ( &m_emailInfo ) ) return false;
|
|
// it didn't block
|
|
g_errno = m_indexCode;
|
|
return true;
|
|
}
|
|
|
|
// likewise if we hit the max processing limit...
|
|
if ( ! m_isDiffbotJSONObject &&
|
|
cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >=
|
|
cr->m_diffbotMaxToProcess ) {
|
|
// set the code to badness
|
|
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
|
|
m_indexCodeValid = true;
|
|
log("diffbot: abandoning url because we hit process limit "
|
|
"of %"INT64". processed %"INT64". Disabling spiders."
|
|
, cr->m_diffbotMaxToProcess
|
|
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
|
|
);
|
|
g_errno = m_indexCode;
|
|
// if spiders already off...
|
|
if ( ! cr->m_spideringEnabled ) return true;
|
|
// turn them off and send notification (email or url)
|
|
cr->m_spideringEnabled = false;
|
|
// set this
|
|
m_emailInfo.reset();
|
|
m_emailInfo.m_finalCallback = m_masterLoop;
|
|
m_emailInfo.m_finalState = m_masterState;
|
|
m_emailInfo.m_collnum = m_collnum;
|
|
// note it
|
|
setStatus("sending notification");
|
|
// . this returns false if it would block, so we ret fals
|
|
// . this is now in PingServer.cpp
|
|
if ( ! sendNotification( &m_emailInfo ) ) return false;
|
|
// it didn't block
|
|
g_errno = m_indexCode;
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
setStatus("indexing doc");
|
|
|
|
// maybe a callback had g_errno set?
|
|
if ( g_errno ) return true;
|
|
|
|
// before indexing this doc, index its inlinks it has according
|
|
// to ahrefs?
|
|
if ( m_downloadLevel == 1 && m_useAhrefs && ! m_doneWithAhrefs ) {
|
|
// do not repeat this call!
|
|
m_doneWithAhrefs = true;
|
|
// call it
|
|
if ( ! injectAhrefsLinks () ) return false;
|
|
}
|
|
|
|
|
|
|
|
if ( m_firstUrlValid && (m_firstUrl.isArc() || m_firstUrl.isWarc())) {
|
|
// this returns false if it would block and callback will be
|
|
// called
|
|
if ( ! indexWarcOrArc ( ) )
|
|
return false;
|
|
logIt();
|
|
// all done! no need to add the parent doc.
|
|
return true;
|
|
}
|
|
|
|
if ( isContainerDoc() ) {
|
|
// m_delimeter should be set!
|
|
if ( ! indexContainerDoc () )
|
|
return false;
|
|
logIt();
|
|
// all done! no need to add the parent doc.
|
|
return true;
|
|
}
|
|
|
|
// . now get the meta list from it to add
|
|
// . returns NULL and sets g_errno on error
|
|
char *metaList = getMetaList ( );
|
|
|
|
// error?
|
|
if ( ! metaList ) {
|
|
// sanity check. g_errno must be set
|
|
if ( ! g_errno ) {
|
|
log("build: Error UNKNOWN error spidering. setting "
|
|
"to bad engineer.");
|
|
g_errno = EBADENGINEER;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
log("build: Error spidering for doc %s: %s",
|
|
m_firstUrl.m_url,mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
// did it block? return false if so, we will be recalled since
|
|
// we set m_masterLoop to indexDoc
|
|
if ( metaList == (char *) -1 ) return false;
|
|
|
|
// before we add the meta list let's updateTagdb()
|
|
//char *ret = updateTagdb();
|
|
// it returns NULL on error
|
|
//if ( ret == NULL ) return true;
|
|
// return false if it blocked
|
|
//if ( ret == (char *)-1 ) return false;
|
|
|
|
// . let's update tagdb's venue address default too
|
|
// . no. that is in getTitleRecBuf()
|
|
|
|
// must be valid
|
|
int32_t *indexCode = getIndexCode();
|
|
if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
|
|
|
|
// . check to make sure the parser is consistent so we can cleanly
|
|
// delete the various rdb records if we need to in the future solely
|
|
// based on the titleRec.
|
|
// . force = false
|
|
// . unless we force it, the test is only done at random intervals
|
|
// for performance reasons
|
|
if ( ! *indexCode ) doConsistencyTest ( false );
|
|
// ignore errors from that
|
|
g_errno = 0;
|
|
|
|
|
|
// unregister any sleep callback
|
|
if ( m_registeredSleepCallback ) {
|
|
g_loop.unregisterSleepCallback(m_masterState,indexDocWrapper2);
|
|
m_registeredSleepCallback = false;
|
|
}
|
|
|
|
//////////
|
|
// . add the doledb negative key quickly to our tree to avoid a
|
|
// respider because the msg4 doledb negative key is buffered by msg4
|
|
// . make it negative
|
|
// . well it should not be respidered because the lock is on it!!
|
|
// -- so let's comment this out
|
|
/////////
|
|
/*
|
|
key_t negative = m_doledbKey;
|
|
// make it negative
|
|
negative.n0 &= 0xfffffffffffffffeLL;
|
|
// . store it in our tree if we can
|
|
// . returns false and sets g_errno on error
|
|
// . i.e. g_errno == ETRYAGAIN
|
|
if ( ! m_addedNegativeDoledbRec &&
|
|
! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
|
|
NULL,0,m_niceness)){
|
|
log("build: error trying to add to doledb: %s",
|
|
mstrerror(g_errno));
|
|
// set sleep wrapper
|
|
g_loop.registerSleepCallback(1000,m_masterState,
|
|
indexDocWrapper2,m_niceness);
|
|
// note it
|
|
m_registeredSleepCallback = true;
|
|
// sleep and retry
|
|
return false;
|
|
}
|
|
*/
|
|
// we did that
|
|
m_addedNegativeDoledbRec = true;
|
|
|
|
// now add it
|
|
if ( ! m_listAdded && m_metaListSize ) {
|
|
// only call thuis once
|
|
m_listAdded = true;
|
|
// show it for now
|
|
//printMetaList(m_metaList , m_metaList + m_metaListSize,NULL);
|
|
// test it
|
|
verifyMetaList ( m_metaList ,
|
|
m_metaList + m_metaListSize ,
|
|
false );
|
|
// do it
|
|
if ( ! m_msg4.addMetaList ( m_metaList ,
|
|
m_metaListSize ,
|
|
m_collnum,//cr->m_coll ,
|
|
m_masterState , // state
|
|
m_masterLoop ,
|
|
m_niceness ) ) {
|
|
// spider hang bug
|
|
if ( g_conf.m_testSpiderEnabled )
|
|
logf(LOG_DEBUG,"build: msg4 meta add blocked"
|
|
"msg4=0x%"PTRFMT"" ,(PTRTYPE)&m_msg4);
|
|
m_msg4Waiting = true;
|
|
return false;
|
|
}
|
|
// error with msg4? bail
|
|
if ( g_errno ) return logIt();
|
|
|
|
}
|
|
|
|
// make sure our msg4 is no longer in the linked list!
|
|
if (m_msg4Waiting && isInMsg4LinkedList(&m_msg4)){char *xx=NULL;*xx=0;}
|
|
|
|
if ( m_msg4Waiting && g_conf.m_testSpiderEnabled )
|
|
logf(LOG_DEBUG,"build: msg4=0x%"PTRFMT" returned"
|
|
,(PTRTYPE)&m_msg4);
|
|
|
|
// we are not waiting for the msg4 to return
|
|
m_msg4Waiting = false;
|
|
|
|
bool flush = false;
|
|
|
|
// no longer flush injections.
|
|
// TODO: pass in a flush flag with injection and flush in that
|
|
// case, but for now disable to make things faster. profiler
|
|
// indicates too much msg4 activity.
|
|
//if ( m_contentInjected ) flush = true;
|
|
//if ( m_sreqValid && m_sreq.m_isPageInject ) flush = true;
|
|
|
|
// to keep our qa runs consistent
|
|
if ( strcmp(cr->m_coll,"qatest123") == 0 ) flush = true;
|
|
|
|
if ( ! m_listAdded ) flush = false;
|
|
if ( m_listFlushed ) flush = false;
|
|
|
|
// HACK: flush it if we are injecting it in case the next thing we
|
|
// spider is dependent on this one
|
|
if ( flush ) {
|
|
// note it
|
|
setStatus ( "flushing msg4" );
|
|
// only do it once
|
|
m_listFlushed = true;
|
|
// do it
|
|
if ( ! flushMsg4Buffers ( m_masterState , m_masterLoop ) )
|
|
return false;
|
|
}
|
|
|
|
// . all done with that. core if we block i guess.
|
|
// . but what if we were not the function that set this to begin w/?
|
|
//m_masterLoop = NULL;
|
|
|
|
return logIt();
|
|
/*
|
|
// if not doing exact quotas, we're done
|
|
if ( ! cr->m_exactQuotas ) return logIt();
|
|
|
|
char *isIndexed = getIsIndexed();
|
|
// this means it blocked
|
|
if ( isIndexed == (char *)-1) { char *xx=NULL; *xx=0; }
|
|
// returns NULL with g_errno set
|
|
if ( isIndexed ) return logIt();
|
|
|
|
// otherwise, tell Msg36 to update our quota count for this site
|
|
// so we don't have to keep merging site: termlists
|
|
m_incCount = false;
|
|
m_decCount = false;
|
|
if ( m_indexCode ) m_decCount = true;
|
|
//if ( m_forceDelete ) m_decCount = true;
|
|
|
|
// fix for the exact quota bug found on eurekster collection. bug 229
|
|
// if we're not a new doc, then don't increment the count because
|
|
// we have been already counted as the old doc. MDW: i added the
|
|
// condition that if decCount is true we need to update the count!
|
|
if ( *isIndexed && ! m_decCount ) return logIt();
|
|
|
|
// if it is new and we are not adding it to the index then no need
|
|
// to update any quota count...
|
|
if ( ! *isIndexed && m_decCount ) return logIt();
|
|
|
|
// if not decrementing the count, must be incrementing it then!
|
|
if ( ! m_decCount ) m_incCount = true;
|
|
*/
|
|
// i am not using quotas, so disable this for now
|
|
|
|
/*
|
|
log(LOG_DEBUG,"build: inc'ing quota to REMOTE table "
|
|
"for termIdHost %"UINT64" termIdDom %"UINT64" for %s.",
|
|
m_msg16.m_termIdHost,m_msg16.m_termIdDom,m_url.getUrl());
|
|
|
|
setStatus ( "updating quota cache" );
|
|
|
|
// sanity checks
|
|
if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; }
|
|
if ( m_msg16.m_termIdDom == 0 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// . Msg36 gets the correct count from disk and puts it in cache. It
|
|
// doesn't try to increment or decrement the quotas in cache, because
|
|
// then it would have to be done on all twins, and also the correct
|
|
// split will have to be found.
|
|
// . Actually, we should only use the cache on one host to hold the
|
|
// sum of all splits. This will be the authority cache.
|
|
if ( ! m_updatedCounts ) {
|
|
// only call this once
|
|
m_updatedCounts = true;
|
|
// do it
|
|
if ( ! m_msg36.getTermFreq ( m_coll ,
|
|
0 , // maxAge
|
|
m_msg16.m_termIdHost ,
|
|
this ,
|
|
m_masterLoop ,
|
|
m_niceness ,
|
|
m_exactQuotas ,
|
|
m_incCount ,
|
|
m_decCount ,
|
|
false ))
|
|
// we blocked
|
|
return false;
|
|
// error?
|
|
if ( g_errno ) return logIt();
|
|
}
|
|
|
|
// add the second entry for domain
|
|
if ( ! m_updatedCounts2 ) {
|
|
// only call this once
|
|
m_updateCounts2 = true;
|
|
// do it
|
|
if ( ! m_msg36.getTermFreq ( m_coll ,
|
|
0 , // maxAge
|
|
m_msg16.m_termIdDom ,
|
|
this ,
|
|
doneAddingMsg36Entry2,
|
|
m_niceness ,
|
|
m_exactQuotas ,
|
|
m_incCount ,
|
|
m_decCount ,
|
|
false ))
|
|
// we blocked
|
|
return false;
|
|
// error?
|
|
if ( g_errno ) return logIt();
|
|
}
|
|
|
|
// that is it!
|
|
return logIt();
|
|
*/
|
|
}
|
|
|
|
bool isRobotsTxtFile ( char *u , int32_t ulen ) {
|
|
if ( ulen > 12 && ! strncmp ( u + ulen - 11 , "/robots.txt" , 11 ) )
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
// does this doc consist of a sequence of smaller sub-docs?
|
|
// if so we'll index the subdocs and not the container doc itself.
|
|
bool XmlDoc::isContainerDoc ( ) {
|
|
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
|
|
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
|
|
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( m_contentDelim ) return true;
|
|
if ( m_contentDelimValid && m_contentDelim ) return true;
|
|
return false;
|
|
}
|
|
|
|
// returns false if would block, true otherwise. returns true and sets g_errno on err
|
|
bool XmlDoc::indexContainerDoc ( ) {
|
|
|
|
if ( ! m_contentDelim ) {
|
|
log("build: can not index container doc. no delimeter.");
|
|
g_errno = EBADENGINEER;
|
|
return true;
|
|
}
|
|
|
|
// int8_t *hc = getHopCount();
|
|
// if ( ! hc ) return true; // error?
|
|
// if ( hc == (void *)-1 ) return false;
|
|
// first download
|
|
// in the case of a list of delimeted http server replies let's
|
|
// not convert into utf8 here but just use as-is
|
|
char **cpp = getContent();//getUtf8Content();
|
|
// return true with g_errno set on error
|
|
if ( ! cpp ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
// would block? return false then
|
|
if ( cpp == (void *)-1 )
|
|
return false;
|
|
|
|
// need this. it is almost 1MB in size, so alloc it
|
|
if ( ! m_msg7 ) {
|
|
try { m_msg7 = new ( Msg7 ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
|
|
}
|
|
|
|
// inject input parms:
|
|
InjectionRequest *ir = &m_msg7->m_injectionRequest;
|
|
// the cursor for scanning the subdocs
|
|
if ( ! m_anyContentPtr ) {
|
|
// init the content cursor to point to the first subdoc
|
|
m_anyContentPtr = *cpp;
|
|
// but skip over initial separator if there. that is a
|
|
// faux pau
|
|
int32_t dlen = gbstrlen(m_contentDelim);
|
|
if ( strncmp(m_anyContentPtr,m_contentDelim,dlen) == 0 )
|
|
m_anyContentPtr += dlen;
|
|
// init the input parms
|
|
memset ( ir , 0 , sizeof(InjectionRequest) );
|
|
// reset it
|
|
ir->m_spiderLinks = false;
|
|
ir->m_injectLinks = false;
|
|
ir->m_hopCount = 0;//*hc + 1;
|
|
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
|
ir->m_collnum = m_collnum;
|
|
// will this work on a content delimeterized doc?
|
|
ir->m_deleteUrl = m_deleteFromIndex;
|
|
// each subdoc will have a mime since it is an arc
|
|
ir->m_hasMime = m_subDocsHaveMime;//true;
|
|
}
|
|
|
|
subdocLoop:
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// EOF?
|
|
if ( m_anyContentPtr == (char *)-1 ) {
|
|
m_indexCode = 0;//m_warcError;
|
|
m_indexCodeValid = true;
|
|
return true;
|
|
}
|
|
|
|
// we had \0 terminated the end of the previous record, so put back
|
|
if ( m_savedChar && ! *m_anyContentPtr ) {
|
|
*m_anyContentPtr = m_savedChar;
|
|
m_anyContentPtr += gbstrlen(m_contentDelim);
|
|
}
|
|
|
|
|
|
// index this subdoc
|
|
ir->ptr_content = m_anyContentPtr;
|
|
|
|
// . should have the url as well.
|
|
// . the url, ip etc. are on a single \n terminated line for an arc!
|
|
char *separator = strstr(m_anyContentPtr,m_contentDelim);
|
|
|
|
|
|
|
|
if ( separator ) {
|
|
m_savedChar = *separator;
|
|
m_anyContentPtr = separator;
|
|
*m_anyContentPtr = '\0';
|
|
//ir->size_content = separator - ir->ptr_content;
|
|
}
|
|
|
|
// if no separator found, this is our last injection
|
|
if ( ! separator ) {
|
|
m_anyContentPtr = (char *)-1;
|
|
}
|
|
|
|
|
|
// these are not defined. will be autoset in set4() i guess.
|
|
ir->m_firstIndexed = 0;
|
|
ir->m_lastSpidered = 0;
|
|
|
|
bool setUrl = false;
|
|
|
|
// HOWEVER, if an hasmime is true and an http:// follows
|
|
// the delimeter then use that as the url...
|
|
// this way we can specify our own urls.
|
|
if ( ir->m_hasMime ) {
|
|
char *du = ir->ptr_content;
|
|
//du += gbstrlen(delim);
|
|
if ( du && is_wspace_a ( *du ) ) du++;
|
|
if ( du && is_wspace_a ( *du ) ) du++;
|
|
if ( du && is_wspace_a ( *du ) ) du++;
|
|
if ( ir->m_hasMime &&
|
|
(strncasecmp( du,"http://",7) == 0 ||
|
|
strncasecmp( du,"https://",8) == 0 ) ) {
|
|
// flag it
|
|
setUrl = true;
|
|
// find end of it
|
|
char *uend = du + 7;
|
|
for ( ; *uend && ! is_wspace_a(*uend) ; uend++ );
|
|
// inject that then
|
|
m_injectUrlBuf.reset();
|
|
m_injectUrlBuf.safeMemcpy ( du , uend - du );
|
|
m_injectUrlBuf.nullTerm();
|
|
// and point to the actual http mime then
|
|
// well, skip that space, right
|
|
ir->ptr_content = uend + 1;
|
|
ir->ptr_url = m_injectUrlBuf.getBufStart();
|
|
ir->size_url = m_injectUrlBuf.length()+1; // include \0
|
|
// if (!strncmp(ir->ptr_url,"http://www.focusinfo.com/"
|
|
// "products/mxprodv" ,40) )
|
|
// log("hey");
|
|
}
|
|
}
|
|
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// make the url from parent url
|
|
// use hash of the content
|
|
int64_t ch64 = hash64n ( ir->ptr_content , 0LL );
|
|
|
|
// need this for an injection
|
|
ir->size_content = gbstrlen(ir->ptr_content) + 1;// improve this?
|
|
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( ! setUrl ) {
|
|
// reset it
|
|
m_injectUrlBuf.reset();
|
|
// by default append a -<ch64> to the provided url
|
|
m_injectUrlBuf.safePrintf("%s-%"UINT64"",
|
|
m_firstUrl.getUrl(),ch64);
|
|
ir->ptr_url = m_injectUrlBuf.getBufStart();
|
|
ir->size_url = m_injectUrlBuf.length()+1; // include \0
|
|
}
|
|
|
|
|
|
bool status = m_msg7->sendInjectionRequestToHost ( ir ,
|
|
m_masterState ,
|
|
m_masterLoop ) ;
|
|
|
|
// it would block, callback will be called later
|
|
if ( status )
|
|
return false;
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// error?
|
|
if ( g_errno ) {
|
|
log("build: index flatfile error %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
else
|
|
log("build: index flatfile did not block");
|
|
|
|
// loop it up
|
|
goto subdocLoop;
|
|
|
|
}
|
|
|
|
|
|
void doneInjectingArchiveRec ( void *state ) {
|
|
Msg7 *THIS = (Msg7 *)state;
|
|
THIS->m_inUse = false;
|
|
XmlDoc *xd = THIS->m_stashxd;
|
|
xd->m_numInjectionsOut--;
|
|
log("build: archive: injection thread returned. %"INT32" out now.",
|
|
xd->m_numInjectionsOut);
|
|
// reset g_errno so it doesn't error out in ::indexDoc() when
|
|
// we are injecting a ton of these msg7s and then xmldoc ends up
|
|
// getting reset and when a msg7 reply comes back in, we core
|
|
g_errno = 0;
|
|
xd->m_masterLoop ( xd );
|
|
}
|
|
|
|
void doneReadingArchiveFileWrapper ( int fd, void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// . go back to the main entry function
|
|
// . make sure g_errno is clear from a msg3a g_errno before calling
|
|
// this lest it abandon the loop
|
|
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
#define MAXWARCRECSIZE 5000000
|
|
|
|
bool XmlDoc::readMoreWarc() {
|
|
// We read everything we can off the pipe in a sleep timer.
|
|
// When we have enough to start processing, we call the
|
|
// processing function.
|
|
// If reading gets too far ahead of the processing and we can
|
|
// no longer buffer the read, then we save the offset of what
|
|
// we processed, free the readbuffer and restart the pipe and
|
|
// skip until the offset we last processed
|
|
|
|
if(!m_calledWgetThread) {
|
|
m_pipe = getUtf8ContentInFile();
|
|
}
|
|
|
|
// return true with g_errno set on error
|
|
if ( ! m_pipe ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
log("We don't have the warc pipe.");
|
|
return true;
|
|
}
|
|
|
|
int64_t leftOver = 0;
|
|
int64_t skipAhead = 0;
|
|
|
|
// How much is unprocessed
|
|
if(m_fptr != m_fptrEnd) {
|
|
leftOver = m_fptrEnd - m_fptr;
|
|
}
|
|
if(leftOver < 0) {
|
|
// Happens when we skip a record which is too big
|
|
skipAhead = - leftOver;
|
|
leftOver = 0;
|
|
m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fileBuf;
|
|
}
|
|
|
|
// We don't want to be memmoving the buffer up for every single
|
|
// document we process so only do it when we need it.
|
|
if(leftOver > MAXWARCRECSIZE) return false;
|
|
|
|
int64_t bytesRemaining = m_fileBufAllocSize - (m_fptrEnd - m_fileBuf) - 1;
|
|
// Scoot up everything we haven't processed
|
|
if(bytesRemaining < MAXWARCRECSIZE) {
|
|
//log("scooting up by left over %"INT64, leftOver);
|
|
// count everything we've processed
|
|
m_bytesStreamed += m_fptr - m_fileBuf;
|
|
memmove(m_fileBuf, m_fptr, leftOver);
|
|
m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fileBuf + leftOver;
|
|
*m_fptrEnd = '\0';
|
|
bytesRemaining += leftOver;
|
|
}
|
|
|
|
int64_t toRead = m_fileBufAllocSize - leftOver - 1;
|
|
if(toRead > bytesRemaining) toRead = bytesRemaining;
|
|
|
|
if(toRead == 0) {
|
|
//log("build: not enough room to read, lets process the buffer" );
|
|
return false;
|
|
}
|
|
|
|
|
|
g_loop.disableTimer();
|
|
errno = 0;
|
|
int bytesRead = fread(m_fptrEnd, 1, toRead, m_pipe);
|
|
g_loop.enableTimer();
|
|
|
|
// if(bytesRead > 0) {
|
|
// log("build: warc pipe read %"INT32" more bytes of the pipe. errno = %s, buf space = %"INT64 " processed = %"INT64 " skipAhead=%"INT64,
|
|
// bytesRead, mstrerror(errno),toRead, m_bytesStreamed, skipAhead);
|
|
// }
|
|
|
|
if(bytesRead <= 0 && errno != EAGAIN) {
|
|
// if(errno == EAGAIN){
|
|
// log("build: fd is not ready, lets process the buffer" );
|
|
// return false;
|
|
// } else {
|
|
if(m_registeredWgetReadCallback) {
|
|
//log("build:came back from read callback");
|
|
g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
|
|
m_registeredWgetReadCallback = false;
|
|
}
|
|
|
|
if(m_pipe) {
|
|
int32_t retCode = fclose(m_pipe);
|
|
if(retCode) {
|
|
log("we closed the pipe with error %s", mstrerror(retCode));
|
|
}
|
|
m_pipe = NULL;
|
|
}
|
|
|
|
//log("build: warc problem pipe terminated %s", mstrerror(errno));
|
|
m_hasMoreToRead = false;
|
|
return false;
|
|
// }
|
|
}
|
|
//m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fptrEnd + bytesRead;
|
|
*m_fptrEnd = '\0';
|
|
m_fptr += skipAhead;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
// . returns false if would block, true otherwise.
|
|
// . returns true and sets g_errno on err
|
|
// . injectwarc
|
|
bool XmlDoc::indexWarcOrArc ( ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
if ( ! cr->m_indexWarcs ) {
|
|
g_errno = EDOCWARC;
|
|
return true;
|
|
}
|
|
|
|
// This can be a busy loop if we have max injections out but we
|
|
// are getting a read ready callback. Should we unregister
|
|
// when max injections are out and then reregister when we have room?
|
|
int32_t max = g_hostdb.m_numHosts * 2;
|
|
if ( max > MAXMSG7S ) max = MAXMSG7S;
|
|
if ( m_numInjectionsOut >= max ) return false;
|
|
|
|
char ctype;
|
|
if ( m_firstUrl.isWarc() ) {
|
|
ctype = CT_WARC;
|
|
} else {
|
|
ctype = CT_ARC;
|
|
}
|
|
|
|
int8_t *hc = getHopCount();
|
|
if ( ! hc ) return true; // error?
|
|
if ( hc == (void *)-1 ) return false;
|
|
|
|
if ( ! m_fileBuf ) {
|
|
// Do this exacly once.
|
|
m_fileBufAllocSize = (5 * MAXWARCRECSIZE) + 1;
|
|
m_fileBuf=(char *)mmalloc(m_fileBufAllocSize ,"sibuf");
|
|
m_fptr = m_fileBuf;
|
|
m_fptrEnd = m_fileBuf;
|
|
m_bytesStreamed = 0;
|
|
m_hasMoreToRead = true;
|
|
}
|
|
|
|
if ( ! m_fileBuf ) {
|
|
log("build: failed to alloc buf to read archive file %s",m_firstUrl.getUrl());
|
|
return true;
|
|
}
|
|
|
|
if(m_hasMoreToRead) readMoreWarc();
|
|
|
|
setStatus ("injecting archive records");
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// did an inject return?
|
|
if ( m_doneInjectingWarc ) {
|
|
warcDone:
|
|
// log("build: done parsing %"INT64" bytes of archive file %s. left over =%"INT32 "done injecting %"INT32 " hasmoretoread %"INT32,
|
|
// m_bytesStreamed + m_fptrEnd - m_fileBuf,
|
|
// m_firstUrl.getUrl(),
|
|
// (int32_t)(m_fptrEnd - m_fptr),
|
|
// (int32_t)m_doneInjectingWarc,
|
|
// (int32_t)m_hasMoreToRead);
|
|
|
|
m_doneInjectingWarc = true;
|
|
|
|
// return if all injects have returned.
|
|
if ( m_numInjectionsOut == 0) { // && !m_hasMoreToRead
|
|
g_errno = m_warcError;
|
|
m_indexCode = m_warcError;
|
|
m_indexCodeValid = true;
|
|
|
|
return true;
|
|
}
|
|
log("build: waiting for injection threads to return.");
|
|
// we would block
|
|
return false;
|
|
}
|
|
|
|
// Dup strings into here so we don't write nulls into our buffer, sometimes we have
|
|
// to rewind over a rec and we want the buf to be the same every time.
|
|
char scratchSpace[1024*10];
|
|
SafeBuf scratch(scratchSpace, 1024*10);
|
|
loop:
|
|
scratch.reset();
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( max > MAXMSG7S ) max = MAXMSG7S;
|
|
// wait for one to come back before launching another msg7
|
|
if ( m_numInjectionsOut >= max ) {
|
|
// Don't need to read anymore so don't call us
|
|
if(m_registeredWgetReadCallback && m_pipe && m_fptr < m_fptrEnd) {
|
|
g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
|
|
m_registeredWgetReadCallback = false;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
char *realStart = m_fptr;
|
|
|
|
// need at least say 100k for warc header
|
|
if ( m_fptr + 100000 > m_fptrEnd && m_hasMoreToRead ) {
|
|
//log("build need more of the record to process so sleeping.");
|
|
|
|
if(!m_registeredWgetReadCallback) {
|
|
if(!g_loop.registerReadCallback ( fileno(m_pipe),
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return true;
|
|
}
|
|
log("build: reregistered the read callback. need more");
|
|
m_registeredWgetReadCallback = true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
int64_t recTime = 0;
|
|
char *recIp = NULL;
|
|
char *recUrl = NULL;
|
|
char *recContent = NULL;
|
|
int64_t recContentLen = 0;
|
|
// what we skip over
|
|
uint64_t recSize = 0;
|
|
|
|
//
|
|
// set recUrl, recIp, recTime, recContent, recContentLen and recSize
|
|
//
|
|
//log("buf size is %"INT64 " four chars %c%c%c%c%c%c",
|
|
//m_fptrEnd-m_fptr, m_fptr[0], m_fptr[1], m_fptr[2], m_fptr[3],m_fptr[4],m_fptr[5]);
|
|
|
|
if ( ctype == CT_WARC ) {
|
|
// find "WARC/1.0" or whatever
|
|
char *whp = m_fptr;
|
|
if( ! whp ) {
|
|
// FIXME: shouldn't get here with a NULL
|
|
log("build: No buffer for file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
// we do terminate last warc rec with \0 so be aware of that...
|
|
int32_t maxCount = 10;
|
|
for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++);
|
|
// none?
|
|
if ( ! *whp ) {
|
|
log("build: could not find WARC/1 header start for "
|
|
"file=%s", m_firstUrl.getUrl());
|
|
// we don't really need this and since we force the
|
|
// http reply to end in \0 before calling inject2() on
|
|
// it it gets messed up
|
|
goto warcDone;
|
|
}
|
|
|
|
char *warcHeader = whp;
|
|
|
|
// find end of warc mime HEADER not the content
|
|
char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
|
|
if ( ! warcHeaderEnd ) {
|
|
log("build: could not find end of WARC header for "
|
|
"file=%s.",
|
|
m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
// \0 term for strstrs below
|
|
char tmp = *warcHeaderEnd;
|
|
*warcHeaderEnd = '\0';
|
|
|
|
char *warcLen = strstr(warcHeader,"Content-Length:");
|
|
char *warcUrl = strstr(warcHeader,"WARC-Target-URI:");
|
|
char *warcType = strstr(warcHeader,"WARC-Type:");
|
|
char *warcDate = strstr(warcHeader,"WARC-Date:");
|
|
char *warcIp = strstr(warcHeader,"WARC-IP-Address:");
|
|
char *warcCon = strstr(warcHeader,"Content-Type:");
|
|
|
|
|
|
// advance
|
|
if ( warcLen ) warcLen += 15;
|
|
if ( warcUrl ) warcUrl += 16;
|
|
if ( warcType ) warcType += 10;
|
|
if ( warcIp ) warcIp += 17;
|
|
if ( warcCon ) warcCon += 13;
|
|
if ( warcDate ) warcDate += 10;
|
|
|
|
// skip initial spaces spaces
|
|
for ( ; warcUrl && is_wspace_a(*warcUrl ) ; warcUrl ++ );
|
|
for ( ; warcLen && is_wspace_a(*warcLen ) ; warcLen ++ );
|
|
for ( ; warcType && is_wspace_a(*warcType) ; warcType++ );
|
|
for ( ; warcDate && is_wspace_a(*warcDate) ; warcDate++ );
|
|
for ( ; warcIp && is_wspace_a(*warcIp ) ; warcIp ++ );
|
|
for ( ; warcCon && is_wspace_a(*warcCon ) ; warcCon ++ );
|
|
|
|
// get Content-Length: of WARC header for its content
|
|
if ( ! warcLen ) {
|
|
// this is a critical stop.
|
|
log("build: warc problem: could not find WARC Content-Length:");
|
|
goto warcDone;
|
|
}
|
|
|
|
//
|
|
// advance m_fptr to point to the next warc record in case we
|
|
// end up calling 'goto loop' below
|
|
//
|
|
recContent = warcHeaderEnd + 4;
|
|
recContentLen = atoll(warcLen);
|
|
|
|
//log("build content len was %"INT64, recContentLen);
|
|
char *warcContentEnd = recContent + recContentLen;
|
|
recSize = (warcContentEnd - realStart);
|
|
|
|
recUrl = warcUrl;
|
|
|
|
// point to the next warc record
|
|
m_fptr += recSize;
|
|
*warcHeaderEnd = tmp;
|
|
|
|
//log("skipping %"UINT64, recSize);
|
|
// advance the file offset to the next record as well
|
|
|
|
// get WARC-Type:
|
|
// revisit (if url was already done before)
|
|
// request (making a GET or DNS request)
|
|
// response (reponse to a GET or dns request)
|
|
// warcinfo (crawling parameters, robots: obey, etc)
|
|
// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
|
|
if ( ! warcType ) {
|
|
log("build: could not find WARC-Type:");
|
|
goto loop;
|
|
}
|
|
//http://www.mpaa.org/Resources/5bec4ac9-a95e-443b-987b-bff6fb5455a9.pdf
|
|
// get Content-Type:
|
|
// application/warc-fields (fetch time, hops from seed)
|
|
// application/http; msgtype=request (the GET request)
|
|
// application/http; msgtype=response (the GET reply)
|
|
if ( ! warcCon ) {
|
|
log("build: could not find Content-Type:");
|
|
goto loop;
|
|
}
|
|
|
|
if ( ! warcUrl ) {
|
|
// no URI?
|
|
log("build: could not find url");
|
|
goto loop;
|
|
}
|
|
|
|
// if WARC-Type: is not response, skip it. so if it
|
|
// is a revisit then skip it i guess.
|
|
if ( strncmp ( warcType,"response", 8 ) != 0) {
|
|
//log("build: was not type response %s *****%s*****", warcUrl, warcType);
|
|
|
|
// read another warc record
|
|
goto loop;
|
|
}
|
|
|
|
// warcConType needs to be
|
|
// application/http; msgtype=response
|
|
if ( !(strncmp(warcCon,"application/http; msgtype=response",34) == 0 ||
|
|
strncmp(warcCon,"application/http;msgtype=response",33) == 0)) {
|
|
// read another warc record
|
|
//log("build: wrong content type %s ---%s---", warcUrl, warcCon);
|
|
goto loop;
|
|
}
|
|
|
|
recTime = 0;
|
|
if ( warcDate ) recTime = atotime ( warcDate );
|
|
recIp = warcIp;
|
|
}
|
|
// END WARC SPECIFIC PARSING
|
|
|
|
//
|
|
// set recUrl, recIp, recTime, recContent, recContentLen and recSize
|
|
//
|
|
if ( ctype == CT_ARC ) {
|
|
// find \n\nhttp://
|
|
char *whp = m_fptr;
|
|
for ( ; *whp ; whp++ ) {
|
|
if ( whp[0] != '\n' ) continue;
|
|
if ( strncmp(whp+1,"http://",7) == 0) break;
|
|
if ( strncmp(whp+1,"https://",8) == 0) break;
|
|
}
|
|
// none?
|
|
if ( ! *whp ) {
|
|
log("build: arc: could not find next \\nhttp:// in "
|
|
"arc file %s",m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
char *arcHeader = whp;
|
|
// find end of arc header not the content
|
|
char *arcHeaderEnd = strstr(arcHeader+1,"\n");
|
|
if ( ! arcHeaderEnd ) {
|
|
log("build: warc problem: could not find end of ARC header. file=%s",
|
|
m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
// \0 term for strstrs below
|
|
char tmp = *arcHeaderEnd;
|
|
*arcHeaderEnd = '\0';
|
|
char *arcContent = arcHeaderEnd + 1;
|
|
// parse arc header line
|
|
char *url = arcHeader + 1;
|
|
char *hp = url;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 1.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
url = scratch.pushStr(url, hp-url);
|
|
hp++;
|
|
|
|
char *ipStr = hp;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 2.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
ipStr = scratch.pushStr(ipStr, hp - ipStr);
|
|
hp++;
|
|
|
|
char *timeStr = hp;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 3.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
timeStr = scratch.pushStr(timeStr, hp - timeStr);
|
|
hp++;
|
|
|
|
char *arcConType = hp;
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {
|
|
log("build: warc problem: bad arc header 4.file=%s", m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
arcConType = scratch.pushStr(arcConType, hp - arcConType);
|
|
hp++;
|
|
|
|
char *arcContentLenStr = hp;
|
|
// get arc content len
|
|
int64_t arcContentLen = atoll(arcContentLenStr);
|
|
char *arcContentEnd = arcContent + arcContentLen;
|
|
//uint64_t oldOff = s_off;
|
|
recSize = (arcContentEnd - realStart);
|
|
// point to the next arc record
|
|
m_fptr += recSize;
|
|
*arcHeaderEnd = tmp;
|
|
// advance the file offset to the next record as well
|
|
// arcConType needs to indexable
|
|
int32_t ct = getContentTypeFromStr ( arcConType );
|
|
if ( ct != CT_HTML &&
|
|
ct != CT_TEXT &&
|
|
ct != CT_XML &&
|
|
ct != CT_PDF &&
|
|
ct != CT_XLS &&
|
|
ct != CT_PPT &&
|
|
ct != CT_PS &&
|
|
ct != CT_DOC &&
|
|
ct != CT_JSON ) {
|
|
// read another arc record
|
|
log("build: was not indexable response %s", arcConType);
|
|
goto loop;
|
|
}
|
|
// convert to timestamp
|
|
// this time structure, once filled, will help yield a time_t
|
|
struct tm t;
|
|
// DAY OF MONTH
|
|
t.tm_mday = atol2 ( timeStr + 6 , 2 );
|
|
// MONTH
|
|
t.tm_mon = atol2 ( timeStr + 4 , 2 );
|
|
// YEAR - # of years since 1900
|
|
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
|
|
// TIME
|
|
t.tm_hour = atol2 ( timeStr + 8 , 2 );
|
|
t.tm_min = atol2 ( timeStr + 10 , 2 );
|
|
t.tm_sec = atol2 ( timeStr + 12 , 2 );
|
|
// unknown if we're in daylight savings time
|
|
t.tm_isdst = -1;
|
|
// translate using mktime
|
|
recTime = timegm ( &t );
|
|
// set content as well
|
|
recContent = arcContent;
|
|
recContentLen = arcContentLen;
|
|
recUrl = url;
|
|
recIp = ipStr;
|
|
}
|
|
// END ARC SPECIFIC PARSING
|
|
|
|
|
|
|
|
// must be http not dns:
|
|
// url must start with http:// or https://
|
|
// it's probably like WARC-Target-URI: dns:www.xyz.com
|
|
// so it is a dns response
|
|
if ( strncmp(recUrl,"http://" ,7) != 0 &&
|
|
strncmp(recUrl,"https://",8) != 0 )
|
|
goto loop;
|
|
|
|
// get length of it, null term it
|
|
char *recUrlEnd = recUrl;
|
|
for ( ; *recUrlEnd && ! is_wspace_a(*recUrlEnd) ; recUrlEnd++ );
|
|
int32_t recUrlLen = recUrlEnd - recUrl;
|
|
//*recUrlEnd = '\0';
|
|
|
|
// skip if robots.txt
|
|
if ( isRobotsTxtFile( recUrl , recUrlLen ) )
|
|
goto loop;
|
|
|
|
// how can there be no more to read?
|
|
if ( m_fptr > m_fptrEnd && ! m_hasMoreToRead ) {
|
|
log("build: warc problem: archive file %s exceeded file length.",
|
|
m_firstUrl.getUrl());
|
|
goto warcDone;
|
|
}
|
|
|
|
// if we fall outside of the current read buf, read next rec if too big
|
|
if ( m_fptr > m_fptrEnd && recSize > MAXWARCRECSIZE ) {
|
|
log("build: skipping archive file of %"INT64" "
|
|
"bytes which is too big",recSize);
|
|
|
|
if(!m_registeredWgetReadCallback) {
|
|
if(!g_loop.registerReadCallback ( fileno(m_pipe),
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return true;
|
|
}
|
|
log("build: reregistered the read callback. skip bigrec");
|
|
m_registeredWgetReadCallback = true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// don't read the next record, read THIS one again, we can fit it
|
|
if ( m_fptr > m_fptrEnd ) {
|
|
//log("build: record end is past the end of what we read by %"INT64 " %"UINT64, m_fptrEnd - m_fptr, recSize);
|
|
m_fptr -= recSize;
|
|
|
|
if(!m_registeredWgetReadCallback) {
|
|
if(!g_loop.registerReadCallback ( fileno(m_pipe),
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return true;
|
|
}
|
|
log("build: reregistered the read callback. reread this record");
|
|
m_registeredWgetReadCallback = true;
|
|
}
|
|
|
|
|
|
return false;
|
|
}
|
|
|
|
char *httpReply = recContent;
|
|
int64_t httpReplySize = recContentLen;
|
|
|
|
// should be a mime that starts with GET or POST
|
|
HttpMime m;
|
|
if ( ! m.set ( httpReply , httpReplySize , NULL ) ) {
|
|
log("build: archive: failed to set http mime at in "
|
|
"file");
|
|
goto loop;
|
|
}
|
|
|
|
// check content type
|
|
int ct2 = m.getContentType();
|
|
if ( ct2 != CT_HTML &&
|
|
ct2 != CT_TEXT &&
|
|
ct2 != CT_XML &&
|
|
ct2 != CT_PDF &&
|
|
ct2 != CT_XLS &&
|
|
ct2 != CT_PPT &&
|
|
ct2 != CT_PS &&
|
|
ct2 != CT_DOC &&
|
|
ct2 != CT_JSON ) {
|
|
//log("build:got wrong type %"INT32, (int32_t)ct2);
|
|
goto loop;
|
|
}
|
|
|
|
// grab an available msg7
|
|
Msg7 *msg7 = NULL;
|
|
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) {
|
|
msg7 = m_msg7s[i];
|
|
// if we got an available one stop
|
|
if ( msg7 ) {
|
|
if( msg7->m_inUse ) continue;
|
|
break; // reuse this one.
|
|
}
|
|
// ok, create one, 1MB each about
|
|
try { msg7 = new ( Msg7 ); }
|
|
catch ( ... ) {g_errno=ENOMEM;m_warcError=g_errno;return true;}
|
|
mnew ( msg7 , sizeof(Msg7),"xdmsgs7");
|
|
|
|
// store it for re-use
|
|
m_msg7s[i] = msg7;
|
|
break;
|
|
}
|
|
|
|
if(!msg7 || msg7->m_inUse) {
|
|
// shouldn't happen, but it does... why?
|
|
log("build: archive: Ran out of msg7s to inject doc.");
|
|
return false;
|
|
}
|
|
|
|
// inject input parms:
|
|
InjectionRequest *ir = &msg7->m_injectionRequest;
|
|
// reset it
|
|
ir->m_hopCount = *hc + 1;
|
|
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
|
ir->m_collnum = m_collnum;
|
|
// will this work on a content delimeterized doc?
|
|
ir->m_deleteUrl = m_deleteFromIndex;
|
|
// each subdoc will have a mime since it is a warc
|
|
ir->m_hasMime = true;
|
|
// it has a mime so we shouldn't need to set this
|
|
ir->ptr_contentTypeStr = NULL;
|
|
// we are injecting a single page, not a container file
|
|
ir->ptr_contentDelim = NULL;
|
|
// miscelleaneous. faster than memsetting the whole gr class (32k)
|
|
ir->m_getSections = 0;
|
|
ir->m_gotSections = 0;
|
|
ir->m_spiderLinks = false;
|
|
ir->m_injectLinks = false;
|
|
ir->m_shortReply = false;
|
|
ir->m_newOnly = false;
|
|
ir->m_recycle = false;
|
|
ir->m_dedup = true;
|
|
ir->m_doConsistencyTesting = false;
|
|
ir->m_charset = 0;
|
|
|
|
ir->ptr_queryToScrape = NULL;
|
|
ir->ptr_contentFile = NULL;
|
|
ir->ptr_diffbotReply = NULL;
|
|
|
|
|
|
// Stick the capture date in the metadata
|
|
StackBuf(newKey);
|
|
newKey.safePrintf("\"gbcapturedate\":%"INT64, recTime);
|
|
SafeBuf newMetadata(newKey.length() * 2 + size_metadata, "ModifiedMetadata");
|
|
|
|
newMetadata.safeMemcpy(ptr_metadata, size_metadata);
|
|
Json::prependKey(newMetadata, newKey.getBufStart());
|
|
|
|
ir->ptr_metadata = newMetadata.getBufStart();
|
|
ir->size_metadata = newMetadata.length();
|
|
|
|
newMetadata.nullTerm();
|
|
// set 'timestamp' for injection
|
|
ir->m_firstIndexed = recTime;
|
|
ir->m_lastSpidered = recTime;
|
|
|
|
|
|
// set 'ip' for injection
|
|
|
|
ir->m_injectDocIp = 0;
|
|
// get the record IP address from the warc header if there
|
|
if ( recIp ) {
|
|
// get end of ip
|
|
char *ipEnd = recIp;
|
|
// skip digits and periods
|
|
while ( *ipEnd && ! is_wspace_a(*ipEnd) ) ipEnd++;
|
|
// we now have the ip address for doing ip: searches
|
|
// this func is in ip.h
|
|
ir->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
|
|
}
|
|
|
|
// we end up repopulating m_fileBuf to read the next warc sometimes
|
|
// so do not destroy the content we are injecting from the original
|
|
// m_fileBuf. so we have to copy it.
|
|
msg7->m_contentBuf.reset();
|
|
msg7->m_contentBuf.reserve ( httpReplySize + 5 );
|
|
msg7->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
|
|
msg7->m_contentBuf.nullTerm();
|
|
|
|
// set 'content' for injection
|
|
ir->ptr_content = msg7->m_contentBuf.getBufStart();
|
|
ir->size_content = msg7->m_contentBuf.getLength() + 1;
|
|
|
|
|
|
|
|
// set the rest of the injection parms
|
|
ir->m_hopCount = -1;
|
|
ir->m_newOnly = 0;
|
|
// all warc records have the http mime
|
|
ir->m_hasMime = true;
|
|
|
|
ir->ptr_url = recUrl;
|
|
ir->size_url = recUrlLen+1;
|
|
|
|
// stash this
|
|
msg7->m_stashxd = this;
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// log it
|
|
*recUrlEnd = '\0';
|
|
log("build: archive: injecting archive url %s",recUrl);
|
|
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if (msg7->sendInjectionRequestToHost(ir,msg7,doneInjectingArchiveRec)){
|
|
m_numInjectionsOut++;
|
|
msg7->m_inUse = true;
|
|
goto loop;
|
|
}
|
|
|
|
log("build: index archive: msg7 inject: %s",
|
|
mstrerror(g_errno));
|
|
|
|
goto loop;
|
|
}
|
|
|
|
|
|
|
|
|
|
void getTitleRecBufWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in get title rec wrapper" );
|
|
// return if it blocked
|
|
if ( THIS->getTitleRecBuf() == (void *)-1 ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
key_t *XmlDoc::getTitleRecKey() {
|
|
if ( m_titleRecBufValid ) return &m_titleRecKey;
|
|
SafeBuf *tr = getTitleRecBuf();
|
|
if ( ! tr || tr == (void *)-1 ) return (key_t *)tr;
|
|
return &m_titleRecKey;
|
|
}
|
|
|
|
|
|
int32_t *XmlDoc::getIndexCode ( ) {
|
|
|
|
int32_t *indexCode = getIndexCode2();
|
|
if ( ! indexCode || indexCode == (void *)-1 ) return indexCode;
|
|
|
|
// if zero good!
|
|
if ( *indexCode == 0 ) return indexCode;
|
|
|
|
//
|
|
// should we neutralize it?
|
|
//
|
|
// in the case of indexing dmoz urls outputted from
|
|
// 'dmozparse urldump -s' it outputs a meta tag
|
|
// (<meta name=ignorelinksexternalerrors content=1>) that
|
|
// indicates to index the links even in the case of some errors,
|
|
// so that we can be assured to have exactly the same urls the dmoz
|
|
// has in our index. so when we do a gbcatid:xxx query we get the same
|
|
// urls in the search results that dmoz has for that category id.
|
|
if ( ! m_sreqValid || ! m_sreq.m_ignoreExternalErrors )
|
|
return indexCode;
|
|
|
|
// only neutralize certain errors
|
|
if ( * indexCode != EDNSTIMEDOUT
|
|
&& *indexCode != ETCPTIMEDOUT
|
|
&& *indexCode != EUDPTIMEDOUT
|
|
// from m_redirError
|
|
&& *indexCode != EDOCSIMPLIFIEDREDIR
|
|
&& *indexCode != EDOCNONCANONICAL
|
|
&& *indexCode != EDNSDEAD
|
|
&& *indexCode != ENETUNREACH
|
|
&& *indexCode != EHOSTUNREACH
|
|
&& *indexCode != EDOCFILTERED
|
|
&& *indexCode != EDOCREPEATSPAMMER
|
|
&& *indexCode != EDOCDUP
|
|
&& *indexCode != EDOCISERRPG
|
|
&& *indexCode != EDOCHIJACKED
|
|
&& *indexCode != EDOCBADHTTPSTATUS
|
|
&& *indexCode != EDOCDISALLOWED
|
|
&& *indexCode != EBADCHARSET
|
|
&& *indexCode != EDOCDUPWWW
|
|
&& *indexCode != EBADIP
|
|
&& *indexCode != EDOCEVILREDIRECT // fix video.google.com dmoz
|
|
&& *indexCode != EBADMIME
|
|
// index.t and .exe files are in dmoz but those
|
|
// extensions are "bad" according to Url::isBadExtension()
|
|
&& *indexCode != EDOCBADCONTENTTYPE
|
|
// repeat url path components are ok:
|
|
&& *indexCode != ELINKLOOP
|
|
&& *indexCode != ECONNREFUSED
|
|
// malformed sections:
|
|
&& *indexCode != EDOCBADSECTIONS
|
|
&& *indexCode != ECORRUPTHTTPGZIP
|
|
)
|
|
return indexCode;
|
|
|
|
// ok, neutralize it
|
|
*indexCode = 0;
|
|
|
|
// if we could not get an ip we need to make a fake one
|
|
if ( ! m_ipValid || m_ip == 0 || m_ip == -1 ) {
|
|
log("build: ip unattainable. forcing ip address of %s "
|
|
"to 10.5.123.45",m_firstUrl.m_url);
|
|
m_ip = atoip("10.5.123.45");
|
|
m_ipValid = true;
|
|
}
|
|
|
|
// make certain things valid to avoid core in getNewSpiderReply()
|
|
if ( ! m_crawlDelayValid ) {
|
|
m_crawlDelayValid = true;
|
|
m_crawlDelay = -1;
|
|
}
|
|
|
|
return indexCode;
|
|
}
|
|
|
|
|
|
// . return NULL and sets g_errno on error
|
|
// . returns -1 if blocked
|
|
int32_t *XmlDoc::getIndexCode2 ( ) {
|
|
|
|
// return it now if we got it already
|
|
if ( m_indexCodeValid ) return &m_indexCode;
|
|
|
|
setStatus ( "getting index code");
|
|
|
|
// page inject can set deletefromindex to true
|
|
if ( m_deleteFromIndex ) {
|
|
m_indexCode = EDOCFORCEDELETE;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block and return -1, we will be re-called from the top
|
|
//if ( ! m_masterLoop ) {
|
|
// m_masterLoop = getTitleRecWrapper;
|
|
// m_masterState = this;
|
|
//}
|
|
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( m_firstUrl.m_ulen <= 5 ) {
|
|
m_indexCode = EBADURL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
if ( m_firstUrl.m_ulen + 1 >= MAX_URL_LEN ) {
|
|
m_indexCode = EURLTOOLONG;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// "url is repeating path components" error?
|
|
if ( ! m_check1 ) {
|
|
m_check1 = true;
|
|
if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
|
|
m_indexCode = ELINKLOOP;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// fix for "http://.xyz.com/...."
|
|
if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
|
|
m_indexCode = EBADURL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
|
|
m_check2 = true;
|
|
if ( m_firstUrl.isSpam() ) {
|
|
m_indexCode = EDOCURLSPAM;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// . don't spider robots.txt urls for indexing!
|
|
// . quickly see if we are a robots.txt url originally
|
|
int32_t fulen = getFirstUrl()->getUrlLen();
|
|
char *fu = getFirstUrl()->getUrl();
|
|
char *fp = fu + fulen - 11;
|
|
if ( fulen > 12 &&
|
|
fp[1] == 'r' &&
|
|
! strncmp ( fu + fulen - 11 , "/robots.txt" , 11 )) {
|
|
m_indexCode = EBADURL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// if this is an injection and "newonly" is not zero then we
|
|
// only want to do the injection if the url is "new", meaning not
|
|
// already indexed. "m_wasContentInjected" will be true if this is
|
|
// an injection. "m_newOnly" will be true if the injector only
|
|
// wants to proceed with the injection if this url is not already
|
|
// indexed.
|
|
if ( m_wasContentInjected && m_newOnly ) {
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
|
|
XmlDoc *od = *pod;
|
|
// if the old doc does exist and WAS NOT INJECTED itself
|
|
// then abandon this injection. it was spidered the old
|
|
// fashioned way and we want to preserve it and NOT overwrite
|
|
// it with this injection.
|
|
if ( od && ! od->m_wasContentInjected ) {
|
|
m_indexCode = EABANDONED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
// if it was injected itself, only abandon this injection
|
|
// in the special case that m_newOnly is "1". otherwise
|
|
// if m_newOnly is 2 then we will overwrite any existing
|
|
// titlerecs that were not injected themselves.
|
|
if ( od && od->m_wasContentInjected && m_newOnly == 1 ) {
|
|
m_indexCode = EABANDONED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
}
|
|
|
|
// need tagrec to see if banned
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// this is an automatic ban!
|
|
if ( gr->getLong("manualban",0) ) {
|
|
m_indexCode = EDOCBANNED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
|
|
// get the ip of the current url
|
|
int32_t *ip = getIp ( );
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
|
|
if ( *ip == 0 ) {
|
|
m_indexCode = EBADIP;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . check robots.txt
|
|
// . uses the curernt url
|
|
// . if we end in /robots.txt then this quickly returns true
|
|
// . no, we still might want to index if we got link text, so just
|
|
// check this again below
|
|
bool *isAllowed = getIsAllowed();
|
|
if ( ! isAllowed || isAllowed == (void *)-1) return (int32_t *)isAllowed;
|
|
/*
|
|
if ( ! *isAllowed ) {
|
|
m_indexCode = EDOCDISALLOWED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
*/
|
|
|
|
// . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc.
|
|
// . this will be the reply from diffbot.com if using diffbot
|
|
int32_t *dstatus = getDownloadStatus();
|
|
if ( ! dstatus || dstatus == (void *)-1 ) return (int32_t *)dstatus;
|
|
if ( *dstatus ) {
|
|
m_indexCode = *dstatus;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// check the mime
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (int32_t *)mime;
|
|
// no, now the smart compression will nuke a reply if it has
|
|
// no good date or for other reasons...
|
|
// if empty, bad mime
|
|
//if ( mime->getMimeLen() <= 0 && ! m_recycleContent ) {
|
|
// m_indexCode = EBADMIME;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
//}
|
|
|
|
// check redir url
|
|
Url **redirp = getRedirUrl();
|
|
if ( ! redirp || redirp == (void *)-1 ) return (int32_t *)redirp;
|
|
// this must be valid now
|
|
if ( ! m_redirErrorValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_redirError ) {
|
|
m_indexCode = m_redirError;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (void *)-1 ) return (int32_t *)d;
|
|
if ( *d == 0LL ) {
|
|
m_indexCode = ENODOCID;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . is the same url but with a www. present already in titledb?
|
|
// . example: if we are xyz.com and www.xyz.com is already in titledb
|
|
// then nuke ourselves by setting m_indexCode to EDOCDUPWWW
|
|
char *isWWWDup = getIsWWWDup ();
|
|
if ( ! isWWWDup || isWWWDup == (char *)-1) return (int32_t *)isWWWDup;
|
|
if ( *isWWWDup ) {
|
|
m_indexCode = EDOCDUPWWW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
|
|
uint16_t *charset = getCharset();
|
|
if ( ! charset && g_errno == EBADCHARSET ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBADCHARSET;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if ( ! charset || charset == (void *)-1) return (int32_t *)charset;
|
|
// we had a 2024 for charset come back and that had a NULL
|
|
// get_charset_str() but it was not supported
|
|
if ( ! supportedCharset(*charset) ) { //&&get_charset_str(*charset) ) {
|
|
m_indexCode = EBADCHARSET;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// get local link info
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int32_t *)info1;
|
|
// get remote link info
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (int32_t *)pinfo2;
|
|
LinkInfo *info2 = *pinfo2;
|
|
|
|
// if robots.txt said no, and if we had no link text, then give up
|
|
bool disallowed = true;
|
|
if ( *isAllowed ) disallowed = false;
|
|
if ( info1 && info1->hasLinkText() ) disallowed = false;
|
|
if ( info2 && info2->hasLinkText() ) disallowed = false;
|
|
// if we generated a new sitenuminlinks to store in tagdb, we might
|
|
// want to add this for that only reason... consider!
|
|
if ( disallowed ) {
|
|
m_indexCode = EDOCDISALLOWED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// check for bad url extension, like .jpg
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (int32_t *)cu;
|
|
|
|
// take this check out because it is hurting
|
|
// http://community.spiceworks.com/profile/show/Mr.T
|
|
// because 't' was in the list of bad extensions.
|
|
// now we use the url filters table to exclude the extensions we want.
|
|
// and we use the 'ismedia' directive to exclude common media
|
|
// extensions. having this check here is no longer needed and confusing
|
|
// BUT on the otherhand stuff like .exe .rpm .deb is good to avoid!
|
|
// so i'll just edit the list to remove more ambiguous extensions
|
|
// like .f and .t
|
|
bool badExt = cu->isBadExtension ( m_version );
|
|
if ( badExt && ! info1->hasLinkText() &&
|
|
( ! info2 || ! info2->hasLinkText() ) ) {
|
|
m_indexCode = EDOCBADCONTENTTYPE;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
int16_t *hstatus = getHttpStatus();
|
|
if ( ! hstatus || hstatus == (void *)-1 ) return (int32_t *)hstatus;
|
|
if ( *hstatus != 200 ) {
|
|
m_indexCode = EDOCBADHTTPSTATUS;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// debug point
|
|
//if ( cr->m_localCrawlInfo.m_pageDownloadAttempts >= 2 ) {
|
|
// m_indexCode = ETCPTIMEDOUT;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
//}
|
|
|
|
// if this page is hijacked, toss it!
|
|
char *hj = getIsHijacked();
|
|
if ( ! hj || hj == (char *)-1 ) return (int32_t *)hj;
|
|
// if not allowed m_indexCode will be set
|
|
if ( *hj ) {
|
|
m_indexCode = EDOCHIJACKED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// check for EDOCISERRPG (custom error pages)
|
|
char *isErrorPage = getIsErrorPage();
|
|
if ( !isErrorPage||isErrorPage==(void *)-1) return (int32_t *)isErrorPage;
|
|
if ( *isErrorPage ) {
|
|
m_indexCode = EDOCISERRPG;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . i moved this up to perhaps fix problems of two dup pages being
|
|
// downloaded at about the same time
|
|
// . are we a dup of another doc from any other site already indexed?
|
|
char *isDup = getIsDup();
|
|
if ( ! isDup || isDup == (char *)-1 ) return (int32_t *)isDup;
|
|
if ( *isDup ) {
|
|
m_indexCode = EDOCDUP;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// . is a non-canonical page that have <link ahref=xxx rel=canonical>
|
|
// . also sets m_canonicanlUrl.m_url to it if we are not
|
|
// . returns NULL if we are the canonical url
|
|
// . do not do this check if the page was injected
|
|
bool checkCanonical = true;
|
|
if ( m_wasContentInjected ) checkCanonical = false;
|
|
if ( m_isInjecting && m_isInjectingValid ) checkCanonical = false;
|
|
// do not do canonical deletion if recycling content either i guess
|
|
if ( m_sreqValid && m_sreq.m_recycleContent ) checkCanonical = false;
|
|
// do not delete from being canonical if doing a query reindex
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex ) checkCanonical = false;
|
|
if ( checkCanonical ) {
|
|
Url **canon = getCanonicalRedirUrl();
|
|
if ( ! canon || canon == (void *)-1 ) return (int32_t *)canon;
|
|
// if there is one then we are it's leaf, it is the primary
|
|
// page so we should not index ourselves
|
|
if ( *canon ) {
|
|
m_indexCode = EDOCNONCANONICAL;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// was page unchanged since last time we downloaded it?
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
|
|
XmlDoc *od = NULL;
|
|
if ( *pod ) od = *pod;
|
|
|
|
// if recycling content is true you gotta have an old title rec.
|
|
if ( ! od && m_recycleContent ) {
|
|
m_indexCode = ENOTITLEREC;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
bool check = true;
|
|
if ( ! od ) check = false;
|
|
// do not do this logic for diffbot because it might want to get
|
|
// the diffbot reply even if page content is the same, because it
|
|
// might have an ajax call that updates the product price.
|
|
// onlyProcessIfNewUrl defaults to true, so typically even diffbot
|
|
// crawls will do this check.
|
|
if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl &&
|
|
// but allow urls like *-diffbotxyz2445187448 to be deduped,
|
|
// that is the whole point of this line
|
|
! m_isDiffbotJSONObject )
|
|
check = false;
|
|
if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError )
|
|
check = false;
|
|
// or if recycling content turn this off as well! otherwise
|
|
// it will always be 100% the same
|
|
if ( m_recycleContent )
|
|
check = false;
|
|
// never check for a bulk job
|
|
if ( cr->m_isCustomCrawl == 2 )
|
|
check = false;
|
|
|
|
if ( check ) {
|
|
// check inlinks now too!
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 )
|
|
return (int32_t *)info1;
|
|
LinkInfo *info2 = od->getLinkInfo1 ();
|
|
if ( ! info2 || info2 == (LinkInfo *)-1 )
|
|
return (int32_t *)info2;
|
|
Inlink *k1 = NULL;
|
|
Inlink *k2 = NULL;
|
|
char *s1, *s2;
|
|
int32_t len1,len2;
|
|
if ( info1->getNumGoodInlinks() !=
|
|
info2->getNumGoodInlinks() )
|
|
goto changed;
|
|
for ( ; k1=info1->getNextInlink(k1) ,
|
|
k2=info2->getNextInlink(k2); ) {
|
|
if ( ! k1 )
|
|
break;
|
|
if ( ! k2 )
|
|
break;
|
|
if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks )
|
|
goto changed;
|
|
s1 = k1->getLinkText();
|
|
len1 = k1->size_linkText - 1; // exclude \0
|
|
s2 = k2->getLinkText();
|
|
len2 = k2->size_linkText - 1; // exclude \0
|
|
if ( len1 != len2 )
|
|
goto changed;
|
|
if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
|
|
goto changed;
|
|
}
|
|
// no change in link text, look for change in page content now
|
|
int32_t *ch32 = getContentHash32();
|
|
if ( ! ch32 || ch32 == (void *)-1 ) return (int32_t *)ch32;
|
|
if ( *ch32 == od->m_contentHash32 ) {
|
|
m_indexCode = EDOCUNCHANGED;
|
|
m_indexCodeValid = true;
|
|
// hack these values on or off.
|
|
// really should be function calls.
|
|
// but it never gets set when it should if the
|
|
// doc is unchanged.
|
|
m_sentToDiffbot = od->m_sentToDiffbot;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
changed:
|
|
// words
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (int32_t *)words;
|
|
|
|
// we set the D_IS_IN_DATE flag for these bits
|
|
Bits *bits = getBits(); if ( ! bits ) return NULL;
|
|
|
|
// . check for date buffer overflow before setting sections
|
|
// . returns false and sets g_errno on error
|
|
/*
|
|
if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits )) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: parseDates: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
if ( g_errno != EBUFOVERFLOW ) return NULL;
|
|
g_errno = 0;
|
|
m_indexCode = EDOCBADDATES;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
*/
|
|
|
|
// bad sections? fixes http://www.beerexpedition.com/northamerica.shtml
|
|
// being continuously respidered when its lock expires every
|
|
// MAX_LOCK_AGE seconds
|
|
Sections *sections = getSections();
|
|
// on EBUFOVERFLOW we will NEVER be able to parse this url
|
|
// correctly so do not retry!
|
|
if ( ! sections && g_errno == EBUFOVERFLOW ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if (!sections||sections==(Sections *)-1) return (int32_t *)sections;
|
|
if ( sections->m_numSections == 0 && words->m_numWords > 0 ) {
|
|
m_indexCode = EDOCBADSECTIONS;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
// i think an oom error is not being caught by Sections.cpp properly
|
|
if ( g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
Dates *dp = getDates();
|
|
if ( ! dp && g_errno == EBUFOVERFLOW ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
|
|
|
|
// make sure address buffers did not overflow
|
|
Addresses *aa = getAddresses ();
|
|
if ( (! aa && g_errno == EBUFOVERFLOW) ||
|
|
// it sets m_breached now if there's a problem
|
|
(aa && aa->m_breached) ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
if ( ! aa || aa == (void *)-1 ) return (int32_t *)aa;
|
|
|
|
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (int32_t *)isRoot;
|
|
|
|
// get the tag rec
|
|
//TagRec *gr = getTagRec ();
|
|
//if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
|
|
bool spamCheck = true;
|
|
// if we are a root, allow repeat spam
|
|
if ( *isRoot ) spamCheck = false;
|
|
// if we are being spidered deep, allow repeat spam
|
|
if ( gr->getLong("deep",0) ) spamCheck = false;
|
|
// not for crawlbot
|
|
if ( cr->m_isCustomCrawl ) spamCheck = false;
|
|
// only html for now
|
|
if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
|
|
// turn this off for now
|
|
spamCheck = false;
|
|
// otherwise, check the weights
|
|
if ( spamCheck ) {
|
|
char *ws = getWordSpamVec();
|
|
if ( ! ws || ws == (void *)-1 ) return (int32_t *)ws;
|
|
if ( m_isRepeatSpammer ) {
|
|
m_indexCode = EDOCREPEATSPAMMER;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
}
|
|
|
|
// validate this here so getSpiderPriority(), which calls
|
|
// getUrlFilterNum(), which calls getNewSpiderReply(), which calls
|
|
// us, getIndexCode() does not repeat all this junk
|
|
//m_indexCodeValid = true;
|
|
//m_indexCode = 0;
|
|
|
|
// fix query reindex on global-index from coring because
|
|
// the spider request is null
|
|
if ( m_isDiffbotJSONObject ) {
|
|
m_indexCode = 0;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
|
|
// this needs to be last!
|
|
int32_t *priority = getSpiderPriority();
|
|
if ( ! priority || priority == (void *)-1) {
|
|
// allow this though
|
|
if ( g_errno == EBUFOVERFLOW ) {
|
|
g_errno = 0;
|
|
m_indexCode = EBUFOVERFLOW;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
// but if it blocked, then un-validate it
|
|
m_indexCodeValid = false;
|
|
// and return to be called again i hope
|
|
return (int32_t *)priority;
|
|
}
|
|
if ( *priority == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
|
|
m_indexCode = EDOCFILTERED;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
// if ( *priority == SPIDER_PRIORITY_BANNED ) {
|
|
// m_indexCode = EDOCBANNED;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
// }
|
|
|
|
// . if using diffbot and the diffbot reply had a time out error
|
|
// or otherwise... diffbot failure demands a re-try always i guess.
|
|
// put this above getSpiderPriority() call otherwise we end up in
|
|
// a recursive loop with getIndexCode() and getNewSpiderReply()
|
|
// . NO, don't do this anymore, however, if there is a diffbot
|
|
// reply error then record it in the spider reply BUT only if it is
|
|
// a diffbot reply error that warrants a retry. for instance,
|
|
// EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
|
|
// error trying to download the page so it probably should not
|
|
// retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
|
|
// SafeBuf *dbr = getDiffbotReply();
|
|
// if ( ! dbr || dbr == (void *)-1 ) return (int32_t *)dbr;
|
|
// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
|
|
// m_indexCode= m_diffbotReplyError;
|
|
// m_indexCodeValid = true;
|
|
// return &m_indexCode;
|
|
// }
|
|
|
|
// no error otherwise
|
|
m_indexCode = 0;
|
|
m_indexCodeValid = true;
|
|
return &m_indexCode;
|
|
}
|
|
|
|
char *XmlDoc::prepareToMakeTitleRec ( ) {
|
|
// do not re-call this for speed
|
|
if ( m_prepared ) return (char *)1;
|
|
|
|
int32_t *indexCode = getIndexCode();
|
|
if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
|
|
if ( *indexCode ) { m_prepared = true; return (char *)1; }
|
|
|
|
//
|
|
// do all the sets here
|
|
//
|
|
|
|
// . this gets our old doc from titledb, if we got it
|
|
// . TODO: make sure this is cached in the event of a backoff, we
|
|
// will redo this again!!! IMPORTANT!!!
|
|
char *isIndexed = getIsIndexed();
|
|
if ( ! isIndexed || isIndexed == (char *)-1) return (char *)isIndexed;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if we are injecting into the "qatest123" coll, then we need to have
|
|
// m_spideredTimeValid be true before calling getIsSpam() which calls
|
|
// getSiteNumInlinks() which adds tags to tagdb using that date, but
|
|
// only for the "qatest123" coll!
|
|
// that keeps our parser output consistent across runs!
|
|
char **content = NULL;
|
|
if ( ! strcmp ( cr->m_coll,"qatest123") ) {
|
|
content = getContent ( );
|
|
if ( ! content || content == (void *)-1 )
|
|
return (char *)content;
|
|
}
|
|
|
|
// get our site root
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (char *)mysite;
|
|
|
|
// if we are a root page, update tagdb with the root lang id
|
|
//bool *status1 = updateRootLangId();
|
|
//if ( ! status1 || status1 == (void *)-1 ) return (char *)status1;
|
|
|
|
// if we are a root page, update tagdb with the root lang id
|
|
//bool *status2 = updateSiteTitleBuf();
|
|
//if ( ! status2 || status2 == (void *)-1 ) return (char *)status2;
|
|
|
|
// if we found some default venue addresses on page, add to tagdb
|
|
//bool *status3 = updateVenueAddresses();
|
|
//if ( ! status3 || status3 == (void *)-1 ) return (char *)status3;
|
|
|
|
// add "firstip" to tag rec if we need to
|
|
//bool *status4 = updateFirstIp();
|
|
//if ( ! status4 || status4 == (void *)-1 ) return (char *)status4;
|
|
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
|
|
|
|
int32_t *datedbDate = getPubDate();
|
|
if ( ! datedbDate || datedbDate == (int32_t *)-1 )
|
|
return (char *)datedbDate;
|
|
|
|
getHostHash32a();
|
|
getContentHash32();
|
|
|
|
//Images *images = getImages();
|
|
//if ( ! images || images == (Images *)-1 ) return (char *)images;
|
|
|
|
char **id = getThumbnailData();
|
|
if ( ! id || id == (void *)-1 ) return (char *)id;
|
|
|
|
int8_t *hopCount = getHopCount();
|
|
if ( ! hopCount || hopCount == (void *)-1 ) return (char *)hopCount;
|
|
|
|
char *spiderLinks = getSpiderLinks();
|
|
if ( ! spiderLinks || spiderLinks == (char *)-1 )
|
|
return (char *)spiderLinks;
|
|
|
|
//int32_t *nextSpiderTime = getNextSpiderTime();
|
|
//if ( ! nextSpiderTime || nextSpiderTime == (int32_t *)-1 )
|
|
// return (char *)nextSpiderTime;
|
|
|
|
//int8_t *nextSpiderPriority = getNextSpiderPriority();
|
|
//if ( ! nextSpiderPriority || nextSpiderPriority == (void *)-1 )
|
|
// return (char *)nextSpiderPriority;
|
|
|
|
int32_t *firstIndexedDate = getFirstIndexedDate();
|
|
if ( ! firstIndexedDate || firstIndexedDate == (int32_t *)-1 )
|
|
return (char *)firstIndexedDate;
|
|
|
|
int32_t *outlinksAddedDate = getOutlinksAddedDate();
|
|
if ( ! outlinksAddedDate || outlinksAddedDate == (int32_t *)-1 )
|
|
return (char *)outlinksAddedDate;
|
|
|
|
uint16_t *countryId = getCountryId();
|
|
if ( ! countryId||countryId==(uint16_t *)-1) return (char *)countryId;
|
|
|
|
char *trunc = getIsContentTruncated();
|
|
if ( ! trunc || trunc == (char *)-1 ) return (char *)trunc;
|
|
|
|
char *pl = getIsPermalink();
|
|
if ( ! pl || pl == (char *)-1 ) return (char *)pl;
|
|
|
|
//int32_t *numBannedOutlinks = getNumBannedOutlinks();
|
|
// set this
|
|
//m_numBannedOutlinks8 = score32to8 ( *numBannedOutlinks );
|
|
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (char *)dp;
|
|
|
|
// . before storing this into title Rec, make sure all tags
|
|
// are valid and tagRec is up to date
|
|
// . like we might need to update the contact info, siteNumInlinks,
|
|
// or other tags because, for instance, contact info might not
|
|
// be in there because isSpam() never required it.
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (char *)hci;
|
|
char *ict = getIsContentTruncated();
|
|
if ( ! ict || ict == (char *)-1 ) return (char *)ict;
|
|
int64_t **wd = getWikiDocIds();
|
|
if ( ! wd || wd == (void *)-1 ) return (char *)wd;
|
|
int64_t **avp = getAdVector();
|
|
if ( ! avp || avp == (void *)-1 ) return (char *)avp;
|
|
char *at = getIsAdult();
|
|
if ( ! at || at == (void *)-1 ) return (char *)at;
|
|
char *ls = getIsLinkSpam();
|
|
if ( ! ls || ls == (void *)-1 ) return (char *)ls;
|
|
uint32_t *tph = getTagPairHash32();
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;
|
|
|
|
// sets the ptr_sectionsReply, that is all we need it to do
|
|
//char **sd = getSectionsReply ( ) ;
|
|
//if ( ! sd || sd == (void *)-1 ) return (char *)sd;
|
|
// sets the ptr_addressReply, that is all we need it to do
|
|
//char **ad = getAddressReply ( ) ;
|
|
//if ( ! ad || ad == (void *)-1 ) return (char *)ad;
|
|
uint8_t *rl = getRootLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (char *)rl;
|
|
int32_t **pcids = getCatIds();
|
|
if ( ! pcids || pcids == (void *)-1) return (char *)pcids;
|
|
// get dmoz ptr_dmozTitles, ptr_dmozSumms, ptr_dmozAnchors
|
|
if ( ! setDmozInfo() ) return (char *)-1;
|
|
|
|
m_prepared = true;
|
|
return (char *)1;
|
|
}
|
|
|
|
#define MAX_DMOZ_TITLES 10
|
|
|
|
int32_t *XmlDoc::getNumDmozEntries() {
|
|
// MDW: wth is this?
|
|
//int32_t **getDmozCatIds();
|
|
int32_t nc = size_catIds / 4;
|
|
if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
|
|
m_numDmozEntries = nc;
|
|
return &m_numDmozEntries;
|
|
}
|
|
// list of \0 terminated titles, etc. use getNumDmozTitles() to get #
|
|
char **XmlDoc::getDmozTitles ( ) {
|
|
// returns false if blocked
|
|
if ( ! setDmozInfo() ) return (char **)-1;
|
|
if ( g_errno ) return NULL;
|
|
return &ptr_dmozTitles;
|
|
}
|
|
char **XmlDoc::getDmozSummaries ( ) {
|
|
// returns false if blocked
|
|
if ( ! setDmozInfo() ) return (char **)-1;
|
|
if ( g_errno ) return NULL;
|
|
return &ptr_dmozSumms;
|
|
}
|
|
char **XmlDoc::getDmozAnchors ( ) {
|
|
// returns false if blocked
|
|
if ( ! setDmozInfo() ) return (char **)-1;
|
|
if ( g_errno ) return NULL;
|
|
return &ptr_dmozAnchors;
|
|
}
|
|
|
|
|
|
// returns false if blocked, true otherwise. sets g_errno on error & rets true
|
|
bool XmlDoc::setDmozInfo () {
|
|
|
|
if ( m_dmozInfoValid ) return true;
|
|
|
|
g_errno = 0;
|
|
|
|
// return true and set g_errno on error
|
|
if ( ! m_dmozBuf.reserve(12000) ) {
|
|
log("xmldoc: error getting dmoz info: %s",mstrerror(g_errno));
|
|
// ensure log statement does not clear g_errno
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// start here
|
|
char *dmozBuf = m_dmozBuf.getBufStart();
|
|
|
|
char *titles = dmozBuf;
|
|
char *summs = dmozBuf+5000;
|
|
char *anchors = dmozBuf+10000;
|
|
// the end of it
|
|
char *dtend = dmozBuf + 5000;
|
|
char *dsend = dmozBuf + 10000;
|
|
char *daend = dmozBuf + 12000;
|
|
// point into those bufs
|
|
char *dt = titles;
|
|
char *ds = summs;
|
|
char *da = anchors;
|
|
// MDW: i limit this to 10 to save stack space!
|
|
int32_t nc = size_catIds / 4;
|
|
if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
|
|
for (int32_t i = 0; i < nc ; i++) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// temp stuff
|
|
int32_t dtlen = 0;
|
|
int32_t dslen = 0;
|
|
unsigned char dalen = 0;
|
|
|
|
// . store all dmoz info separated by \0's into titles[] buffer
|
|
// . crap, this does a disk read and blocks on that
|
|
//
|
|
// . TODO: make it non-blocking!!!!
|
|
//
|
|
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
|
|
m_firstUrl.getUrlLen(),
|
|
ptr_catIds[i],
|
|
dt,//&titles[titlesLen],
|
|
&dtlen,//&titleLens[i],
|
|
dtend-dt,
|
|
ds,//&summs[summsLen],
|
|
&dslen,//&summLens[i],
|
|
dsend-ds,
|
|
da,//&anchors[anchorsLen],
|
|
&dalen,//&anchorLens[i],
|
|
daend-da,
|
|
m_niceness);
|
|
// advance ptrs
|
|
dt += dtlen;
|
|
ds += dslen;
|
|
da += dalen;
|
|
// null terminate
|
|
*dt++ = 0;
|
|
*ds++ = 0;
|
|
*ds++ = 0;
|
|
}
|
|
|
|
// if empty, make it a \0 to keep in sync with the rest
|
|
if ( dt == titles ) *dt++ = '\0';
|
|
if ( ds == summs ) *ds++ = '\0';
|
|
if ( da == anchors ) *da++ = '\0';
|
|
|
|
// set these
|
|
ptr_dmozTitles = titles;
|
|
ptr_dmozSumms = summs;
|
|
ptr_dmozAnchors = anchors;
|
|
size_dmozTitles = dt - titles;
|
|
size_dmozSumms = ds - summs;
|
|
size_dmozAnchors = da - anchors;
|
|
|
|
m_dmozInfoValid = true;
|
|
return true;
|
|
}
|
|
|
|
// . create and store the titlerec into "buf".
|
|
// . it is basically the header part of all the member vars in this XmlDoc.
|
|
// . it has a key,dataSize,compressedData so it can be a record in an Rdb
|
|
// . return true on success, false on failure
|
|
bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){
|
|
|
|
//setStatus ( "making title rec");
|
|
|
|
// assume could not make one because we were banned or something
|
|
tbuf->purge(); // m_titleRec = NULL;
|
|
|
|
// start seting members in THIS's header before compression
|
|
m_version = TITLEREC_CURRENT_VERSION;
|
|
|
|
// tag rec must have "sitenuminlinks" in it
|
|
//if (! m_newTagRec.getTag("sitenuminlinks") ) { char *xx=NULL;*xx=0; }
|
|
// we often update m_oldTagRec above by calling updateRootLangId(), etc
|
|
// so update the size our of tag rec here
|
|
//size_tagRecData = m_oldTagRec.getSize();
|
|
// and sanity check this
|
|
//if( ptr_tagRecData != (char *)&m_oldTagRec ) { char *xx=NULL;*xx=0; }
|
|
|
|
// lookup dmoz title and summary for this site
|
|
//int32_t titleLens [10];
|
|
//int32_t summLens [10];
|
|
//unsigned char anchorLens [10];
|
|
//int32_t titlesLen = 0;
|
|
//int32_t summsLen = 0;
|
|
//int32_t anchorsLen = 0;
|
|
//char titles [10*1024];
|
|
//char summs [10*4096];
|
|
//char anchors [10* 256];
|
|
|
|
/*
|
|
|
|
MDW oct 12 2013 -
|
|
why is this here? we should store this info at spider time?
|
|
|
|
char *titles = m_dmozBuf;
|
|
char *summs = m_dmozBuf+5000;
|
|
char *anchors = m_dmozBuf+10000;
|
|
// the end of it
|
|
char *dtend = m_dmozBuf + 5000;
|
|
char *dsend = m_dmozBuf + 10000;
|
|
char *daend = m_dmozBuf + 12000;
|
|
// point into those bufs
|
|
char *dt = titles;
|
|
char *ds = summs;
|
|
char *da = anchors;
|
|
// MDW: i limit this to 10 to save stack space!
|
|
int32_t nc = size_catIds / 4;
|
|
if ( nc > 10 ) nc = 10;
|
|
for (int32_t i = 0; i < nc ; i++) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// temp stuff
|
|
int32_t dtlen = 0;
|
|
int32_t dslen = 0;
|
|
unsigned char dalen = 0;
|
|
|
|
// . store all dmoz info separated by \0's into titles[] buffer
|
|
// . crap, this does a disk read and blocks on that
|
|
//
|
|
// . TODO: make it non-blocking!!!!
|
|
//
|
|
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
|
|
m_firstUrl.getUrlLen(),
|
|
ptr_catIds[i],
|
|
dt,//&titles[titlesLen],
|
|
&dtlen,//&titleLens[i],
|
|
dtend-dt,
|
|
ds,//&summs[summsLen],
|
|
&dslen,//&summLens[i],
|
|
dsend-ds,
|
|
da,//&anchors[anchorsLen],
|
|
&dalen,//&anchorLens[i],
|
|
daend-da,
|
|
m_niceness);
|
|
// advance ptrs
|
|
dt += dtlen;
|
|
ds += dslen;
|
|
da += dalen;
|
|
// null terminate
|
|
if ( dtlen>0 && dt[dtlen-1]!='\0' ) { *dt++=0; dtlen++; }
|
|
if ( dslen>0 && ds[dslen-1]!='\0' ) { *ds++=0; dslen++; }
|
|
if ( dalen>0 && da[dalen-1]!='\0' ) { *da++=0; dalen++; }
|
|
// must always be something!
|
|
if ( dtlen==0 ) {*dt++=0; dtlen++;}
|
|
if ( dslen==0 ) {*ds++=0; dslen++;}
|
|
if ( dalen==0 ) {*da++=0; dalen++;}
|
|
}
|
|
|
|
// set these
|
|
ptr_dmozTitles = titles;
|
|
ptr_dmozSumms = summs;
|
|
ptr_dmozAnchors = anchors;
|
|
size_dmozTitles = dt - titles;
|
|
size_dmozSumms = ds - summs;
|
|
size_dmozAnchors = da - anchors;
|
|
*/
|
|
|
|
// set our crap that is not necessarily set
|
|
//ptr_firstUrl = m_firstUrl.getUrl();
|
|
//ptr_redirUrl = m_redirUrl.getUrl();
|
|
//ptr_tagRecData = (char *)&m_oldTagRec;
|
|
|
|
// this must be valid now
|
|
//if ( ! m_skipIndexingValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// CT_STATUS docs do not have a valid XmlDoc really, it is
|
|
// just the first 2048 bytes, so there is no m_collnum member
|
|
// in the first 2048 bytes that is valid or even in legit memory.
|
|
// see 'char xdhead[2048];' below.
|
|
CollectionRec *cr = NULL;
|
|
if ( m_contentType != CT_STATUS ) {
|
|
cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
}
|
|
|
|
// zero out the content to save disk space if it is a custom crawl
|
|
// and the page was not processed (i.e. sent to diffbot).
|
|
// this will cause some undeletable data in the index, like for
|
|
// indexing meta tags perhaps, but in general we do not index
|
|
// most of the html document in custom crawls because we set
|
|
// 'indexBody/indexDoc' to false. but don't do this if we have
|
|
// ever sent this url to diffbot for processing before at any time.
|
|
// this may screw up content hash deduping, because the original
|
|
// hash will always be indexed, even if the doc changes or is
|
|
// deleted.
|
|
bool zeroOut = false;
|
|
if ( cr && cr->m_isCustomCrawl && ! m_sentToDiffbot ) zeroOut = true;
|
|
if ( zeroOut && m_isDiffbotJSONObject ) zeroOut = false;
|
|
if ( zeroOut && ! m_exactContentHash64Valid ) zeroOut = false;
|
|
// don't zero out spider status documents
|
|
if ( zeroOut && m_contentType == CT_STATUS ) zeroOut = false;
|
|
// disable for now. probably most disk space is from the spider status
|
|
// docs.
|
|
//zeroOut = false;
|
|
char *savedPtr = ptr_utf8Content;
|
|
int32_t savedSize = size_utf8Content;
|
|
if ( zeroOut ) {
|
|
// record the 64 bit content hash here and make
|
|
// getExactContentHash64() return it as a 64-bit binary number.
|
|
// that way we can preserve it.
|
|
sprintf(m_tmp9,"gbzeroedout:%"UINT64"",m_exactContentHash64);
|
|
ptr_utf8Content = m_tmp9;
|
|
size_utf8Content = gbstrlen(ptr_utf8Content) + 1;
|
|
m_zeroedOut = true;
|
|
}
|
|
|
|
// set this
|
|
m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;
|
|
|
|
// add in variable length data
|
|
int32_t *ps = (int32_t *)&size_firstUrl;
|
|
// data ptr, consider a NULL to mean empty too!
|
|
char **pd = (char **)&ptr_firstUrl;
|
|
// how many XmlDoc::ptr_* members do we have? set "np" to that
|
|
int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ;
|
|
np /= sizeof(char *);
|
|
// count up total we need to alloc
|
|
int32_t need1 = m_headerSize;
|
|
// clear these
|
|
m_internalFlags1 = 0;
|
|
// loop over em
|
|
for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
|
|
// skip if empty
|
|
if ( *ps <= 0 ) continue;
|
|
// or empty string ptr
|
|
if ( ! *pd ) continue;
|
|
// skip utf8content if we should -- no events or addresses
|
|
//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
|
|
// 4 bytes for the size
|
|
need1 += 4;
|
|
// add it up
|
|
need1 += *ps;
|
|
// make the mask
|
|
uint32_t mask = 1 << i ;
|
|
// add it in
|
|
m_internalFlags1 |= mask;
|
|
}
|
|
// alloc the buffer
|
|
char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
|
|
// return NULL with g_errno set on error
|
|
if ( ! ubuf ) {
|
|
// restore if we were zeroed out
|
|
ptr_utf8Content = savedPtr;
|
|
size_utf8Content = savedSize;
|
|
return false;
|
|
}
|
|
// serialize into it
|
|
char *p = ubuf;
|
|
// copy our crap into there
|
|
gbmemcpy ( p , &m_headerSize , m_headerSize );
|
|
// skip it
|
|
p += m_headerSize;
|
|
// reset data ptrs
|
|
pd = (char **)&ptr_firstUrl;
|
|
// reset data sizes
|
|
ps = (int32_t *)&size_firstUrl;
|
|
|
|
// then variable length data
|
|
for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
|
|
// skip if empty, do not serialize
|
|
if ( ! *ps ) continue;
|
|
// or empty string ptr
|
|
if ( ! *pd ) continue;
|
|
// skip utf8content if we should -- no events or addresses
|
|
//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
|
|
// store size first
|
|
*(int32_t *)p = *ps;
|
|
p += 4;
|
|
// then the data
|
|
gbmemcpy ( p , *pd , *ps );
|
|
// skip *ps bytes we wrote. should include a \0
|
|
p += *ps;
|
|
}
|
|
// sanity check
|
|
if ( p != ubuf + need1 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// restore in case zeroOut was true
|
|
ptr_utf8Content = savedPtr;
|
|
size_utf8Content = savedSize;
|
|
|
|
// now restore it for other functions to use
|
|
//size_content = saved;
|
|
|
|
// . now compress our "title rec" data into a titleRec
|
|
// . cbuf should not be set
|
|
//if ( cbuf ) {
|
|
// log(LOG_LOGIC,"db: titlerec: compress: cbuf is set.");
|
|
// char *p = NULL; *p = 0; exit(-1);
|
|
//}
|
|
// should we free cbuf on our reset/destruction?
|
|
//m_owncbuf = ownCompressedData;
|
|
// . make a buf big enough to hold compressed, we'll realloc afterwards
|
|
// . according to zlib.h line 613 compress buffer must be .1% larger
|
|
// than source plus 12 bytes. (i add one for round off error)
|
|
// . now i added another extra 12 bytes cuz compress seemed to want it
|
|
int32_t need2 = ((int64_t)need1 * 1001LL) / 1000LL + 13 + 12;
|
|
// we also need to store a key then regular dataSize then
|
|
// the uncompressed size in cbuf before the compression of m_ubuf
|
|
int32_t hdrSize = sizeof(key_t) + 4 + 4;
|
|
// . now i add 12 bytes more so Msg14.cpp can also squeeze in a
|
|
// negative key to delete the old titleRec, cuz we use this cbuf
|
|
// to set our list that we add to our twins with
|
|
// . we now store the negative rec before the positive rec in Msg14.cpp
|
|
//hdrSize += sizeof(key_t) + 4;
|
|
need2 += hdrSize;
|
|
// alloc what we need
|
|
//char *cbuf = (char *) mmalloc ( need2 ,"TitleRecc");
|
|
//if ( ! cbuf ) return false;
|
|
// return false on error
|
|
if ( ! tbuf->reserve ( need2 ,"titbuf" ) ) return false;
|
|
// int16_tcut
|
|
char *cbuf = tbuf->getBufStart();
|
|
// set cbuf sizes, we set cbufSize below to fit exactly used buf
|
|
//int32_t cbufMaxSize = need2;
|
|
// . how big is the buf we're passing to ::compress()?
|
|
// . don't include the last 12 byte, save for del key in Msg14.cpp
|
|
int32_t size = need2 - hdrSize ;
|
|
// . uncompress the data into ubuf
|
|
// . this will reset cbufSize to a smaller value probably
|
|
// . "size" is set to how many bytes we wrote into "cbuf + hdrSize"
|
|
int err = gbcompress ( (unsigned char *)cbuf + hdrSize,
|
|
(uint32_t *)&size,
|
|
(unsigned char *)ubuf ,
|
|
(uint32_t )need1 );
|
|
// note it
|
|
//log("test: compressed %s from %"INT32" to %"INT32" bytes",
|
|
// m_firstUrl.m_url,need2-hdrSize,size);
|
|
// free the buf we were trying to compress now
|
|
mfree ( ubuf , need1 , "trub" );
|
|
// we should check ourselves
|
|
if ( err == Z_OK && size > (need2 - hdrSize ) ) {
|
|
//mfree ( cbuf , need2 ,"TitleRecc" );
|
|
tbuf->purge();
|
|
g_errno = ECOMPRESSFAILED;
|
|
log("db: Failed to compress document of %"INT32" bytes. "
|
|
"Provided buffer of %"INT32" bytes.",
|
|
size, (need2 - hdrSize ) );
|
|
return false;
|
|
}
|
|
// check for error
|
|
if ( err != Z_OK ) {
|
|
//mfree ( cbuf , need2 ,"TitleRecc" );
|
|
tbuf->purge();
|
|
g_errno = ECOMPRESSFAILED;
|
|
log("db: Failed to compress document.");
|
|
return false;
|
|
}
|
|
// calc cbufSize, the uncompressed header + compressed stuff
|
|
//cbufSize = hdrSize + size ;
|
|
|
|
//int64_t uh48 = getFirstUrlHash48();
|
|
// . make the key from docId
|
|
// . false = delkey?
|
|
//m_titleRecKey = g_titledb.makeKey (*getDocId(),uh48,false);//delkey?
|
|
key_t tkey = g_titledb.makeKey (docId,uh48,false);//delkey?
|
|
// validate it
|
|
//m_titleRecKeyValid = true;
|
|
|
|
// get a ptr to the Rdb record at start of the header
|
|
p = cbuf;
|
|
// skip over the negative rec reserved space for Msg14.cpp
|
|
//p += 12 + 4;
|
|
// . store key in header of cbuf
|
|
// . store in our host byte ordering so we can be a rec in an RdbList
|
|
*(key_t *) p = tkey;
|
|
p += sizeof(key_t);
|
|
// store total dataSize in header (excluding itself and key only)
|
|
int32_t dataSize = size + 4;
|
|
*(int32_t *) p = dataSize ;
|
|
p += 4;
|
|
// store uncompressed size in header
|
|
*(int32_t *) p = need1 ; p += 4;
|
|
// sanity check
|
|
if ( p != cbuf + hdrSize ) { char *xx = NULL; *xx = 0; }
|
|
// sanity check
|
|
if ( need1 <= 0 ) { char *xx = NULL; *xx = 0; }
|
|
// advance over data
|
|
p += size;
|
|
|
|
// update safebuf::m_length so it is correct
|
|
tbuf->setLength ( p - cbuf );
|
|
|
|
return true;
|
|
}
|
|
|
|
// . return NULL and sets g_errno on error
|
|
// . returns -1 if blocked
|
|
SafeBuf *XmlDoc::getTitleRecBuf ( ) {
|
|
|
|
// return it now if we got it already
|
|
if ( m_titleRecBufValid ) return &m_titleRecBuf;
|
|
|
|
setStatus ( "making title rec");
|
|
|
|
// did one of our many blocking function calls have an error?
|
|
if ( g_errno ) return NULL;
|
|
|
|
// . HACK so that TitleRec::isEmpty() return true
|
|
// . faster than calling m_titleRec.reset()
|
|
//m_titleRec.m_url.m_ulen = 0;
|
|
|
|
int32_t *indexCode = getIndexCode();
|
|
// not allowed to block here
|
|
if ( indexCode == (void *)-1) { char *xx=NULL;*xx=0; }
|
|
// return on errors with g_errno set
|
|
if ( ! indexCode ) return NULL;
|
|
// force delete? EDOCFORCEDELETE
|
|
if ( *indexCode ) { m_titleRecBufValid = true; return &m_titleRecBuf; }
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block and return -1, we will be re-called from the top
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getTitleRecBufWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
/*
|
|
// parsing knobs
|
|
if ( ! m_titleWeightValid ) {
|
|
// TODO: watchout for overruns!! these are 16-bits only!
|
|
//m_eliminateMenus = cr->m_eliminateMenus;
|
|
m_titleWeight = cr->m_titleWeight;
|
|
m_headerWeight = cr->m_headerWeight;
|
|
m_urlPathWeight = cr->m_urlPathWeight;
|
|
m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
|
|
m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
|
|
m_conceptWeight = cr->m_conceptWeight;
|
|
//int32_t siteNumInlinksBoost = cr->m_siteNumInlinksBoost;
|
|
// validate these
|
|
//m_eliminateMenusValid = true;
|
|
m_titleWeightValid = true;
|
|
m_headerWeightValid = true;
|
|
m_urlPathWeightValid = true;
|
|
m_externalLinkTextWeightValid = true;
|
|
m_internalLinkTextWeightValid = true;
|
|
m_conceptWeightValid = true;
|
|
}
|
|
*/
|
|
|
|
/////////
|
|
//
|
|
// IF ANY of these validation sanity checks fail then update
|
|
// prepareToMakeTitleRec() so it makes them valid!!!
|
|
//
|
|
/////////
|
|
|
|
// verify key parts
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// verify record parts
|
|
//if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_firstIndexedDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_outlinksAddedDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_charsetValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_countryIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_httpStatusValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
if ( ! m_titleWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_headerWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_urlPathWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_externalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_internalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_conceptWeightValid ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
|
|
// if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
|
|
// if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_rootLangIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_metaListCheckSum8Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_numBannedOutlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_eliminateMenusValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_spiderLinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isContentTruncatedValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isLinkSpamValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// buffers
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_metaRedirUrlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_adVectorValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_imageDataValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_catIdsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_indCatIdsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_dmozInfoValid ) { char *xx=NULL;*xx=0; }
|
|
// if m_recycleContent is true, these are not valid
|
|
if ( ! m_recycleContent ) {
|
|
if ( ! m_rawUtf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_expandedUtf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_datesValid ) { char *xx=NULL;*xx=0; }
|
|
// why do we need valid sections for a titlerec? we no longer user
|
|
// ptr_sectiondbData...
|
|
//if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sectionsReplyValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_addressReplyValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sectiondbDataValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_placedbDataValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_clockCandidatesDataValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do we need these?
|
|
if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_contentHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_tagHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_tagPairHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
// sanity checks
|
|
if ( ! m_addressesValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
setStatus ( "compressing into final title rec");
|
|
|
|
int64_t uh48 = getFirstUrlHash48();
|
|
|
|
int64_t *docId = getDocId();
|
|
|
|
// time it
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
|
|
//////
|
|
//
|
|
// fill in m_titleRecBuf
|
|
//
|
|
//////
|
|
|
|
// we need docid and uh48 for making the key of the titleRec
|
|
if ( ! setTitleRecBuf ( &m_titleRecBuf , *docId , uh48 ) )
|
|
return NULL;
|
|
|
|
// set this member down here because we can't set it in "xd"
|
|
// because it is too int16_t of an xmldoc stub
|
|
m_versionValid = true;
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// . add the stat
|
|
// . use white for the stat
|
|
g_stats.addStat_r ( 0 ,
|
|
startTime ,
|
|
gettimeofdayInMilliseconds(),
|
|
0x00ffffff );
|
|
|
|
QUICKPOLL( m_niceness );
|
|
|
|
char *cbuf = m_titleRecBuf.getBufStart();
|
|
m_titleRecKey = *(key_t *)cbuf;
|
|
m_titleRecKeyValid = true;
|
|
|
|
// we are legit
|
|
//m_freeTitleRec = true;
|
|
//m_titleRec = cbuf;
|
|
// key + dataSize + ubufSize + compressedData
|
|
//m_titleRecSize = sizeof(key_t)+ 4 + 4 + size;
|
|
//m_titleRecAllocSize = need2;
|
|
|
|
// now valid. congratulations!
|
|
m_titleRecBufValid = true;
|
|
return &m_titleRecBuf;
|
|
}
|
|
|
|
|
|
// . an "id" of 2 means very indicative of a dirty doc
|
|
// . an "id" of 1 means it must be joined with another dirty word to indicate
|
|
// . taken mostly from Url.cpp
|
|
// . see matches2.h for Needle class definition
|
|
static Needle s_dirtyWords [] = {
|
|
{"upskirt" ,0,2,0,0,NULL,0,NULL},
|
|
{"downblouse" ,0,2,0,0,NULL,0,NULL},
|
|
{"shemale" ,0,1,0,0,NULL,0,NULL},
|
|
{"spank" ,0,1,0,0,NULL,0,NULL},
|
|
{"dildo" ,0,2,0,0,NULL,0,NULL},
|
|
{"bdsm" ,0,2,0,0,NULL,0,NULL},
|
|
{"voyeur" ,0,2,0,0,NULL,0,NULL},
|
|
{"fisting" ,0,2,0,0,NULL,0,NULL},
|
|
{"vibrator" ,0,2,0,0,NULL,0,NULL},
|
|
{"ejaculat" ,0,2,0,0,NULL,0,NULL},
|
|
{"rgasm" ,0,2,0,0,NULL,0,NULL},
|
|
{"orgy" ,0,2,0,0,NULL,0,NULL},
|
|
{"orgies" ,0,2,0,0,NULL,0,NULL},
|
|
{"stripper" ,0,1,0,0,NULL,0,NULL},
|
|
{"softcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"whore" ,0,2,0,0,NULL,0,NULL},
|
|
// gary slutkin on ted.com. make this just 1 point.
|
|
{"slut" ,0,1,0,0,NULL,0,NULL},
|
|
{"smut" ,0,2,0,0,NULL,0,NULL},
|
|
{"tits" ,0,2,0,0,NULL,0,NULL},
|
|
{"lesbian" ,0,2,0,0,NULL,0,NULL},
|
|
{"swinger" ,0,2,0,0,NULL,0,NULL},
|
|
{"fetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"nude" ,0,1,0,0,NULL,0,NULL},
|
|
{"centerfold" ,0,2,0,0,NULL,0,NULL},
|
|
{"incest" ,0,2,0,0,NULL,0,NULL},
|
|
{"pedophil" ,0,2,0,0,NULL,0,NULL},
|
|
{"pedofil" ,0,2,0,0,NULL,0,NULL},
|
|
{"horny" ,0,2,0,0,NULL,0,NULL}, // horny toad
|
|
{"pussy" ,0,2,0,0,NULL,0,NULL}, // pussy willow pussy cat
|
|
{"pussies" ,0,2,0,0,NULL,0,NULL},
|
|
{"penis" ,0,2,0,0,NULL,0,NULL},
|
|
{"vagina" ,0,2,0,0,NULL,0,NULL},
|
|
{"phuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"blowjob" ,0,2,0,0,NULL,0,NULL},
|
|
{"blow job" ,0,2,0,0,NULL,0,NULL},
|
|
{"gangbang" ,0,2,0,0,NULL,0,NULL},
|
|
{"xxx" ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
|
|
{"porn" ,0,2,0,0,NULL,0,NULL},
|
|
{"felch" ,0,2,0,0,NULL,0,NULL},
|
|
{"cunt" ,0,2,0,0,NULL,0,NULL},
|
|
{"bestial" ,0,2,0,0,NULL,0,NULL},
|
|
{"beastial" ,0,2,0,0,NULL,0,NULL},
|
|
{"kink" ,0,2,0,0,NULL,0,NULL},
|
|
// . "sex" is often substring in tagids.
|
|
// . too many false positives, make "1" not "2"
|
|
{"sex" ,0,1,0,0,NULL,0,NULL},
|
|
{"anal" ,0,2,0,0,NULL,0,NULL},
|
|
{"cum" ,0,2,0,0,NULL,0,NULL}, // often used for cumulative
|
|
{"clit" ,0,2,0,0,NULL,0,NULL},
|
|
{"fuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"boob" ,0,1,0,0,NULL,0,NULL},
|
|
{"wank" ,0,2,0,0,NULL,0,NULL},
|
|
{"fick" ,0,2,0,0,NULL,0,NULL},
|
|
{"eroti" ,0,2,0,0,NULL,0,NULL},
|
|
{"gay" ,0,1,0,0,NULL,0,NULL}, // make 1 pt. 'marvin gay'
|
|
// new stuff not in Url.cpp
|
|
{"thong" ,0,1,0,0,NULL,0,NULL},
|
|
{"masturbat" ,0,2,0,0,NULL,0,NULL},
|
|
{"bitch" ,0,1,0,0,NULL,0,NULL},
|
|
{"hell" ,0,1,0,0,NULL,0,NULL},
|
|
{"damn" ,0,1,0,0,NULL,0,NULL},
|
|
{"rimjob" ,0,2,0,0,NULL,0,NULL},
|
|
{"cunnilingu" ,0,2,0,0,NULL,0,NULL},
|
|
{"felatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"fellatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"dick" ,0,1,0,0,NULL,0,NULL},
|
|
{"cock" ,0,1,0,0,NULL,0,NULL},
|
|
{"rape" ,0,2,0,0,NULL,0,NULL},
|
|
{"raping" ,0,2,0,0,NULL,0,NULL},
|
|
{"bukake" ,0,2,0,0,NULL,0,NULL},
|
|
{"shit" ,0,2,0,0,NULL,0,NULL},
|
|
{"naked" ,0,1,0,0,NULL,0,NULL},
|
|
{"nympho" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcore" ,0,1,0,0,NULL,0,NULL}, // hardcore gamer, count as 1
|
|
{"sodom" ,0,2,0,0,NULL,0,NULL},
|
|
{"titties" ,0,2,0,0,NULL,0,NULL}, // re-do
|
|
{"twat" ,0,2,0,0,NULL,0,NULL},
|
|
{"bastard" ,0,1,0,0,NULL,0,NULL},
|
|
{"erotik" ,0,2,0,0,NULL,0,NULL},
|
|
|
|
// EXCEPTIONS
|
|
|
|
// smut
|
|
{"transmut" ,0,-2,0,0,NULL,0,NULL},
|
|
{"bismuth" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
// sex
|
|
{"middlesex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sussex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"essex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"deusex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexchange" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpress" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpert" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
|
|
// EXCEPTIONS
|
|
|
|
// sex
|
|
{"middlesex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sussex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"essex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"deusex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexchange" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpress" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpert" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexcel" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexist" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexile" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexperi" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexual" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpose" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexclu" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexo" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexism" ,0,-1,0,0,NULL,0,NULL},
|
|
{"sexpan" ,0,-1,0,0,NULL,0,NULL}, // buttonsexpanion
|
|
{"same-sex" ,0,-1,0,0,NULL,0,NULL},
|
|
{"opposite sex",0,-1,0,0,NULL,0,NULL},
|
|
|
|
// anal
|
|
{"analog" ,0,-2,0,0,NULL,0,NULL},
|
|
{"analy" ,0,-2,0,0,NULL,0,NULL},
|
|
{"canal" ,0,-2,0,0,NULL,0,NULL},
|
|
{"kanal" ,0,-2,0,0,NULL,0,NULL},
|
|
{"banal" ,0,-2,0,0,NULL,0,NULL},
|
|
{"ianalbert" ,0,-2,0,0,NULL,0,NULL}, // ian albert
|
|
|
|
// cum
|
|
{"circum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cum laude" ,0,-2,0,0,NULL,0,NULL},
|
|
{"succum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cumber" ,0,-2,0,0,NULL,0,NULL},
|
|
{"docum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cumul" ,0,-2,0,0,NULL,0,NULL},
|
|
{"acumen" ,0,-2,0,0,NULL,0,NULL},
|
|
{"incum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"capsicum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"modicum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"locum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"accum" ,0,-2,0,0,NULL,0,NULL},
|
|
{"cumbre" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
{"swank" ,0,-2,0,0,NULL,0,NULL},
|
|
{"fickle" ,0,-2,0,0,NULL,0,NULL},
|
|
{"traffick" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scleroti" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gaylor" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gaynor" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gayner" ,0,-2,0,0,NULL,0,NULL},
|
|
{"gayton" ,0,-2,0,0,NULL,0,NULL},
|
|
{"dipthong" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// hell
|
|
{"hellen" ,0,-1,0,0,NULL,0,NULL},
|
|
{"hellman" ,0,-1,0,0,NULL,0,NULL},
|
|
{"shell" ,0,-1,0,0,NULL,0,NULL},
|
|
{"mitchell" ,0,-1,0,0,NULL,0,NULL},
|
|
{"chelle" ,0,-1,0,0,NULL,0,NULL}, // me/michelle
|
|
{"hello" ,0,-1,0,0,NULL,0,NULL},
|
|
{"moschella" ,0,-1,0,0,NULL,0,NULL},
|
|
{"othello" ,0,-1,0,0,NULL,0,NULL},
|
|
{"schelling" ,0,-1,0,0,NULL,0,NULL},
|
|
{"seychelles" ,0,-1,0,0,NULL,0,NULL},
|
|
{"wheller" ,0,-1,0,0,NULL,0,NULL},
|
|
{"winchell" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// dick
|
|
{"dicker" ,0,-1,0,0,NULL,0,NULL},
|
|
{"dickins" ,0,-1,0,0,NULL,0,NULL},
|
|
{"dickies" ,0,-1,0,0,NULL,0,NULL},
|
|
{"dickran" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// cock
|
|
{"babcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocked" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocking" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cockpit" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cockroach" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocktail" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cocky" ,0,-1,0,0,NULL,0,NULL},
|
|
{"hancock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"hitchcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"peacock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"shuttlecock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"stopcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"weathercock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"woodcock" ,0,-1,0,0,NULL,0,NULL},
|
|
{"cockburn" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// kink
|
|
{"kinko" ,0,-2,0,0,NULL,0,NULL},
|
|
{"ukink" ,0,-2,0,0,NULL,0,NULL}, // ink shop in uk
|
|
|
|
// naked
|
|
{"snaked" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// rape
|
|
{"drape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"grape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scrape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"therape" ,0,-2,0,0,NULL,0,NULL},
|
|
{"trapez" ,0,-2,0,0,NULL,0,NULL},
|
|
{"parapet" ,0,-2,0,0,NULL,0,NULL},
|
|
{"scraping" ,0,-2,0,0,NULL,0,NULL},
|
|
{"draping" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
// twat
|
|
{"twatch" ,0,-2,0,0,NULL,0,NULL}, // courtwatch -- cspan.org
|
|
|
|
// clit
|
|
{"heraclitus" ,0,-2,0,0,NULL,0,NULL},
|
|
|
|
// boob
|
|
{"booboo" ,0,-1,0,0,NULL,0,NULL},
|
|
|
|
// shit
|
|
{"shitak" ,0,-2,0,0,NULL,0,NULL}
|
|
};
|
|
|
|
////
|
|
//// New stuff from sex.com adult word list
|
|
////
|
|
////
|
|
//// make it a 2nd part because of performance limits on matches2.cpp algo
|
|
////
|
|
static Needle s_dirtyWordsPart2 [] = {
|
|
{"amateurfoto" ,0,2,0,0,NULL,0,NULL},
|
|
{"amateurhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"amateurindex" ,0,2,0,0,NULL,0,NULL},
|
|
{"amateurnaked" ,0,2,0,0,NULL,0,NULL},
|
|
{"amatuerhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"ampland" ,0,2,0,0,NULL,0,NULL},
|
|
//{"animehentai" ,0,2,0,0,NULL,0,NULL}, dup
|
|
{"anitablonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"asiacarrera" ,0,2,0,0,NULL,0,NULL},
|
|
{"asshole" ,0,2,0,0,NULL,0,NULL},
|
|
{"asslick" ,0,2,0,0,NULL,0,NULL},
|
|
{"asspic" ,0,2,0,0,NULL,0,NULL},
|
|
{"assworship" ,0,2,0,0,NULL,0,NULL},
|
|
//{"badgirl" ,0,2,0,0,NULL,0,NULL}, not necessarily bad
|
|
{"bareceleb" ,0,2,0,0,NULL,0,NULL},
|
|
{"barenaked" ,0,2,0,0,NULL,0,NULL},
|
|
{"beaverboy" ,0,2,0,0,NULL,0,NULL},
|
|
{"beavershot" ,0,2,0,0,NULL,0,NULL}, // was beavershots
|
|
//{"bigball" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
|
|
{"bigbreast" ,0,2,0,0,NULL,0,NULL},
|
|
//{"bigbutt" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
|
|
{"bigcock" ,0,2,0,0,NULL,0,NULL},
|
|
{"bigdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"biggestdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"biggesttit" ,0,2,0,0,NULL,0,NULL},
|
|
{"bighairyball" ,0,2,0,0,NULL,0,NULL},
|
|
{"bighooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"bignipple" ,0,2,0,0,NULL,0,NULL},
|
|
{"bigtit" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackbooty" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackbutt" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackcock" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"blackonblonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"blacksonblonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"blacktit" ,0,2,0,0,NULL,0,NULL},
|
|
{"blacktwat" ,0,2,0,0,NULL,0,NULL},
|
|
{"boner" ,0,1,0,0,NULL,0,NULL}, // softcore, someone's lastname?
|
|
{"bordello" ,0,2,0,0,NULL,0,NULL},
|
|
{"braless" ,0,2,0,0,NULL,0,NULL},
|
|
{"brothel" ,0,2,0,0,NULL,0,NULL},
|
|
{"bukake" ,0,2,0,0,NULL,0,NULL},
|
|
{"bukkake" ,0,2,0,0,NULL,0,NULL},
|
|
{"bustyblonde" ,0,2,0,0,NULL,0,NULL},
|
|
{"bustyceleb" ,0,2,0,0,NULL,0,NULL},
|
|
{"butthole" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttman" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttplug" ,0,2,0,0,NULL,0,NULL},
|
|
{"buttthumbnails" ,0,2,0,0,NULL,0,NULL},
|
|
{"callgirl" ,0,2,0,0,NULL,0,NULL},
|
|
{"celebritiesnaked" ,0,2,0,0,NULL,0,NULL},
|
|
{"celebritybush" ,0,2,0,0,NULL,0,NULL},
|
|
{"celebritybutt" ,0,2,0,0,NULL,0,NULL},
|
|
{"chaseylain" ,0,2,0,0,NULL,0,NULL},
|
|
{"chickswithdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"christycanyon" ,0,2,0,0,NULL,0,NULL},
|
|
{"cicciolina" ,0,2,0,0,NULL,0,NULL},
|
|
//{"cunilingus" ,0,2,0,0,NULL,0,NULL},
|
|
{"cunniling" ,0,2,0,0,NULL,0,NULL}, // abbreviate
|
|
{"cyberlust" ,0,2,0,0,NULL,0,NULL},
|
|
{"danniashe" ,0,2,0,0,NULL,0,NULL},
|
|
{"dicksuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"dirtymind" ,0,2,0,0,NULL,0,NULL},
|
|
{"dirtypicture" ,0,2,0,0,NULL,0,NULL},
|
|
{"doggiestyle" ,0,2,0,0,NULL,0,NULL},
|
|
{"doggystyle" ,0,2,0,0,NULL,0,NULL},
|
|
{"domatrix" ,0,2,0,0,NULL,0,NULL},
|
|
{"dominatrix" ,0,2,0,0,NULL,0,NULL},
|
|
//{"dyke" ,0,2,0,0,NULL,0,NULL}, // dick van dyke!
|
|
{"ejaculation" ,0,2,0,0,NULL,0,NULL},
|
|
{"erosvillage" ,0,2,0,0,NULL,0,NULL},
|
|
{"facesit" ,0,2,0,0,NULL,0,NULL},
|
|
{"fatass" ,0,2,0,0,NULL,0,NULL},
|
|
{"feetfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"felatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"fellatio" ,0,2,0,0,NULL,0,NULL},
|
|
{"femdom" ,0,2,0,0,NULL,0,NULL},
|
|
{"fetishwear" ,0,2,0,0,NULL,0,NULL},
|
|
{"fettegirl" ,0,2,0,0,NULL,0,NULL},
|
|
{"fingerbang" ,0,2,0,0,NULL,0,NULL},
|
|
{"fingering" ,0,1,0,0,NULL,0,NULL}, // fingering the keyboard? use 1
|
|
{"flesh4free" ,0,2,0,0,NULL,0,NULL},
|
|
{"footfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"footjob" ,0,2,0,0,NULL,0,NULL},
|
|
{"footlicking" ,0,2,0,0,NULL,0,NULL},
|
|
{"footworship" ,0,2,0,0,NULL,0,NULL},
|
|
{"fornication" ,0,2,0,0,NULL,0,NULL},
|
|
{"freeass" ,0,2,0,0,NULL,0,NULL},
|
|
{"freebigtit" ,0,2,0,0,NULL,0,NULL},
|
|
{"freedick" ,0,2,0,0,NULL,0,NULL},
|
|
{"freehardcore" ,0,2,0,0,NULL,0,NULL},
|
|
//{"freehentai" ,0,2,0,0,NULL,0,NULL}, dup
|
|
{"freehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"freelargehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"freenakedpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"freenakedwomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"freetit" ,0,2,0,0,NULL,0,NULL},
|
|
{"freevoyeur" ,0,2,0,0,NULL,0,NULL},
|
|
{"gratishardcoregalerie" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorecelebs" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorefree" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorejunkie" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorejunky" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcoremovie" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorepic" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorepix" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcoresample" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorestories" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorethumb" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardcorevideo" ,0,2,0,0,NULL,0,NULL},
|
|
{"harddick" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardnipple" ,0,2,0,0,NULL,0,NULL},
|
|
{"hardon" ,0,2,0,0,NULL,0,NULL},
|
|
{"hentai" ,0,2,0,0,NULL,0,NULL},
|
|
{"interacialhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"intercourseposition" ,0,2,0,0,NULL,0,NULL},
|
|
{"interracialhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"ittybittytitty" ,0,2,0,0,NULL,0,NULL},
|
|
{"jackoff" ,0,2,0,0,NULL,0,NULL},
|
|
{"jennajameson" ,0,2,0,0,NULL,0,NULL},
|
|
{"jennicam" ,0,2,0,0,NULL,0,NULL},
|
|
{"jerkoff" ,0,2,0,0,NULL,0,NULL},
|
|
{"jism" ,0,2,0,0,NULL,0,NULL},
|
|
{"jiz" ,0,2,0,0,NULL,0,NULL},
|
|
{"justhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"karasamateurs" ,0,2,0,0,NULL,0,NULL},
|
|
{"kascha" ,0,2,0,0,NULL,0,NULL},
|
|
{"kaylakleevage" ,0,2,0,0,NULL,0,NULL},
|
|
{"kobetai" ,0,2,0,0,NULL,0,NULL},
|
|
{"lapdance" ,0,2,0,0,NULL,0,NULL},
|
|
{"largedick" ,0,2,0,0,NULL,0,NULL},
|
|
{"largehooter" ,0,2,0,0,NULL,0,NULL},
|
|
{"largestbreast" ,0,2,0,0,NULL,0,NULL},
|
|
{"largetit" ,0,2,0,0,NULL,0,NULL},
|
|
{"lesben" ,0,2,0,0,NULL,0,NULL},
|
|
{"lesbo" ,0,2,0,0,NULL,0,NULL},
|
|
{"lickadick" ,0,2,0,0,NULL,0,NULL},
|
|
{"lindalovelace" ,0,2,0,0,NULL,0,NULL},
|
|
{"longdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"lovedoll" ,0,2,0,0,NULL,0,NULL},
|
|
{"makinglove" ,0,2,0,0,NULL,0,NULL},
|
|
{"mangax" ,0,2,0,0,NULL,0,NULL},
|
|
{"manpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"marilynchambers" ,0,2,0,0,NULL,0,NULL},
|
|
{"massivecock" ,0,2,0,0,NULL,0,NULL},
|
|
{"masterbating" ,0,2,0,0,NULL,0,NULL},
|
|
{"mensdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"milf" ,0,2,0,0,NULL,0,NULL},
|
|
{"minka" ,0,2,0,0,NULL,0,NULL},
|
|
{"monstercock" ,0,2,0,0,NULL,0,NULL},
|
|
{"monsterdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"muffdiving" ,0,2,0,0,NULL,0,NULL},
|
|
{"nacktfoto" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedblackwomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedceleb" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedcelebrity" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedcheerleader" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedchick" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedgirl" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedguy" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedladies" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedlady" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedman" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedmen" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedness" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedphoto" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedstar" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedwife" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedwoman" ,0,2,0,0,NULL,0,NULL},
|
|
{"nakedwomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"nastychat" ,0,2,0,0,NULL,0,NULL},
|
|
{"nastythumb" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtylink" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtylinx" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtylynx" ,0,2,0,0,NULL,0,NULL},
|
|
{"naughtynurse" ,0,2,0,0,NULL,0,NULL},
|
|
{"niceass" ,0,2,0,0,NULL,0,NULL},
|
|
{"nikkinova" ,0,2,0,0,NULL,0,NULL},
|
|
{"nikkityler" ,0,2,0,0,NULL,0,NULL},
|
|
{"nylonfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"nympho" ,0,2,0,0,NULL,0,NULL},
|
|
{"openleg" ,0,2,0,0,NULL,0,NULL},
|
|
{"oral4free" ,0,2,0,0,NULL,0,NULL},
|
|
{"pantyhosefetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"peepcam" ,0,2,0,0,NULL,0,NULL},
|
|
{"persiankitty" ,0,2,0,0,NULL,0,NULL},
|
|
{"perverted" ,0,2,0,0,NULL,0,NULL},
|
|
{"pimpserver" ,0,2,0,0,NULL,0,NULL},
|
|
{"pissing" ,0,2,0,0,NULL,0,NULL},
|
|
{"poontang" ,0,2,0,0,NULL,0,NULL},
|
|
{"privatex" ,0,2,0,0,NULL,0,NULL},
|
|
{"prono" ,0,2,0,0,NULL,0,NULL},
|
|
{"publicnudity" ,0,2,0,0,NULL,0,NULL},
|
|
{"puffynipple" ,0,2,0,0,NULL,0,NULL},
|
|
{"racqueldarrian" ,0,2,0,0,NULL,0,NULL},
|
|
//{"rape" ,0,2,0,0,NULL,0,NULL}, // dup!
|
|
{"rawlink" ,0,2,0,0,NULL,0,NULL},
|
|
{"realhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"rubberfetish" ,0,2,0,0,NULL,0,NULL},
|
|
{"seka" ,0,2,0,0,NULL,0,NULL},
|
|
{"sheboy" ,0,2,0,0,NULL,0,NULL},
|
|
{"showcam" ,0,2,0,0,NULL,0,NULL},
|
|
{"showercam" ,0,2,0,0,NULL,0,NULL},
|
|
{"smallbreast" ,0,2,0,0,NULL,0,NULL},
|
|
{"smalldick" ,0,2,0,0,NULL,0,NULL},
|
|
{"spycamadult" ,0,2,0,0,NULL,0,NULL},
|
|
{"strapon" ,0,2,0,0,NULL,0,NULL},
|
|
{"stripclub" ,0,2,0,0,NULL,0,NULL},
|
|
{"stripshow" ,0,2,0,0,NULL,0,NULL},
|
|
{"striptease" ,0,2,0,0,NULL,0,NULL},
|
|
{"strokeit" ,0,2,0,0,NULL,0,NULL},
|
|
{"strokeme" ,0,2,0,0,NULL,0,NULL},
|
|
{"suckdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"sylviasaint" ,0,2,0,0,NULL,0,NULL},
|
|
{"teenhardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"teenie" ,0,2,0,0,NULL,0,NULL},
|
|
{"teenpic" ,0,2,0,0,NULL,0,NULL},
|
|
{"teensuck" ,0,2,0,0,NULL,0,NULL},
|
|
{"tgp" ,0,2,0,0,NULL,0,NULL},
|
|
{"threesome" ,0,2,0,0,NULL,0,NULL},
|
|
{"thumblord" ,0,2,0,0,NULL,0,NULL},
|
|
{"thumbzilla" ,0,2,0,0,NULL,0,NULL},
|
|
{"tiffanytowers" ,0,2,0,0,NULL,0,NULL},
|
|
{"tinytitties" ,0,2,0,0,NULL,0,NULL},
|
|
//{"tities" ,0,2,0,0,NULL,0,NULL}, // entities
|
|
{"titman" ,0,2,0,0,NULL,0,NULL},
|
|
{"titsandass" ,0,2,0,0,NULL,0,NULL},
|
|
{"titties" ,0,2,0,0,NULL,0,NULL},
|
|
{"titts" ,0,2,0,0,NULL,0,NULL},
|
|
{"titty" ,0,2,0,0,NULL,0,NULL},
|
|
{"tokyotopless" ,0,2,0,0,NULL,0,NULL},
|
|
{"tommysbookmark" ,0,2,0,0,NULL,0,NULL},
|
|
{"toplesswomen" ,0,2,0,0,NULL,0,NULL},
|
|
{"trannies" ,0,2,0,0,NULL,0,NULL},
|
|
{"twinks" ,0,2,0,0,NULL,0,NULL},
|
|
{"ultradonkey" ,0,2,0,0,NULL,0,NULL},
|
|
{"ultrahardcore" ,0,2,0,0,NULL,0,NULL},
|
|
{"uncutcock" ,0,2,0,0,NULL,0,NULL},
|
|
{"vividtv" ,0,2,0,0,NULL,0,NULL},
|
|
{"wendywhoppers" ,0,2,0,0,NULL,0,NULL},
|
|
{"wetdick" ,0,2,0,0,NULL,0,NULL},
|
|
{"wetpanties" ,0,2,0,0,NULL,0,NULL},
|
|
{"wifesharing" ,0,2,0,0,NULL,0,NULL},
|
|
{"wifeswapping" ,0,2,0,0,NULL,0,NULL},
|
|
{"xrated" ,0,2,0,0,NULL,0,NULL}
|
|
};
|
|
|
|
|
|
// . store this in clusterdb rec so family filter works!
|
|
// . check content for adult words
|
|
char *XmlDoc::getIsAdult ( ) {
|
|
|
|
if ( m_isAdultValid ) return &m_isAdult2;
|
|
|
|
// call that
|
|
setStatus ("getting is adult bit");
|
|
|
|
int32_t **pici = getIndCatIds();
|
|
if ( ! pici || pici == (void *)-1 ) return (char *)pici;
|
|
|
|
// check categories
|
|
for ( int32_t i = 0 ; i < size_indCatIds / 4 ; i++ ) {
|
|
int32_t ic = ptr_indCatIds[i];
|
|
// skip if not an adult category
|
|
if ( ! g_categories->isIdAdult ( ic ) ) continue;
|
|
// got it
|
|
m_isAdult = true;
|
|
m_isAdult2 = true;
|
|
m_isAdultValid = true;
|
|
return &m_isAdult2;
|
|
}
|
|
|
|
// . if any of the wiki docids we are in are adult.... then we are
|
|
// . we set the top bit of wiki docids to indicate if adult
|
|
//for ( int32_t i = 0 ; i < size_wikiDocIds / 8 ; i++ ) {
|
|
// int64_t d = ptr_wikiDocIds[i];
|
|
// if ( ! ( d & 0x8000000000000000 ) ) continue;
|
|
// // got it
|
|
// m_isAdult = true;
|
|
// m_isAdultValid = true;
|
|
// return &m_isAdult;
|
|
//}
|
|
|
|
// need the content
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1) return (char *)u8;
|
|
|
|
// time it
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
// score that up
|
|
int32_t total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
|
|
m_niceness , m_firstUrl.m_url );
|
|
|
|
// then the url
|
|
//char *u = getFirstUrl()->getUrl();
|
|
//total += getDirtyPoints ( u , gbstrlen(u) );
|
|
|
|
// and redir url
|
|
//char *r = getRedirUrl()->getUrl();
|
|
//total += getDirtyPoints ( r , gbstrlen(r) );
|
|
|
|
// debug msg
|
|
int64_t took = gettimeofdayInMilliseconds() - start;
|
|
if ( took > 10 )
|
|
logf(LOG_DEBUG,
|
|
"build: Took %"INT64" ms to check doc of %"INT32" bytes for "
|
|
"dirty words.",took,size_utf8Content-1);
|
|
|
|
m_isAdult = false;
|
|
// adult?
|
|
if ( total >= 2 ) m_isAdult = true;
|
|
// set shadow member
|
|
m_isAdult2 = (bool)m_isAdult;
|
|
// validate
|
|
m_isAdultValid = true;
|
|
|
|
// note it
|
|
if ( m_isAdult2 && g_conf.m_logDebugDirty )
|
|
log("dirty: %s points = %"INT32"",m_firstUrl.m_url,total);
|
|
|
|
// no dirty words found
|
|
return &m_isAdult2;
|
|
}
|
|
|
|
|
|
|
|
int32_t getDirtyPoints ( char *s , int32_t slen , int32_t niceness , char *url ) {
|
|
// . use the matches function to get all the matches
|
|
// . then check each match to see if it is actually a legit word
|
|
// . actually match the dirty words, then match the clean words
|
|
// then we can subtract counts.
|
|
int32_t numDirty = sizeof(s_dirtyWords) / sizeof(Needle);
|
|
getMatches2 ( s_dirtyWords ,
|
|
numDirty ,
|
|
s ,
|
|
slen ,
|
|
NULL , // linkPos
|
|
NULL , // needleNum
|
|
false , // stopAtFirstMatch?
|
|
NULL , // hadPreMatch ptr
|
|
true , // saveQuickTables?
|
|
niceness );
|
|
|
|
int32_t points = 0;
|
|
// each needle has an associated score
|
|
for ( int32_t i = 0 ; i < numDirty ; i++ ) {
|
|
// skip if no match
|
|
if ( s_dirtyWords[i].m_count <= 0 ) continue;
|
|
// . the "id", is positive for dirty words, - for clean
|
|
// . uses +2/-2 for really dirty words
|
|
// . uses +1/-1 for borderline dirty words
|
|
points += s_dirtyWords[i].m_id;
|
|
// log debug
|
|
if ( ! g_conf.m_logDebugDirty ) continue;
|
|
// show it in the log
|
|
log("dirty: %s %"INT32" %s"
|
|
,s_dirtyWords[i].m_string
|
|
,(int32_t)s_dirtyWords[i].m_id
|
|
,url
|
|
);
|
|
}
|
|
|
|
////
|
|
//
|
|
// repeat for part2
|
|
//
|
|
// we have to do two separate parts otherwise the algo in
|
|
// matches2.cpp gets really slow. it was not meant to match
|
|
// so many needles in one haystack.
|
|
//
|
|
///
|
|
int32_t numDirty2 = sizeof(s_dirtyWordsPart2) / sizeof(Needle);
|
|
|
|
// . disable this for now. most of these are phrases and they
|
|
// will not be detected.
|
|
// . TODO: hash the dirty words and phrases and just lookup
|
|
// words in that table like we do for isStopWord(), but use
|
|
// isDirtyWord(). Then replace the code is Speller.cpp
|
|
// with isDirtyUrl() which will split the string into words
|
|
// and call isDirtyWord() on each one. also use bi and tri grams
|
|
// in the hash table.
|
|
numDirty2 = 0;
|
|
|
|
getMatches2 ( s_dirtyWordsPart2 ,
|
|
numDirty2 ,
|
|
s ,
|
|
slen ,
|
|
NULL , // linkPos
|
|
NULL , // needleNum
|
|
false , // stopAtFirstMatch?
|
|
NULL , // hadPreMatch ptr
|
|
true , // saveQuickTables?
|
|
niceness );
|
|
|
|
|
|
// each needle has an associated score
|
|
for ( int32_t i = 0 ; i < numDirty2 ; i++ ) {
|
|
// skip if no match
|
|
if ( s_dirtyWordsPart2[i].m_count <= 0 ) continue;
|
|
// . the "id", is positive for dirty words, - for clean
|
|
// . uses +2/-2 for really dirty words
|
|
// . uses +1/-1 for borderline dirty words
|
|
points += s_dirtyWordsPart2[i].m_id;
|
|
// log debug
|
|
if ( ! g_conf.m_logDebugDirty ) continue;
|
|
// show it in the log
|
|
log("dirty: %s %"INT32" %s"
|
|
,s_dirtyWordsPart2[i].m_string
|
|
,(int32_t)s_dirtyWordsPart2[i].m_id
|
|
,url
|
|
);
|
|
}
|
|
|
|
|
|
return points;
|
|
}
|
|
|
|
|
|
int32_t **XmlDoc::getIndCatIds ( ) {
|
|
// if XmlDoc was set from a titleRec it should validate this
|
|
if ( m_indCatIdsValid ) return &ptr_indCatIds;
|
|
// otherwise, we must compute them!
|
|
CatRec *cat = getCatRec ();
|
|
// blocked or error?
|
|
if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
|
|
// set this
|
|
ptr_indCatIds = cat->m_indCatids;
|
|
size_indCatIds = cat->m_numIndCatids * 4;
|
|
m_indCatIdsValid = true;
|
|
// parse that up
|
|
return &ptr_indCatIds;
|
|
}
|
|
|
|
int32_t **XmlDoc::getCatIds ( ) {
|
|
// if XmlDoc was set from a titleRec it should validate this
|
|
if ( m_catIdsValid ) return &ptr_catIds;
|
|
// otherwise, we must compute them!
|
|
CatRec *cat = getCatRec ();
|
|
// blocked or error?
|
|
if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
|
|
// set this
|
|
ptr_catIds = cat->m_catids;
|
|
size_catIds = cat->m_numCatids * 4;
|
|
m_catIdsValid = true;
|
|
// parse that up
|
|
return &ptr_catIds;
|
|
}
|
|
|
|
CatRec *XmlDoc::getCatRec ( ) {
|
|
// return what we got
|
|
if ( m_catRecValid ) return &m_catRec;
|
|
// call that
|
|
setStatus ("getting dmoz cat rec");
|
|
// callback?
|
|
if ( m_calledMsg8b ) {
|
|
// return NULL on error
|
|
if ( g_errno ) return NULL;
|
|
// otherwise, success
|
|
m_catRecValid = true;
|
|
return &m_catRec;
|
|
}
|
|
// consider it called
|
|
m_calledMsg8b = true;
|
|
// assume empty and skip the call for now
|
|
m_catRec.reset();
|
|
m_catRecValid = true;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// let's bring dmoz back
|
|
//return &m_catRec;
|
|
// compute it otherwise
|
|
if ( ! m_msg8b.getCatRec ( &m_firstUrl ,
|
|
cr->m_coll ,
|
|
gbstrlen(cr->m_coll) ,
|
|
true , // use canonical name?
|
|
m_niceness ,
|
|
&m_catRec , // store here
|
|
m_masterState , // state
|
|
m_masterLoop )) // callback
|
|
// return -1 if we blocked
|
|
return (CatRec *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// we got it somehow without blocking... local cached lookup?
|
|
m_catRecValid = true;
|
|
return &m_catRec;
|
|
}
|
|
|
|
void gotWikiResultsWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotWikiResults ( slot );
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . get the wiki pages that this page matches
|
|
// . use the docids of the wiki pages to represent them
|
|
// . use an independent 32-node cluster to index all of wikipedia so it is all
|
|
// in ram. do not need datedb, etc.
|
|
// . get the gigabits for this page, up to 50 of them, and use that as a rat=0
|
|
// query on the wiki cluster
|
|
// . score each wiki docid too, based on match
|
|
// . normalize scores so they range from 10% to 100%, based on # of gigabits
|
|
// that the wiki page matches
|
|
// . index these as gbwiki:<wikipagedocid> with the score given (8-bit) mapped
|
|
// to 32 bits using score8to32() so the score itself is preserved
|
|
// . WE CAN ALSO call this at QUERY TIME, using the actual query of the
|
|
// searcher instead of the string of gigabits
|
|
// . BUT i will probably just look at the wiki topics of the search results,
|
|
// that will be faster and maybe more accurate...
|
|
int64_t **XmlDoc::getWikiDocIds ( ) {
|
|
|
|
if ( m_wikiDocIdsValid ) return (int64_t **)&ptr_wikiDocIds;
|
|
|
|
setStatus ( "getting wiki docids" );
|
|
|
|
// . get our gigabit vector
|
|
// . consists of array of 32-bit hashes
|
|
// . followed by 1-1 array of 16-bit scores
|
|
// . TODO: restrict gigabits to capitalized words and phrases, and
|
|
// also to 2+ word wiki titles
|
|
char *gq = getGigabitQuery ( );
|
|
if ( ! gq || gq == (char *)-1 ) return (int64_t **)gq;
|
|
|
|
// empty? then no wiki match i guess
|
|
//logf(LOG_DEBUG,"FIX ME FIX ME - getWikiDocIds");
|
|
|
|
// MDW: for now bail here too!
|
|
if ( ! gq[0] || 1 == 1 ) {
|
|
ptr_wikiDocIds = m_wikiDocIds;
|
|
ptr_wikiScores = m_wikiScores;
|
|
size_wikiDocIds = 0;
|
|
size_wikiScores = 0;
|
|
m_wikiDocIdsValid = true;
|
|
return (int64_t **)&ptr_wikiDocIds;
|
|
}
|
|
|
|
// set our query to these gigabits
|
|
// re-enable this later
|
|
//if ( ! m_calledMsg40 ) m_wq.set ( gq );
|
|
|
|
int32_t need = 200 + gbstrlen(gq);
|
|
// make buf
|
|
m_wikiqbuf = (char *)mmalloc ( need , "wikiqbuf");
|
|
// error?
|
|
if ( ! m_wikiqbuf ) return NULL;
|
|
// save size
|
|
m_wikiqbufSize = need;
|
|
// use large single tier for speed
|
|
char *p = m_wikiqbuf;
|
|
p += sprintf ( p ,
|
|
"GET /search?raw=9&n=%"INT32"&sc=0&dr=0&"//dio=1&"
|
|
"t0=1000000&rat=0&"
|
|
"c=wiki&q=%s", (int32_t)MAX_WIKI_DOCIDS, gq );
|
|
// terminate it
|
|
*p++ = '\0';
|
|
// then put in the ip
|
|
*(int32_t *)p = g_hostdb.m_myHost->m_ip;
|
|
// skip over ip
|
|
p += 4;
|
|
// sanity check
|
|
if ( p - m_wikiqbuf > need ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t ip = g_conf.m_wikiProxyIp;
|
|
// if not given, make it gf1 for now
|
|
if ( ! ip ) ip = atoip ( "10.5.62.11" , 10 );
|
|
|
|
int32_t port = g_conf.m_wikiProxyPort;
|
|
// port default too to gf1
|
|
if ( ! port ) port = 9002;
|
|
|
|
// send it using msg 0xfd to the wiki cluster's proxy
|
|
if ( ! g_udpServer.sendRequest ( m_wikiqbuf ,
|
|
p - m_wikiqbuf ,
|
|
0xfd ,
|
|
ip ,
|
|
port ,
|
|
-1 , // hostId
|
|
NULL , // retSlot
|
|
this , // state
|
|
gotWikiResultsWrapper ,
|
|
1000 ) )
|
|
// we had an error, g_errno should be set
|
|
return NULL;
|
|
|
|
// got without blocking? no way!
|
|
return (int64_t **)-1;
|
|
}
|
|
|
|
void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
|
|
|
|
setStatus ( "got wiki docids" );
|
|
|
|
// do not free our request in slot
|
|
slot->m_sendBufAlloc = NULL;
|
|
|
|
// free request buf
|
|
mfree ( m_wikiqbuf , m_wikiqbufSize , "wikiqbuf" );
|
|
|
|
// error getting the wiki results?
|
|
if ( g_errno ) return;
|
|
|
|
// TODO: normalize all scores with each other some how. i think
|
|
// they are fairly absolute, but now sure with a lot of rat=0 terms!
|
|
logf(LOG_DEBUG,"wiki: fix my scoring stuff. have a min score... "
|
|
" and somehow normalize scores to be in [0,1.0]");
|
|
|
|
// . force this reply to be NULL terminated
|
|
// . i can't fix in the code now because the reply is coming from
|
|
// a different cluster running an older version of gb
|
|
char *s = slot->m_readBuf;
|
|
char *end = s + slot->m_readBufSize - 1;
|
|
// overwrite the last '>', who cares!
|
|
*end = '\0';
|
|
// make our xml
|
|
Xml xml;
|
|
if ( ! xml.set ( s ,
|
|
end - s ,
|
|
false , // ownData?
|
|
0 ,
|
|
false ,
|
|
TITLEREC_CURRENT_VERSION ,
|
|
false , // setParents?
|
|
m_niceness ,
|
|
CT_HTML ))
|
|
// return if g_errno got set
|
|
return;
|
|
|
|
// grab docids
|
|
int32_t nd = 0;
|
|
int32_t nn = xml.getNumNodes();
|
|
XmlNode *nodes = xml.getNodes();
|
|
float score = 0.0;
|
|
int64_t docId = 0LL;
|
|
for ( int32_t i = 0 ; i + 1 < nn ; i++ ) {
|
|
if ( nodes[i].m_nodeId != 1 ) continue;
|
|
// tagname is <docid>?
|
|
if ( nodes[i].m_tagNameLen == 5 &&
|
|
nodes[i].m_tagName[0] == 'd' &&
|
|
! strncmp(nodes[i].m_tagName,"docId",5) )
|
|
docId = atoll ( nodes[i].m_tagName );
|
|
// is <score>? (after docid tag)
|
|
if ( nodes[i].m_tagNameLen == 8 &&
|
|
nodes[i].m_tagName[0] == 'a' &&
|
|
! strncmp(nodes[i].m_tagName,"absScore",8) ) {
|
|
score = atof ( nodes[i].m_tagName );
|
|
// add it
|
|
m_wikiDocIds [ nd ] = docId;
|
|
m_wikiScores [ nd ] = score;
|
|
nd++;
|
|
// do not overflow
|
|
if ( nd >= MAX_WIKI_DOCIDS ) break;
|
|
}
|
|
}
|
|
// point to them
|
|
ptr_wikiDocIds = m_wikiDocIds;
|
|
ptr_wikiScores = m_wikiScores;
|
|
size_wikiDocIds = nd * 8;
|
|
size_wikiScores = nd * sizeof(rscore_t);
|
|
|
|
log ( LOG_DEBUG , "build: got %"INT32" wiki docids",nd);
|
|
|
|
m_wikiDocIdsValid = true;
|
|
}
|
|
|
|
int32_t *XmlDoc::getPubDate ( ) {
|
|
if ( m_pubDateValid ) return (int32_t *)&m_pubDate;
|
|
// get date parse
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
|
|
// got it
|
|
m_pubDateValid = true;
|
|
m_pubDate = dp->getPubDate();
|
|
// print it once for page parser. we now do this in XmlDoc::print()
|
|
//if ( m_pbuf ) m_dates.printPubDates ( m_pbuf );
|
|
// set m_ageInDays
|
|
if ( m_pubDate == (uint32_t)-1 ) return (int32_t *)&m_pubDate;
|
|
// for parsing date
|
|
//int32_t currentTime = getTimeGlobal();
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t spideredTime = getSpideredTime();
|
|
// get doc age
|
|
//float age = currentTime - m_pubDate;
|
|
float age = spideredTime - m_pubDate;
|
|
// convert to days (could be negative if in the future)
|
|
m_ageInDays = age / (3600*24.0);
|
|
// fix it if negative
|
|
if ( m_ageInDays < 0.0 ) m_ageInDays = 0.0;
|
|
return (int32_t *)&m_pubDate;
|
|
}
|
|
|
|
Dates *XmlDoc::getDates ( ) {
|
|
if ( m_datesValid ) return &m_dates;
|
|
// skip for now
|
|
m_datesValid = true;
|
|
return &m_dates;
|
|
|
|
// set status. we can time status changes with this routine!
|
|
setStatus ( "getting dates");
|
|
|
|
Dates *dd = getSimpleDates();
|
|
// bail on error
|
|
if ( ! dd ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return NULL;
|
|
}
|
|
|
|
// need addresses
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (void *)-1 ) return (Dates *)aa;
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (Dates *)isRoot;
|
|
|
|
// . get root doc, from titlerec is ok ( TODO: make sure from titlerec)
|
|
// . TODO: make sure to save in titledb too???
|
|
// . we need this now too
|
|
// . now set DF_IN_ROOTDOC on dates that were in the same section but
|
|
// in the root doc.
|
|
// . if we are not the root, we use the root title rec to see if
|
|
// the website repeats the store hours on every page. in that case
|
|
// . TODO: a special cache just fo rholding "svt" for root pages.
|
|
// should be highly efficient!!!
|
|
//XmlDoc *rd = NULL;
|
|
|
|
// setPart2() needs the implied sections set, so set them
|
|
Sections *sections = getSections();
|
|
if ( !sections ||sections==(Sections *)-1) return(Dates *)sections;
|
|
|
|
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
//if ( ! osvt || osvt == (void *)-1 ) return (Dates *)osvt;
|
|
|
|
// table should be empty if we are the root!
|
|
//HashTableX *rvt = getRootVotingTable();
|
|
//if ( ! rvt || rvt == (void *)-1 ) return (Dates *)rvt;
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS || isRSS == (void *)-1 ) return (Dates *)isRSS;
|
|
|
|
uint8_t *ctype = getContentType();
|
|
if ( ! ctype || ctype == (void *)-1 ) return (Dates *)ctype;
|
|
|
|
bool isXml = false;
|
|
if ( *isRSS ) isXml = true;
|
|
if ( *ctype == CT_XML ) isXml = true;
|
|
|
|
int32_t minPubDate = -1;
|
|
int32_t maxPubDate = -1;
|
|
// parentPrevSpiderTime is 0 if that was the first time that the
|
|
// parent was spidered, in which case isNewOutlink will always be set
|
|
// for every outlink it had!
|
|
if ( m_sreqValid &&
|
|
m_sreq.m_isNewOutlink &&
|
|
m_sreq.m_parentPrevSpiderTime ) {
|
|
// pub date is somewhere between these two times
|
|
minPubDate = m_sreq.m_parentPrevSpiderTime;
|
|
//maxPubDate = m_sreq.m_addedTime;
|
|
maxPubDate = m_sreq.m_discoveryTime;
|
|
}
|
|
|
|
// now set part2 , returns false and sets g_errno on error
|
|
if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
|
|
isXml , *isRoot )) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: dates2: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
|
|
// on all other errors, return NULL
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
// debug EBADENGINEER error
|
|
if ( g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
// overflow? does not set g_errno. at least clear all so we do not
|
|
// get a messed up partial representation.
|
|
//if ( m_dates.m_overflowed ) {
|
|
// log("doc: date overflow for %s",m_firstUrl.m_url);
|
|
// m_dates.reset();
|
|
//}
|
|
|
|
// only call it once
|
|
m_datesValid = true;
|
|
// return it
|
|
return &m_dates;
|
|
}
|
|
|
|
Dates *XmlDoc::getSimpleDates ( ) {
|
|
|
|
if ( m_simpleDatesValid ) return &m_dates;
|
|
// note that
|
|
setStatus("get dates part 1");
|
|
// try the current url
|
|
Url *u = getCurrentUrl();
|
|
// and ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (Dates *)ip;
|
|
// the docid
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Dates *)d;
|
|
// the site hash
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Dates *)sh32;
|
|
// words
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (Dates *)words;
|
|
// we set the D_IS_IN_DATE flag for these bits
|
|
Bits *bits = getBits(); if ( ! bits ) return NULL;
|
|
// sections. is it ok that these do not include implied sections?
|
|
Sections *sections = getExplicitSections();
|
|
if (!sections||sections==(Sections *)-1) return (Dates *)sections;
|
|
// link info (this is what we had the problem with)
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Dates *)info1;
|
|
//int32_t *sv = getPageSampleVector();
|
|
//if ( ! sv || sv == (int32_t *)-1 ) return (Dates *)sv;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Dates *)xml;
|
|
// this must be valid, cuz Dates.cpp uses it!
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0;}
|
|
// . get the xml doc of the previously stored title rec
|
|
// . Dates will compare the two docs to check for clocks, etc.
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (Dates *)pod;
|
|
Url **redir = getRedirUrl();
|
|
if ( ! redir || redir == (Url **)-1 ) return (Dates *)redir;
|
|
//char *ru = NULL;
|
|
//if ( *redir ) ru = (*redir)->getUrl();
|
|
|
|
// this should deserialize from its title rec data
|
|
//Dates *odp = NULL;
|
|
//if ( *pod ) odp = (*pod)->getDates ();
|
|
// the key in this table is the date tagHash and occNum, and the
|
|
// value is the timestamp of the date. this is used by the clock
|
|
// detection algorithm to compare a date in the previous version
|
|
// of this web page to see if it changed and is therefore a clock then.
|
|
// HashTableX *cct = NULL;
|
|
// if ( *pod ) cct = (*pod)->getClockCandidatesTable();
|
|
// this should be valid
|
|
uint8_t ctype = *getContentType();
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// this now returns false and sets g_errno on error, true on success
|
|
if ( ! m_dates.setPart1 ( u , //->getUrl(),
|
|
*redir, // ru ,
|
|
ctype ,
|
|
*ip ,
|
|
*d ,
|
|
*sh32 ,
|
|
xml ,
|
|
words ,
|
|
// set D_IS_IN_DATE flag so Address.cpp
|
|
// can avoid such word in addresses!
|
|
bits ,
|
|
sections ,
|
|
info1 ,
|
|
//sv ,
|
|
//odp , // old dates
|
|
NULL , // cct ,
|
|
this , // us
|
|
*pod , // old XmlDoc
|
|
cr->m_coll ,
|
|
m_niceness )) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: dates1: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
|
|
// on all other errors, return NULL
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
// only call it once
|
|
m_simpleDatesValid = true;
|
|
// return it
|
|
return &m_dates;
|
|
}
|
|
|
|
/*
|
|
// returns NULL and sets g_errno on error, returns -1 if blocked
|
|
HashTableX *XmlDoc::getClockCandidatesTable ( ) {
|
|
// return if valid
|
|
if ( m_clockCandidatesTableValid ) return &m_clockCandidatesTable;
|
|
// otherwise, deserialize?
|
|
if ( m_clockCandidatesDataValid ) {
|
|
// and table is now valid
|
|
m_clockCandidatesTableValid = true;
|
|
// return empty table if ptr is NULL. take this out then.
|
|
if(!ptr_clockCandidatesData ) return &m_clockCandidatesTable;
|
|
// otherwise, deserialize
|
|
m_clockCandidatesTable.deserialize(ptr_clockCandidatesData ,
|
|
size_clockCandidatesData,
|
|
m_niceness );
|
|
// and return that
|
|
return &m_clockCandidatesTable;
|
|
}
|
|
|
|
// no longer using this since we got ptr_metadata
|
|
return &m_clockCandidatesTable;
|
|
|
|
// otherwise, get our dates
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (HashTableX *)dp;
|
|
// reset table just in case
|
|
m_clockCandidatesTable.reset();
|
|
// if no dates, bail
|
|
if ( dp->m_numDatePtrs == 0 ) {
|
|
m_clockCandidatesTableValid = true;
|
|
m_clockCandidatesDataValid = true;
|
|
// ptr_clockCandidatesData = NULL;
|
|
// size_clockCandidatesData = 0;
|
|
return &m_clockCandidatesTable;
|
|
}
|
|
// and set size to 32 buckets to start
|
|
if ( ! m_clockCandidatesTable.set (8,4,32,NULL,0,false,m_niceness,
|
|
"clockcands") )
|
|
return NULL;
|
|
// now stock the table
|
|
for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get date
|
|
Date *di = dp->m_datePtrs[i];
|
|
// skip if got nuked
|
|
if ( ! di ) continue;
|
|
// make the key
|
|
int64_t key ;
|
|
// lower 32 bits is taghash
|
|
key = di->m_tagHash;
|
|
// upper 32 bits is occNum
|
|
key |= ((int64_t)(di->m_occNum)) << 32;
|
|
// timestamp is the val
|
|
int32_t val = di->m_timestamp;
|
|
// then store it
|
|
if ( ! m_clockCandidatesTable.addKey ( &key , &val ) )
|
|
return NULL;
|
|
}
|
|
// that is now valid
|
|
m_clockCandidatesTableValid = true;
|
|
// how many bytes to serialize?
|
|
int32_t need = m_clockCandidatesTable.getStoredSize();
|
|
// now make the ptr valid
|
|
if ( ! m_cctbuf.reserve ( need ) ) return NULL;
|
|
// store it in there
|
|
m_clockCandidatesTable.serialize ( &m_cctbuf );
|
|
// point to it
|
|
// ptr_clockCandidatesData = m_cctbuf.getBufStart();
|
|
// size_clockCandidatesData = need;
|
|
// that is valid now
|
|
m_clockCandidatesDataValid = true;
|
|
return &m_clockCandidatesTable;
|
|
}
|
|
*/
|
|
|
|
// a date of -1 means not found or unknown
|
|
int32_t XmlDoc::getUrlPubDate ( ) {
|
|
if ( m_urlPubDateValid ) return m_urlPubDate;
|
|
// need a first url. caller should have called setFirstUrl()
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
// use Dates
|
|
//Dates dp;
|
|
// -1 means unknown
|
|
m_urlPubDate = -1;
|
|
//m_urlAge = -1;
|
|
// try the FIRST url
|
|
Url *u = getFirstUrl();
|
|
// get last url we redirected to
|
|
Url **redir = getRedirUrl();
|
|
if ( ! redir || redir == (Url **)-1 ) {char *xx=NULL;*xx=0;}
|
|
|
|
subloop:
|
|
// . try to get the date just from the url
|
|
// . this will be zero if none found
|
|
m_urlPubDate = parseDateFromUrl ( u->getUrl() );
|
|
// we are kosher
|
|
m_urlPubDateValid = true;
|
|
// if we are unknown try last/redir url, if any
|
|
if ( m_urlPubDate == 0 && *redir && u != *redir ) {
|
|
u = *redir;
|
|
goto subloop;
|
|
}
|
|
// if we got a valid pub date from the url, set "m_urlAge"
|
|
if ( m_urlPubDate == 0 ) return m_urlPubDate;
|
|
// note it
|
|
log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"",
|
|
(uint32_t)m_urlPubDate );
|
|
// set the age
|
|
//m_urlAge = getTimeGlobal() - m_urlPubDate;
|
|
//if ( m_urlAge < 0 ) m_urlAge = 0;
|
|
return m_urlPubDate;
|
|
}
|
|
|
|
// . use Dates to extract pub date from the url itself if pub date exists
|
|
// . an age of "-1" means unknown
|
|
/*
|
|
int32_t XmlDoc::getOutlinkAge ( int32_t outlinkNum ) {
|
|
// use Dates
|
|
Dates dp;
|
|
// sanity
|
|
if ( outlinkNum < 0 ) { char *xx=NULL;*xx=0; }
|
|
// get it
|
|
char *us = m_links.getLinkPtr(outlinkNum);
|
|
// for now set this, until we mod Dates to use normalized
|
|
// string urls
|
|
Url u;
|
|
u.set ( us );
|
|
// try to get the date just from the url
|
|
if ( ! dp.set ( &u ,
|
|
0 , // ip
|
|
0LL , // m_newDocId
|
|
0 , // siteHash
|
|
NULL , // Xml
|
|
NULL , // Words
|
|
NULL , // Bits
|
|
NULL , // Sections
|
|
NULL , // LinkInfo
|
|
NULL , // pageSampleVec
|
|
NULL , // old date parse2
|
|
NULL , // m_newDoc
|
|
NULL , // m_oldDoc
|
|
m_coll ,
|
|
0 , // defaultTimeZone
|
|
m_niceness )){
|
|
// should never block!
|
|
char *xx=NULL; *xx= 0; }
|
|
// this will be -1 if no date was found in the url
|
|
int32_t urlPubDate = dp.getPubDate();
|
|
// if we got a valid pub date from the url, set "m_urlAge"
|
|
if ( urlPubDate == -1 ) return -1;
|
|
// note it
|
|
//log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"", m_urlDate );
|
|
// set the age
|
|
int32_t age = getTimeGlobal() - urlPubDate;
|
|
// keep positive
|
|
if ( age < 0 ) age = 0;
|
|
// return it
|
|
return age;
|
|
}
|
|
*/
|
|
|
|
|
|
// . sets g_errno on error and returns NULL
|
|
// . now returns a ptr to it so we can return NULL to signify error, that way
|
|
// all accessors have equivalent return values
|
|
// . an acessor function returns (char *)-1 if it blocked!
|
|
char *XmlDoc::getIsPermalink ( ) {
|
|
if ( m_isPermalinkValid ) return &m_isPermalink2;
|
|
Url *url = getCurrentUrl();
|
|
if ( ! url ) return NULL;
|
|
char *isRSS = getIsRSS();
|
|
// return NULL with g_errno set, -1 if blocked
|
|
if ( ! isRSS || isRSS == (char *)-1 ) return isRSS;
|
|
Links *links = getLinks();
|
|
// return NULL with g_errno set, -1 if blocked
|
|
if ( ! links || links == (Links *)-1 ) return (char *)links;
|
|
uint8_t *ct = getContentType();
|
|
// return NULL with g_errno set, -1 if blocked
|
|
if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct;
|
|
// GUESS if it is a permalink by the format of the url
|
|
int32_t p = ::isPermalink ( links , // Links ptr
|
|
url ,
|
|
*ct , // CT_HTML default?
|
|
NULL , // LinkInfo ptr
|
|
*isRSS );// isRSS?
|
|
m_isPermalink = p;
|
|
m_isPermalink2 = p;
|
|
m_isPermalinkValid = true;
|
|
return &m_isPermalink2;
|
|
}
|
|
|
|
// guess based on the format of the url if this is a permalink
|
|
char *XmlDoc::getIsUrlPermalinkFormat ( ) {
|
|
if ( m_isUrlPermalinkFormatValid ) return &m_isUrlPermalinkFormat;
|
|
|
|
setStatus ( "getting is url permalink format" );
|
|
|
|
Url *url = getCurrentUrl();
|
|
if ( ! url ) return NULL;
|
|
// just guess if we are rss here since we most likely do not have
|
|
// access to the url's content...
|
|
bool isRSS = false;
|
|
char *ext = url->getExtension();
|
|
if ( ext && strcasecmp(ext,"rss") == 0 ) isRSS = true;
|
|
// GUESS if it is a permalink by the format of the url
|
|
int32_t p = ::isPermalink ( NULL , // Links ptr
|
|
url ,
|
|
CT_HTML ,
|
|
NULL , // LinkInfo ptr
|
|
isRSS );// we guess this...
|
|
m_isUrlPermalinkFormat = p;
|
|
m_isUrlPermalinkFormatValid = true;
|
|
return &m_isUrlPermalinkFormat;
|
|
}
|
|
|
|
char *XmlDoc::getIsRSS ( ) {
|
|
if ( m_isRSSValid ) return &m_isRSS2;
|
|
// the xml tells us for sure
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
m_isRSS = xml->isRSSFeed();
|
|
m_isRSS2 = (bool)m_isRSS;
|
|
m_isRSSValid = true;
|
|
return &m_isRSS2;
|
|
}
|
|
|
|
char *XmlDoc::getIsSiteMap ( ) {
|
|
if ( m_isSiteMapValid ) return &m_isSiteMap;
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct;
|
|
char *uf = m_firstUrl.getFilename();
|
|
int32_t ulen = m_firstUrl.getFilenameLen();
|
|
// sitemap.xml
|
|
m_isSiteMap = false;
|
|
// must be xml to be a sitemap
|
|
if ( *ct == CT_XML &&
|
|
ulen == 11 &&
|
|
strncmp(uf,"sitemap.xml",11) == 0 )
|
|
m_isSiteMap = true;
|
|
m_isSiteMapValid = true;
|
|
return &m_isSiteMap;
|
|
}
|
|
|
|
// . this function should really be called getTagTokens() because it mostly
|
|
// works on HTML documents, not XML, and just sets an array of ptrs to
|
|
// the tags in the document, including ptrs to the text in between
|
|
// tags.
|
|
Xml *XmlDoc::getXml ( ) {
|
|
|
|
// return it if it is set
|
|
if ( m_xmlValid ) return &m_xml;
|
|
|
|
// note it
|
|
setStatus ( "parsing html");
|
|
|
|
// get the filtered content
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
|
|
int32_t u8len = size_utf8Content - 1;
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
|
|
|
|
// set it
|
|
if ( ! m_xml.set ( *u8 ,
|
|
u8len ,
|
|
false , // ownData?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
false , // setParentsArg?
|
|
m_niceness ,
|
|
*ct ) )
|
|
// return NULL on error with g_errno set
|
|
return NULL;
|
|
// set just once
|
|
m_xmlValid = true;
|
|
// all done
|
|
return &m_xml;
|
|
}
|
|
|
|
// Language support static stuff
|
|
enum {
|
|
METHOD_TAG = 0,
|
|
METHOD_DMOZ,
|
|
METHOD_URL,
|
|
METHOD_OUTLINKS,
|
|
METHOD_INLINKS,
|
|
METHOD_FREQ,
|
|
METHOD_DEFAULT,
|
|
METHOD_IP,
|
|
METHOD_ROOT,
|
|
METHOD_CAP
|
|
};
|
|
|
|
bool setLangVec ( Words *words ,
|
|
SafeBuf *langBuf ,
|
|
Sections *ss ,
|
|
int32_t niceness ) {
|
|
|
|
int64_t *wids = words->getWordIds ();
|
|
char **wptrs = words->m_words;
|
|
int32_t nw = words->getNumWords ();
|
|
|
|
// allocate
|
|
if ( ! langBuf->reserve ( nw ) ) return false;
|
|
|
|
uint8_t *langVector = (uint8_t *)langBuf->getBufStart();
|
|
|
|
// now set the langid
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// default
|
|
langVector[i] = langUnknown;
|
|
// add the word
|
|
if ( wids[i] == 0LL ) continue;
|
|
// skip if number
|
|
if ( is_digit(wptrs[i][0]) ) {
|
|
langVector[i] = langTranslingual;
|
|
continue;
|
|
}
|
|
// get the lang bits. does not include langTranslingual
|
|
// or langUnknown
|
|
int64_t bits = g_speller.getLangBits64 ( &wids[i] );
|
|
// skip if not unique
|
|
char count = getNumBitsOn64 ( bits ) ;
|
|
// if we only got one lang we could be, assume that
|
|
if ( count == 1 ) {
|
|
// get it. bit #0 is english, so add 1
|
|
char langId = getBitPosLL((uint8_t *)&bits) + 1;
|
|
//langVector[i] = g_wiktionary.getLangId(&wids[i]);
|
|
langVector[i] = langId;
|
|
continue;
|
|
}
|
|
// ambiguous? set it to unknown then
|
|
if ( count >= 2 ) {
|
|
langVector[i] = langUnknown;
|
|
continue;
|
|
}
|
|
// try setting based on script. greek. russian. etc.
|
|
// if the word was not in the wiktionary.
|
|
// this will be langUnknown if not definitive.
|
|
langVector[i] = getCharacterLanguage(wptrs[i]);
|
|
}
|
|
|
|
// . now go sentence by sentence
|
|
// . get the 64 bit vector for each word in the sentence
|
|
// . then intersect them all
|
|
// . if the result is a unique langid, assign that langid to
|
|
// all words in the sentence
|
|
|
|
// get first sentence in doc
|
|
Section *si = NULL;
|
|
if ( ss ) si = ss->m_firstSent;
|
|
// scan the sentence sections and or in the bits we should
|
|
for ( ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// reset vec
|
|
int64_t bits = LANG_BIT_MASK;
|
|
// get lang 64 bit vec for each wid in sentence
|
|
for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[j] ) continue;
|
|
// skip if starts with digit
|
|
if ( is_digit(wptrs[j][0]) ) continue;
|
|
// get 64 bit lang vec. does not include
|
|
// langUnknown or langTransligual bits
|
|
bits &= g_speller.getLangBits64 ( &wids[j] );
|
|
}
|
|
// bail if none
|
|
if ( ! bits ) continue;
|
|
// skip if more than one language in intersection
|
|
if ( getNumBitsOn64(bits) != 1 ) continue;
|
|
// get it. bit #0 is english, so add 1
|
|
char langId = getBitPosLL((uint8_t *)&bits) + 1;
|
|
// ok, must be this language i guess
|
|
for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[j] ) continue;
|
|
// skip if starts with digit
|
|
if ( is_digit(wptrs[j][0]) ) continue;
|
|
// set it
|
|
langVector[j] = langId;
|
|
}
|
|
}
|
|
|
|
// try the same thing but do not use sentences. use windows of
|
|
// 5 words. this will pick up pages that have an english menu
|
|
// where each menu item is an individual sentence and only
|
|
// one word.
|
|
// http://www.topicexchange.com/
|
|
int64_t window[5];
|
|
int32_t wpos[5];
|
|
memset ( window , 0 , 8*5 );
|
|
int32_t wp = 0;
|
|
int32_t total = 0;
|
|
// now set the langid
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// must be alnum
|
|
if ( ! wids[i] ) continue;
|
|
// skip if starts with digit
|
|
if ( is_digit(wptrs[i][0]) ) continue;
|
|
// skip if lang already set to a language
|
|
//if ( langVector[i] != langUnknown &&
|
|
// langVector[i] != langTranslingual )
|
|
// continue;
|
|
// get last 5
|
|
window[wp] = g_speller.getLangBits64 ( &wids[i] );
|
|
// skip if not in dictionary!
|
|
if ( window[wp] == 0 ) continue;
|
|
// otherwise, store it
|
|
wpos [wp] = i;
|
|
if ( ++wp >= 5 ) wp = 0;
|
|
// need at least 3 samples
|
|
if ( ++total <= 2 ) continue;
|
|
// intersect them all together
|
|
int64_t bits = LANG_BIT_MASK;
|
|
for ( int32_t j = 0 ; j < 5 ; j++ ) {
|
|
// skip if uninitialized, like if we have 3
|
|
// or only 4 samples
|
|
if ( ! window[j] ) continue;
|
|
// otherwise, toss it in the intersection
|
|
bits &= window[j];
|
|
}
|
|
// skip if intersection empty
|
|
if ( ! bits ) continue;
|
|
// skip if more than one language in intersection
|
|
if ( getNumBitsOn64(bits) != 1 ) continue;
|
|
// get it. bit #0 is english, so add 1
|
|
char langId = getBitPosLL((uint8_t *)&bits) + 1;
|
|
// set all in window to this language
|
|
for ( int32_t j = 0 ; j < 5 ; j++ ) {
|
|
// skip if unitialized
|
|
if ( ! window[j] ) continue;
|
|
// otherwise, set it
|
|
langVector[wpos[j]] = langId;
|
|
}
|
|
}
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// 1-1 with the words!
|
|
uint8_t *XmlDoc::getLangVector ( ) {
|
|
|
|
if ( m_langVectorValid ) {
|
|
// can't return NULL, that means error!
|
|
uint8_t *v = (uint8_t *)m_langVec.getBufStart();
|
|
if ( ! v ) return (uint8_t *)0x01;
|
|
return v;
|
|
}
|
|
|
|
// words
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
|
|
|
|
// get the sections without implied sections
|
|
Sections *ss = getImpliedSections();
|
|
if ( ! ss || ss==(void *)-1) return (uint8_t *)ss;
|
|
|
|
|
|
if ( ! setLangVec ( words , &m_langVec , ss , m_niceness) )
|
|
return NULL;
|
|
|
|
m_langVectorValid = true;
|
|
// can't return NULL, that means error!
|
|
uint8_t *v = (uint8_t *)m_langVec.getBufStart();
|
|
if ( ! v ) return (uint8_t *)0x01;
|
|
return v;
|
|
}
|
|
|
|
// returns -1 and sets g_errno on error
|
|
uint8_t *XmlDoc::getLangId ( ) {
|
|
if ( m_langIdValid ) return &m_langId;
|
|
setStatus ( "getting lang id");
|
|
|
|
// debu ghack
|
|
//m_langId = langRussian;
|
|
//m_langIdValid = true;
|
|
//return &m_langId;
|
|
|
|
// get the stuff we need
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (uint8_t *)ip;
|
|
|
|
// . if we got no ip, we can't get the page...
|
|
// . also getLinks() will call getSiteNumInlinks() which will
|
|
// call getSiteLinkInfo() and will core if ip is 0 or -1
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
m_langId = langUnknown;
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
//Xml *xml = getXml ();
|
|
//if ( ! xml || xml == (Xml *)-1 ) return (uint8_t *)xml;
|
|
Words *words = getWords ();
|
|
if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
|
|
// do not get regular sections, getSections() which will call
|
|
// getImpliedSections(), because then that will need to set addresses
|
|
// and dates, etc. the addresses could return NULL with EBUFOVERFLOW
|
|
// from a static buffer overflow causing us some problems here and
|
|
// since that g_errno is only really handled well in getIndexCode()
|
|
// it will log that CRITICAL CRITICAL message. and we really only
|
|
// need the section sot avoid looking at script tag sections, etc.
|
|
// when calling Words::getLanguage()
|
|
Sections *sections = getExplicitSections();
|
|
// did it block?
|
|
if ( sections==(Sections *)-1) return(uint8_t *)sections;
|
|
// well, it still calls Dates::parseDates which can return g_errno
|
|
// set to EBUFOVERFLOW...
|
|
if ( ! sections && g_errno != EBUFOVERFLOW ) return NULL;
|
|
// if sectinos is still NULL - try lang id without sections then,
|
|
// reset g_errno
|
|
g_errno = 0;
|
|
//Links *links = getLinks();
|
|
//if ( ! links || links == (Links *)-1 ) return (uint8_t *)links;
|
|
//LinkInfo *info1 = getLinkInfo1();
|
|
//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (uint8_t *)info1;
|
|
//CatRec *cat = getCatRec ();
|
|
//if ( ! cat || cat == (CatRec *)-1) return (uint8_t *)cat;
|
|
uint8_t *lv = getLangVector();
|
|
if ( ! lv || lv == (void *)-1 ) return (uint8_t *)lv;
|
|
|
|
setStatus ( "getting lang id");
|
|
|
|
// compute langid from vector
|
|
m_langId = computeLangId ( sections , words, (char *)lv );
|
|
if ( m_langId != langUnknown ) {
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
// . try the meta description i guess
|
|
// . 99% of the time we don't need this because the above code
|
|
// captures the language
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription( &mdlen );
|
|
Words mdw;
|
|
mdw.setx ( md , mdlen , m_niceness );
|
|
SafeBuf langBuf;
|
|
setLangVec ( &mdw,&langBuf,NULL,m_niceness);
|
|
char *tmpLangVec = langBuf.getBufStart();
|
|
m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
|
|
if ( m_langId != langUnknown ) {
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
// try meta keywords
|
|
md = getMetaKeywords( &mdlen );
|
|
mdw.setx ( md , mdlen , m_niceness );
|
|
langBuf.purge();
|
|
setLangVec ( &mdw,&langBuf,NULL,m_niceness);
|
|
tmpLangVec = langBuf.getBufStart();
|
|
m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
|
|
// lv = langVec
|
|
char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {
|
|
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
// this means null too
|
|
if ( sections && sections->m_numSections == 0 ) sp = NULL;
|
|
int32_t badFlags = SEC_SCRIPT|SEC_STYLE;//|SEC_SELECT;
|
|
|
|
int32_t counts [ MAX_LANGUAGES ];
|
|
memset ( counts , 0 , MAX_LANGUAGES * 4);
|
|
|
|
|
|
|
|
int32_t nw = words->getNumWords ();
|
|
char **wptrs = words->m_words;
|
|
int32_t *wlens = words->m_wordLens;
|
|
|
|
|
|
// now set the langid
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if in script or style section
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
|
//
|
|
// skip if in a url
|
|
//
|
|
// blah/
|
|
if ( wptrs[i][wlens[i]] == '/' ) continue;
|
|
// blah.blah or blah?blah
|
|
if ( (wptrs[i][wlens[i]] == '.' ||
|
|
wptrs[i][wlens[i]] == '?' ) &&
|
|
is_alnum_a(wptrs[i][wlens[i]+1]) )
|
|
continue;
|
|
// /blah or ?blah
|
|
if ( (i>0 && wptrs[i][-1] == '/') ||
|
|
(i>0 && wptrs[i][-1] == '?') )
|
|
continue;
|
|
// add it up
|
|
counts[(unsigned char)lv[i]]++;
|
|
}
|
|
|
|
// get the majority count
|
|
int32_t max = 0;
|
|
int32_t maxi = 0;
|
|
// skip langUnknown by starting at 1, langEnglish
|
|
for ( int32_t i = 1 ; i < MAX_LANGUAGES ; i++ ) {
|
|
// skip translingual
|
|
if ( i == langTranslingual ) continue;
|
|
if ( counts[i] <= max ) continue;
|
|
max = counts[i];
|
|
maxi = i;
|
|
}
|
|
|
|
return maxi;
|
|
//m_langId = maxi;
|
|
//m_langIdValid = true;
|
|
//return &m_langId;
|
|
|
|
/*
|
|
int32_t freqScore = 0;
|
|
int32_t lang;
|
|
if ( ! m_processedLang ) {
|
|
// do not repeat this call for this document
|
|
m_processedLang = true;
|
|
lang = words->getLanguage( sections ,
|
|
1000 , // sampleSize ,
|
|
m_niceness,
|
|
&freqScore);
|
|
// return NULL on error with g_errno set
|
|
if ( lang == -1 ) return NULL;
|
|
// we got it from words, return
|
|
if ( lang != 0 ) {
|
|
m_langId = lang;
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
}
|
|
|
|
m_langId = 0;
|
|
// try from charset
|
|
uint16_t *charset = getCharset ( );
|
|
if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset;
|
|
// do based on charset
|
|
if ( *charset == csGB18030 ) m_langId = langChineseTrad;
|
|
if ( *charset == csGBK ) m_langId = langChineseSimp;
|
|
|
|
if ( m_langId ) {
|
|
m_langIdValid = true;
|
|
return &m_langId;
|
|
}
|
|
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
|
|
// this lookup here might be unnecessary
|
|
uint8_t *rl = NULL;
|
|
if ( ! *isRoot ) {
|
|
rl = getRootLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
|
|
}
|
|
|
|
//Url *u = getCurrentUrl();
|
|
Url *u = getFirstUrl();
|
|
uint8_t gs[METHOD_CAP];
|
|
// reset language method vector
|
|
memset( gs , 0, sizeof(uint8_t) * METHOD_CAP );
|
|
// Let the site tell us what language it's in
|
|
gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml );
|
|
// Guess from the FIRST URL (unredirected url)
|
|
gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() );
|
|
// Guess from the outlinks
|
|
gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links );
|
|
// Guess from the inlinks
|
|
gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip);
|
|
// root page's language, if there was one
|
|
if ( ! *isRoot ) gs [METHOD_ROOT] = *rl;
|
|
|
|
int32_t scores[MAX_LANGUAGES];
|
|
memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES );
|
|
// weights for the 10 methods
|
|
char cw[] = { 8,9,4,7,6,7,8,1,2};
|
|
// add up weighted scores
|
|
for(int i = 0; i < METHOD_CAP; i++ )
|
|
scores[gs[i]] += cw[i];
|
|
|
|
// reset the "lang" to langUnknown which is 0
|
|
lang = langUnknown ;
|
|
int max, oldmax;
|
|
max = oldmax = 0;
|
|
// find best language
|
|
for ( int32_t i = MAX_LANGUAGES - 1; i > 0 ; i-- ) {
|
|
if ( scores[i] < max) continue;
|
|
oldmax = max;
|
|
max = scores[i];
|
|
lang = i;
|
|
}
|
|
// give up if not too conclusive
|
|
if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) {
|
|
//log(LOG_DEBUG, "build: Language: Threshold, score "
|
|
// "(%"INT32" - %"INT32") %"INT32" vs. %"INT32".\n",
|
|
// (int32_t)max,
|
|
// (int32_t)oldmax,
|
|
// (int32_t)max - oldmax,
|
|
// (int32_t)3);//(int32_t)cr->m_languageThreshold);
|
|
lang = langUnknown;
|
|
}
|
|
// Make sure we're over the bailout value, this
|
|
// keeps low scoring methods like TLD from being
|
|
// the decider if it was the only successful method.
|
|
if ( max < 5 ) { // cr->m_languageBailout ) {
|
|
//log(LOG_DEBUG, "build: Language: Bailout, "
|
|
// "score %"INT32" vs. %"INT32".",
|
|
// (int32_t)max, (int32_t)5);//cr->m_languageBailout);
|
|
lang = langUnknown;
|
|
}
|
|
// If the language is still not known,
|
|
// use the language detected from the frames.
|
|
//if(lang == langUnknown) lang = frameFoundLang;
|
|
// . try dmoz if still unknown
|
|
// . limit to 10 of them
|
|
// all done, do not repeat
|
|
m_langIdValid = true;
|
|
m_langId = lang;
|
|
m_langIdScore = max;
|
|
return &m_langId;
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
Words *XmlDoc::getWords ( ) {
|
|
// return it if it is set
|
|
if ( m_wordsValid ) return &m_words;
|
|
// this will set it if necessary
|
|
Xml *xml = getXml();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Words *)xml;
|
|
// note it
|
|
setStatus ( "getting words");
|
|
// now set what we need
|
|
if ( ! m_words.set ( xml ,
|
|
true , // computeWordIds?
|
|
m_niceness ))
|
|
return NULL;
|
|
// we got it
|
|
m_wordsValid = true;
|
|
return &m_words;
|
|
}
|
|
|
|
Bits *XmlDoc::getBits ( ) {
|
|
// return it if it is set
|
|
if ( m_bitsValid ) return &m_bits;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Bits *)words;
|
|
// now set what we need
|
|
if ( ! m_bits.set ( words , m_version , m_niceness ) )
|
|
return NULL;
|
|
// we got it
|
|
m_bitsValid = true;
|
|
return &m_bits;
|
|
}
|
|
|
|
Bits *XmlDoc::getBitsForSummary ( ) {
|
|
// return it if it is set
|
|
if ( m_bits2Valid ) return &m_bits2;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Bits *)words;
|
|
// now set what we need
|
|
if ( ! m_bits2.setForSummary ( words ) ) return NULL;
|
|
// we got it
|
|
m_bits2Valid = true;
|
|
return &m_bits2;
|
|
}
|
|
|
|
Pos *XmlDoc::getPos ( ) {
|
|
// return it if it is set
|
|
if ( m_posValid ) return &m_pos;
|
|
// this will set it if necessary
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Pos *)ww;
|
|
//Sections *sections = getSections();
|
|
//if ( !sections ||sections==(Sections *)-1) return(Pos *)sections;
|
|
// now set what we need
|
|
//if ( ! m_pos.set ( ww , sections ) ) return NULL;
|
|
if ( ! m_pos.set ( ww , NULL ) ) return NULL;
|
|
// we got it
|
|
m_posValid = true;
|
|
return &m_pos;
|
|
}
|
|
|
|
Phrases *XmlDoc::getPhrases ( ) {
|
|
// return it if it is set
|
|
if ( m_phrasesValid ) return &m_phrases;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Phrases *)words;
|
|
// get this
|
|
Bits *bits = getBits();
|
|
// bail on error
|
|
if ( ! bits ) return NULL;
|
|
// now set what we need
|
|
if ( ! m_phrases.set ( words ,
|
|
bits ,
|
|
true , // use stop words
|
|
false , // use stems
|
|
m_version ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
// we got it
|
|
m_phrasesValid = true;
|
|
return &m_phrases;
|
|
}
|
|
|
|
/*
|
|
Synonyms *XmlDoc::getSynonyms ( ) {
|
|
// return if already set
|
|
if ( m_synonymsValid ) return &m_synonyms;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (Synonyms *)words;
|
|
Phrases *phrases = getPhrases ();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (Synonyms *)phrases;
|
|
uint8_t *lv = getLangVector();
|
|
if ( ! lv || lv == (void *)-1 ) return (Synonyms *)lv;
|
|
// primary language of the document
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (Synonyms *) langId;
|
|
// . now set what we need
|
|
// . provide a buf for which synonyms can be stored if we need to
|
|
SafeBuf *synBuf = NULL;
|
|
if ( m_pbuf || m_storeTermListInfo ) synBuf = &m_synBuf;
|
|
|
|
// force on for printing out the synonyms in the loop below
|
|
//synBuf = &m_synBuf;
|
|
|
|
if ( ! m_synonyms.set ( words,
|
|
(char *)lv,
|
|
(char)*langId,phrases,
|
|
m_niceness,synBuf) )
|
|
return NULL;
|
|
|
|
// we got it
|
|
m_synonymsValid = true;
|
|
return &m_synonyms;
|
|
}
|
|
*/
|
|
|
|
Sections *XmlDoc::getExplicitSections ( ) {
|
|
// these sections might or might not have the implied sections in them
|
|
if ( m_explicitSectionsValid ) return &m_sections;
|
|
|
|
// if json forget this it is only html
|
|
//uint8_t *ct = getContentType();
|
|
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
|
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
|
// m_sectionsValid = true;
|
|
// return &m_sections;
|
|
//}
|
|
|
|
setStatus ( "getting explicit sections" );
|
|
// use the old title rec to make sure we parse consistently!
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;
|
|
// int16_tcut
|
|
//XmlDoc *od = *pod;
|
|
// if the serialized section is valid, use that
|
|
//char *sd = NULL;
|
|
//bool valid = false;
|
|
//if ( od && od->m_sectionsReplyValid ) valid = true;
|
|
//if ( valid ) sd = od->ptr_sectionsReply;
|
|
// shouldn't we use the section data in ptr_sections for this???
|
|
//bool valid = m_sectionsReplyValid ;
|
|
//char *sd = NULL;
|
|
//if ( valid ) sd = ptr_sectionsReply;
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
|
|
// need these too now
|
|
Phrases *phrases = getPhrases();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases;
|
|
// get this
|
|
Bits *bits = getBits();
|
|
// bail on error
|
|
if ( ! bits ) return NULL;
|
|
// the site hash
|
|
int64_t *sh64 = getSiteHash64();
|
|
// sanity check
|
|
if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; }
|
|
if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64;
|
|
// the docid
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
setStatus ( "getting sections");
|
|
|
|
//char *sv = NULL;
|
|
//if ( m_setFromTitleRec ) sv = ptr_sectionsVotes;
|
|
|
|
// debug time to find a slow url
|
|
int64_t start = gettimeofdayInMillisecondsLocal();
|
|
|
|
// this uses the sectionsReply to see which sections are "text", etc.
|
|
// rather than compute it expensively
|
|
if ( ! m_calledSections &&
|
|
// we get malformed sections error for some diffbot replies
|
|
//*ct != CT_JSON &&
|
|
! m_sections.set ( &m_words ,
|
|
&m_phrases ,
|
|
bits ,
|
|
getFirstUrl() ,
|
|
*d ,
|
|
*sh64 , // 64 bits
|
|
cr->m_coll ,
|
|
m_niceness ,
|
|
m_masterState , // state
|
|
m_masterLoop , // callback
|
|
*ct ,
|
|
&m_dates ,
|
|
NULL , // sd // sections data
|
|
true , // sections data valid?
|
|
NULL , // sv // for m_nsvt
|
|
//*tph ,
|
|
NULL , // buf
|
|
0 )) { // bufSize
|
|
m_calledSections = true;
|
|
// sanity check, this should not block, we are setting
|
|
// exclusively from the titleRec
|
|
//if ( sd ) { char *xx=NULL;*xx=0; }
|
|
// it blocked, return -1
|
|
return (Sections *) -1;
|
|
}
|
|
|
|
int64_t end = gettimeofdayInMillisecondsLocal();
|
|
|
|
if ( end - start > 1000 )
|
|
log("build: %s section set took %"INT64" ms",
|
|
m_firstUrl.m_url,end -start);
|
|
|
|
|
|
// error? ETAGBREACH for example... or maybe ENOMEM
|
|
if ( g_errno ) return NULL;
|
|
// set inlink bits
|
|
m_bits.setInLinkBits ( &m_sections );
|
|
// we got it
|
|
m_explicitSectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
Sections *XmlDoc::getImpliedSections ( ) {
|
|
if ( m_impliedSectionsValid ) return &m_sections;
|
|
|
|
// get the sections without implied sections
|
|
Sections *sections = getExplicitSections();
|
|
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
|
|
|
|
// just use that for now if not doing events to save time! because
|
|
// adding implied sections really sucks the resources.
|
|
m_impliedSectionsValid = true;
|
|
return &m_sections;
|
|
|
|
// this will set it if necessary
|
|
Words *words = getWords();
|
|
// returns NULL on error, -1 if blocked
|
|
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
|
|
// get this
|
|
Bits *bits = getBits();
|
|
// bail on error
|
|
if ( ! bits ) return NULL;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now we need basic date types to add implied sections that
|
|
// have a dow/dom header and tod brother sections
|
|
|
|
// THIS WAS in getExplicitSections() but now m_wids is NULL.
|
|
// m_wids is set in setPart1() called by XmlDoc::getSimpleDates(),
|
|
// which calls getExplicitSections().
|
|
// . This was called for the benefit of Sections::addImpliedSections()
|
|
// but now getAddresses() which we call below ends up calling
|
|
// getSimpleDates() which calls m_dates.setPart1() which calls
|
|
// m_dates.parseDates() so this is no longer needed i guess.
|
|
/*
|
|
if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits,
|
|
sections, m_niceness , &m_firstUrl ,
|
|
*ct )) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("doc: dates3: %s",mstrerror(g_errno));
|
|
// this just means we ran out of stack space to parse
|
|
// out all the dates, so ignore and continue... that way
|
|
// Spider.cpp does not give up and keep retrying us over
|
|
// and over again
|
|
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
|
|
// on all other errors, return NULL
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
*/
|
|
|
|
// if we got no sections it was bad html. so don't go any further
|
|
// lest we core in other code..
|
|
// it might have also just been an empty doc.
|
|
// either way we'll core in getAddresses cuz it calls getSimpleDates
|
|
// which will core in Dates::setPart1() trying to use m_sectionPtrs
|
|
if ( sections->m_numSections == 0 ) {
|
|
m_impliedSectionsValid = true;
|
|
// hack to avoid core for empty docs like www.mini-polis.com
|
|
sections->m_addedImpliedSections = true;
|
|
return &m_sections;
|
|
}
|
|
// . now set addresses so we can use those to add implied sections
|
|
// . this calls getSimpleDates() which calles m_dates.setPart1()
|
|
// which calls parseDates again
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (void *)-1 ) return (Sections *)aa;
|
|
|
|
// . now add implied sections
|
|
// . return NULL with g_errno set on error
|
|
if ( ! m_sections.addImpliedSections ( aa ) ) return NULL;
|
|
|
|
// we got it
|
|
m_impliedSectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
// add in Section::m_sentFlags bits having to do with our voting tables
|
|
Sections *XmlDoc::getSections ( ) {
|
|
|
|
setStatus("getting sections");
|
|
|
|
// get the sections without implied sections
|
|
Sections *ss = getImpliedSections();
|
|
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
|
|
|
|
// hash the turk votes (each vote maps a contenthash or taghash to
|
|
// a value) and use these to set sections sentence flags, etc.
|
|
//HashTableX *tvt = getTurkVotingTable ();
|
|
//if ( ! tvt || tvt == (void *)-1 ) return (Sections *)tvt;
|
|
|
|
// returns NULL if our url is root!
|
|
//HashTableX *rvt = getRootVotingTable();
|
|
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
|
|
|
|
SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt;
|
|
|
|
uint32_t *tph = getTagPairHash32();
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph;
|
|
|
|
// need a getUseSectiondb() function...
|
|
|
|
if ( ! m_useSectiondb ) {
|
|
m_sectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
// start here
|
|
Section *si;
|
|
|
|
/*
|
|
// get first sentence in doc
|
|
si = ss->m_firstSent;
|
|
// do not bother scanning if no votes
|
|
if ( osvt->getNumVotes() <= 0 ) si = NULL;
|
|
// scan the sentence sections and or in the bits we should
|
|
for ( ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// combine section tagHash with contentHashAll to get
|
|
// the "modified tagHash"
|
|
int32_t modified = si->m_tagHash ^ si->m_contentHash;
|
|
// save this
|
|
float dups = osvt->getNumSampled (modified,SV_TAGCONTENTHASH);
|
|
// . getNumSampled() combines both m_nsvt and m_osvt so it
|
|
// includes ourselves... NO!... let's change this!
|
|
// the osvt should not include votes from us!
|
|
// it strips those outin SectionVotingTable::addListOfVotes()
|
|
// . if it is a print-friendly version of the same page then
|
|
// one of the two should have been deduped and not indexed,
|
|
// so be strict with adhering to no more than 1!
|
|
if ( dups > 0 ) si->m_flags |= SEC_DUP;
|
|
// . content hash must be unique!
|
|
// . can detect texty bios repeated throughout the site
|
|
// . this is the hash of the words directly in the section
|
|
// . HACK: the contentHash is the "tagHash" for this call
|
|
// . SectionVote::m_numSampled is how many sections over all
|
|
// docs we indexed from this site have this m_contentHash
|
|
// . note that it is not restricted to pages with the same
|
|
// tagPairHash as us (i.e. pages with similar layouts)
|
|
// therefore it is very flexible!!! it is only restricted
|
|
// to pages with our same site hash.
|
|
// . getNumSampled() combines both m_nsvt and m_osvt so it
|
|
// includes ourselves
|
|
// . if it is a print-friendly version of the same page then
|
|
// one of the two should have been deduped and not indexed,
|
|
// so be strict with adhering to no more than 1!
|
|
if ( dups > 0 ) continue;
|
|
// . must be in a unique section
|
|
// . if the section has siblings, skip it!
|
|
if ( si->m_numOccurences > 1 ) continue;
|
|
// . eliminate dynamic menus
|
|
// . like "related posts" menus
|
|
// . therefore require that we must be "texty" ...
|
|
// . i.e. be like 80% plain text and no more than 20% link text
|
|
// . vote on this since in some cases article may be mostly
|
|
// just all in anchor text on a few article pages, but on
|
|
// other pages it is well-behaved
|
|
if ( osvt->getScore ( si->m_tagHash, SV_TEXTY) < .80 )
|
|
continue;
|
|
// . check for comment sections
|
|
// . these are text and the content is unique
|
|
// . BUT the section tagHash is typically repeated at least
|
|
// once on some other pages (HOPEFULLY!!!!)
|
|
// . if we only require there be X other pages from this site
|
|
// with the same layout, we might get unlucky in that each
|
|
// page has 1 or less comments!!! how to fix???
|
|
// . anyway, we ask for the max # sampled from all of the votes
|
|
// here because if just one page has 2+ copies of this
|
|
// section enum tag hash, that is enough to be a comment
|
|
// section
|
|
// . SV_TEXTY_MAX_SAMPLED is a statistic compiled from the
|
|
// voters and does not actually exist in sectiondb per se.
|
|
// we add this statistic transparently in addVote() below
|
|
// . it just gets the num sampled from the voter that had the
|
|
// maximum m_numSampled value, because we don't want an
|
|
// average in this case
|
|
if ( osvt->getNumSampled(si->m_tagHash,SV_TEXTY_MAX_SAMPLED)>0)
|
|
continue;
|
|
// set it
|
|
si->m_flags |= SEC_ARTICLE;
|
|
// tally it up
|
|
//m_numAlnumWordsInArticle += si->m_exclusive;
|
|
// and another flag
|
|
//m_hadArticle = true;
|
|
}
|
|
*/
|
|
|
|
//
|
|
// . how many other pages from this site have our tagpairhash?
|
|
// . that is all the unique adjacent tag pair hashes xor'd together
|
|
// . kind of represents the template of the webpage, ideally
|
|
//
|
|
//int32_t numSimLayouts = osvt->getNumSampled ( *tph , SV_TAGPAIRHASH );
|
|
|
|
///////////////////////////////////////
|
|
//
|
|
// set m_dupVotes and m_notDupVotes for each section
|
|
//
|
|
// answers the question... out of all the pages with this taghash,
|
|
// from this site, how often is this content repeated?
|
|
//
|
|
// trumba.com often repeats an event on its various feeds, but
|
|
// not on EVERY page. so we should adjust the event title penalties
|
|
// based on the ratio of repeated to not-repeated from the various
|
|
// pages on the site that have the same *taghash*
|
|
//
|
|
///////////////////////////////////////
|
|
|
|
// get first sentence in doc
|
|
si = ss->m_firstSent;
|
|
// do not bother scanning if no votes
|
|
if ( osvt->getNumVotes() <= 0 ) si = NULL;
|
|
// assume no dups
|
|
m_maxVotesForDup = 0;
|
|
// scan the sentence sections and or in the bits we should
|
|
for ( ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// sanity check
|
|
if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
|
|
// how many pages from this site have this taghash for
|
|
// a sentence
|
|
float nt;
|
|
nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH);
|
|
// skip if nobody! (except us)
|
|
if ( nt <= 0.0 ) continue;
|
|
// . get out tag content hash
|
|
// . for some reason m_contentHash is 0 for like menu-y sectns
|
|
int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64;
|
|
// . now how many pages also had same content in that tag?
|
|
// . TODO: make sure numsampled only counts a docid once!
|
|
// and this is not each time it occurs on that page.
|
|
float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH);
|
|
// cast it to a int32_t
|
|
int32_t votes1 = (int32_t)nsam;
|
|
// by default, complement
|
|
int32_t votes2 = (int32_t)nt - votes1;
|
|
// store votes
|
|
si->m_votesForDup = votes1;
|
|
si->m_votesForNotDup = votes2;
|
|
// what's the most dup votes we had...
|
|
if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1;
|
|
// set it
|
|
//if ( si->m_votesForDup > 2 * si->m_votesForNotDup &&
|
|
// si->m_votesForDup >= 1 &&
|
|
// ! (si->m_flags & SEC_HAS_NONFUZZYDATE) )
|
|
// si->m_sentFlags |= SENT_DUP_SECTION;
|
|
}
|
|
|
|
m_sectionsValid = true;
|
|
return &m_sections;
|
|
}
|
|
|
|
SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) {
|
|
if ( m_nsvtValid ) return &m_nsvt;
|
|
// need sections
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
|
|
// and dates
|
|
Dates *dp = getDates();
|
|
if ( ! dp || dp == (Dates *)-1 ) return (SectionVotingTable *)dp;
|
|
// hash of all adjacent tag pairs
|
|
uint32_t *tph = getTagPairHash32 ( ) ;
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
|
|
// are we a site root url?
|
|
//char *isRoot = getIsSiteRoot();
|
|
//if ( ! isRoot || isRoot == (char *)-1 )
|
|
// return (SectionVotingTable *)isRoot;
|
|
|
|
// init table
|
|
if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL;
|
|
// . tally the section votes from the sections class
|
|
// . only add the date votes, not the taghash/contenthash keys
|
|
// from the root, since we add those from the root voting table
|
|
// into m_osvt directly!
|
|
// . we no longer have root voting table!
|
|
// . this adds keys of the hash of each tag xpath
|
|
// . and it adds keys of the hash of each tag path PLUS its innerhtml
|
|
if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL;
|
|
// tally the section votes from the dates
|
|
if ( ! dp->addVotes ( &m_nsvt ) ) return NULL;
|
|
// our new section voting table is now valid, and ready to be added
|
|
// to sectiondb by calling SectionVotingTable::hash()
|
|
m_nsvtValid = true;
|
|
return &m_nsvt;
|
|
}
|
|
|
|
|
|
// . scan every section and look up its tag and content hashes in
|
|
// sectiondb to find out how many pages and sites have the same hash
|
|
// . use the secondary sectiondb key, key2
|
|
// . then store the stats in the Sections::m_stats class
|
|
Sections *XmlDoc::getSectionsWithDupStats ( ) {
|
|
|
|
Sections *ss = getSections();
|
|
if ( !ss ||ss==(Sections *)-1) return(Sections *)ss;
|
|
|
|
if ( m_gotDupStats ) return ss;
|
|
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32;
|
|
uint32_t siteHash32 = (uint32_t)*sh32;
|
|
|
|
//int64_t *shp64 = getSiteHash64();
|
|
//if ( ! shp64 || shp64 == (void *)-1 ) return (Sections *)shp64;
|
|
//int64_t siteHash48 = *shp64 & 0x0000ffffffffffffLL;
|
|
|
|
// first time called? then init m_nextSection.
|
|
//Section *si = m_si;
|
|
|
|
// if this is -1, we are called for the first time
|
|
if ( m_si == (void *)-1 ) {
|
|
m_si = ss->m_rootSection;
|
|
m_mcastRequestsIn = 0;
|
|
m_mcastRequestsOut = 0;
|
|
m_secStatsErrno = 0;
|
|
}
|
|
|
|
|
|
//sec_t menuFlags = SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ;
|
|
|
|
for ( ; m_si ; m_si = m_si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index.
|
|
if ( ! ( m_si->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
|
|
// skip if sentence, only hash tags now i guess for diffbot
|
|
//if ( m_si->m_sentenceContentHash64 )
|
|
// continue;
|
|
|
|
// get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64;
|
|
if ( ! val32 )
|
|
continue;
|
|
|
|
// skip if menu!
|
|
//if ( m_si->m_flags & menuFlags ) continue;
|
|
|
|
// get section xpath hash combined with sitehash
|
|
uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32;
|
|
|
|
// convert this to 32 bits
|
|
uint32_t innerHash32 ;
|
|
//sentHash32 = (uint32_t)m_si->m_sentenceContentHash64;
|
|
innerHash32 = (uint32_t)m_si->m_indirectSentHash64;
|
|
|
|
// save in case we need to read more than 5MB
|
|
//m_lastSection = si;
|
|
// . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32
|
|
// . we hack the "sentContentHash32" into each posdb key
|
|
// as the "value" so we can do a facet-like histogram
|
|
// over all the possible values this xpath has for this site
|
|
SectionStats *stats = getSectionStats ( secHash32,
|
|
innerHash32,
|
|
false ); // cache only?
|
|
// it returns -1 if would block
|
|
if ( stats == (void *)-1 ) {
|
|
// count it as outstanding
|
|
//m_mcastRequestsOut++;
|
|
// launch more if we have room
|
|
// UdpServer.cpp has a limit of 10 on 0x39 requests
|
|
if ( m_mcastRequestsOut - m_mcastRequestsIn < 10)
|
|
continue;
|
|
// advance m_si so we do not repeat
|
|
m_si = m_si->m_next;
|
|
// otherwise, return -1 to indicate blocked
|
|
return (Sections *)-1;
|
|
}
|
|
// NULL means g_errno
|
|
if ( ! stats ) {
|
|
// ensure g_errno is set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// save it
|
|
m_secStatsErrno = g_errno;
|
|
// clear it
|
|
g_errno = 0;
|
|
// if still waiting though return -1
|
|
if ( m_mcastRequestsOut > m_mcastRequestsIn )
|
|
return (Sections *)-1;
|
|
// otherwise, all done i guess
|
|
return NULL;
|
|
}
|
|
// if already in the table, skip it!
|
|
}
|
|
|
|
// waiting for more replies to come back?
|
|
if ( m_mcastRequestsOut > m_mcastRequestsIn )
|
|
return (Sections *) -1;
|
|
|
|
// now scan the sections and copy the stats from the table
|
|
// into Section::m_stats of each sentence section.
|
|
// use the key hash as the the hash of the tag/xpath and the innerhtml
|
|
// and the val instead of being site hash will be hash of the
|
|
// content. then we can get the histogram of our content hash
|
|
// for this xpath on our site.
|
|
Section *si = ss->m_rootSection;
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if no content to hash
|
|
//if ( ! si->m_sentenceContentHash64 ) continue;
|
|
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index
|
|
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
|
|
// skip if sentence, only hash tags now i guess for diffbot
|
|
//if ( si->m_sentenceContentHash64 )
|
|
// continue;
|
|
|
|
// get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
|
if ( ! val32 )
|
|
continue;
|
|
|
|
// skip if menu!
|
|
//if ( si->m_flags & menuFlags ) continue;
|
|
|
|
|
|
// get section xpath hash combined with sitehash
|
|
uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32;
|
|
|
|
// convert this to 32 bits
|
|
uint32_t innerHash32 ;
|
|
innerHash32 = (uint32_t)si->m_indirectSentHash64;
|
|
|
|
// the "stats" class should be in the table from
|
|
// the lookups above!!
|
|
SectionStats *stats = getSectionStats ( secHash32,
|
|
innerHash32,
|
|
true ); // cache only?
|
|
// sanity
|
|
//if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;}
|
|
// must have had a network error or something
|
|
if ( ! stats ) continue;
|
|
// copy
|
|
gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) );
|
|
}
|
|
|
|
//
|
|
// now if a section has no stats but has the same
|
|
// m_indirectSentHash64 as a kid, take his stats
|
|
//
|
|
Section *sx = ss->m_rootSection;
|
|
for ( ; sx ; sx = sx->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index
|
|
if ( ! ( sx->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
// scan up parents and set their stats to ours as int32_t as
|
|
// they have the same indirect sent hash64
|
|
Section *p = sx->m_parent;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
|
|
// if parent is like an img tag, skip it
|
|
if ( p->m_tagId == TAG_IMG )
|
|
continue;
|
|
|
|
if ( p ->m_indirectSentHash64 !=
|
|
sx->m_indirectSentHash64 )
|
|
break;
|
|
|
|
// copy it to parent with the same inner html hash
|
|
gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats));
|
|
}
|
|
}
|
|
|
|
// now free the table's mem
|
|
m_sectionStatsTable.reset();
|
|
|
|
m_gotDupStats = true;
|
|
return ss;
|
|
}
|
|
|
|
static void gotReplyWrapper39 ( void *state1 , void *state2 ) {
|
|
//XmlDoc *THIS = (XmlDoc *)state;
|
|
XmlDoc *THIS = (XmlDoc *)state1;
|
|
Multicast *mcast = (Multicast *)state2;
|
|
THIS->gotSectionFacets ( mcast );
|
|
// this will end up calling getSectionsWithDupStats() again
|
|
// which will call getSectionStats() some more on new sections
|
|
// until m_gotDupStats is set to true.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
// . launch a single msg3a::getDocIds() for a section hash, secHash32
|
|
SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 ,
|
|
uint32_t innerHash32 ,
|
|
bool cacheOnly ) {
|
|
|
|
// init cache?
|
|
if ( m_sectionStatsTable.m_numSlots == 0 &&
|
|
! m_sectionStatsTable.set(4,
|
|
sizeof(SectionStats),
|
|
32,
|
|
NULL,
|
|
0,
|
|
false,
|
|
m_niceness,
|
|
"secstatsch"))
|
|
return NULL;
|
|
|
|
// check in cache...
|
|
SectionStats *stats ;
|
|
stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 );
|
|
// if there, return it
|
|
if ( stats ) return stats;
|
|
|
|
// if cache only do not launch
|
|
if ( cacheOnly ) return NULL;
|
|
|
|
//
|
|
// TODO: shard gbxpathsitehashxxxxx by termid
|
|
// and make sure msg3a only sends to that single shard and sends
|
|
// the stats back. should make us much faster to sectionize
|
|
// a web page. but for now try without it...
|
|
//
|
|
|
|
//int32_t *sh32 = getSiteHash32();
|
|
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32;
|
|
|
|
int32_t maxOut = 32;
|
|
|
|
// . need to make new msg39Request and a new Multicast arrays
|
|
// . only need multicast since these gbfacetstr:gbxpathsitehash123456
|
|
// terms are sharded by termid, otherwise we'd have to use msg3a
|
|
if ( ! m_mcastArray ) {
|
|
// how much mem to alloc?
|
|
int32_t need = 0;
|
|
need += sizeof(Multicast);
|
|
need += sizeof(Msg39Request);
|
|
// query buf str
|
|
need += 100;
|
|
need *= maxOut;
|
|
// a single query now to be shared
|
|
//need += sizeof(Query);
|
|
// just in case we are being re-used
|
|
m_mcastBuf.reset();
|
|
// alloc space
|
|
if ( ! m_mcastBuf.reserve(need) ) return NULL;
|
|
// point to buf
|
|
char *p = m_mcastBuf.getBufStart();
|
|
// set them up
|
|
m_mcastArray = (Multicast *)p;
|
|
p += sizeof(Multicast) * maxOut;
|
|
m_msg39RequestArray = (Msg39Request *)p;
|
|
p += sizeof(Msg39Request) * maxOut;
|
|
//m_queryArray = (Query *)p;
|
|
//p += sizeof(Query) * maxOut;
|
|
//m_sharedQuery = (Query *)p;
|
|
//p += sizeof(Query);
|
|
// for holding the query string
|
|
// assume query will not exceed 100 bytes incuding \0
|
|
m_queryBuf = p;
|
|
p += 100 * maxOut;
|
|
// initialize all!
|
|
for ( int32_t i = 0 ; i < maxOut ; i++ ) {
|
|
m_mcastArray [i].constructor();
|
|
m_msg39RequestArray[i].reset();//constructor();
|
|
//m_queryArray [i].constructor();
|
|
m_queryBuf[100*i] = '\0';
|
|
//m_inUse[i] = 0;
|
|
}
|
|
}
|
|
|
|
// get first available
|
|
int32_t i;
|
|
for ( i = 0 ; i < maxOut ; i++ )
|
|
if ( ! m_mcastArray[i].m_inUse ) break;
|
|
|
|
// wtf?
|
|
if ( i >= maxOut ) { char *xx=NULL;*xx=0; }
|
|
|
|
// and our vehicle
|
|
Multicast *mcast = &m_mcastArray[i];
|
|
|
|
// mark as in use up here in case we quickpoll into this same code?!
|
|
// yeah, i guess set2() calls quickpoll?
|
|
//mcast->m_inUse = 1;
|
|
|
|
// save this for reply
|
|
//mcast->m_hack = this;
|
|
|
|
char *qbuf = m_queryBuf + 100 * i;
|
|
|
|
// . hash this special term (was gbsectionhash)
|
|
// . the wordbits etc will be a number though, the hash of the content
|
|
// of the xpath, the inner html hash
|
|
// . preceeding this term with gbfacet: will make gigablast return
|
|
// the statistics for all the values in the posdb keys of this
|
|
// termlist, which happen to be innerHTML hashes for all pages
|
|
// with this same xpath and on this same site.
|
|
sprintf(qbuf,"gbfacetstr:gbxpathsitehash%"UINT32"",
|
|
(uint32_t)secHash32);
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// set the msg39 request
|
|
Msg39Request *r = &m_msg39RequestArray[i];
|
|
|
|
// reset all to defaults
|
|
r->reset();
|
|
|
|
//r-> ptr_coll = cr->m_coll;
|
|
//r->size_coll = gbstrlen(cr->m_coll)+1;
|
|
r->m_collnum = cr->m_collnum;
|
|
r->m_maxAge = 60; // cache timeout?
|
|
r->m_addToCache = true;
|
|
r->m_docsToGet = 0; // just calc stats
|
|
r->m_niceness = m_niceness;
|
|
r->m_debug = 0;
|
|
r->m_doSiteClustering = false;
|
|
//r->m_doIpClustering = false;
|
|
r->m_doDupContentRemoval = false;
|
|
r->m_boolFlag = 2;
|
|
r->m_familyFilter = 0;
|
|
r->m_language = 0;
|
|
r->ptr_query = qbuf;//m_sectionHashQueryBuf;
|
|
r->size_query = gbstrlen(r->ptr_query)+1;
|
|
r->m_timeout = 3600; //-1;// auto-determine based on #terms
|
|
r->m_maxQueryTerms = 10;
|
|
|
|
// how much of each termlist to read in bytes
|
|
int32_t readList = 10000;
|
|
r-> ptr_readSizes = (char *)&readList;
|
|
r->size_readSizes = 4;
|
|
|
|
// term freqs
|
|
float tfw = 1.0;
|
|
r-> ptr_termFreqWeights = (char *)&tfw;
|
|
r->size_termFreqWeights = 4;
|
|
|
|
// speed it up some with this flag
|
|
r->m_forSectionStats = true;
|
|
|
|
// only do a single read of docids... do not split up
|
|
r->m_numDocIdSplits = 1;
|
|
|
|
// 1 query term
|
|
r->m_nqt = 1;
|
|
|
|
///////////////////////
|
|
//
|
|
// this tells msg3a/msg39/posdbtable its a hack! no need to do this
|
|
// because it's implied by the query.
|
|
// BUT REALLY let's eliminate this and just make our queries like
|
|
// gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of
|
|
// the section's xpath with the site. the values of that term in
|
|
// the posdb key will be 32-bit hashes of the innerHtml for such
|
|
// sections from all pages with the same xpath on the same site.
|
|
// so no need for this now, comment out.
|
|
//
|
|
//r->m_getFacetStats = true;
|
|
//
|
|
/////////////////////////
|
|
|
|
|
|
// we need to know what site is the base site so the section stats
|
|
// can set m_onSiteDocIds and m_offSiteDocIds correctly
|
|
//r->m_siteHash32 = *sh32;
|
|
|
|
// . now we use the hash of the innerHtml of the xpath
|
|
// . this is our value for the facet field of gbxpathsitehash12345678
|
|
// which is the hash of the innerHTML for that xpath on this site.
|
|
// 12345678 is the hash of the xpath and the site.
|
|
//r->m_myFacetVal32 = sentHash32;
|
|
|
|
|
|
//Query *qq = &m_queryArray[i];
|
|
// set query for msg3a. queryExpansion=false
|
|
//qq->set2 ( r->ptr_query , langUnknown , false );
|
|
|
|
Query qq;
|
|
qq.set2 ( r->ptr_query , langUnknown , false );
|
|
|
|
// TODO: ensure this just hits the one host since it is sharded
|
|
// by termid...
|
|
|
|
// what shard owns this termlist. we shard these
|
|
// gbfacetstr:gbxpathsitehash123456 terms by termid.
|
|
int64_t termId = qq.getTermId(0);
|
|
int32_t shardNum = getShardNumFromTermId ( termId );
|
|
|
|
// hack in our inner html content hash for this xpath
|
|
mcast->m_hack32 = innerHash32;
|
|
mcast->m_hack64 = secHash32;
|
|
|
|
// malloc and store the request. mcast will free it when done.
|
|
int32_t reqSize;
|
|
char *req = serializeMsg ( sizeof(Msg39Request),
|
|
&r->size_readSizes,
|
|
&r->size_whiteList,
|
|
&r->ptr_readSizes,
|
|
r,
|
|
&reqSize,
|
|
NULL,
|
|
0,
|
|
false);
|
|
|
|
// . send out a msg39 request to each shard
|
|
// . multicasts to a host in group "groupId"
|
|
// . we always block waiting for the reply with a multicast
|
|
// . returns false and sets g_errno on error
|
|
// . sends the request to fastest host in group "groupId"
|
|
// . if that host takes more than about 5 secs then sends to
|
|
// next host
|
|
// . key should be largest termId in group we're sending to
|
|
bool status;
|
|
status = mcast->send ( req , // m_rbufPtr ,
|
|
reqSize , // request size
|
|
0x39 , // msgType 0x39
|
|
true , // mcast owns m_request?
|
|
shardNum , // group to send to
|
|
false , // send to whole group?
|
|
0,//(int32_t)qh , // 0 // startKey.n1
|
|
this , // state1 data
|
|
mcast , // state2 data
|
|
gotReplyWrapper39 ,
|
|
30 , //timeout in secs
|
|
m_niceness,//m_r->m_niceness ,
|
|
false , // realtime?
|
|
-1, // firstHostId, // -1// bestHandlingHostId ,
|
|
NULL , // m_replyBuf ,
|
|
0 , // MSG39REPLYSIZE,
|
|
// this is true if multicast should free the
|
|
// reply, otherwise caller is responsible
|
|
// for freeing it after calling
|
|
// getBestReply().
|
|
// actually, this should always be false,
|
|
// there is a bug in Multicast.cpp.
|
|
// no, if we error out and never steal
|
|
// the buffers then they will go unfreed
|
|
// so they are freed by multicast by default
|
|
// then we steal control explicitly
|
|
true );
|
|
|
|
m_mcastRequestsOut++;
|
|
|
|
// if successfully launch, wait...
|
|
if ( status ) return (SectionStats *) -1;
|
|
|
|
// error?
|
|
if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; }
|
|
|
|
// sets &m_sectionStats and adds to the table
|
|
gotSectionFacets ( mcast );
|
|
|
|
// i guess did not block...
|
|
//return &msg3a->m_sectionStats;
|
|
return &m_sectionStats;
|
|
}
|
|
|
|
// . come here when msg39 got the ptr_faceHashList for our single
|
|
// gbfacet:gbxpathsitehash
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
|
|
//SectionStats *stats = &msg39->m_sectionStats;
|
|
|
|
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;}
|
|
|
|
// count it as returned
|
|
m_mcastRequestsIn++;
|
|
// mark it as available now
|
|
int32_t num = mcast - m_mcastArray;
|
|
// sanity
|
|
//if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; }
|
|
|
|
// grab the xpath/site hash
|
|
uint32_t secHash32 = mcast->m_hack64;
|
|
|
|
// and our innher html for that xpath
|
|
int32_t myFacetVal32 = mcast->m_hack32;
|
|
|
|
// sanity. should only be a gbfacet:gbxpathsitehash12345567 term.
|
|
//if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// reset all counts to 0
|
|
m_sectionStats.reset();
|
|
|
|
//////
|
|
//
|
|
// compile m_sectionStats
|
|
//
|
|
///////
|
|
|
|
// set m_sectionStats from the list of facet values for this
|
|
// gbfacet:xpathsitehash term...
|
|
// Query::m_queryTerm.m_facetHashTable has the facets merged
|
|
// from all the shards. so now compute the stats from them.
|
|
// set the section stats.
|
|
//QueryTerm *qt = &msg3a->m_q->m_qterms[0];
|
|
//HashTableX *ft = &qt->m_facetHashTable;
|
|
|
|
// . get the list of facet field/value pairs.
|
|
// . see how Msg3a.cpp merges these to see how they are stored
|
|
Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply();
|
|
|
|
// this is NULL with g_errno set on error
|
|
if ( ! mr ) {
|
|
log("xmldoc: got error from sec stats mcast: %s",
|
|
mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
|
|
deserializeMsg ( sizeof(Msg39Reply) ,
|
|
&mr->size_docIds,
|
|
&mr->size_clusterRecs,
|
|
&mr->ptr_docIds,
|
|
mr->m_buf );
|
|
|
|
char *p = (char *)(mr->ptr_facetHashList);
|
|
//char *pfinal = p + mr->size_facetHashList;
|
|
|
|
//
|
|
// should only be one termid of facets in here, so no need to re-loop
|
|
//
|
|
int32_t nh = 0;
|
|
// "matches" is how many docids with this facet field had our facet val
|
|
int32_t matches = 0;
|
|
// "totalDocIds" is how many docids had this facet field
|
|
int32_t totalFields = 0;
|
|
|
|
if ( p ) {
|
|
// first is the termid
|
|
//int64_t termId = *(int64_t *)p;
|
|
// skip that
|
|
p += 8;
|
|
// the # of unique 32-bit facet values
|
|
nh = *(int32_t *)p;
|
|
p += 4;
|
|
// the end point
|
|
char *pend = p + (8 * nh);
|
|
// now compile the facet hash list into there
|
|
for ( ; p < pend ; ) {
|
|
// does this facet value match ours?
|
|
// (i.e. same inner html?)
|
|
if ( *(int32_t *)p == myFacetVal32 )
|
|
matches += *(int32_t *)(p+4);
|
|
p += 4;
|
|
// now how many docids had this facet value?
|
|
totalFields += *(int32_t *)p;
|
|
p += 4;
|
|
}
|
|
}
|
|
|
|
// how many unique inner html content hashes for this xpath/site
|
|
// hash were there?
|
|
m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed;
|
|
|
|
// how many xpaths existsed over all docs. doc can have multiple.
|
|
m_sectionStats.m_totalEntries = totalFields;
|
|
|
|
// total # unique docids that had this facet
|
|
m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits;
|
|
|
|
// how many had the same inner html content hash for
|
|
// this xpath/site as we did?
|
|
m_sectionStats.m_totalMatches = matches;
|
|
|
|
////////
|
|
//
|
|
// store m_sectionStats in cache
|
|
//
|
|
////////
|
|
|
|
// cache them. this does a copy of m_sectionStats
|
|
if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) )
|
|
log("xmldoc: failed to add sections stats: %s",
|
|
mstrerror(g_errno));
|
|
|
|
// reset that msg39 to free its data
|
|
//msg39->reset();
|
|
|
|
if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . make it available again
|
|
// . do this after all in case we were in quickpoll interruptting
|
|
// the getSectionStats() function below
|
|
//mcast->m_inUse = 0;
|
|
|
|
// free query Query::m_qwords array etc. to stop mem leaks
|
|
m_mcastArray [num].reset();
|
|
m_msg39RequestArray[num].reset();
|
|
//m_queryArray [num].reset();
|
|
// now when the master loop calls getSectionsWithDupStats() it
|
|
// should find the stats class in the cache!
|
|
return true;
|
|
}
|
|
|
|
|
|
// . for all urls from this subdomain...
|
|
// . EXCEPT root url since we use msg17 to cache that, etc.
|
|
SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {
|
|
|
|
if ( m_osvtValid ) return &m_osvt;
|
|
|
|
// do not consult sectiondb if we are set from the title rec,
|
|
// that way we avoid parsining inconsistencies since sectiondb changes!
|
|
if ( m_setFromTitleRec ) {
|
|
char *p = ptr_sectiondbData;
|
|
m_osvtValid = true;
|
|
m_osvt.m_totalSiteVoters = 0;
|
|
if ( size_sectiondbData <= 4 ) return &m_osvt;
|
|
m_osvt.m_totalSiteVoters = *(int32_t *)p;
|
|
p += 4;
|
|
int32_t remaining = size_sectiondbData - 4;
|
|
m_osvt.m_svt.deserialize(p,remaining,m_niceness);
|
|
return &m_osvt;
|
|
}
|
|
|
|
// returns empty table if WE are the site root url!
|
|
//HashTableX *rvt = getRootVotingTable();
|
|
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
|
|
|
|
// need sections
|
|
//Sections *ss = getSections();
|
|
//if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
|
|
|
|
// hash of all adjacent tag pairs
|
|
uint32_t *tph = getTagPairHash32 ( ) ;
|
|
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
|
|
|
|
int64_t *siteHash64 = getSiteHash64();
|
|
if ( ! siteHash64 || siteHash64 == (void *)-1 )
|
|
return (SectionVotingTable *)siteHash64;
|
|
|
|
// the docid
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . for us, dates are really containers of the flags and tag hash
|
|
// . init this up here, it is re-set if we re-call getSectiondbList()
|
|
// because there were too many records in it to handle in one read
|
|
if ( m_numSectiondbReads == 0 ) {
|
|
// init table
|
|
if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL;
|
|
// use site hash as the main thing
|
|
int64_t termId = *siteHash64 & TERMID_MASK;
|
|
// . start key for reading list from sectiondb
|
|
// . read all the section votes for this site
|
|
m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff);
|
|
// how many reads we have to do...
|
|
m_numSectiondbNeeds = 1;
|
|
}
|
|
|
|
//bool skipRecall = false;
|
|
// always read 5MB at a time from sectiondb
|
|
int32_t minRecSizes = 5000000;
|
|
|
|
// crap! host #28 is being totall slammed!!!!!
|
|
// why?????? in the meantime do this
|
|
//minRecSizes = 100000;
|
|
//skipRecall = true;
|
|
|
|
// is it facebook?
|
|
bool limitSectiondb = false;
|
|
// limit now to speed up repair rebuild
|
|
// limit now to speed up injection!
|
|
limitSectiondb = true;
|
|
// facebook lists often clog the tree, and when we read 2MB worth of
|
|
// it, it takes 100ms, so reduce to 50k to so it takes 2.5ms...
|
|
// because facebook is a well structured xml feed so why read any
|
|
// really!
|
|
if ( limitSectiondb ) minRecSizes = 50000;
|
|
|
|
key128_t *lastKey = NULL;
|
|
|
|
// if msg0 blocked and came back with g_errno set, like
|
|
// in preparing to merge it got an OOM
|
|
if ( g_errno ) {
|
|
log("build: sectiondb read2: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
|
|
readLoop:
|
|
// before looking up TitleRecs using Msg20, let's first consult
|
|
// datedb to see if we got adequate data as to what sections
|
|
// are the article sections
|
|
|
|
// only get the list once
|
|
if ( m_numSectiondbReads < m_numSectiondbNeeds ) {
|
|
// only do this once
|
|
m_numSectiondbReads++;
|
|
// make the termid
|
|
uint64_t termId = *siteHash64 & TERMID_MASK;
|
|
// end key is always the same
|
|
key128_t end = g_datedb.makeEndKey ( termId , 0 );
|
|
// int16_tcut
|
|
Msg0 *m = &m_msg0;
|
|
// get the group this list is in (split = false)
|
|
uint32_t shardNum;
|
|
shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey);
|
|
// we need a group # from the groupId
|
|
//int32_t split = g_hostdb.getGroupNum ( gid );
|
|
// note it
|
|
//logf(LOG_DEBUG,"sections: "
|
|
// "reading list from sectiondb: "
|
|
// "sk.n1=0x%"XINT64" sk.n0=0x%"XINT64" "
|
|
// "ek.n1=0x%"XINT64" ek.n0=0x%"XINT64" "
|
|
// ,m_sectiondbStartKey.n1
|
|
// ,m_sectiondbStartKey.n0
|
|
// ,end.n1
|
|
// ,end.n0
|
|
// );
|
|
// . get the list
|
|
// . gets all votes for one particular site
|
|
if ( ! m->getList ( -1 , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // addToCache
|
|
RDB_SECTIONDB , // was RDB_DATEDB
|
|
cr->m_collnum ,
|
|
&m_secdbList ,
|
|
(char *)&m_sectiondbStartKey ,
|
|
(char *)&end ,
|
|
minRecSizes ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // MAX_NICENESS
|
|
// default parms follow
|
|
true , // doErrorCorrection?
|
|
true , // includeTree?
|
|
true , // doMerge?
|
|
-1 , // firstHostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
999995 , // timeout
|
|
-1 , // syncPoint
|
|
-1 , // preferLocalReads
|
|
NULL , // msg5
|
|
NULL , // msg5b
|
|
false , // isrealmerge?
|
|
true , // allowpagecache?
|
|
false , // forceLocalIndexdb?
|
|
false , // doIndexdbSplit?
|
|
shardNum ) )//split ))
|
|
// return -1 if blocks
|
|
return (SectionVotingTable *)-1;
|
|
// error?
|
|
if ( g_errno ) {
|
|
log("build: sectiondb read: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// it also returns the lastKey in the list so we can use that to
|
|
// set the startKey for a re-call if we read >= 5MB
|
|
lastKey = NULL;
|
|
|
|
//logf(LOG_DEBUG,"sections: read list of %"INT32" bytes",
|
|
// m_secdbList.m_listSize);
|
|
|
|
bool recall = true;
|
|
|
|
if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false;
|
|
|
|
// . unless it had special byte set in Msg0.cpp HACK
|
|
// . we send back a compressed list and tack on an extra 0 byte at
|
|
// the end so that we know we had a full list!
|
|
if ( (m_secdbList.m_listSize % 2) == 1 ) {
|
|
m_secdbList.m_listSize--;
|
|
m_secdbList.m_listEnd --;
|
|
recall = true;
|
|
}
|
|
|
|
// no longer bother re-calling, because facebook is way slow...
|
|
if ( limitSectiondb ) recall = false;
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . compile the votes from sectiondb for this site into a hashtable
|
|
// . m_osvt is a SectionVotingTable and each entry in the hashtable
|
|
// is a SectionVote class.
|
|
// . the taghash is the key of the vote and is a hash of all the
|
|
// nested tags the section is in.
|
|
// . another vote uses the tag hash hashed with the hash of the
|
|
// content contained by the section
|
|
// . using these two vote counts we set Section::m_votesForDup
|
|
// or Section::m_votesForNotDup counts which let us know how the
|
|
// section is repeated or not repeated on the site
|
|
// . SectionVote::m_score is always 1.0 from what i can tell
|
|
// cuz it seems like addVote*() always uses a score of 1.0
|
|
// . SectionVote::m_numSampled is how many times that tagHash
|
|
// occurs in the document.
|
|
if ( ! m_osvt.addListOfVotes(&m_secdbList,
|
|
&lastKey,
|
|
*tph,
|
|
*d , // docid
|
|
m_niceness))
|
|
return NULL;
|
|
|
|
// why is this always zero it seems?
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("xmldoc: added sectiondblist size=%"INT32" recall=%"INT32"",
|
|
m_secdbList.m_listSize,(int32_t)recall);
|
|
|
|
// . recall? yes if we had to truncate our list...
|
|
// . we need to be able to scan all votes for the website... that is
|
|
// why we recall here
|
|
// . limit votes by a special sectiondb key then that is a vote...
|
|
if ( recall ) {
|
|
// another debug
|
|
//logf(LOG_DEBUG,"sections: recallling read");
|
|
// just note it for now
|
|
//if ( m_sectiondbRecall > 5 )
|
|
if ( m_numSectiondbNeeds > 5 )
|
|
logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%"INT32"",
|
|
m_sectiondbRecall++);
|
|
// we should really limit voting per site! we do now!
|
|
//if ( m_recall > 5 ) { char *xx=NULL;*xx=0; }
|
|
// update our start key
|
|
if ( lastKey ) m_sectiondbStartKey = *lastKey;
|
|
// inc by 2 since we already had this key
|
|
m_sectiondbStartKey += 2;
|
|
// unflag
|
|
m_numSectiondbNeeds++;
|
|
// and repeat
|
|
goto readLoop;
|
|
}
|
|
|
|
//
|
|
// set ptr_sectiondbData so this can be set from a title rec without
|
|
// having to lookup in sectiondb again which might have changed!
|
|
//
|
|
m_sectiondbData.purge();
|
|
// alloc
|
|
int32_t need = m_osvt.m_svt.getStoredSize() + 4;
|
|
if ( ! m_sectiondbData.reserve(need) )
|
|
// oom error?
|
|
return NULL;
|
|
// serialize this number
|
|
m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters);
|
|
// serialize the hashtablex
|
|
m_osvt.m_svt.serialize ( &m_sectiondbData );
|
|
// reference it for title rec serialization
|
|
ptr_sectiondbData = m_sectiondbData.getBufStart();
|
|
size_sectiondbData = m_sectiondbData.length();
|
|
|
|
m_osvtValid = true;
|
|
return &m_osvt;
|
|
}
|
|
|
|
int32_t *XmlDoc::getLinkSiteHashes ( ) {
|
|
if ( m_linkSiteHashesValid )
|
|
return (int32_t *)m_linkSiteHashBuf.getBufStart();
|
|
// get the outlinks
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
|
|
|
// . get the outlink tag rec vector
|
|
// . each link's tagrec may have a "site" tag that is basically
|
|
// the cached SiteGetter::getSite() computation
|
|
TagRec ***grv = NULL;
|
|
if ( ! m_setFromTitleRec ) {
|
|
grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (int32_t *)grv;
|
|
}
|
|
|
|
// how many outlinks do we have on this page?
|
|
int32_t n = links->getNumLinks();
|
|
|
|
// reserve space
|
|
m_linkSiteHashBuf.purge();
|
|
if ( ! m_linkSiteHashBuf.reserve ( n * 4 ) ) return NULL;
|
|
|
|
if ( n == 0 ) {
|
|
ptr_linkdbData = NULL;
|
|
size_linkdbData = 0;
|
|
return (int32_t *)0x1234;
|
|
}
|
|
|
|
// if set from titlerec then assume each site is the full hostname
|
|
// of the link, unless its specified explicitly in the hashtablex
|
|
// serialized in ptr_linkdbData
|
|
if ( m_setFromTitleRec ) {
|
|
// this holds the sites that are not just the hostname
|
|
int32_t *p = (int32_t *)ptr_linkdbData;
|
|
int32_t *pend = (int32_t *)(ptr_linkdbData + size_linkdbData);
|
|
// loop over links
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the link
|
|
char *u = links->getLinkPtr(i);
|
|
// assume site is just the host
|
|
int32_t hostLen = 0;
|
|
char *host = ::getHost ( u , &hostLen );
|
|
int32_t siteHash32 = hash32 ( host , hostLen , 0 );
|
|
// unless give as otherwise
|
|
if ( p < pend && *p == i ) {
|
|
p++;
|
|
siteHash32 = *p;
|
|
p++;
|
|
}
|
|
// store that then. should not fail since we allocated
|
|
// right above
|
|
if ( ! m_linkSiteHashBuf.pushLong(siteHash32) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
// return ptr of array, which is a safebuf
|
|
return (int32_t *)m_linkSiteHashBuf.getBufStart();
|
|
}
|
|
|
|
// ptr_linkdbData will point into this buf
|
|
m_linkdbDataBuf.purge();
|
|
|
|
// loop through them
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the link
|
|
char *u = links->getLinkPtr(i);
|
|
// get full host from link
|
|
int32_t hostLen = 0;
|
|
char *host = ::getHost ( u , &hostLen );
|
|
int32_t hostHash32 = hash32 ( host , hostLen , 0 );
|
|
// get the site
|
|
TagRec *gr = (*grv)[i];
|
|
char *site = NULL;
|
|
int32_t siteLen = 0;
|
|
if ( gr ) {
|
|
int32_t dataSize = 0;
|
|
site = gr->getString("site",NULL,&dataSize);
|
|
if ( dataSize ) siteLen = dataSize - 1;
|
|
}
|
|
// otherwise, make it the host or make it cut off at
|
|
// a "/user/" or "/~xxxx" or whatever path component
|
|
if ( ! site ) {
|
|
// GUESS link site... like /~xxx
|
|
site = host;
|
|
siteLen = hostLen;
|
|
}
|
|
int32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
|
|
// only store if different form host itself
|
|
if ( linkeeSiteHash32 != hostHash32 ) {
|
|
if ( ! m_linkdbDataBuf.pushLong(i) )
|
|
return NULL;
|
|
if ( ! m_linkdbDataBuf.pushLong(linkeeSiteHash32) )
|
|
return NULL;
|
|
}
|
|
// store it always in this buf
|
|
if ( ! m_linkSiteHashBuf.pushLong(linkeeSiteHash32) ) {
|
|
// space should have been reserved above!
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
// set ptr_linkdbData
|
|
ptr_linkdbData = m_linkdbDataBuf.getBufStart();
|
|
size_linkdbData = m_linkdbDataBuf.length();
|
|
m_linkSiteHashesValid = true;
|
|
|
|
return (int32_t *)m_linkSiteHashBuf.getBufStart();
|
|
}
|
|
|
|
Links *XmlDoc::getLinks ( bool doQuickSet ) {
|
|
if ( m_linksValid ) return &m_links;
|
|
// set status
|
|
setStatus ( "getting outlinks");
|
|
|
|
// . add links from diffbot reply
|
|
// . get the reply of json objects from diffbot
|
|
// . this will be empty if we are a json object!
|
|
// . will also be empty if not meant to be sent to diffbot
|
|
// . the TOKENIZED reply consists of \0 separated json objects that
|
|
// we create from the original diffbot reply
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( ! dbr || dbr == (void *)-1 ) return (Links *)dbr;
|
|
|
|
// this will set it if necessary
|
|
Xml *xml = getXml();
|
|
// bail on error
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Links *)xml;
|
|
// can't call getIsPermalink() here without entering a dependency loop
|
|
char *pp = getIsUrlPermalinkFormat();
|
|
if ( !pp || pp == (char *)-1 ) return (Links *)pp;
|
|
// use the old xml doc
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (Links *)od;
|
|
// get Links class of the old title rec
|
|
Links *oldLinks = NULL;
|
|
// if we were set from a title rec, do not do this
|
|
if ( *od ) {
|
|
oldLinks = (*od)->getLinks();
|
|
if (!oldLinks||oldLinks==(Links *)-1) return (Links *)oldLinks;
|
|
}
|
|
Url *baseUrl = getBaseUrl();
|
|
if ( ! baseUrl || baseUrl==(Url *)-1) return (Links *)baseUrl;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (Links *)ip;
|
|
// this ensures m_contentLen is set
|
|
//char **content = getContent();
|
|
//if ( ! content || content == (char **)-1 ) return (Links *)content;
|
|
// this will set ptr_indCatIds and size_indCatIds
|
|
int32_t **pici = getIndCatIds();
|
|
if ( ! pici || pici == (void *)-1 ) return (Links *)pici;
|
|
char *ict = getIsContentTruncated();
|
|
if ( ! ict || ict == (char *)-1 ) return (Links *)ict;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (Links *)sni;
|
|
// get the latest url we are on
|
|
Url *u = getCurrentUrl();
|
|
|
|
//
|
|
// if we had a EDOCSIMPLIFIEDREDIR error, pretend it is a link
|
|
// so addOutlinkSpiderRecsToMetaList() will add it to spiderdb
|
|
//
|
|
if ( m_indexCodeValid && m_indexCode == EDOCSIMPLIFIEDREDIR ) {
|
|
m_links.set ( m_redirUrl.getUrl(),m_redirUrl.getUrlLen() );
|
|
m_linksValid = true;
|
|
return &m_links;
|
|
}
|
|
|
|
if ( m_indexCodeValid && m_indexCode == EDOCNONCANONICAL ) {
|
|
m_links.set(m_canonicalRedirUrl.getUrl(),
|
|
m_canonicalRedirUrl.getUrlLen());
|
|
m_linksValid = true;
|
|
return &m_links;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
bool useRelNoFollow = true;
|
|
if ( ! cr->m_obeyRelNoFollowLinks ) useRelNoFollow = false;
|
|
// to keep things simple, for diffbot custom crawls, if robots.txt
|
|
// is not used then do not use rel no follow
|
|
if ( ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
|
|
useRelNoFollow = false;
|
|
|
|
// . set it
|
|
// . if parent is a permalink we can avoid its suburl outlinks
|
|
// containing "comment" from being classified as permalinks
|
|
if ( ! m_links.set ( useRelNoFollow ,
|
|
xml ,
|
|
u ,
|
|
true , // setLinkHashes?
|
|
baseUrl ,
|
|
m_version ,
|
|
m_niceness ,
|
|
*pp , // parent url in permalink format?
|
|
oldLinks ,// oldLinks, might be NULL!
|
|
doQuickSet ,
|
|
dbr ) )
|
|
return NULL;
|
|
|
|
m_linksValid = true;
|
|
|
|
// do not bother setting that bit if we are being called for link
|
|
// text because that bit was already in the linkdb key, and it
|
|
// was set to zero! so if getting msg20 reply.... bail now
|
|
if ( m_req ) return &m_links;
|
|
|
|
// . apply link spam settings
|
|
// . set the "spam bits" in the Links class
|
|
setLinkSpam ( *ip ,
|
|
ptr_indCatIds ,
|
|
size_indCatIds / 4 ,
|
|
u , // linker url
|
|
*sni ,
|
|
xml ,
|
|
&m_links ,
|
|
*ict ,
|
|
m_niceness );
|
|
// we got it
|
|
return &m_links;
|
|
}
|
|
|
|
|
|
HashTableX *XmlDoc::getCountTable ( ) {
|
|
// return it if we got it
|
|
if ( m_countTableValid ) return &m_countTable;
|
|
|
|
setStatus ("getting count table");
|
|
|
|
// get the stuff we need
|
|
Xml *xml = getXml ();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (HashTableX *)xml;
|
|
Words *words = getWords ();
|
|
if ( ! words || words == (Words *)-1 ) return (HashTableX *)words;
|
|
Phrases *phrases = getPhrases ();
|
|
if ( ! phrases || phrases==(Phrases *)-1) return (HashTableX *)phrases;
|
|
Bits *bits = getBits ();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (HashTableX *)bits;
|
|
Sections *sections = getSections();
|
|
if ( !sections||sections==(Sections *)-1) return(HashTableX *)sections;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (HashTableX *)info1;
|
|
|
|
// . reduce score of words in badly repeated fragments to 0 so we do
|
|
// not count them here!
|
|
// . ff[i] will have score of 0 if in repeated frag
|
|
// . make sure this is stored for whole doc... since we only use it
|
|
// for the body
|
|
char *fv = getFragVec();
|
|
if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv;
|
|
|
|
//LinkInfo *info2 = getLinkInfo2();
|
|
//if ( ! info2 || info2 == (LinkInfo *)-1 ) return (HashTableX *)info2;
|
|
|
|
// init our count table otherwise
|
|
//if(! m_countTable.set( 8,4,1024,NULL,0,false,m_niceness,"xmlcnttbl"))
|
|
// return NULL;
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
//
|
|
// this was in Weights.cpp, but now it is here...
|
|
//
|
|
|
|
// int16_tcut
|
|
HashTableX *ct = &m_countTable;
|
|
|
|
// reset the counts, just in case set() below does not
|
|
//ct->reset();
|
|
|
|
// ez var
|
|
int64_t *wids = words->getWordIds ();
|
|
nodeid_t *tids = words->getTagIds ();
|
|
int32_t nw = words->getNumWords ();
|
|
char **wptrs = words->m_words;
|
|
int32_t *wlens = words->m_wordLens;
|
|
int64_t *pids = phrases->getPhraseIds2();
|
|
|
|
// add 5000 slots for inlink text in hashString_ct() calls below
|
|
int32_t numSlots = nw * 3 + 5000;
|
|
// only alloc for this one if not provided
|
|
if (!ct->set(8,4,numSlots,NULL,0,false,m_niceness,"xmlct"))
|
|
return (HashTableX *)NULL;
|
|
|
|
//char *ff = getFragVec ( ) ;
|
|
//if ( ! ff ) return false;
|
|
|
|
// . now hash all the phrase ids we have in order to see if the phrase
|
|
// is unique or not. if phrase is repeated a lot we punish the scores
|
|
// of the individual words in the phrase and boost the score of the
|
|
// phrase itself. We check for uniqueness down below.
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// add the word
|
|
if ( wids[i] == 0LL ) continue;
|
|
//if ( wids[i] == 708411945052722517LL )
|
|
// log("hey4 got new pid=%"INT64" i=%"INT32"",pids[i],i);
|
|
// . skip if in repeated fragment
|
|
// . unfortunately we truncate the frag vec to like
|
|
// the first 80,000 words for performance reasons
|
|
if ( i < MAXFRAGWORDS && fv[i] == 0 ) continue;
|
|
// accumulate the wid with a score of 1 each time it occurs
|
|
if ( ! ct->addTerm ( &wids[i] ) ) return (HashTableX *)NULL;
|
|
// skip if word #i does not start a phrase
|
|
if ( ! pids [i] ) continue;
|
|
// if phrase score is less than 100% do not consider as a
|
|
// phrase so that we do not phrase "albuquerque, NM" and stuff
|
|
// like that... in fact, we can only have a space here...
|
|
if ( wptrs[i+1][0] == ',' ) continue;
|
|
if ( wptrs[i+1][1] == ',' ) continue;
|
|
if ( wptrs[i+1][2] == ',' ) continue;
|
|
// put it in, accumulate, max score is 0x7fffffff
|
|
if ( ! ct->addTerm ( &pids[i] ) ) return (HashTableX *)NULL;
|
|
}
|
|
|
|
// now add each meta tag to the pot
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not a meta tag
|
|
if ( tids[i] != 68 ) continue;
|
|
// find the "content=" word
|
|
char *w = wptrs[i];
|
|
int32_t wlen = wlens[i];
|
|
char *wend = w + wlen;
|
|
char *p ;
|
|
p = strncasestr (w,wlen,"content=");
|
|
// skip if we did not have any content in this meta tag
|
|
if ( ! p ) continue;
|
|
// skip the "content="
|
|
p += 8;
|
|
// skip if empty meta content
|
|
if ( wend - p <= 0 ) continue;
|
|
// our ouw hash
|
|
if ( ! hashString_ct ( ct , p , wend - p ) )
|
|
return (HashTableX *)NULL;
|
|
}
|
|
// add each incoming link text
|
|
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// int16_tcuts
|
|
char *p;
|
|
int32_t plen;
|
|
// hash link text (was hashPwids())
|
|
p = k-> getLinkText();
|
|
plen = k->size_linkText - 1;
|
|
if ( ! verifyUtf8 ( p , plen ) ) {
|
|
log("xmldoc: bad link text 3 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
if ( ! hashString_ct ( ct , p , plen ) )
|
|
return (HashTableX *)NULL;
|
|
// hash this stuff (was hashPwids())
|
|
p = k->getSurroundingText();
|
|
plen = k->size_surroundingText - 1;
|
|
if ( ! hashString_ct ( ct , p , plen ) )
|
|
return (HashTableX *)NULL;
|
|
}
|
|
|
|
// we got it
|
|
m_countTableValid = true;
|
|
return &m_countTable;
|
|
}
|
|
|
|
// . a special function used by XmlDoc::getCountTable() above
|
|
// . kinda similar to XmlDoc::hashString()
|
|
bool XmlDoc::hashString_ct ( HashTableX *ct , char *s , int32_t slen ) {
|
|
|
|
Words words;
|
|
Bits bits;
|
|
Phrases phrases;
|
|
if ( ! words.set ( s , slen , m_version , true , m_niceness ) )
|
|
return false;
|
|
if ( ! bits.set ( &words , m_version , m_niceness ) )
|
|
return false;
|
|
if ( ! phrases.set(&words,&bits,true,false,m_version,m_niceness))
|
|
return false;
|
|
int32_t nw = words.getNumWords();
|
|
int64_t *wids = words.getWordIds();
|
|
int64_t *pids = phrases.m_phraseIds2;
|
|
char **wptrs = words.m_words;
|
|
int32_t *wlens = words.m_wordLens;
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// add the word
|
|
if ( wids[i] == 0LL ) continue;
|
|
// skip if in repeated fragment
|
|
// . NO, we do not use this for these int16_t strings
|
|
//if ( ww[i] == 0 ) continue;
|
|
// accumulate the wid with a score of 1 each time it occurs
|
|
if ( ! ct->addTerm ( &wids[i] ) ) return false;
|
|
// skip if word #i does not start a phrase
|
|
if ( ! pids [i] ) continue;
|
|
// if phrase score is less than 100% do not consider as a
|
|
// phrase so that we do not phrase "albuquerque, NM" and stuff
|
|
// like that... in fact, we can only have a space here...
|
|
if ( i+1<nw ) {
|
|
if ( wptrs[i+1][0] == ',' ) continue;
|
|
if ( wlens[i+1]>=2 && wptrs[i+1][1] == ',' ) continue;
|
|
if ( wlens[i+1]>=3 && wptrs[i+1][2] == ',' ) continue;
|
|
}
|
|
// put it in, accumulate, max score is 0x7fffffff
|
|
if ( ! ct->addTerm ( &pids[i] ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
uint8_t *XmlDoc::getSummaryLangId ( ) {
|
|
// return if we got it already
|
|
if ( m_summaryLangIdValid ) return &m_summaryLangId;
|
|
Summary *s = getSummary();
|
|
if ( ! s || s == (void *)-1 ) return (uint8_t *)s;
|
|
char *sum = s->getSummary();
|
|
// now set the words class
|
|
Words ww;
|
|
if ( ! ww.set9 ( sum , m_niceness ) ) return NULL;
|
|
// check it out. 0 means langUnknown. -1 means error.
|
|
int32_t ret = ww.getLanguage ( NULL , 100 , m_niceness , NULL );
|
|
// -1 means error! g_errno should be set
|
|
if ( ret < 0 ) return NULL;
|
|
// set it
|
|
m_summaryLangId = (uint8_t)ret;
|
|
// assume valid
|
|
m_summaryLangIdValid = true;
|
|
// return it
|
|
return &m_summaryLangId;
|
|
}
|
|
|
|
int cmp ( const void *h1 , const void *h2 ) ;
|
|
|
|
// vector components are 32-bit hashes
|
|
int32_t *XmlDoc::getTagPairHashVector ( ) {
|
|
|
|
if ( m_tagPairHashVecValid ) return m_tagPairHashVec;
|
|
|
|
Xml *xml = getXml ();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
|
|
|
|
// store the hashes here
|
|
uint32_t hashes [ 2000 ];
|
|
int32_t nh = 0;
|
|
// go through each node
|
|
XmlNode *nodes = xml->getNodes ();
|
|
int32_t n = xml->getNumNodes ();
|
|
|
|
// start with the ith node
|
|
int32_t i = 0;
|
|
|
|
uint32_t saved = 0;
|
|
uint32_t lastHash = 0;
|
|
// loop over the nodes
|
|
for ( ; i < n ; i++ ) {
|
|
// breathe a little
|
|
QUICKPOLL ( m_niceness );
|
|
// skip NON tags
|
|
if ( ! nodes[i].isTag() ) continue;
|
|
// use the tag id as the hash, its unique
|
|
uint32_t h = hash32h ( nodes[i].getNodeId() , 0 );
|
|
// ensure hash is not 0, that has special meaning
|
|
if ( h == 0 ) h = 1;
|
|
// store in case we have only one hash
|
|
saved = h;
|
|
|
|
// if we are the first, set this
|
|
if ( ! lastHash ) {
|
|
lastHash = h;
|
|
continue;
|
|
}
|
|
|
|
// if they were the same do not xor, they will zero out
|
|
if ( h == lastHash ) hashes[nh++] = h;
|
|
// incorporate it into the last hash
|
|
else hashes[nh++] = h ^ lastHash;
|
|
|
|
// we are the new last hash
|
|
lastHash = h;
|
|
// bust out if no room
|
|
if ( nh >= 2000 ) break;
|
|
}
|
|
// if only had one tag after, use that
|
|
if ( nh == 0 && saved ) hashes[nh++] = saved;
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness ) ;
|
|
// . TODO: remove the link text hashes here?
|
|
// . because will probably be identical..
|
|
// . now sort hashes to get the top MAX_PAIR_HASHES
|
|
gbsort ( hashes , nh , 4 , cmp );
|
|
// breathe
|
|
QUICKPOLL ( m_niceness ) ;
|
|
// uniquify them
|
|
int32_t d = 0;
|
|
for ( int32_t j = 1 ; j < nh ; j++ ) {
|
|
if ( hashes[j] == hashes[d] ) continue;
|
|
hashes[++d] = hashes[j];
|
|
}
|
|
// breathe
|
|
QUICKPOLL ( m_niceness ) ;
|
|
// how many do we got?
|
|
nh = d;
|
|
// truncate to MAX_PAIR_HASHES MINUS 1 so we can put a 0 at the end
|
|
if ( nh > MAX_TAG_PAIR_HASHES-1 ) nh = MAX_TAG_PAIR_HASHES-1;
|
|
// store the top MAX_PAIR_HASHES
|
|
gbmemcpy ( m_tagPairHashVec , hashes , nh * 4 );
|
|
// null term it. all vectors need this so computeSimilarity() works
|
|
m_tagPairHashVec [ nh++ ] = 0;
|
|
m_tagPairHashVecValid = true;
|
|
m_tagPairHashVecSize = nh * 4;
|
|
return m_tagPairHashVec;
|
|
}
|
|
|
|
// sort in descending order
|
|
int cmp ( const void *h1 , const void *h2 ) {
|
|
return *(uint32_t *)h2 - *(uint32_t *)h1;
|
|
}
|
|
|
|
// . m_tagVector.setTagPairHashes(&m_xml, niceness);
|
|
// . Sections.cpp and getIsDup() both use this hash
|
|
// . returns NULL and sets g_errno on error
|
|
// . xors all the unique adjacent tag hashes together
|
|
// . kind of represents the template the web pages uses
|
|
// . we add this to sectiondb as a vote in Sections::addVotes()
|
|
uint32_t *XmlDoc::getTagPairHash32 ( ) {
|
|
|
|
// only compute once
|
|
if ( m_tagPairHash32Valid ) return &m_tagPairHash32;
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (uint32_t *)words;
|
|
|
|
// int16_tcuts
|
|
//int64_t *wids = words->getWordIds ();
|
|
nodeid_t *tids = words->getTagIds ();
|
|
int32_t nw = words->getNumWords ();
|
|
int32_t nt = words->m_numTags;
|
|
|
|
// . get the hash of all the tag pair hashes!
|
|
// . we then combine that with our site hash to get our site specific
|
|
// html template termid
|
|
// . put all tag pairs into a hash table
|
|
// . similar to Vector::setTagPairHashes() but we do not compute a
|
|
// vector, just a single scalar/hash of 32 bits, m_termId
|
|
HashTableX tp; // T<int64_t,char> tp;
|
|
if ( ! tp.set ( 4 , 1 , nt * 4 , NULL , 0 , true,m_niceness,"xmltp"))
|
|
return 0LL;
|
|
uint32_t lastTid = 0;
|
|
char val = 1;
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// skip if not tag
|
|
if ( tids[i] == 0LL ) continue;
|
|
// skip if back tag
|
|
if ( tids[i] & BACKBIT ) continue;
|
|
// get last tid
|
|
uint32_t h = hash32h ( tids[i] , lastTid );
|
|
//logf(LOG_DEBUG,"build: tph %"INT32" h=%"UINT64"",i,(int64_t)h);
|
|
// . add to table (skip if 0, means empty bucket)
|
|
// . return NULL and set g_errno on error
|
|
if ( h && ! tp.addKey ( &h , &val ) ) return NULL;
|
|
// update this
|
|
lastTid = h;
|
|
}
|
|
// linear scan on hash table to get all the hash, XOR together
|
|
uint32_t hx = 0;
|
|
int32_t nb = tp.getNumSlots();
|
|
char *flags = tp.m_flags;
|
|
// get keys
|
|
uint32_t *keys = (uint32_t *)tp.m_keys;
|
|
for ( int32_t i = 0 ; i < nb ; i++ ) {
|
|
// skip if empty
|
|
if ( flags[i] == 0 ) continue;
|
|
// skip if empty
|
|
//if ( keys[i] == 0LL ) continue;
|
|
// incorporate
|
|
hx ^= keys[i];
|
|
}
|
|
// never return 0, make it 1. 0 means an error
|
|
if ( hx == 0 ) hx = 1;
|
|
// set the hash
|
|
m_tagPairHash32 = hx ;
|
|
// it is now valid
|
|
m_tagPairHash32Valid = true;
|
|
return &m_tagPairHash32;
|
|
}
|
|
|
|
// . used for deduping search results
|
|
// . also uses the title
|
|
int32_t *XmlDoc::getSummaryVector ( ) {
|
|
if ( m_summaryVecValid ) return (int32_t *)m_summaryVec;
|
|
Summary *s = getSummary();
|
|
if ( ! s || s == (Summary *)-1 ) return (int32_t *)s;
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (int32_t *)ti;
|
|
// store title and summary into "buf" so we can call words.set()
|
|
//char buf[5000];
|
|
SafeBuf sb;
|
|
//char *p = buf;
|
|
//int32_t avail = 5000;
|
|
//int32_t len;
|
|
// put title into there
|
|
int32_t tlen = ti->m_titleBytes - 1;
|
|
//if ( len > avail ) len = avail - 10;
|
|
if ( tlen < 0 ) tlen = 0;
|
|
|
|
// put summary into there
|
|
int32_t slen = s->m_summaryLen;
|
|
|
|
// allocate space
|
|
int32_t need = tlen + 1 + slen + 1;
|
|
if ( ! sb.reserve ( need ) ) return NULL;
|
|
|
|
//gbmemcpy ( p , ti->m_title , len );
|
|
//p += len;
|
|
sb.safeMemcpy ( ti->m_title , tlen );
|
|
// space separting the title from summary
|
|
if ( tlen > 0 ) sb.pushChar(' ');
|
|
|
|
//if ( len > avail ) len = avail - 10;
|
|
//gbmemcpy ( p , s->m_summary , len );
|
|
//p += len;
|
|
sb.safeMemcpy ( s->m_summary , slen );
|
|
// null terminate it
|
|
//*p = '\0';
|
|
sb.nullTerm();
|
|
// word-ify it
|
|
Words words;
|
|
if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL;
|
|
// . now set the dedup vector from big summary and title
|
|
// . store sample vector in here
|
|
// . returns size in bytes including null terminating int32_t
|
|
m_summaryVecSize = computeVector ( NULL , &words ,
|
|
(uint32_t *)m_summaryVec );
|
|
m_summaryVecValid = true;
|
|
return m_summaryVec;
|
|
}
|
|
|
|
|
|
bool getWordVector ( char *s ,
|
|
HashTableX *ht ,
|
|
uint32_t *d ,
|
|
int32_t *nd ,
|
|
int32_t ndmax ) {
|
|
// utf8 char size
|
|
char size;
|
|
// grab each word and hash it
|
|
for ( ; *s ; s += size ) {
|
|
// get size
|
|
size = getUtf8CharSize(s);
|
|
// skip if tag
|
|
if ( *s == '<' ) {
|
|
while ( *s && *s!='>' )
|
|
s += getUtf8CharSize(s);
|
|
continue;
|
|
}
|
|
// skip if other type of punct
|
|
if ( ! is_alnum_utf8(s) ) continue;
|
|
// ok, we got a word then
|
|
char *start = s;
|
|
// see how long the word is
|
|
for ( ; *s && is_alnum_utf8(s);s+=getUtf8CharSize(s));
|
|
// get wordid, a simple hash, just like Words.cpp does
|
|
uint64_t h = hash64Lower_utf8(start,s - start);
|
|
// do not inc this time
|
|
size = 0;
|
|
// breathe
|
|
//QUICKPOLL ( m_niceness );
|
|
// make 32 bit
|
|
uint32_t wid32 = (uint32_t)h;
|
|
//
|
|
// TODO: ignore if it is a day name or month name or
|
|
// number because those are like dates
|
|
//
|
|
if ( ht ) {
|
|
// do not add if we already got it
|
|
if ( ht->getSlot ( &wid32 ) >= 0 ) continue;
|
|
// add to hash table. return NULL and set g_errno onerr
|
|
if ( ! ht->addKey (&wid32 )) return false;
|
|
}
|
|
// add it to our vector
|
|
d[*nd] = (uint32_t)wid32;
|
|
// inc it
|
|
*nd = *nd + 1;
|
|
// stop after 3000 for sure
|
|
if ( *nd >= ndmax ) return true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// used by getIsDup() and Dates.cpp for detecting dups and for
|
|
// seeing if the content changed respectively
|
|
int32_t *XmlDoc::getPageSampleVector ( ) {
|
|
if ( m_pageSampleVecValid ) return m_pageSampleVec;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
|
|
Sections *ss = NULL;
|
|
//if ( m_eliminateMenus ) {
|
|
//ss = getSections();
|
|
//if ( ! ss || ss == (Sections *)-1) return (int32_t *)ss;
|
|
//}
|
|
m_pageSampleVecSize = computeVector ( ss, ww,
|
|
(uint32_t *)m_pageSampleVec );
|
|
m_pageSampleVecValid = true;
|
|
return m_pageSampleVec;
|
|
}
|
|
|
|
// . this is the vector of the words right after the hypertext for the link
|
|
// we are voting on.
|
|
// . it is used to dedup voters in Msg25.cpp
|
|
int32_t *XmlDoc::getPostLinkTextVector ( int32_t linkNode ) {
|
|
|
|
if ( m_postVecValid ) return m_postVec;
|
|
// assume none
|
|
m_postVecSize = 0;
|
|
|
|
// set up
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
|
|
|
|
// sanity check
|
|
if ( linkNode < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// linkNode starts pointing to a <a> tag so skip over that!
|
|
linkNode++;
|
|
// limit
|
|
int32_t nn = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// and advance i to the next anchor tag thereafter, we do not
|
|
// want to include link text in this vector because it is usually
|
|
// repeated and will skew our "similarities"
|
|
for ( ; linkNode < nn ; linkNode++ ) {
|
|
// stop if we hit </a> or <a>
|
|
if ( (nodes[linkNode].m_nodeId & BACKBITCOMP) != 2 ) continue;
|
|
// advance over the </a> or <a>
|
|
linkNode++;
|
|
// then stop, we will start gathering link text here
|
|
break;
|
|
}
|
|
// if we hit end of the doc, we got not vector then
|
|
if ( linkNode >= nn ) return m_postVec;
|
|
|
|
// now convert the linkNode # to a word #, "start"
|
|
int32_t nw = ww->getNumWords ();
|
|
int64_t *wids = ww->getWordIds ();
|
|
nodeid_t *tids = ww->getTagIds ();
|
|
int32_t *wn = ww->m_nodes;
|
|
int32_t i = 0;
|
|
for ( ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop when we got the first word in this node #
|
|
if ( wn[i] == linkNode ) break;
|
|
}
|
|
// if none, bail now, size is 0
|
|
if ( i >= nw ) return m_postVec;
|
|
// save that
|
|
int32_t start = i;
|
|
|
|
// likewise, set the end of it
|
|
int32_t end = nw;
|
|
// count alnum words
|
|
int32_t count = 0;
|
|
// limit it
|
|
for ( i = start ; i < nw && count < 35 ; i++ ) {
|
|
// get tag id
|
|
nodeid_t tid = tids[i] & BACKBITCOMP;
|
|
// stop if certain ones
|
|
if ( tid == TAG_TABLE ) break;
|
|
if ( tid == TAG_UL ) break;
|
|
// <a>, </a> is ok
|
|
if ( tids[i] == TAG_A ) break;
|
|
// only up to 35 words allowed in the hash
|
|
if ( wids[i] ) count++;
|
|
}
|
|
// set the end of the words to hash
|
|
end = i;
|
|
// specify starting node # now
|
|
m_postVecSize = computeVector(NULL,ww,(uint32_t *)m_postVec,start,end);
|
|
// return what we got
|
|
return m_postVec;
|
|
}
|
|
|
|
// . was kinda like "m_tagVector.setTagPairHashes(&m_xml, niceness);"
|
|
// . this is used by getIsDup() (below)
|
|
// . this is used by Dates.cpp to see how much a doc has changed
|
|
// . this is also now used for getting the title/summary vector for deduping
|
|
// search results
|
|
// . if we couldn't extract a good pub date for the doc, and it has changed
|
|
// since last spidered, use the bisection method to come up with our own
|
|
// "last modified date" which we use as the pub date.
|
|
// . this replaces the clusterdb.getSimilarity() logic in Msg14.cpp used
|
|
// to do the same thing. but we call Vector::setForDates() from
|
|
// Dates.cpp. that way the logic is more contained in Dates!
|
|
// . doesn't Msg14 already do that?
|
|
// . yes, but it uses two TermTables and calls Clusterdb::getSimilarity()
|
|
// . returns false and sets g_errno on error
|
|
// . these words classes should have been set by a call to Words::set(Xml *...)
|
|
// so that we have "tids1" and "tids2"
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . TODO: if our title rec is non-empty consider getting it from that
|
|
// . we use this vector to compare two docs to see how similar they are
|
|
int32_t XmlDoc::computeVector ( Sections *sections, Words *words, uint32_t *vec ,
|
|
int32_t start , int32_t end ) {
|
|
|
|
// assume empty vector
|
|
vec[0] = 0;
|
|
|
|
// skip if no article section. then we have no vector.
|
|
if ( sections && ! sections->m_hadArticle ) return 0;
|
|
|
|
// int16_tcuts
|
|
int32_t nw = words->getNumWords();
|
|
//int32_t nt = words->m_numTags;
|
|
int64_t *wids = words->getWordIds();
|
|
|
|
// set the end to the real end if it was specified as less than zero
|
|
if ( end < 0 ) end = nw;
|
|
|
|
// # of alnum words, about... minus the tags, then the punct words
|
|
// are half of what remains...
|
|
int32_t count = words->m_numAlnumWords;
|
|
|
|
// if we got sections, how many good words?
|
|
if ( sections ) count = sections->m_numAlnumWordsInArticle;
|
|
|
|
// google seems to index SEC_MARQUEE so i took that out
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
|
|
|
|
// these Section ptrs are 1-1 with the words
|
|
Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs;
|
|
|
|
// . Get sample vector from content section only.
|
|
// . This helps remove duplicate menu/ad from vector
|
|
|
|
// 4 bytes per hash, save the last one for a NULL terminator, 0 hash
|
|
int32_t maxTerms = SAMPLE_VECTOR_SIZE / 4 - 1;
|
|
// what portion of them do we want to mask out from the rest?
|
|
int32_t ratio = count / maxTerms ;
|
|
// a mask of 0 means to get them all
|
|
unsigned char mask = 0x00;
|
|
// if we got twice as many terms as we need, then set mask to 0x01
|
|
// to filter out half of them! but actually, let's aim for twice
|
|
// as many as we need to ensure we really get as many as we need.
|
|
// so if we got 4 or more than we need then cut in half...
|
|
while ( ratio >= 4 ) {
|
|
// shift the mask down, ensure hi bit is set
|
|
mask >>= 1;
|
|
mask |= 0x80;
|
|
ratio >>= 1; // /2
|
|
}
|
|
|
|
// store vector into "d" for now. will sort below
|
|
uint32_t d [ 3000 ];
|
|
|
|
// dedup our vector using this hashtable, "ht"
|
|
char hbuf[3000*6*2];
|
|
HashTableX ht;
|
|
if ( ! ht.set(4,0,3000,hbuf,3000*6*2,false,m_niceness,"xmlvecdedup")){
|
|
char*xx=NULL;*xx=0;}
|
|
|
|
again:
|
|
// a buffer to hold the top termIds
|
|
int32_t nd = 0;
|
|
// count how many we mask out
|
|
int32_t mo = 0;
|
|
// . buffer should have at least "maxTerms" in it
|
|
// . these should all be 12 byte keys
|
|
for ( int32_t i = start ; i < end ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum word
|
|
if ( wids[i] == 0 ) continue;
|
|
// skip if mask filters it
|
|
if ( ((wids[i]>>(NUMTERMIDBITS-8)) & mask)!=0) {mo++;continue;}
|
|
// skip if in select, style, script or marquee tag section
|
|
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
|
|
// make 32 bit
|
|
uint32_t wid32 = (uint32_t)wids[i];
|
|
// do not add if we already got it
|
|
if ( ht.getSlot ( &wid32 ) >= 0 ) continue;
|
|
// add to hash table. return NULL and set g_errno on error
|
|
if ( ! ht.addKey (&wid32 )){char*xx=NULL;*xx=0; }
|
|
// add it to our vector
|
|
d[nd] = (uint32_t)wids[i];
|
|
// stop after 3000 for sure
|
|
if ( ++nd < 3000 ) continue;
|
|
// bitch and break out on error
|
|
log(LOG_INFO,"build: Sample vector overflow. Slight "
|
|
"performance hit.");
|
|
break;
|
|
}
|
|
|
|
// . if nd was too small, don't use a mask to save time
|
|
// . well just make the mask less restrictive
|
|
if ( nd < maxTerms && mask && mo ) {
|
|
// shift the mask UP, allow more termIds to pass through
|
|
mask <<= 1;
|
|
// reset hash table since we are starting over
|
|
ht.clear();
|
|
goto again;
|
|
}
|
|
|
|
// bubble sort them
|
|
bool flag = true;
|
|
while ( flag ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
flag = false;
|
|
for ( int32_t i = 1 ; i < nd ; i++ ) {
|
|
if ( d[i-1] <= d[i] ) continue;
|
|
uint32_t tmp = d[i-1];
|
|
d[i-1] = d[i];
|
|
d[i] = tmp;
|
|
flag = true;
|
|
}
|
|
}
|
|
|
|
// truncate
|
|
if ( nd > maxTerms ) nd = maxTerms;
|
|
// null terminate
|
|
d [ nd++ ] = 0;
|
|
// store in our sample vector
|
|
gbmemcpy ( vec , d , nd * 4 );
|
|
// return size in bytes
|
|
return nd * 4;
|
|
}
|
|
|
|
float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
|
|
int32_t *tv1 = getTagPairHashVector();
|
|
if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
|
|
int32_t *tv2 = xd2->getTagPairHashVector();
|
|
if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
|
|
m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
|
|
m_niceness );
|
|
// this means error, g_errno should be set
|
|
if ( m_tagSimilarity == -1.0 ) return NULL;
|
|
return &m_tagSimilarity;
|
|
}
|
|
|
|
float *XmlDoc::getGigabitSimilarity ( XmlDoc *xd2 ) {
|
|
int32_t **gv1 = getGigabitHashes();
|
|
if ( ! gv1 || gv1 == (int32_t **)-1 ) return (float *)gv1;
|
|
int32_t **gv2 = xd2->getGigabitHashes();
|
|
if ( ! gv2 || gv2 == (int32_t **)-1 ) return (float *)gv2;
|
|
// *gv1 could be NULL if vec was empty in titlerec's ptr_gigabitHashes
|
|
m_gigabitSimilarity = computeSimilarity ( *gv1, *gv2, NULL, NULL, NULL,
|
|
m_niceness );
|
|
// this means error, g_errno should be set
|
|
if ( m_gigabitSimilarity == -1.0 ) return NULL;
|
|
return &m_gigabitSimilarity;
|
|
}
|
|
|
|
float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
|
|
int32_t *sv1 = getPageSampleVector();
|
|
if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
|
|
int32_t *sv2 = xd2->getPageSampleVector();
|
|
if ( ! sv2 || sv2 == (int32_t *)-1 ) return (float *)sv2;
|
|
m_pageSimilarity = computeSimilarity ( sv1, sv2, NULL, NULL, NULL,
|
|
m_niceness );
|
|
// this means error, g_errno should be set
|
|
if ( m_pageSimilarity == -1.0 ) return NULL;
|
|
return &m_pageSimilarity;
|
|
}
|
|
|
|
// . compare old page vector with new
|
|
// . returns ptr to a float from 0.0 to 100.0
|
|
float *XmlDoc::getPercentChanged ( ) {
|
|
// if we got it
|
|
if ( m_percentChangedValid ) return &m_percentChanged;
|
|
// get the old doc
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (float *)od;
|
|
// if empty, assume 0% changed
|
|
if ( ! *od ) {
|
|
m_percentChanged = 0;
|
|
m_percentChangedValid = true;
|
|
return &m_percentChanged;
|
|
}
|
|
// get its page c
|
|
float *ps = getPageSimilarity ( *od );
|
|
if ( ! ps || ps == (float *)-1 ) return (float *)ps;
|
|
// got it
|
|
m_percentChanged = *ps;
|
|
m_percentChangedValid = true;
|
|
// just return it
|
|
return &m_percentChanged;
|
|
}
|
|
|
|
// . Address.cpp converts a place name into a vector for comparing via a
|
|
// call to computeSimilarity() below
|
|
// . returns -1 and set g_errno on error
|
|
// . "vbufSize" is in BYTES!
|
|
// . returns length of word vector in int32_ts (# components stored)
|
|
int32_t makeSimpleWordVector (char *s,int32_t *vbuf,int32_t vbufSize,int32_t niceness ) {
|
|
// nonsense?
|
|
if ( vbufSize < 4 ) { char *xx=NULL;*xx=0; }
|
|
// empty it
|
|
*vbuf = 0;
|
|
// no words, no vector
|
|
if ( ! s ) return 0;
|
|
// set them
|
|
Words w;
|
|
// return -1 with g_errno set on error
|
|
if ( ! w.set9 ( s , niceness ) ) return -1;
|
|
// skip if no words
|
|
if ( w.m_numWords == 0 ) return 0;
|
|
// int16_t cut
|
|
int64_t *wids = w.m_wordIds;
|
|
int64_t pid = 0LL;
|
|
// count insertions
|
|
int32_t count = 0;
|
|
// ptr
|
|
int32_t *vbufPtr = vbuf;
|
|
int32_t *vbufEnd = vbuf + vbufSize/4;
|
|
// put words into a vector
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// if no room stop. need room for NULL terminator
|
|
if ( vbufPtr + 2 >= vbufEnd ) return count;
|
|
// put it in
|
|
//*vbufPtr = (int32_t)wids[i];
|
|
// . use the synonym instead if it had one
|
|
// . maps "theatre" to "theater", "4th" to "fourth", etc.
|
|
// . false = is street name?
|
|
int64_t *p = getSynonymWord ( &wids[i] , &pid , false );
|
|
// set this
|
|
pid = wids[i];
|
|
//int64_t *p = (int64_t *)synTable->getValue64( wids[i] );
|
|
// 0 means to ignore it
|
|
if ( *p == 0LL ) continue;
|
|
// otherwise add into our vector
|
|
*vbufPtr = *p;
|
|
// advance
|
|
vbufPtr++;
|
|
// NULL termination
|
|
*vbufPtr = 0;
|
|
// count it
|
|
count++;
|
|
}
|
|
// all done
|
|
return count;
|
|
}
|
|
|
|
// . compare two vectors
|
|
// . components in vectors are int32_ts
|
|
// . last component is a zero, to mark EOV = end of vector
|
|
// . discount any termIds that are in the query vector, qvec, which may be NULL
|
|
// . returns -1 and sets g_errno on error
|
|
// . vector components are 32-bit hashes of the words (hash32())???
|
|
// i would say they should be the lower 32 bits of the 64-bit hashes!
|
|
// . replaces:
|
|
// g_clusterdb.getGigabitSimilarity()
|
|
// m_tagVec->getLinkBrotherProbability()
|
|
// g_clusterdb.getSampleSimilarity()
|
|
float computeSimilarity ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t *s0 , // corresponding scores vector
|
|
int32_t *s1 , // corresponding scores vector
|
|
Query *q ,
|
|
int32_t niceness ,
|
|
bool dedupVectors ) {
|
|
static int32_t s_tmp = 0;
|
|
if ( ! vec0 ) vec0 = &s_tmp;
|
|
if ( ! vec1 ) vec1 = &s_tmp;
|
|
// if both empty, assume not similar at all
|
|
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *vec0 == 0 ) return 0;
|
|
if ( *vec1 == 0 ) return 0;
|
|
|
|
|
|
// flag if from query vector
|
|
HashTableX qt;
|
|
char qbuf[5000];
|
|
if ( q ) {
|
|
// init hash table
|
|
if ( ! qt.set ( 4,0,512,qbuf,5000,false,niceness,"xmlqvtbl") )
|
|
return -1;
|
|
// . stock the query term hash table
|
|
// . use the lower 32 bits of the termids to make compatible
|
|
// with the other vectors we use
|
|
//int64_t *qtids = q->getTermIds ();
|
|
int32_t nt = q->getNumTerms();
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
// get query term
|
|
QueryTerm *QT = &q->m_qterms[i];
|
|
// get the termid
|
|
int64_t termId = QT->m_termId;
|
|
// get it
|
|
uint32_t h = (uint32_t)(termId & 0xffffffff);
|
|
// hash it
|
|
if ( ! qt.addKey ( &h ) ) return -1;
|
|
}
|
|
}
|
|
|
|
// if we ignore cardinality then it only matters if both vectors
|
|
// have a particular value, and not how many times they each have it.
|
|
// so we essentially dedup each vector if dedupVectors is true.
|
|
// but we do total up the score and put it behind the one unique
|
|
// occurence though. we do this only for
|
|
// Sections::addDateBasedImpliedSections() right now
|
|
bool allowDups = true;
|
|
if ( dedupVectors ) allowDups = false;
|
|
|
|
HashTableX ht;
|
|
char hbuf[10000];
|
|
if ( ! ht.set ( 4,4,-1,hbuf,10000,allowDups,niceness,"xmlqvtbl2"))
|
|
return -1;
|
|
|
|
bool useScores = (bool)s0;
|
|
|
|
int32_t matches = 0;
|
|
int32_t total = 0;
|
|
|
|
int32_t matchScore = 0;
|
|
int32_t totalScore = 0;
|
|
|
|
// hash first vector. accumulating score total and total count
|
|
for ( int32_t *p = vec0; *p ; p++ , s0++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip if matches a query term
|
|
if ( q && qt.getSlot ( p ) ) continue;
|
|
// count it
|
|
total++;
|
|
// get it
|
|
int32_t score = 1;
|
|
// get the score if valid
|
|
if ( useScores ) score = *s0;
|
|
// total it up
|
|
totalScore += score;
|
|
// add it
|
|
if ( dedupVectors ) {
|
|
// accumulate all the scores into this one bucket
|
|
// in the case of p being a dup
|
|
if ( ! ht.addTerm32 ( p , score ) ) return -1;
|
|
}
|
|
else {
|
|
// otherwise, add each into its own bucket since
|
|
// ht.m_allowDups should be true
|
|
if ( ! ht.addKey ( p , &score ) ) return -1;
|
|
}
|
|
}
|
|
|
|
int32_t zero = 0;
|
|
|
|
// see what components of this vector match
|
|
for ( int32_t *p = vec1; *p ; p++ , s1++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip if matches a query term
|
|
if ( q && qt.getSlot ( p ) ) continue;
|
|
// count it
|
|
total++;
|
|
// get it
|
|
int32_t score = 1;
|
|
// get the score if valid
|
|
if ( useScores ) score = *s1;
|
|
// and total scores
|
|
totalScore += score;
|
|
// is it in there?
|
|
int32_t slot = ht.getSlot ( p );
|
|
// skip if unmatched
|
|
if ( slot < 0 ) continue;
|
|
// otherwise, it is a match!
|
|
matches++;
|
|
// and scores
|
|
matchScore += score;
|
|
// and score of what we matched
|
|
uint32_t *val = (uint32_t *)ht.getValueFromSlot ( slot );
|
|
// he is hit too
|
|
matchScore += *val;
|
|
|
|
// remove it as we match it to deal with dups
|
|
if ( allowDups ) {
|
|
// once we match it once, do not match again, score was
|
|
// already accumulated
|
|
ht.setValue ( slot , &zero );
|
|
}
|
|
else {
|
|
// otherwise, remove this dup and try to match any
|
|
// remaining dups in the table
|
|
ht.removeSlot ( slot );
|
|
}
|
|
}
|
|
|
|
// if after subtracting query terms we got no hits, return 0.framesets?
|
|
if ( useScores && totalScore == 0 ) return 0;
|
|
if ( total == 0 ) return 0;
|
|
// . what is the max possible score we coulda had?
|
|
// . subtract the vector components that matched a query term
|
|
float percent = 100 * (float)matchScore / (float)totalScore;
|
|
//if ( useScores)percent = 100 * (float)matchScore / (float)totalScore;
|
|
//else percent = 100 * (float)matches / (float)total;
|
|
// sanity
|
|
//if ( percent > 100 ) percent = 100;
|
|
if ( percent > 100 ) { char *xx=NULL;*xx=0; }
|
|
|
|
return percent;
|
|
}
|
|
|
|
// this returns true if the two vecs are "percentSimilar" or more similar
|
|
bool isSimilar_sorted ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t nv0 , // how many int32_ts in vec?
|
|
int32_t nv1 , // how many int32_ts in vec?
|
|
// they must be this similar or more to return true
|
|
int32_t percentSimilar,
|
|
int32_t niceness ) {
|
|
// if both empty, assume not similar at all
|
|
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *vec0 == 0 ) return 0;
|
|
if ( *vec1 == 0 ) return 0;
|
|
|
|
// do not include last 0
|
|
nv0--;
|
|
nv1--;
|
|
int32_t total = nv0 + nv1;
|
|
|
|
// so if the "noMatched" count ever EXCEEDS (not equals) this
|
|
// "brink" we can bail early because there's no chance of getting
|
|
// the similarity "percentSimilar" provided. should save some time.
|
|
int32_t brink = ((100-percentSimilar) * total) / 100;
|
|
|
|
// scan each like doing a merge
|
|
int32_t *p0 = vec0;
|
|
int32_t *p1 = vec1;
|
|
int32_t yesMatched = 0;
|
|
int32_t noMatched = 0;
|
|
|
|
mergeLoop:
|
|
|
|
// stop if both exhausted. we didn't bail on brink, so it's a match
|
|
if ( *p0 == 0 && *p1 == 0 )
|
|
return true;
|
|
|
|
if ( *p0 < *p1 || *p1 == 0 ) {
|
|
p0++;
|
|
if ( ++noMatched > brink ) return false;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
if ( *p1 < *p0 || *p0 == 0 ) {
|
|
p1++;
|
|
if ( ++noMatched > brink ) return false;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
yesMatched += 2;
|
|
p1++;
|
|
p0++;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
uint64_t *XmlDoc::getFuzzyDupHash ( ) {
|
|
|
|
if ( m_dupHashValid ) return &m_dupHash;
|
|
uint32_t *h1 = getTagPairHash32();
|
|
if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;
|
|
|
|
uint32_t *h2 = getGigabitVectorScorelessHash ( ) ;
|
|
if ( ! h2 || h2 == (uint32_t *)-1 ) return (uint64_t *)h2;
|
|
|
|
//uint64_t h2b = (uint64_t)*h2;
|
|
|
|
m_dupHash = hash64 ( (uint64_t)*h1 , (uint64_t)*h2 );
|
|
m_dupHashValid = true;
|
|
return &m_dupHash;
|
|
}
|
|
|
|
int64_t *XmlDoc::getExactContentHash64 ( ) {
|
|
|
|
if ( m_exactContentHash64Valid )
|
|
return &m_exactContentHash64;
|
|
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1) return (int64_t *)u8;
|
|
|
|
|
|
// if (m_docId==88581116800LL)
|
|
// log("got article1 diffbot");
|
|
// if (m_docId==201689682865LL)
|
|
// log("got article11 diffbot");
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if we are diffbot, then do not quite do an exact content hash.
|
|
// there is a "url:" field in the json that changes. so we have
|
|
// to exclude that field. otherwise getDupList() spider time dedup
|
|
// detection will fail the TestDuplicateContent.testDuplicate smoketest
|
|
if ( cr->m_isCustomCrawl == 1 && m_isDiffbotJSONObject ) {
|
|
int32_t *ch32 = getContentHashJson32();
|
|
if ( ! ch32 || ch32 == (void *)-1 ) return (int64_t *)ch32;
|
|
m_exactContentHash64Valid = true;
|
|
m_exactContentHash64 = (uint64_t)(uint32_t)*ch32;
|
|
return &m_exactContentHash64;
|
|
}
|
|
|
|
unsigned char *p = (unsigned char *)*u8;
|
|
|
|
int32_t plen = size_utf8Content;
|
|
if ( plen > 0 ) plen--;
|
|
|
|
// if we zeroed out this doc to save disk space, then we only
|
|
// record the exact 64-bit hash, so extract it here so that
|
|
// we can delete the gbcontenthash: term from the index if we are
|
|
// deleting this doc or updating it with a fresh copy.
|
|
if ( plen < 100 && p && plen > 12 &&
|
|
strncmp((char *)p,"gbzeroedout:",12) == 0 ) {
|
|
sscanf((char *)p+12,"%"UINT64,&m_exactContentHash64);
|
|
m_exactContentHash64Valid = true;
|
|
return &m_exactContentHash64;
|
|
}
|
|
|
|
|
|
// sanity
|
|
//if ( ! p ) return 0LL;
|
|
//if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
unsigned char *pend = (unsigned char *)p + plen;
|
|
uint64_t h64 = 0LL;
|
|
unsigned char pos = 0;
|
|
bool lastWasSpace = true;
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// treat sequences of white space as a single ' ' (space)
|
|
if ( is_wspace_a(*p) ) {
|
|
if ( lastWasSpace ) continue;
|
|
lastWasSpace = true;
|
|
// treat all white space as a space
|
|
h64 ^= g_hashtab[pos][(unsigned char)' '];
|
|
pos++;
|
|
continue;
|
|
}
|
|
lastWasSpace = false;
|
|
// xor this in right
|
|
h64 ^= g_hashtab[pos][p[0]];
|
|
pos++;
|
|
}
|
|
|
|
m_exactContentHash64Valid = true;
|
|
m_exactContentHash64 = h64;
|
|
return &m_exactContentHash64;
|
|
}
|
|
|
|
|
|
RdbList *XmlDoc::getDupList ( ) {
|
|
if ( m_dupListValid ) return &m_dupList;
|
|
|
|
// until we start using posdb and not indexdb, just return an
|
|
// empty list.
|
|
// TODO: MDW fix the deduping.
|
|
//m_dupList.reset();
|
|
//m_dupListValid = true;
|
|
//return &m_dupList;
|
|
//
|
|
// end temp hack
|
|
//
|
|
|
|
//uint64_t *dh = getDupHash ( );
|
|
//if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
int64_t *ph64 = getExactContentHash64();
|
|
//int64_t *ph64 = getLooseContentHash64();
|
|
if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64;
|
|
|
|
// must match term in XmlDoc::hashVectors()
|
|
char qbuf[256];
|
|
snprintf(qbuf, 256, "%"UINT64"",*ph64);
|
|
int64_t pre = hash64b ( "gbcontenthash" , 0LL );
|
|
int64_t rawHash = hash64b ( qbuf , 0LL );
|
|
int64_t termId = hash64 ( rawHash , pre );
|
|
// get the startkey, endkey for termlist
|
|
key144_t sk ;
|
|
key144_t ek ;
|
|
g_posdb.makeStartKey ( &sk,termId ,0);
|
|
g_posdb.makeEndKey ( &ek,termId ,MAX_DOCID);
|
|
// note it
|
|
log(LOG_DEBUG,"build: check termid=%"UINT64" for docid %"UINT64""
|
|
,(uint64_t)(termId&TERMID_MASK)
|
|
,m_docId);
|
|
// assume valid now
|
|
m_dupListValid = true;
|
|
// this is a no-split lookup by default now
|
|
if ( ! m_msg0.getList ( -1 , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // add to cache?
|
|
RDB_POSDB, // INDEXDB ,
|
|
cr->m_collnum,
|
|
&m_dupList ,
|
|
(char *)&sk ,
|
|
(char *)&ek ,
|
|
606006 , // minRecSizes in bytes
|
|
m_masterState , // state
|
|
m_masterLoop ,
|
|
m_niceness ,
|
|
true , // error correction?
|
|
true , // include tree?
|
|
true , // domerge?
|
|
-1 , // firsthosti
|
|
0 , // startfilenum
|
|
-1, // # files
|
|
// never timeout when spidering in case
|
|
// a host is down.
|
|
9999977 , // timeout
|
|
-1 , // syncpoint
|
|
-1 , // preferlocal reads
|
|
NULL, // msg5
|
|
NULL, // msg5b
|
|
false , // isRealMerge
|
|
true , // allow page cache
|
|
false , // forcelocalindexdb
|
|
true ) ) // shardByTermId? THIS IS DIFFERENT!!!
|
|
// return -1 if this blocks
|
|
return (RdbList *)-1;
|
|
// assume valid!
|
|
m_dupListValid = true;
|
|
return &m_dupList;
|
|
}
|
|
|
|
|
|
// moved DupDetector.cpp into here...
|
|
char *XmlDoc::getIsDup ( ) {
|
|
if ( m_isDupValid ) return &m_isDup;
|
|
// assume we are not a dup
|
|
m_isDup = false;
|
|
// get it
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// skip if we should
|
|
if ( ! cr->m_dedupingEnabled ||
|
|
// bulk jobs never dedup
|
|
cr->m_isCustomCrawl == 2 ) {
|
|
m_isDupValid = true;
|
|
return &m_isDup;
|
|
}
|
|
|
|
// if &links was given in the diffbot api url then do not do
|
|
// spider time deduping because the pages are likely rendered using
|
|
// javascript, so they'd all seem to be dups of one another.
|
|
if ( cr->m_isCustomCrawl ) {
|
|
SafeBuf *au = getDiffbotApiUrl();
|
|
if ( ! au || au == (void *)-1 ) return (char *)au;
|
|
char *linksParm = NULL;
|
|
if ( au->length() > 0 )
|
|
linksParm = strstr ( au->getBufStart() , "&links");
|
|
if ( ! linksParm && au->length() > 0 )
|
|
linksParm = strstr ( au->getBufStart() , "?links");
|
|
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
|
|
linksParm = NULL;
|
|
if ( linksParm ) {
|
|
m_isDupValid = true;
|
|
m_isDup = false;
|
|
return &m_isDup;
|
|
}
|
|
}
|
|
|
|
// do not dedup seeds
|
|
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
|
if ( cr->m_isCustomCrawl && isSeed ) {
|
|
m_isDupValid = true;
|
|
m_isDup = false;
|
|
return &m_isDup;
|
|
}
|
|
|
|
|
|
setStatus ( "checking for dups" );
|
|
|
|
// BUT if we are already indexed and a a crawlbot/bulk diffbot job
|
|
// then do not kick us out just because another indexed doc is
|
|
// a dup of us because it messes up the TestOnlyProcessIfNew smoketests
|
|
// because in the 2nd round we end up deleting article1.html after
|
|
// indexing it in the first round, then we add article11.html's
|
|
// diffbot reply in the 2nd round because article1.html and its
|
|
// diffbot reply was deleted. thereby giving it a new timestamp and
|
|
// makeing the smoke fail.
|
|
if ( cr->m_isCustomCrawl ) {
|
|
char *isIndexed = getIsIndexed();
|
|
if ( ! isIndexed || isIndexed == (char *)-1)
|
|
return (char *)isIndexed;
|
|
if ( *isIndexed ) {
|
|
m_isDupValid = true;
|
|
return &m_isDup;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//we need both vectors to be non-empty
|
|
//uint64_t *tv = getTagPairHash();
|
|
//if ( ! tv || tv == (uint64_t *)-1) return (char *)tv;
|
|
// get our docid
|
|
int64_t *mydocid = getDocId();
|
|
if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
|
|
// get the duplist!
|
|
RdbList *list = getDupList();
|
|
if ( ! list || list == (RdbList *)-1 ) return (char *)list;
|
|
|
|
// sanity. must be posdb list.
|
|
if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}
|
|
|
|
// so getSiteRank() does not core
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
|
|
// . see if there are any pages that seem like they are dups of us
|
|
// . they must also have a HIGHER score than us, for us to be
|
|
// considered the dup
|
|
//if ( ! m_didQuickDupCheck ) {
|
|
// // do not repeat
|
|
// m_didQuickDupCheck = true;
|
|
|
|
|
|
int32_t myRank = getSiteRank ( );
|
|
|
|
// init
|
|
//uint8_t maxScore = 0;
|
|
//uint8_t myScore = 0;
|
|
//char maxSiteRank = -1;
|
|
//int64_t maxDocId = -1LL;
|
|
// assume not a dup
|
|
m_isDup = false;
|
|
// get the docid that we are a dup of
|
|
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
//int64_t d = list->getCurrentDocId();
|
|
char *rec = list->getCurrentRec();
|
|
// get the docid
|
|
int64_t d = g_posdb.getDocId ( rec );
|
|
// get the score
|
|
//uint8_t score = list->getCurrentScore();
|
|
// just let the best site rank win i guess?
|
|
// even though one page may have more inlinks???
|
|
char sr = (char )g_posdb.getSiteRank ( rec );
|
|
// skip if us!
|
|
//if ( d == *getDocId() ) {
|
|
// // record our score
|
|
// //myScore = score;
|
|
// mySiteRank = sr;
|
|
// continue;
|
|
//}
|
|
|
|
// skip if us
|
|
if ( d == m_docId ) continue;
|
|
|
|
// for debug
|
|
//if ( d != m_docId )
|
|
//log("build: doc %s is dup of docid %"INT64"",
|
|
// m_firstUrl.m_url,d);
|
|
|
|
// if his rank is <= ours then he was here first and we
|
|
// are the dup i guess...
|
|
if ( sr >= myRank ) {
|
|
log("build: doc %s is dup of docid %"INT64"",
|
|
m_firstUrl.m_url,d);
|
|
m_isDup = true;
|
|
m_isDupValid = true;
|
|
m_docIdWeAreADupOf = d;
|
|
return &m_isDup;
|
|
}
|
|
|
|
// get the winner
|
|
//if ( score > maxScore ) maxScore = score;
|
|
//if ( sr > maxSiteRank || maxSiteRank == -1 ) {
|
|
// maxSiteRank = sr;
|
|
// maxDocId = d;
|
|
// continue;
|
|
//}
|
|
//if ( sr < maxSiteRank ) continue;
|
|
// fallback to docid?
|
|
// do it first come first server othereise i guess
|
|
// this will prevent dups from existing in the index at least
|
|
// if they have the same siterank...
|
|
//if ( d < maxDocId ) {
|
|
// maxDocId = d;
|
|
// continue;
|
|
//}
|
|
}
|
|
// are we the highest scoring doc with this template?
|
|
// corollary: if all dups have equal scores they will be
|
|
// removed until there is only one doc that matches the pattern
|
|
//if ( myScore >= maxScore ) {
|
|
//if ( maxDocId >= 0 && maxDocId != *mydocid && out) {
|
|
// m_isDup = true;
|
|
// m_isDupValid = true;
|
|
// return &m_isDup;
|
|
//}
|
|
|
|
m_isDup = false;
|
|
m_isDupValid = true;
|
|
return &m_isDup;
|
|
|
|
/*
|
|
we now temporarily at least, do exact dup checking...
|
|
later we will bring in the fuzzy code...
|
|
|
|
// reset its ptr for stuff below
|
|
list->resetListPtr();
|
|
|
|
loop:
|
|
// . get a title rec for the current docid
|
|
// . but if exhausted, we are not a dup!
|
|
if ( list->isExhausted() ) { m_isDupValid = true; return &m_isDup; }
|
|
// get the docid
|
|
int64_t d = list->getCurrentDocId();
|
|
// continue if us!
|
|
if ( d == *mydocid ) { list->skipCurrentRecord(); goto loop; }
|
|
// is this a dup of us?
|
|
char *dup = isDupOfUs ( d );
|
|
if ( ! dup || dup == (char *)dup ) return (char *)dup;
|
|
// if dup of us, bail out
|
|
if ( *dup ) { m_isDup = true; m_isDupValid = true; return &m_isDup; }
|
|
// prepare for next
|
|
list->skipCurrentRecord();
|
|
// loop up
|
|
goto loop;
|
|
*/
|
|
}
|
|
|
|
char *XmlDoc::isDupOfUs ( int64_t d ) {
|
|
// sanity check
|
|
if ( d <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// get our current title rec
|
|
SafeBuf *tr = getTitleRecBuf();
|
|
if ( ! tr || tr == (void *)-1 ) return (char *)tr;
|
|
// we should not be here if we know we are a dup of another doc
|
|
if ( m_isDup ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// get the title rec for this docid if we haven't yet done so
|
|
if ( m_calledMsg22d != d ) { // .m_docId != d ) {
|
|
bool s;
|
|
// note it
|
|
setStatus ( "getting possible dup title rec" );
|
|
// do not re-call
|
|
m_calledMsg22d = d;
|
|
// get the guy that might be a dup of us
|
|
s = m_msg22d.getTitleRec ( &m_msg22Request ,
|
|
NULL ,
|
|
d ,
|
|
cr->m_coll ,
|
|
&m_dupTrPtr ,
|
|
&m_dupTrSize ,
|
|
false , // just check tfndb?
|
|
false , // getAvailDocIdOnly
|
|
m_masterState, // state
|
|
m_masterLoop , // callback
|
|
m_niceness ,
|
|
false , // add to cache
|
|
60*60*24 , // maxcacheage
|
|
999999 );// timeout
|
|
// we blocked
|
|
if ( ! s ) return (char *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
// if not there do not count as an error
|
|
if ( ! m_dupTrPtr ) { g_errno = 0; return &m_isDup; }
|
|
// ignore any errors too i guess...
|
|
if ( m_msg22d.m_errno ) {
|
|
log(LOG_WARN, "build: Dup Detection error with "
|
|
"titlerec fetch: %s",mstrerror(m_msg22d.m_errno));
|
|
g_errno = 0;
|
|
return &m_isDup;
|
|
}
|
|
// we need to parse this potential dup doc
|
|
XmlDoc dd;
|
|
// . parse the possible dup title rec into another XmlDoc class
|
|
// . it returns false and sets g_errno on error
|
|
if ( ! dd.set2 ( m_dupTrPtr ,
|
|
m_dupTrSize ,
|
|
cr->m_coll ,
|
|
NULL , // m_pbuf ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
|
|
LinkInfo *info1a = dd.getLinkInfo1();
|
|
LinkInfo *info1b = getLinkInfo1();
|
|
float pageNumInlinksA = info1a->m_numGoodInlinks;//getNumInlinksExtrapolated();
|
|
float pageNumInlinksB = info1b->m_numGoodInlinks;//getNumInlinksExtrapolated();
|
|
|
|
// . if the old dup doc is of lower quality than the new doc that
|
|
// we are checking, then that one should be removed, not us!
|
|
// if they are equal, we keep the int16_ter url of the two
|
|
// . dd was set from title rec so these numInlinks should be taken
|
|
// from the TagRec in ptr_tagRecData, and therefore NOT BLOCK!
|
|
if ( *dd.getSiteNumInlinks() < *getSiteNumInlinks() )
|
|
return &m_isDup;
|
|
if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
|
|
pageNumInlinksA < pageNumInlinksB )
|
|
return &m_isDup;
|
|
if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
|
|
pageNumInlinksA == pageNumInlinksB &&
|
|
dd.getFirstUrl()->getUrlLen() > getFirstUrl()->getUrlLen())
|
|
return &m_isDup;
|
|
|
|
float *ts = getTagSimilarity ( &dd );
|
|
if ( ! ts || ts == (float *)-1 ) return (char *)ts;
|
|
float *gs = getGigabitSimilarity ( &dd );
|
|
if ( ! gs || gs == (float *)-1 ) return (char *)gs;
|
|
float *ps = getPageSimilarity ( &dd );
|
|
if ( ! ps || ps == (float *)-1 ) return (char *)ps;
|
|
|
|
int32_t gigabitVecSimilarity = (int32_t)*gs;
|
|
int32_t tagVecSimilarity = (int32_t)*ts;
|
|
int32_t sampleVecSimilarity = (int32_t)*ps;
|
|
|
|
int32_t notSimilarCount = 0;
|
|
if ( gigabitVecSimilarity < 80 ) {
|
|
notSimilarCount++;
|
|
if ( gigabitVecSimilarity < 50 ) return &m_isDup;
|
|
}
|
|
if ( tagVecSimilarity < 80 ) {
|
|
notSimilarCount++;
|
|
if ( tagVecSimilarity < 50 ) return &m_isDup;
|
|
}
|
|
if ( sampleVecSimilarity < 80 ) {
|
|
notSimilarCount++;
|
|
if ( sampleVecSimilarity < 50 ) return &m_isDup;
|
|
}
|
|
// if it is similar enough, we got a dup!
|
|
if ( notSimilarCount <= 0 ) { m_isDupValid = true; m_isDup = true; }
|
|
|
|
return &m_isDup;
|
|
}
|
|
|
|
// hash a gigabit hash vector without its scores, also order independent
|
|
uint32_t *XmlDoc::getGigabitVectorScorelessHash ( ) {
|
|
if ( m_gigabitVectorHashValid ) return &m_gigabitVectorHash;
|
|
int32_t **gbvec = getGigabitHashes();
|
|
if ( ! gbvec || gbvec == (int32_t **)-1 ) return (uint32_t *)gbvec;
|
|
uint32_t h = 0;
|
|
// this bad boy is NULL terminated
|
|
uint32_t *gbv = (uint32_t *)*gbvec;
|
|
// i guess zak likes the simple XOR'ing thing...
|
|
for ( int32_t i = 0; gbv && gbv[i] ; i++) h ^= gbv[i];
|
|
m_gigabitVectorHashValid = true;
|
|
m_gigabitVectorHash = h;
|
|
return &m_gigabitVectorHash;
|
|
}
|
|
|
|
// . the original vector used for deduping similar search results is just from
|
|
// random sample of indexed terms, but gigabit vector is
|
|
// formed using the hashes of the top-scoring gigabits of the document, and
|
|
// therefore uses the words class
|
|
// . sets g_errno and returns NULL on error
|
|
// . ptr_gigabitHashes can be NULL...
|
|
int32_t **XmlDoc::getGigabitHashes ( ) {
|
|
// if it was already set, treat this as an accessor
|
|
if ( m_gigabitHashesValid ) return &ptr_gigabitHashes;
|
|
// this also sets the vector
|
|
char *gq = getGigabitQuery();
|
|
if ( ! gq || gq == (char *)-1) return (int32_t **)gq;
|
|
// it should be valid now!
|
|
if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
|
|
return &ptr_gigabitHashes;
|
|
}
|
|
|
|
// . the new function to get gigabits
|
|
// . sets and validates m_gigabitQuery[] and m_gigabitHashes[] among others
|
|
// . candidates = capitalized word, capitalized sequence of words,
|
|
// uncapitalized 2+ word wikipedia phrase.
|
|
// . candidates exclude uncapitalized query stop words.
|
|
// . calls addGigabits() which is called by each doc in search results
|
|
// when we use this at query time.
|
|
// . separates gigabits with a comma (delimeter) in m_gigabitQuery[]
|
|
// . quotes multiple word gigabits
|
|
char *XmlDoc::getGigabitQuery ( ) {
|
|
|
|
if ( m_gigabitQueryValid ) return m_gigabitQuery;
|
|
|
|
setStatus ( "getting gigabit query" );
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (char *)d;
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss == (Sections *)-1 ) return (char *)ss;
|
|
//Weights *we = getWeights();
|
|
//if ( ! we || we == (Weights *)-1 ) return (char *)we;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
|
|
|
|
HashTableX ht;
|
|
char buf [ 200000 ];
|
|
// pass in niceness in case it has to grow really big and re-hash all!!
|
|
ht.set ( 8 , 4 , -1 , buf , 200000 , false, m_niceness,"xmlgbtbl");
|
|
|
|
// . add gigabits from our body words
|
|
// . includes title and header tags so pts can work well!
|
|
if ( ! addGigabits ( ww , *d , ss , *langId ) ) return NULL;
|
|
|
|
// add gigabits from link info
|
|
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
|
// sanity check
|
|
char *txt = k->getLinkText();
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 0 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
// add those in
|
|
if (!addGigabits(txt, *d, *langId ) ) return NULL;
|
|
// add in neighborhoods
|
|
if(!addGigabits(k->getSurroundingText(),*d,*langId))
|
|
return NULL;
|
|
}
|
|
|
|
// add in gigabits for meta keywords
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription( &mdlen );
|
|
if ( ! addGigabits2 ( md , mdlen, *d , *langId ) ) return NULL;
|
|
|
|
// add in gigabits for meta description
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
if ( ! addGigabits2 ( mk , mklen , *d , *langId ) ) return NULL;
|
|
|
|
// set m_gigabitQuery and m_gigabitScores
|
|
//GigabitInfo *top[100];
|
|
// fill in "top" in order of score
|
|
m_numTop = getTopGigabits ( &ht , m_top , 100 , 0 );
|
|
// error? then g_errno should be set
|
|
if ( m_numTop == -1 ) return NULL;
|
|
|
|
char *p = m_gigabitQuery;
|
|
char *pend = m_gigabitQuery + XD_GQ_MAX_SIZE - 1;
|
|
// reset count of vector components for setting gigabit vector
|
|
int32_t ng = 0;
|
|
// total score
|
|
//int32_t total = 0;
|
|
// . now set the gigabit query!
|
|
// . start with the highest scoring node first, the last node since
|
|
// nodes are ranked by lowest to highest key
|
|
for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
|
|
// get the info
|
|
GigabitInfo *gi = m_top[i];
|
|
// stop if too big
|
|
if ( p + gi->m_len + 10 >= pend ) continue;
|
|
// get 32 bit hash
|
|
uint32_t h = gi->m_hash & 0xffffffff;
|
|
// never allow 0
|
|
if ( h == 0 ) h = 1;
|
|
// add to vector
|
|
if ( ng + 1 < XD_MAX_GIGABIT_HASHES ) {
|
|
// the term hash
|
|
m_gigabitHashes[ng] = (int32_t)h ;
|
|
// and the score
|
|
m_gigabitScores[ng] = gi->m_pts;
|
|
// point into it, where we will copy it to
|
|
m_gigabitPtrs [ng] = p + 1;
|
|
// advance
|
|
ng++;
|
|
}
|
|
// quote it
|
|
*p++ = '\"';
|
|
// write into buffer
|
|
gbmemcpy ( p , gi->m_ptr , gi->m_len );
|
|
// finish quote
|
|
*p++ = '\"';
|
|
// separate terms just in case
|
|
//gbmemcpy ( p , " , ", 4 );
|
|
//p += 4;
|
|
*p++ = ',';
|
|
}
|
|
// done
|
|
*p++ = '\0';
|
|
// NULL termiante the vector to make it a legit vector
|
|
m_gigabitHashes [ ng ] = 0;
|
|
m_gigabitScores [ ng ] = 0;
|
|
|
|
// include the terminating 0
|
|
ng++;
|
|
// validate both the query and vector
|
|
m_gigabitQueryValid = true;
|
|
m_gigabitHashesValid = true;
|
|
// set this too
|
|
ptr_gigabitHashes = m_gigabitHashes;
|
|
ptr_gigabitScores = m_gigabitScores;
|
|
size_gigabitHashes = ng * 4 ; // 4 bytes each component
|
|
size_gigabitScores = ng * 4 ; // 4 bytes each score
|
|
return m_gigabitQuery;
|
|
}
|
|
|
|
|
|
// . fill in "top" in order of score
|
|
// . returns -1 and sets g_errno on error
|
|
int32_t getTopGigabits ( HashTableX *ht ,
|
|
GigabitInfo **top ,
|
|
int32_t max ,
|
|
int32_t minDocCount ) {
|
|
|
|
|
|
// store top 100 into this tree
|
|
RdbTree tree;
|
|
if ( ! tree.set ( 4 , // fixedDataSize
|
|
max+2 , // maxNumNodes
|
|
true , // balance?
|
|
-1 , // maxMem
|
|
true , // own data?
|
|
"tree-topgbits" ))
|
|
return -1;
|
|
|
|
int32_t ns = ht->getNumSlots();
|
|
key_t minKey;
|
|
bool minKeyValid = false;
|
|
for ( int32_t i = 0 ; i < ns ; i++ ) {
|
|
// skip if empty
|
|
if ( ht->isEmpty(i) ) continue;
|
|
// get his info
|
|
GigabitInfo *gi = (GigabitInfo *)ht->getValueFromSlot(i);
|
|
// must be valid
|
|
if ( gi->m_count <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// must be in this many docs minimum
|
|
if ( gi->m_numDocs < minDocCount ) continue;
|
|
// make the key
|
|
key_t key;
|
|
key.n1 = gi->m_pts;
|
|
key.n0 = gi->m_hash;
|
|
// should we add it?
|
|
if ( minKeyValid && key <= minKey ) continue;
|
|
// we should add it. use points as the key. use PTR as data
|
|
int32_t node = tree.addNode(0,key,(char *)&gi,4);
|
|
// error? g_errno should be set
|
|
if ( node < 0 ) return -1;
|
|
// if not full continue
|
|
if ( tree.getNumUsedNodes() < 100 ) continue;
|
|
// get the smallest node
|
|
int32_t tn = tree.getLowestNode ( ) ;
|
|
// sanity check
|
|
if ( tn < 0 ) { char *xx=NULL;*xx=0; }
|
|
// kick out smallest
|
|
tree.deleteNode3 ( tn , false );
|
|
// get new smallest
|
|
tn = tree.getLowestNode();
|
|
// set the new minkey
|
|
minKey = *(key_t *)tree.getKey ( tn );
|
|
// validate it
|
|
minKeyValid = true;
|
|
}
|
|
int32_t count = 0;
|
|
// . now set the array
|
|
// . start with the highest scoring node first, the last node since
|
|
// nodes are ranked by lowest to highest key
|
|
for ( int32_t nn=tree.getLastNode() ; nn>=0 ; nn=tree.getPrevNode(nn) ){
|
|
// get the info
|
|
GigabitInfo *gi = (GigabitInfo *)tree.getData(nn);
|
|
// store it
|
|
top[count++] = gi;
|
|
// stop if we are full
|
|
if ( count >= max ) break;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
char *XmlDoc::getMetaDescription( int32_t *mdlen ) {
|
|
if ( m_metaDescValid ) {
|
|
*mdlen = m_metaDescLen;
|
|
return m_metaDesc;
|
|
}
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
//xml->getMetaContent ( m_metaDesc, 1024, "description", 11 );
|
|
// we need to point to it in the html source so our WordPosInfo
|
|
// algo works right.
|
|
m_metaDesc = xml->getMetaContentPointer("description",
|
|
11,
|
|
"name",
|
|
&m_metaDescLen);
|
|
*mdlen = m_metaDescLen;
|
|
m_metaDescValid = true;
|
|
return m_metaDesc;
|
|
}
|
|
|
|
char *XmlDoc::getMetaSummary ( int32_t *mslen ) {
|
|
if ( m_metaSummaryValid ) {
|
|
*mslen = m_metaSummaryLen;
|
|
return m_metaSummary;
|
|
}
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
m_metaSummary = xml->getMetaContentPointer("summary",
|
|
7,
|
|
"name",
|
|
&m_metaSummaryLen);
|
|
*mslen = m_metaSummaryLen;
|
|
m_metaSummaryValid = true;
|
|
return m_metaSummary;
|
|
}
|
|
|
|
char *XmlDoc::getMetaKeywords( int32_t *mklen ) {
|
|
if ( m_metaKeywordsValid ) {
|
|
*mklen = m_metaKeywordsLen;
|
|
return m_metaKeywords;
|
|
}
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
//xml->getMetaContent ( m_metaKeywords, 1024, "keywords", 8 );
|
|
// we need to point to it in the html source so our WordPosInfo
|
|
// algo works right.
|
|
m_metaKeywords=xml->getMetaContentPointer("keywords",
|
|
8,
|
|
"name",
|
|
&m_metaKeywordsLen);
|
|
*mklen = m_metaKeywordsLen;
|
|
m_metaKeywordsValid = true;
|
|
return m_metaKeywords;
|
|
}
|
|
|
|
bool XmlDoc::addGigabits ( char *s ,
|
|
int64_t docId ,
|
|
uint8_t langId ) {
|
|
Words tmp;
|
|
// skip if none
|
|
if ( ! s ) return true;
|
|
// returns NULL with g_errno set on error
|
|
if ( ! tmp.set9 ( s , m_niceness ) ) return false;
|
|
// and weights!
|
|
//Weights we;
|
|
//if ( ! we.set ( &tmp , )
|
|
// and so does this
|
|
return addGigabits ( &tmp , docId , NULL , langId );
|
|
}
|
|
|
|
bool XmlDoc::addGigabits2 ( char *s ,
|
|
int32_t slen,
|
|
int64_t docId ,
|
|
uint8_t langId ) {
|
|
Words tmp;
|
|
// skip if none
|
|
if ( ! s ) return true;
|
|
// returns NULL with g_errno set on error
|
|
if ( ! tmp.setx ( s , slen , m_niceness ) ) return false;
|
|
// and weights!
|
|
//Weights we;
|
|
//if ( ! we.set ( &tmp , )
|
|
// and so does this
|
|
return addGigabits ( &tmp , docId , NULL , langId );
|
|
}
|
|
|
|
bool XmlDoc::addGigabits(Words *ww,int64_t docId,Sections *sections,
|
|
uint8_t langId ) {
|
|
// skip sections marked as these:
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
|
|
// get this
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
// not if we don't have any identified sections
|
|
if ( sections && sections->m_numSections <= 0 ) sp = NULL;
|
|
// int16_tcuts
|
|
int64_t *wids = ww->m_wordIds;
|
|
char **wptrs = ww->m_words;
|
|
int32_t *wlens = ww->m_wordLens;
|
|
nodeid_t *tids = ww->m_tagIds;
|
|
int32_t nw = ww->getNumWords();
|
|
//int32_t flags;
|
|
// inital # of slots
|
|
int32_t is = 0;
|
|
if ( m_wordsValid ) is = ww->m_numAlnumWords;
|
|
// put gigabits into this hash table
|
|
HashTableX ht;
|
|
if ( ! ht.set ( 8 , sizeof(GigabitInfo),is,NULL,0,false,m_niceness,
|
|
"gigabits") )
|
|
return false;
|
|
// scan through the words
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe if being called by spider
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// get section
|
|
Section *sx = NULL;
|
|
// get flags
|
|
if ( sp ) sx = sp[i];//flags = sp[i]->m_flags;
|
|
//else flags = 0;
|
|
// skip if ignored. i.e. in the menu or not in the article text
|
|
//if ( flags & badFlags ) continue;
|
|
// are we capitalized?
|
|
bool cap = ww->isCapitalized(i);
|
|
// ignore lower case query stop words
|
|
if (!cap&&isQueryStopWord(wptrs[i],wlens[i],wids[i],langId))
|
|
continue;
|
|
// hash of word then the phrase
|
|
//uint32_t h = wids[i] & 0xffffffff;
|
|
//uint64_t h = wids[i];
|
|
// add the word itself. return NULL with g_errno set on error
|
|
if ( ! addGigabit (&ht,wptrs[i],wlens[i],docId,
|
|
sx,true,langId,i)) return false;
|
|
// save position
|
|
int32_t j = i + 1 ;
|
|
// check this far out
|
|
int32_t maxj = i + 12; if ( maxj > nw ) maxj = nw;
|
|
// do we got a cap phrase?
|
|
bool capPhrase = false;
|
|
// if capitalized look for sequence
|
|
for ( ; cap && j < maxj ; j++ ) {
|
|
// . stop on tags
|
|
// . tids is NULL if being set from meta tag...
|
|
if ( tids && tids[j] ) break;
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) {
|
|
// make sure it is like a single space or
|
|
// something we can "phrase across"
|
|
// TODO: can be like "capt. "
|
|
if ( wlens[j] == 1 ) continue;
|
|
// otherwise it stops the phrase
|
|
break;
|
|
}
|
|
// if not capitalized stop
|
|
if ( ! ww->isCapitalized(j) ) break;
|
|
// got one!
|
|
capPhrase = true;
|
|
// . hash it into the ongoing hash
|
|
// . Speller::getPopularity() should use this same
|
|
// method so we can get popularities of the gigabits!
|
|
//h = hash32Fast ( wids[j] & 0xffffffff , h );
|
|
//h = hash64Fast ( wids[j] , h );
|
|
}
|
|
// if we added something... skip whole phrase, if any
|
|
if ( capPhrase ) {
|
|
// get length of it
|
|
int32_t len = wptrs[j-1] + wlens[j-1] - wptrs[i];
|
|
// add that entire sequence, [i,j)
|
|
if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,
|
|
false,langId,i)) return false;
|
|
// advance to end of phrase
|
|
i = j - 1;
|
|
continue;
|
|
}
|
|
// reset
|
|
j = i + 1;
|
|
// this must be true
|
|
// . ok, look for a wiki phrase then!
|
|
// . we can speed this up if too slow... using a crazy hash tbl
|
|
int32_t wikij = -1;
|
|
// init the hash for wiki lookup
|
|
uint32_t h = 0;
|
|
// loop over successive terms
|
|
for ( ; j < maxj ; j++ ) {
|
|
// . stop on tags
|
|
// . tids is NULL if being set from meta tag
|
|
if ( tids && tids[j] ) break;
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) {
|
|
// make sure it is like a single space or
|
|
// something we can "phrase across"
|
|
// TODO: can be like "capt. "
|
|
if ( wlens[j] == 1 ) continue;
|
|
// otherwise it stops the phrase
|
|
break;
|
|
}
|
|
// init it
|
|
if ( ! h ) h = hash32Fast ( wids[i] & 0xffffffff , 0 );
|
|
// hash it into the ongoing hash
|
|
h = hash32Fast ( wids[j] & 0xffffffff , h );
|
|
// is this in the wiki?
|
|
if ( ! g_wiki.isInWiki ( h ) ) continue;
|
|
// it is, mark it
|
|
wikij = j + 1;
|
|
}
|
|
|
|
// must be a 2+ word phrase in the wiki to be a gigabit
|
|
if ( wikij == -1 ) continue;
|
|
// bail if breach
|
|
if ( wikij >= nw ) continue;
|
|
// get len
|
|
int32_t len = wptrs[wikij] + wlens[wikij] - wptrs[i];
|
|
// add what we got
|
|
if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,false,
|
|
langId,i) ) return false;
|
|
// advance to end of phrase
|
|
i = wikij - 1;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
char* XmlDoc::getMetadata(int32_t* retlen) {
|
|
if(!m_hasMetadata) {
|
|
*retlen = 0;
|
|
return NULL;
|
|
}
|
|
|
|
*retlen = size_metadata;
|
|
return ptr_metadata;
|
|
|
|
}
|
|
|
|
// . this is called by Msg40.cpp to intersect gigabits from multiple docs
|
|
// . returns -1 and sets g_errno on error
|
|
// . returns # of GigabitInfos stored into "top"
|
|
/*
|
|
int32_t intersectGigabits ( Msg20 **mp , // search results
|
|
int32_t n , // # of em
|
|
uint8_t langId , // searcher's langId
|
|
int32_t maxTop ,
|
|
int32_t docsToScan ,
|
|
int32_t minDocCount , // must be in this # docs
|
|
GigabitInfo *top ,
|
|
int32_t niceness ) {
|
|
|
|
// put gigabits into this hash table
|
|
HashTableX ht;
|
|
ht.set ( 8 , sizeof(GigabitInfo),0,NULL,0,false,niceness,"ginttbl");
|
|
|
|
for ( int32_t i = 0 ; i < n && i < docsToScan ; i++ ) {
|
|
// get the reply/searchResult
|
|
Msg20Reply *mr = mp[i]->m_r;
|
|
// sanity check
|
|
if ( ! mr && ! mp[i]->m_errno ) { char *xx=NULL;*xx=0; }
|
|
// this is NULL on error
|
|
if ( ! mr ) continue;
|
|
// count them
|
|
int32_t count = 0;
|
|
// add each gigabit for it
|
|
for ( char *p = mr->ptr_gigabitQuery ; p && *p ; count++ ) {
|
|
// skip the comma
|
|
p++;
|
|
// point to next
|
|
char *end = strchr ( p , ',' );
|
|
// do not allow NULLs
|
|
if ( ! end ) end = p + gbstrlen(p);
|
|
// get the score. aka GigabitInfo::m_pts
|
|
int32_t ptsArg = mr->ptr_gigabitScores[count];
|
|
// sanity check for bad scores
|
|
if ( ptsArg <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// add it in
|
|
if ( ! addGigabit ( &ht ,
|
|
p ,
|
|
end - p , // langth
|
|
mr->m_docId ,
|
|
NULL ,// section ptr
|
|
false , // singleWrd? unused
|
|
langId ,
|
|
-1 , // word #i not used
|
|
ptsArg ) )
|
|
return -1;
|
|
// advance p
|
|
p = end;
|
|
// if not comma, all done
|
|
if ( *p != ',' ) break;
|
|
// skip comma
|
|
p++;
|
|
}
|
|
}
|
|
|
|
// . get up to the top 50 gigabits
|
|
GigabitInfo *array [ 50 ];
|
|
int32_t numTop = getTopGigabits ( &ht , array , 50 , minDocCount );
|
|
// error? g_errno should be set
|
|
if ( numTop == -1 ) return -1;
|
|
// sanity check
|
|
if ( numTop > maxTop ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now copy into our array
|
|
for ( int32_t i = 0 ; i < numTop ; i++ ) {
|
|
// get it
|
|
GigabitInfo *gi = array[i];
|
|
// copy it
|
|
gbmemcpy ( &top[i] , gi , sizeof(GigabitInfo) );
|
|
}
|
|
// return how many we copied
|
|
return numTop;
|
|
}
|
|
*/
|
|
|
|
// . "docId" is the document Id that "h" came from
|
|
// . if being called at query time we often get called on each search result!
|
|
// . if being called at parse/index time we are being called on a single docId
|
|
// . returns false and sets g_errno on error
|
|
bool addGigabit ( HashTableX *ht ,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int64_t docId ,
|
|
Section *sp ,
|
|
bool singleWord ,
|
|
uint8_t langId ,
|
|
// starts with word #i
|
|
int32_t i ,
|
|
int32_t ptsArg ) {
|
|
// get its hash
|
|
uint64_t h = hash64d ( s , slen );
|
|
// get the slot where its at
|
|
int32_t slot = ht->getSlot ( &h );
|
|
// info for this hash/gigabit in the doc
|
|
GigabitInfo *gi ;
|
|
// otherwise, init a new slot. set the key to h
|
|
if ( slot < 0 ) {
|
|
// . add key to a new slot, set "gi" to the value ptr
|
|
// . use NULL for the GigabitInfo ptr temporarily so it should
|
|
// not gbmemcpy into the slot
|
|
if ( ! ht->addKey ( &h , NULL , &slot ) ) return false;
|
|
// get data ptr to the bogus data
|
|
gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
|
|
// . set all the stuff now. this way avoids a gbmemcpy...
|
|
// . every wiki title should have a popularity i guess...
|
|
// . "pop" is # of docs out of 10,000 that have this phrase?
|
|
int32_t pop = g_speller.getPhrasePopularity(s,h,true,langId);
|
|
gi->m_pop = pop;
|
|
gi->m_pts = 0;
|
|
gi->m_count = 0;
|
|
gi->m_numDocs = 0;
|
|
gi->m_lastDocId = 0LL;
|
|
gi->m_currentDocCount = 0; // a char
|
|
gi->m_ptr = s;
|
|
gi->m_len = slen;
|
|
gi->m_hash = h;
|
|
// sanity test
|
|
GigabitInfo *tt = (GigabitInfo *)ht->getValue ( &h );
|
|
if ( tt->m_pop != pop ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
|
|
// only allow up to 5 votes per document!
|
|
if ( gi->m_currentDocCount >= 5 ) return true;
|
|
}
|
|
// inc the count, we got one more occurence
|
|
gi->m_count++;
|
|
// doc count. how many docs have this gigabit? count it.
|
|
if ( docId != gi->m_lastDocId ) {
|
|
gi->m_numDocs++;
|
|
gi->m_lastDocId = docId;
|
|
gi->m_currentDocCount = 1;
|
|
}
|
|
else
|
|
gi->m_currentDocCount++;
|
|
|
|
// given?
|
|
if ( ptsArg != -1 ) {
|
|
gi->m_pts += ptsArg;
|
|
return true;
|
|
}
|
|
|
|
// base points on popularity
|
|
float pts = 1.0;
|
|
if ( gi->m_pop < 1 ) pts = 1000;
|
|
else if ( gi->m_pop < 2 ) pts = 500;
|
|
else if ( gi->m_pop < 3 ) pts = 250;
|
|
else if ( gi->m_pop < 4 ) pts = 200;
|
|
else if ( gi->m_pop < 5 ) pts = 150;
|
|
else if ( gi->m_pop < 6 ) pts = 100;
|
|
else if ( gi->m_pop < 7 ) pts = 20;
|
|
else if ( gi->m_pop < 8 ) pts = 10;
|
|
else if ( gi->m_pop < 10 ) pts = 5;
|
|
else if ( gi->m_pop < 15 ) pts = 3;
|
|
else if ( gi->m_pop < 20 ) pts = 2;
|
|
|
|
// . special boost if in title, header or anchor tag
|
|
// . the weights class ONLY boosts the first 20 or so words in
|
|
// header tags... how can we fix that??????????????????
|
|
// . TODO: FIX THAT!!!
|
|
//if ( flags & SEC_TITLE ) pts = pts * 6.0/(float)we->m_titleWeight;
|
|
//if ( flags & SEC_HEADER) pts = pts * 4.0/(float)we->m_headerWeight;
|
|
//if ( flags & SEC_A ) pts = pts * 4.0/(float)we->m_linkTextWeight;
|
|
if ( sp ) {
|
|
if ( sp->m_flags & SEC_IN_TITLE ) pts = pts * 6.0;
|
|
if ( sp->m_flags & SEC_IN_HEADER ) pts = pts * 4.0;
|
|
if ( sp->m_tagId == TAG_A ) pts = pts * 4.0;
|
|
}
|
|
|
|
// if for the query 'recreation' you get the phrase "park bench"
|
|
// 100 times and the word "bench" 100 times. the word weight
|
|
// for "bench" should be very low! Weights.cpp also demotes repreated
|
|
// sentence fragments, etc. it is generally a really handy thing!
|
|
// and i think it already boosts scores for being in the title, etc.
|
|
// IF BEING called from meta tag, weights are NULL!
|
|
// TODO: we need to use the diversity vector here then...
|
|
//if ( we ) {
|
|
// if ( singleWord ) pts *= we->m_ww[i];
|
|
// else pts *= we->m_pw[i];
|
|
//}
|
|
|
|
// add them in
|
|
gi->m_pts += (int32_t)pts;
|
|
|
|
// good to go
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
-- this will be a url filter var like "numindexed"
|
|
int32_t *XmlDoc::getSiteSpiderQuota ( ) {
|
|
if ( m_siteSpiderQuotaValid ) return &m_siteSpiderQuota;
|
|
int32_t *siteNumInlinks = getSiteNumInlinks();
|
|
if ( ! siteNumInlinks ) return NULL;
|
|
if ( siteNumInlinks == (int32_t *)-1 ) return (int32_t *)-1;
|
|
// get this fresh each time
|
|
int32_t *rn = getRegExpNum ( -1 );
|
|
if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
|
|
// bail early? this happens if we match a banned/filtered rule in
|
|
// the url filters table
|
|
if ( m_indexCode ) return NULL;
|
|
// valid at this point
|
|
m_siteSpiderQuotaValid = true;
|
|
// if no match, or filtered or banned, assume no quota
|
|
if ( *rn == -1 ) m_siteSpiderQuota = -1;
|
|
else m_siteSpiderQuota = cr->m_spiderQuotas[*rn];
|
|
// get the quota, -1 means no limit
|
|
return &m_siteSpiderQuota;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
Url *XmlDoc::getCurrentUrl ( ) {
|
|
if ( m_currentUrlValid ) return &m_currentUrl;
|
|
// otherwise, get first url
|
|
Url *fu = getFirstUrl();
|
|
if ( ! fu || fu == (void *)-1 ) return (Url *)fu;
|
|
// make that current url
|
|
m_currentUrl.set ( &m_firstUrl , false );
|
|
m_currentUrlValid = true;
|
|
return &m_currentUrl;
|
|
/*
|
|
// need a valid url
|
|
Url *u = getFirstUrl();
|
|
if ( ! u ) return NULL;
|
|
// but use redir if we got that
|
|
Url *r = getRedirUrl();
|
|
if ( r && m_redirUrlValid ) return r;
|
|
return u;
|
|
*/
|
|
}
|
|
|
|
Url *XmlDoc::getFirstUrl() {
|
|
if ( m_firstUrlValid ) return &m_firstUrl;
|
|
// we might have a title rec
|
|
if ( m_setFromTitleRec ) {
|
|
setFirstUrl ( ptr_firstUrl , false );
|
|
m_firstUrlValid = true;
|
|
return &m_firstUrl;
|
|
}
|
|
// must be this otherwise
|
|
if ( ! m_setFromDocId ) { char *xx=NULL;*xx=0; }
|
|
// this must be valid
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (Url *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// now set it
|
|
setFirstUrl ( od->ptr_firstUrl , false );
|
|
m_firstUrlValid = true;
|
|
return &m_firstUrl;
|
|
}
|
|
|
|
|
|
int64_t XmlDoc::getFirstUrlHash48() {
|
|
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
|
|
// this must work
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( getUseTimeAxis() ) {
|
|
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
|
|
m_firstUrlHash48Valid = true;
|
|
return m_firstUrlHash48;
|
|
}
|
|
|
|
m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL;
|
|
m_firstUrlHash48Valid = true;
|
|
return m_firstUrlHash48;
|
|
}
|
|
|
|
int64_t XmlDoc::getFirstUrlHash64() {
|
|
if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
|
|
// this must work
|
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( getUseTimeAxis() ) {
|
|
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
|
|
m_firstUrlHash64Valid = true;
|
|
return m_firstUrlHash64;
|
|
}
|
|
|
|
m_firstUrlHash64 = hash64b ( m_firstUrl.m_url );
|
|
m_firstUrlHash64Valid = true;
|
|
return m_firstUrlHash64;
|
|
}
|
|
|
|
Url **XmlDoc::getLastRedirUrl() {
|
|
|
|
Url **ru = getRedirUrl();
|
|
if ( ! ru || ru == (void *)-1 ) return ru;
|
|
|
|
// m_redirUrlPtr will be NULL in all cases, however, the
|
|
// last redir url we actually got will be set in
|
|
// m_redirUrl.m_url so return that.
|
|
m_lastRedirUrlPtr = &m_redirUrl;
|
|
return &m_lastRedirUrlPtr;
|
|
}
|
|
|
|
// . operates on the latest m_httpReply
|
|
Url **XmlDoc::getRedirUrl() {
|
|
if ( m_redirUrlValid ) return &m_redirUrlPtr;
|
|
|
|
setStatus ( "getting redir url" );
|
|
|
|
// assume no redirect
|
|
m_redirUrlPtr = NULL;
|
|
//ptr_redirUrl = NULL;
|
|
//size_redirUrl = 0;
|
|
// bail on this
|
|
//if ( ! m_checkForRedir ) {
|
|
// m_redirError = 0;
|
|
// m_redirErrorValid = true;
|
|
// return &m_redirUrlPtr;
|
|
//}
|
|
// we might have a title rec
|
|
if ( m_setFromTitleRec ) { char *xx=NULL;*xx=0; }
|
|
|
|
// or recycling content from old title rec
|
|
if ( m_recycleContent ) {
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
m_redirUrlValid = true;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
// get the current http reply, not the final http reply necessarily
|
|
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set a mime on the stack
|
|
HttpMime mime;
|
|
// int16_tcut
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
// sanity check
|
|
if ( LEN > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
|
|
// empty reply, no redir
|
|
if ( LEN == 0 ) {
|
|
// bad mime, but i guess valid empty redir url
|
|
m_redirUrlValid = true;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
// return a fake thing. content length is 0.
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// set it. if 'connection refused' then LEN is -1.
|
|
if ( LEN<0 || ! mime.set ( m_httpReply, LEN, getCurrentUrl() ) ) {
|
|
// set this on mime error
|
|
//if ( ! m_indexCode ) m_indexCode = EBADMIME;
|
|
// bad mime, but i guess valid empty redir url
|
|
m_redirUrlValid = true;
|
|
// return nothing, no redirect url was there
|
|
m_redirUrlPtr = NULL;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
// return a fake thing. content length is 0.
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
int32_t httpStatus = mime.getHttpStatus() ;
|
|
|
|
|
|
Url *loc = NULL;
|
|
|
|
// quickly see if we are a robots.txt url originally
|
|
bool isRobotsTxt = isFirstUrlRobotsTxt ( );
|
|
|
|
//
|
|
// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
|
|
// if httpStatus is not a redirect
|
|
//
|
|
if ( httpStatus < 300 || httpStatus > 399 ) {
|
|
// ok, crap, i was getting the xml here to get the meta
|
|
// http-equiv refresh tag, but that added an element of
|
|
// recursion that is just too confusing to deal with. so
|
|
// let's just parse out the meta tag by hand
|
|
bool checkMeta = true;
|
|
if ( isRobotsTxt ) checkMeta = false;
|
|
// if we are a doc that consists of a sequence of sub-docs that
|
|
// we are indexing/injecting then don't do this check.
|
|
if ( isContainerDoc() ) checkMeta = false;
|
|
if ( checkMeta ) {
|
|
Url **mrup = getMetaRedirUrl();
|
|
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
|
|
// set it. might be NULL if not there.
|
|
loc = *mrup;
|
|
}
|
|
}
|
|
else
|
|
// get Location: url (the redirect url) from the http mime
|
|
loc = mime.getLocationUrl();
|
|
|
|
// get current url
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (Url **)cu;
|
|
|
|
// this call set size_catIds
|
|
int32_t **pcids = getCatIds();
|
|
if ( ! pcids || pcids == (void *)-1) return (Url **)pcids;
|
|
// get local link info
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// error or blocked
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
|
|
// get remote link info
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
// error or blocked
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (Url **)pinfo2;
|
|
// convenience
|
|
LinkInfo *info2 = *pinfo2;
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// did we send a cookie with our last request?
|
|
bool sentCookieLastTime = false;
|
|
if ( m_redirCookieBuf.length() )
|
|
sentCookieLastTime = true;
|
|
|
|
// get cookie for redirect to fix nyt.com/nytimes.com
|
|
// for gap.com it uses multiple Set-Cookie:\r\n lines so we have
|
|
// to accumulate all of them into a buffer now
|
|
m_redirCookieBuf.reset();
|
|
mime.addCookiesIntoBuffer ( &m_redirCookieBuf );
|
|
m_redirCookieBufValid = true;
|
|
|
|
/*
|
|
char *cookie = mime.getCookie();
|
|
// find end of cookie at the semicolon
|
|
char *s = cookie;
|
|
for ( ; s && *s && *s != ';' ; s++ );
|
|
if ( s && *s == ';' ) {
|
|
// do not include ;
|
|
int32_t clen = s - cookie;
|
|
m_redirCookieBuf.reset();
|
|
m_redirCookieBuf.safeMemcpy ( cookie , clen );
|
|
m_redirCookieBuf.nullTerm();
|
|
m_redirCookieBufValid = true;
|
|
}
|
|
*/
|
|
|
|
// mdw23
|
|
//log("http: reply=%s",m_httpReply);
|
|
|
|
// a hack for removing session ids already in there. for
|
|
// brilliantshopper's bs4 collection and gk0 cluster
|
|
//bool forceRedirect = false;
|
|
if ( size_catIds == 0 &&
|
|
// must not have an actual redirect url in there
|
|
! loc &&
|
|
// must be a valid http status
|
|
httpStatus == 200 &&
|
|
(gb_strcasestr( cu->getUrl(), "sessionid") ||
|
|
gb_strcasestr( cu->getUrl(), "oscsid") ) ) {
|
|
Url *tt = &m_redirUrl;
|
|
tt->set ( cu->getUrl() ,
|
|
cu->getUrlLen() ,
|
|
true , // addwww?
|
|
true ); // strip sessid?
|
|
// if it no longer has the session id, force redirect it
|
|
if ( ! gb_strcasestr( tt->getUrl(), "sessionid") &&
|
|
! gb_strcasestr( tt->getUrl(), "oscsid") ) {
|
|
m_redirUrlValid = true;
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
// TODO: log redir url in spider log output
|
|
//logf(LOG_INFO,"build: %s force redirected to %s",
|
|
// cu->getUrl(),m_redirUrl.getUrl());
|
|
m_redirUrlValid = true;
|
|
ptr_redirUrl = m_redirUrl.m_url;
|
|
size_redirUrl = m_redirUrl.m_ulen+1;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// if no location url, then no redirect a NULL redir url
|
|
if ( ! loc || loc->m_url[0] == '\0' ) {
|
|
// validate it
|
|
m_redirUrlValid = true;
|
|
// no error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
// and return an empty one
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// this is handy
|
|
//Url tmp;
|
|
|
|
// TODO: make sure we got this logic elsewhere
|
|
// if robots.txt said no, and if we had no link text, then give up
|
|
//if(! *isAllowed && !info1->hasLinkText() && !info2->hasLinkText() ) {
|
|
// m_indexCode = EDOCDISALLOWED;
|
|
|
|
// set our redir url from the mime's Location: field. addWWW=false
|
|
//if ( loc != &tmp ) tmp.set ( loc , false );
|
|
|
|
bool keep = false;
|
|
if ( size_catIds > 0 ) keep = true;
|
|
if ( info1->hasLinkText() ) keep = true;
|
|
if ( info2 && info2->hasLinkText() ) keep = true;
|
|
|
|
// at this point we do not block anywhere
|
|
m_redirUrlValid = true;
|
|
|
|
// store the redir error
|
|
m_redirError = 0;
|
|
m_redirErrorValid = true;
|
|
|
|
// i've seen a "Location: 2010..." bogus url as well, so make sure
|
|
// we got a legit url
|
|
if ( ! loc->getDomain() || loc->getDomainLen() <= 0 ) {
|
|
if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
//bool injected = false;
|
|
// get from spider request if there
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
|
|
|
// . if redirect url is nothing new, then bail (infinite loop)
|
|
// . www.xbox.com/SiteRequirements.htm redirects to itself
|
|
// until you send a cookie!!
|
|
// . www.twomileborris.com does the cookie thing, too
|
|
if ( strcmp ( cu->getUrl(), loc->getUrl() ) == 0 ) {
|
|
// try sending the cookie if we got one now and didn't have
|
|
// one for this last request
|
|
if ( ! sentCookieLastTime && m_redirCookieBuf.length() ) {
|
|
m_redirUrl.set ( loc->getUrl() );
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
if ( ! keep ) m_redirError = EDOCREDIRECTSTOSELF;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . don't allow redirects when injecting!
|
|
// . otherwise, we would mfree(m_buf) which would free our
|
|
// injected reply... yet m_injectedReplyLen would still be
|
|
// positive! can you say 'seg fault'?
|
|
// . hmmm... seems to have worked though
|
|
if ( cr->m_recycleContent || m_recycleContent ) { // || injected
|
|
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// . if we followed too many then bail
|
|
// . www.motorolamobility.com www.outlook.com ... failed when we
|
|
// had >= 4 here
|
|
if ( ++m_numRedirects >= 10 ) {
|
|
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// sometimes idiots don't supply us with a Location: mime
|
|
if ( loc->getUrlLen() == 0 ) {
|
|
if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// . protocol of url must be http or https
|
|
// . we had one url redirect to an ihttp:// protocol and caused
|
|
// spider to core dump when it saw that SpiderRequest record
|
|
char *proto = loc->getScheme();
|
|
if ( strncmp(proto,"http://" ,7) && strncmp(proto,"https://",8) ) {
|
|
m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// do not allow redirects to evil-G or bing
|
|
//if ( strstr(loc->getUrl(),".google.com/") ||
|
|
// strstr(loc->getUrl(),".bing.com/") ) {
|
|
// m_redirError = EDOCEVILREDIRECT;
|
|
// return &m_redirUrlPtr;
|
|
//}
|
|
// log a msg
|
|
if ( g_conf.m_logSpideredUrls )
|
|
logf(LOG_INFO,"build: %s redirected to %s",
|
|
cu->getUrl(),loc->getUrl());
|
|
|
|
// if not same Domain, it is not a simplified redirect
|
|
bool sameDom = true;
|
|
int32_t dlen = loc->getDomainLen();
|
|
if ( cu->getDomainLen() != dlen ) sameDom=false;
|
|
else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
|
|
if ( ! sameDom ) {
|
|
m_redirectFlag = true;
|
|
m_redirUrl.set ( loc , false ); // addWWW=false
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
ptr_redirUrl = m_redirUrl.m_url;
|
|
size_redirUrl = m_redirUrl.m_ulen+1;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
// if redirecting to the same domain, then do not add "www.".
|
|
// this way we can take care of slashdot.org, etc.
|
|
//bool addwww = false;
|
|
// but never modify if in dmoz, keep it pure
|
|
//if ( size_catIds > 0 ) addwww = false;
|
|
// debug msg
|
|
//if ( strcmp(m_redirUrl.getUrl(),url->getUrl())== 0 )
|
|
// log("Redirect error: same url");
|
|
//bool stripSessId = (size_catIds == 0);
|
|
// . reset m_redirUrl now (do not addWWW for slashdot.org, etc)
|
|
// . we now add "www." UNLESS it's a redirect from the same
|
|
// domain or firstUrl is in catdb
|
|
//tmp.set( loc->getUrl(),loc->getUrlLen(),addwww,stripSessId);
|
|
/*
|
|
// get this
|
|
bool sameHostLinks = false;
|
|
if ( *pi >= 0 ) sameHostLinks =cr->m_pq_spiderSameHostnameLinks[*pi];
|
|
// get first url ever
|
|
Url *f = getFirstUrl();
|
|
// . for same host links, addwww for comparing
|
|
// . so if we are doing google.com and it redirects to
|
|
// www.google.com then we will allow that... and vice versa
|
|
if ( sameHostLinks ) {
|
|
Url u1;
|
|
Url u2;
|
|
u1.set ( loc->getUrl () , loc->getUrlLen(), true ); // addwww?
|
|
u2.set ( f->getUrl() , f->getUrlLen () , true ); // addwww?
|
|
// host must match if we are restricted to a particular host
|
|
if ( u1.getHostLen() != u2.getHostLen() ||
|
|
strncmp ( u1.getHost() , u2.getHost() ,
|
|
u1.getHostLen () ) != 0 ) {
|
|
m_redirError = EDOCBADREDIRECTURL;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
}
|
|
*/
|
|
// get first url ever
|
|
Url *f = getFirstUrl();
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// set this to true if the redirected urls is much preferred
|
|
bool simplifiedRedir = false;
|
|
// . if it redirected to a simpler url then stop spidering now
|
|
// and add the simpler url to the spider queue
|
|
// . by simpler, i mean one w/ fewer path components
|
|
// . or one with a www for hostname
|
|
// . or could be same as firstUrl but with a / appended
|
|
char *r = loc->getUrl();
|
|
char *u = f->getUrl();
|
|
int32_t rlen = loc->getUrlLen();
|
|
int32_t ulen = f->getUrlLen();
|
|
// simpler if new path depth is int16_ter
|
|
if ( loc->getPathDepth (true) < f->getPathDepth (true) )
|
|
simplifiedRedir = true;
|
|
// simpler if old has cgi and new does not
|
|
if ( f->isCgi() && ! loc->isCgi() )
|
|
simplifiedRedir = true;
|
|
// if we're a dmoz page, don't do this, unless just a / case,no
|
|
if ( size_catIds > 0 )
|
|
simplifiedRedir = false;
|
|
// simpler if new one is same as old but has a '/' at the end
|
|
if ( rlen == ulen+1 && r[rlen-1]=='/' && strncmp(r,u,ulen)==0)
|
|
simplifiedRedir = true;
|
|
// . if new url does not have semicolon but old one does
|
|
// . http://news.yahoo.com/i/738;_ylt=AoL4eFRYKEdXbfDh6W2cF
|
|
// redirected to http://news.yahoo.com/i/738
|
|
if ( strchr (u,';') && ! strchr (r,';') )
|
|
simplifiedRedir = true;
|
|
// simpler is new host is www and old is not
|
|
if ( loc->isHostWWW() && ! f->isHostWWW() )
|
|
simplifiedRedir = true;
|
|
// if redirect is to different domain, set simplified
|
|
// this helps locks from bunching on one domain
|
|
if ( loc->getDomainLen()!=f->getDomainLen() ||
|
|
strncasecmp ( loc->getDomain(),
|
|
f->getDomain(),
|
|
loc->getDomainLen() ) != 0 )
|
|
// crap, but www.hotmail.com redirects to live.msn.com
|
|
// login page ... so add this check here
|
|
if ( ! f->isRoot() )
|
|
simplifiedRedir = true;
|
|
|
|
bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;
|
|
|
|
// follow redirects if injecting so we do not return
|
|
// EDOCSIMPLIFIEDREDIR
|
|
if ( getIsInjecting ( ) )
|
|
allowSimplifiedRedirs = true;
|
|
|
|
// or if disabled then follow the redirect
|
|
if ( ! cr->m_useSimplifiedRedirects )
|
|
allowSimplifiedRedirs = true;
|
|
|
|
// . if the redir url is simpler, but has no hostname we
|
|
// prepend a "www." to it
|
|
// . this should avoids www.russ.ru and russ.ru from being
|
|
// in the index at the same time and causing url: collisions
|
|
/*
|
|
if ( size_catIds == 0 &&
|
|
simplifiedRedir &&
|
|
loc->getDomainLen() == loc->getHostLen () )
|
|
loc->set (loc->getUrl(),
|
|
loc->getUrlLen(),
|
|
true, //false, addwww?
|
|
stripSessId );
|
|
*/
|
|
// if not allow, do not do them... except for the two below
|
|
//if ( ! m_useSimplifiedRedirects || m_isDirColl )
|
|
// simplifiedRedir = false;
|
|
|
|
// special hack for nytimes.com. do not consider simplified redirs
|
|
// because it uses a cookie along with redirs to get to the final
|
|
// page.
|
|
char *dom2 = m_firstUrl.getDomain();
|
|
int32_t dlen2 = m_firstUrl.getDomainLen();
|
|
if ( dlen2 == 11 && strncmp(dom2,"nytimes.com",dlen2)==0 )
|
|
allowSimplifiedRedirs = true;
|
|
// same for bananarepublic.gap.com ?
|
|
// if ( dlen2 == 7 && strncmp(dom2,"gap.com",dlen2)==0 )
|
|
// allowSimplifiedRedirs = true;
|
|
|
|
// if redirect is setting cookies we have to follow the redirect
|
|
// all the way through so we can stop now.
|
|
if ( m_redirCookieBufValid && m_redirCookieBuf.getLength() )
|
|
allowSimplifiedRedirs = true;
|
|
|
|
// . don't bother indexing this url if the redir is better
|
|
// . 301 means moved PERMANENTLY...
|
|
// . many people use 301 on their root pages though, so treat
|
|
// it like a temporary redirect, like exclusivelyequine.com
|
|
if ( simplifiedRedir && ! allowSimplifiedRedirs &&
|
|
// for custom BULK clients don't like this i guess
|
|
// AND for custom crawl it was messing up the processing
|
|
// url format for a nytimes blog subsite which was redirecting
|
|
// to the proper nytimes.com site...
|
|
// ! cr->m_isCustomCrawl ) {
|
|
// no, we need this for custom crawls because otherwise we
|
|
// get too many dups in the index. so for nyt we need something
|
|
// else
|
|
cr->m_isCustomCrawl != 2 ) {
|
|
// returns false if blocked, true otherwise
|
|
//return addSimplifiedRedirect();
|
|
m_redirError = EDOCSIMPLIFIEDREDIR;
|
|
// set this because getLinks() treats this redirUrl
|
|
// as a link now, it will add a SpiderRequest for it:
|
|
m_redirUrl.set ( loc , false ); // addWWW=false
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
// mdw: let this path through so contactXmlDoc gets a proper
|
|
// redirect that we can follow. for the base xml doc at
|
|
// least the m_indexCode will be set
|
|
return &m_redirUrlPtr;
|
|
}
|
|
// good to go
|
|
m_redirectFlag = true;
|
|
m_redirUrl.set ( loc , false ); // addWWW=false
|
|
m_redirUrlPtr = &m_redirUrl;
|
|
ptr_redirUrl = m_redirUrl.m_url;
|
|
size_redirUrl = m_redirUrl.m_ulen+1;
|
|
return &m_redirUrlPtr;
|
|
}
|
|
|
|
int32_t *XmlDoc::getFirstIndexedDate ( ) {
|
|
if ( m_firstIndexedDateValid ) return (int32_t *)&m_firstIndexedDate;
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
|
|
// valid
|
|
m_firstIndexedDateValid = true;
|
|
// must be downloaded
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// assume now is the first time
|
|
m_firstIndexedDate = getSpideredTime();//m_spideredTime;
|
|
// inherit from our old title rec
|
|
if ( *od ) m_firstIndexedDate = (*od)->m_firstIndexedDate;
|
|
// return it
|
|
return (int32_t *)&m_firstIndexedDate;
|
|
}
|
|
|
|
int32_t *XmlDoc::getOutlinksAddedDate ( ) {
|
|
if ( m_outlinksAddedDateValid ) return (int32_t *)&m_outlinksAddedDate;
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
|
|
// valid
|
|
m_outlinksAddedDateValid = true;
|
|
// must be downloaded
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// assume we are doing it now
|
|
m_outlinksAddedDate = getSpideredTime();//m_spideredTime;
|
|
// get that
|
|
if ( *od ) m_outlinksAddedDate = (*od)->m_outlinksAddedDate;
|
|
// return it
|
|
return (int32_t *)&m_outlinksAddedDate;
|
|
}
|
|
|
|
/*
|
|
int32_t *XmlDoc::getNumBannedOutlinks ( ) {
|
|
if ( m_numBannedOutlinksValid ) return &m_numBannedOutlinks;
|
|
|
|
setStatus ( "getting num banned outlinks" );
|
|
|
|
// get the outlinks
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
|
// count em
|
|
int32_t n = links->getNumLinks();
|
|
// reset
|
|
m_numBannedOutlinks = 0;
|
|
// one vote per domain hash table
|
|
char buf[20000];
|
|
HashTableX ht; ht.set ( 4 , 0 , -1 , buf , 20000 ,false,m_niceness);
|
|
// loop through them
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the link
|
|
char *u = links->getLinkPtr(i);
|
|
// get domain of the link
|
|
int32_t dlen; char *dom = getDomFast ( u , &dlen , false );
|
|
// skip if bad domain
|
|
if ( ! dom || dlen <= 0 ) continue;
|
|
// get domHash
|
|
int32_t h = hash32 ( dom , dlen );
|
|
// one check per domain
|
|
if ( ht.getSlot ( &h ) >= 0 ) continue;
|
|
// add it, return NULL on error, g_errno should be set
|
|
if ( ! ht.addKey ( &h ) ) return NULL;
|
|
// . loop over all regular expression in the url filters table
|
|
// . stop at first regular expression it matches
|
|
int32_t *rn = getRegExpNum2 ( i );
|
|
// need to wait for a callback at this point
|
|
if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
|
|
// skip if no match in url filters table
|
|
if ( *rn == -1 ) continue;
|
|
// get spider priority
|
|
int32_t pr = cr->m_spiderPriorities[*rn];
|
|
// skip if not banned
|
|
if ( pr != -2 ) continue;
|
|
// count it
|
|
m_numBannedOutlinks++;
|
|
}
|
|
// all done
|
|
m_numBannedOutlinksValid = true;
|
|
// convert this too!
|
|
//m_numBannedOutlinks8 = score32to8 ( m_numBannedOutlinks );
|
|
// sanity check on score32to8()
|
|
//if(m_numBannedOutlinks8>0&&!m_numBannedOutlinks){char*xx=NULL;*xx=0;}
|
|
|
|
return &m_numBannedOutlinks;
|
|
}
|
|
*/
|
|
|
|
uint16_t *XmlDoc::getCountryId ( ) {
|
|
if ( m_countryIdValid ) return &m_countryId;
|
|
|
|
setStatus ( "getting country id" );
|
|
|
|
// get it
|
|
CatRec *cat = getCatRec ();
|
|
if ( ! cat || cat == (CatRec *)-1) return (uint16_t *)cat;
|
|
// MDW: i limit this to 10 to save stack space!
|
|
Url *u = getCurrentUrl();
|
|
if ( ! u || u == (void *)-1) return (uint16_t *)u;
|
|
// use the url's tld to guess the country
|
|
uint16_t country = g_langId.guessCountryTLD ( u->getUrl ( ) );
|
|
// . 0 means no country i guess. try dmoz next.
|
|
// . limit to 10 of them
|
|
int32_t nc = cat->m_numCatids;
|
|
for ( int32_t i = 0; ! country && i < nc && i < 10 ; i++) {
|
|
int32_t catid = cat->m_catids[i];
|
|
country = g_countryCode.getCountryFromDMOZ ( catid );
|
|
}
|
|
m_countryIdValid = true;
|
|
m_countryId = country;
|
|
return &m_countryId;
|
|
}
|
|
|
|
|
|
/*
|
|
XmlDoc *XmlDoc::getOldDoc ( ) {
|
|
if ( m_oldDocValid ) return &m_oldDoc;
|
|
// get current url
|
|
Url *u = getCurrentUrl();
|
|
// set its url otherwise
|
|
m_oldDoc.setFirstUrl ( u , false );
|
|
// get the old title rec
|
|
char *ret = getOldTitleRec();
|
|
if ( ! ret || ret == (char *)-1 ) return (XmlDoc *)ret;
|
|
// all done
|
|
m_oldDocValid = true;
|
|
// return it
|
|
return m_oldDoc;
|
|
}
|
|
*/
|
|
|
|
uint8_t *XmlDoc::getRootLangId ( ) {
|
|
|
|
// return it if we got it
|
|
if ( m_rootLangIdValid ) return &m_rootLangId;
|
|
// note it
|
|
setStatus ( "getting root lang id from tagdb");
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
|
|
// sanity check - should not be called on a root url
|
|
if ( *isRoot ) {
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 )
|
|
return (uint8_t *) langId;
|
|
m_rootLangId = *langId;
|
|
m_rootLangIdValid = true;
|
|
return &m_rootLangId;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
// get the tag rec
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
|
|
// just use one. there may be multiple ones!
|
|
Tag *tag = gr->getTag("rootlang");
|
|
// if there use that
|
|
if ( ! tag ) {
|
|
// . get the root doc
|
|
// . allow for a one hour cache of the titleRec
|
|
XmlDoc **prd = getRootXmlDoc( 3600 );
|
|
if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
|
|
// int16_tcut
|
|
XmlDoc *rd = *prd;
|
|
// . if no root doc, then assume language unknown
|
|
// . this happens if we are injecting because we do not want
|
|
// to download the root page for speed purposes
|
|
if ( ! rd ) {
|
|
m_rootLangId = langUnknown;
|
|
m_rootLangIdValid = true;
|
|
return &m_rootLangId;
|
|
}
|
|
// . update tagdb rec
|
|
// . on root download error use language "xx" (unknown) to
|
|
// avoid hammering the root page
|
|
//bool *status = rd->updateRootLangId ();
|
|
//if (! status || status==(void *)-1) return (uint8_t *)status;
|
|
// update our tag rec now
|
|
//Tag *tt = rd->m_newTagRec.getTag("rootlang");
|
|
// must be there
|
|
//if ( ! tt ) { char *xx=NULL;*xx=0; }
|
|
// add it for us
|
|
//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
|
|
// get it
|
|
uint8_t *rl = rd->getLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
|
|
// must be legit now!
|
|
if ( ! rd->m_langIdValid ) { char *xx=NULL;*xx=0;}
|
|
// now validate our stuff
|
|
m_rootLangIdValid = true;
|
|
//m_rootLangIdScore = rd->m_langIdScore;
|
|
m_rootLangId = rd->m_langId;
|
|
return &m_rootLangId;
|
|
}
|
|
// sanity check ( must be like "en,50\0" or could be
|
|
// "en_US,50\0" or "zh_cn,50"
|
|
if ( tag->getTagDataSize() > 6 ) { char *xx=NULL;*xx=0; }
|
|
// point to 2 character language abbreviation
|
|
char *abbr = tag->getTagData();
|
|
/*
|
|
// find comma
|
|
char *comma = strchr(abbr,',' );
|
|
// sanity check
|
|
if ( ! comma ) { char *xx=NULL;*xx=0; }
|
|
// tmp NULL
|
|
*comma = '\0';
|
|
*/
|
|
// map it to an id
|
|
uint8_t langId = getLangIdFromAbbr( abbr );
|
|
/*
|
|
// put it back
|
|
*comma = ',';
|
|
// get score
|
|
int32_t score = atol(comma+1);
|
|
// sanity check
|
|
if ( score < 0 || score > 100 ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
// set that up
|
|
m_rootLangId = langId;
|
|
//m_rootLangIdScore = score;
|
|
m_rootLangIdValid = true;
|
|
return &m_rootLangId;
|
|
}
|
|
|
|
XmlDoc **XmlDoc::getOldXmlDoc ( ) {
|
|
|
|
if ( m_oldDocValid ) return &m_oldDoc;
|
|
|
|
// note it
|
|
setStatus ( "getting old xml doc");
|
|
|
|
// if we are set from a title rec, we are the old doc
|
|
if ( m_setFromTitleRec ) {
|
|
m_oldDocValid = true;
|
|
m_oldDoc = NULL;//this;
|
|
return &m_oldDoc;
|
|
}
|
|
|
|
// . cache age is 0... super fresh
|
|
// . returns NULL w/ g_errno if not found unless isIndexed is false
|
|
// and valid, and it is not valid for pagereindexes.
|
|
char **otr = getOldTitleRec ( );
|
|
if ( ! otr || otr == (char **)-1 ) return (XmlDoc **)otr;
|
|
// if no title rec, return ptr to a null
|
|
m_oldDoc = NULL;
|
|
if ( ! *otr ) { m_oldDocValid = true; return &m_oldDoc; }
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if provided title rec matches our docid but not uh48 then there
|
|
// was a docid collision and we should null out our title rec
|
|
// and return with an error and no index this puppy!
|
|
// crap, we can't call getFirstUrl() because it might not be
|
|
// valid if we are a docid based doc and THIS function was called
|
|
// from getFirstUrl() -- we end up in a recursive loop.
|
|
if ( ! m_setFromDocId ) {
|
|
//int64_t uh48 = getFirstUrl()->getUrlHash48();
|
|
int64_t uh48 = getFirstUrlHash48();
|
|
int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr );
|
|
if ( uh48 != tuh48 ) {
|
|
log("xmldoc: docid collision uh48 mismatch. cannot "
|
|
"index "
|
|
"%s",getFirstUrl()->getUrl() );
|
|
g_errno = EDOCIDCOLLISION;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// . if *otr is NULL that means not found
|
|
// . return a NULL old XmlDoc in that case as well?
|
|
// . make a new one
|
|
// . this will uncompress it and set ourselves!
|
|
try { m_oldDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_oldDoc , sizeof(XmlDoc),"xmldoc1");
|
|
// debug the mem leak
|
|
// log("xmldoc: xmldoc1=%"PTRFMT" u=%s"
|
|
// ,(PTRTYPE)m_oldDoc
|
|
// ,m_firstUrl.getUrl());
|
|
// if title rec is corrupted data uncompress will fail and this
|
|
// will return false!
|
|
if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
|
|
m_oldTitleRecSize , // maxSize
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness ) ) {
|
|
log("build: failed to set old doc for %s",m_firstUrl.m_url);
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
int32_t saved = g_errno;
|
|
// ok, fix the memleak here
|
|
mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
|
|
delete ( m_oldDoc );
|
|
m_oldDocExistedButHadError = true;
|
|
//log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc);
|
|
m_oldDoc = NULL;
|
|
g_errno = saved;
|
|
// MDW: i removed this on 2/8/2016 again so the code below
|
|
// would execute.
|
|
//return NULL; //mdwmdwmdw
|
|
// if it is data corruption, just assume empty so
|
|
// we don't stop spidering a url because of this. so we'll
|
|
// think this is the first time indexing it. otherwise
|
|
// we get "Bad cached document" in the logs and the
|
|
// SpiderReply and it never gets re-spidered because it is
|
|
// not a 'temporary' error according to the url filters.
|
|
log("build: treating corrupted titlerec as not found");
|
|
g_errno = 0;
|
|
m_oldDoc = NULL;
|
|
m_oldDocValid = true;
|
|
return &m_oldDoc;
|
|
}
|
|
m_oldDocValid = true;
|
|
// share our masterloop and state!
|
|
m_oldDoc->m_masterLoop = m_masterLoop;
|
|
m_oldDoc->m_masterState = m_masterState;
|
|
return &m_oldDoc;
|
|
}
|
|
|
|
void XmlDoc::nukeDoc ( XmlDoc *nd ) {
|
|
// skip if empty
|
|
if ( ! nd ) return;
|
|
// debug the mem leak
|
|
// if ( nd == m_oldDoc )
|
|
// log("xmldoc: nuke xmldoc1=%"PTRFMT" u=%s this=%"PTRFMT""
|
|
// ,(PTRTYPE)m_oldDoc
|
|
// ,m_firstUrl.getUrl()
|
|
// ,(PTRTYPE)this
|
|
// );
|
|
// do not nuke yerself!
|
|
if ( nd == this ) return;
|
|
// or root doc!
|
|
//if ( nd == m_rootDoc ) return;
|
|
// nuke it
|
|
mdelete ( nd , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( nd );
|
|
// invalidate
|
|
if ( nd == m_extraDoc ) {
|
|
m_extraDocValid = false;
|
|
m_extraDoc = NULL;
|
|
}
|
|
if ( nd == m_rootDoc ) {
|
|
m_rootDocValid = false;
|
|
m_rootDoc = NULL;
|
|
}
|
|
if ( nd == m_oldDoc ) {
|
|
m_oldDocValid = false;
|
|
m_oldDoc = NULL;
|
|
}
|
|
if ( nd == m_ahrefsDoc ) {
|
|
m_ahrefsDocValid = false;
|
|
m_ahrefsDoc = NULL;
|
|
}
|
|
}
|
|
|
|
static LinkInfo s_dummy;
|
|
|
|
XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
|
|
if ( m_extraDocValid ) return &m_extraDoc;
|
|
// note that
|
|
setStatus ( "getting new doc" );
|
|
// we need a valid first ip first!
|
|
//int32_t *pfip = getFirstIp();
|
|
//if ( ! pfip || pfip == (void *)-1 ) return (XmlDoc **)pfip;
|
|
// must be NULL
|
|
if ( m_extraDoc ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( ! u || ! u[0] ) { char *xx=NULL;*xx=0; }//return &m_extraDoc;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// . if *otr is NULL that means not found
|
|
// . return a NULL old XmlDoc in that case as well?
|
|
// . make a new one
|
|
// . this will uncompress it and set ourselves!
|
|
try { m_extraDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_extraDoc , sizeof(XmlDoc),"xmldoc2");
|
|
|
|
// . if we did not have it in titledb then download it!
|
|
// . or if titleRec was too old!
|
|
|
|
// a spider rec for the extra doc to use
|
|
SpiderRequest sreq;
|
|
// clear it
|
|
sreq.reset();
|
|
// spider the url "u"
|
|
strcpy ( sreq.m_url , u );
|
|
// inherit page parser
|
|
sreq.m_isPageParser = getIsPageParser();
|
|
// set the data size right
|
|
sreq.setDataSize();
|
|
// . prepare to download it, set it up
|
|
// . returns false and sets g_errno on error
|
|
if ( ! m_extraDoc->set4 ( &sreq ,
|
|
NULL , // doledbkey ptr
|
|
cr->m_coll ,
|
|
NULL , // SafeBuf
|
|
m_niceness ))
|
|
return NULL;
|
|
|
|
// share our masterloop and state!
|
|
m_extraDoc->m_masterLoop = m_masterLoop;
|
|
m_extraDoc->m_masterState = m_masterState;
|
|
|
|
// carry this forward always!
|
|
m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;
|
|
|
|
// disable spam check because that is not necessary for this doc!
|
|
m_extraDoc->m_spamCheckDisabled = true;
|
|
|
|
// tell msg13 to get this from it robots.txt cache if it can. it also
|
|
// keeps a separate html page cache for the root pages, etc. in case
|
|
m_extraDoc->m_maxCacheAge = maxCacheAge;
|
|
|
|
// a dummy thing
|
|
s_dummy.m_numStoredInlinks = 0;
|
|
s_dummy.m_numGoodInlinks = 0;
|
|
|
|
// we indirectly call m_extraDoc->getHttpReply() which calls
|
|
// m_extraDoc->getRedirectUrl(), which checks the linkInfo and
|
|
// dmoz catids of the original url to see if we should set m_indexCode
|
|
// to something bad or not. to avoid these unnecessary lookups we
|
|
// set these to NULL and validate them
|
|
m_extraDoc->ptr_catIds = NULL;
|
|
m_extraDoc->size_catIds = 0;
|
|
m_extraDoc->m_catIdsValid = true;
|
|
m_extraDoc->ptr_linkInfo1 = &s_dummy;
|
|
m_extraDoc->size_linkInfo1 = 0;
|
|
m_extraDoc->m_linkInfo1Valid = true;
|
|
m_extraDoc->ptr_linkInfo2 = &s_dummy;
|
|
m_extraDoc->size_linkInfo2 = 0;
|
|
m_extraDoc->m_linkInfo2Valid = true;
|
|
m_extraDoc->m_urlFilterNumValid = true;
|
|
m_extraDoc->m_urlFilterNum = 0;
|
|
// for redirects
|
|
m_extraDoc->m_allowSimplifiedRedirs = true;
|
|
// always forward the http download request so that Msg13.cpp's
|
|
// handleRequest13() can avoid this same page
|
|
// from being downloaded at the same time. also, if we are robots.txt
|
|
// this allows us to use the same cache since we select the host we
|
|
// forward to based on ip address.
|
|
m_extraDoc->m_forwardDownloadRequest = true;
|
|
// set this flag so msg13.cpp doesn't print the "hammering ip" msg
|
|
m_extraDoc->m_isChildDoc = true;
|
|
m_extraDoc->m_parentDocPtr = this;
|
|
// debug it
|
|
//g_doc = this;
|
|
|
|
// and inherit test dir so getTestDir() doesn't core on us
|
|
bool isPageParser = getIsPageParser();
|
|
m_extraDoc->m_isPageParser = isPageParser;
|
|
m_extraDoc->m_isPageParserValid = true;
|
|
|
|
// without this we send all the msg13 requests to host #3! because
|
|
// Msg13 uses it to determine what host to handle it
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
m_extraDoc->m_firstIp = m_firstIp;
|
|
m_extraDoc->m_firstIpValid = true;
|
|
|
|
// i guess we are valid now
|
|
m_extraDocValid = true;
|
|
return &m_extraDoc;
|
|
}
|
|
|
|
bool XmlDoc::getIsPageParser ( ) {
|
|
if ( m_isPageParserValid ) return m_isPageParser;
|
|
// assume not
|
|
m_isPageParser = false;
|
|
// and set otherwise
|
|
if ( m_sreqValid && m_sreq.m_isPageParser ) m_isPageParser = true;
|
|
// and validate
|
|
m_isPageParserValid = true;
|
|
return m_isPageParser;
|
|
}
|
|
|
|
XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
|
|
if ( m_rootDocValid ) return &m_rootDoc;
|
|
// help avoid mem leaks
|
|
if ( m_rootDoc ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
setStatus ( "getting root doc");
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (XmlDoc **)isRoot;
|
|
// if we are root use us!!!!!
|
|
if ( *isRoot ) {
|
|
m_rootDoc = this;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
// get our site root
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (XmlDoc **)mysite;
|
|
// otherwise, we gotta get it!
|
|
char **rtr = getRootTitleRec ( );
|
|
if ( ! rtr || rtr == (char **)-1 ) return (XmlDoc **)rtr;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (XmlDoc **)cu;
|
|
// if no title rec, return ptr to a null
|
|
//m_rootDoc = NULL;
|
|
//if ( ! *rtr ) {
|
|
// // damn, not in titledb, i guess download it then
|
|
// m_rootDocValid = true; return &m_rootDoc; }
|
|
// note it
|
|
setStatus ( "getting root doc");
|
|
|
|
// to keep injections fast, do not download the root page!
|
|
if ( ! *rtr && m_contentInjected ) {
|
|
// assume none
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
// likewise, if doing a rebuild
|
|
if ( ! *rtr && m_useSecondaryRdbs ) {
|
|
// assume none
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
// or recycling content like for query reindex. keep it fast.
|
|
if ( ! *rtr && m_recycleContent ) {
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . if *otr is NULL that means not found
|
|
// . return a NULL root XmlDoc in that case as well?
|
|
// . make a new one
|
|
// . this will uncompress it and set ourselves!
|
|
try { m_rootDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
|
|
// if we had the title rec, set from that
|
|
if ( *rtr ) {
|
|
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
|
|
m_rootTitleRecSize , // maxSize ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness ) ) {
|
|
// it was corrupted... delete this
|
|
// possibly printed
|
|
// " uncompress uncompressed size=..." bad uncompress
|
|
log("build: rootdoc set2 failed");
|
|
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( m_rootDoc );
|
|
// call it empty for now, we don't want to return
|
|
// NULL with g_errno set because it could stop
|
|
// the whole indexing pipeline
|
|
m_rootDoc = NULL;
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
//return NULL;
|
|
}
|
|
}
|
|
// . otherwise, set the url and download it on demand
|
|
// . this junk copied from the contactDoc->* stuff below
|
|
else {
|
|
// a spider rec for the contact doc
|
|
SpiderRequest sreq;
|
|
// clear it
|
|
sreq.reset();
|
|
// spider the url "u"
|
|
char *p = sreq.m_url;
|
|
if ( cu->isHttps() ) p += sprintf ( p , "https://" );
|
|
else p += sprintf ( p , "http://" );
|
|
strcpy ( p , mysite );
|
|
// set this
|
|
if ( m_sreqValid ) {
|
|
// this will avoid it adding to tagdb!
|
|
sreq.m_isPageParser = m_sreq.m_isPageParser;
|
|
}
|
|
// reset the data size
|
|
sreq.setDataSize ();
|
|
// . prepare to download it, set it up
|
|
// . returns false and sets g_errno on error
|
|
if ( ! m_rootDoc->set4 ( &sreq ,
|
|
NULL , // doledbkey ptr
|
|
cr->m_coll ,
|
|
NULL , // SafeBuf
|
|
m_niceness )) {
|
|
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( m_rootDoc );
|
|
m_rootDoc = NULL;
|
|
return NULL;
|
|
}
|
|
// do not throttle it!
|
|
//m_rootDoc->m_throttleDownload = false;
|
|
// . do not do robots check for it
|
|
// . no we must to avoid triggering a bot trap & getting banned
|
|
//m_rootDoc->m_isAllowed = m_isAllowed;
|
|
//m_rootDoc->m_isAllowedValid = true;
|
|
}
|
|
|
|
// share our masterloop and state!
|
|
m_rootDoc->m_masterLoop = m_masterLoop;
|
|
m_rootDoc->m_masterState = m_masterState;
|
|
|
|
// msg13 caches the pages it downloads
|
|
m_rootDoc->m_maxCacheAge = maxCacheAge;
|
|
|
|
// like m_contactDoc we avoid unnecessary lookups in call to
|
|
// getRedirUrl() by validating these empty members
|
|
m_rootDoc->ptr_catIds = NULL;
|
|
m_rootDoc->size_catIds = 0;
|
|
m_rootDoc->m_catIdsValid = true;
|
|
m_rootDoc->ptr_linkInfo1 = &s_dummy;
|
|
m_rootDoc->size_linkInfo1 = 0;
|
|
m_rootDoc->m_linkInfo1Valid = true;
|
|
m_rootDoc->ptr_linkInfo2 = &s_dummy;
|
|
m_rootDoc->size_linkInfo2 = 0;
|
|
m_rootDoc->m_linkInfo2Valid = true;
|
|
m_rootDoc->m_urlFilterNumValid = true;
|
|
m_rootDoc->m_urlFilterNum = 0;
|
|
// for redirects
|
|
m_rootDoc->m_allowSimplifiedRedirs = true;
|
|
// always forward the http download request so that Msg13.cpp's
|
|
// handleRequest13() can avoid the same root page or contact page
|
|
// from being downloaded at the same time. also, if we are robots.txt
|
|
// this allows us to use the same cache since we select the host we
|
|
// forward to based on ip address.
|
|
m_rootDoc->m_forwardDownloadRequest = true;
|
|
// set this flag so msg13.cpp doesn't print the "hammering ip" msg
|
|
m_rootDoc->m_isChildDoc = true;
|
|
m_rootDoc->m_parentDocPtr = this;
|
|
|
|
// validate it
|
|
m_rootDocValid = true;
|
|
return &m_rootDoc;
|
|
}
|
|
|
|
/*
|
|
// no longer access Revdb to get the old metalist, now re-compute
|
|
RdbList *XmlDoc::getOldMetaList ( ) {
|
|
// if valid return that
|
|
if ( m_oldMetaListValid ) return &m_oldMetaList;
|
|
// update status msg
|
|
setStatus ( "getting old meta list");
|
|
// load the old title rec
|
|
XmlDoc **odp = getOldXmlDoc( );
|
|
if ( ! odp || odp == (XmlDoc **)-1 ) return (RdbList *)odp;
|
|
XmlDoc *od = *odp;
|
|
// empty old doc?
|
|
if ( ! od ) {
|
|
m_oldMetaList.reset();
|
|
m_oldMetaListValid = true;
|
|
return &m_oldMetaList;
|
|
}
|
|
// and use that. it has m_setFromTitleRec set to true.
|
|
char *old = od->getMetaList();
|
|
if ( ! old || old == (void *)-1 ) return (RdbList *)old;
|
|
// set it
|
|
m_oldMetaList.m_list = od->m_metaList; // old;
|
|
m_oldMetaList.m_listSize = od->m_metaListSize;
|
|
m_oldMetaList.m_ownData = false;
|
|
// assign it
|
|
m_oldMetaListValid = true;
|
|
return &m_oldMetaList;
|
|
}
|
|
*/
|
|
|
|
SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
|
|
if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
|
|
if ( m_setFromDocId ) return &m_timeAxisUrl;
|
|
m_timeAxisUrlValid = true;
|
|
Url *fu = getFirstUrl();
|
|
m_timeAxisUrl.reset();
|
|
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
|
|
return &m_timeAxisUrl;
|
|
}
|
|
|
|
// . look up TitleRec using Msg22 if we need to
|
|
// . set our m_titleRec member from titledb
|
|
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
|
// from scratch. this loads it from titledb.
|
|
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
|
char **XmlDoc::getOldTitleRec ( ) {
|
|
// clear if we blocked
|
|
//if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
|
|
// g_errno = EBADTITLEREC;
|
|
// return NULL;
|
|
|
|
// if valid return that
|
|
if ( m_oldTitleRecValid ) return &m_oldTitleRec;
|
|
// update status msg
|
|
setStatus ( "getting old title rec");
|
|
// if we are set from a title rec, we are the old doc
|
|
if ( m_setFromTitleRec ) {
|
|
m_oldTitleRecValid = true;
|
|
m_oldTitleRec = NULL;//m_titleRec;
|
|
return &m_oldTitleRec;
|
|
}
|
|
// sanity check
|
|
if ( m_oldTitleRecValid && m_msg22a.m_outstanding ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// point to url
|
|
//char *u = getCurrentUrl()->getUrl();
|
|
//char *u = getFirstUrl()->getUrl();
|
|
|
|
// assume its valid
|
|
m_oldTitleRecValid = true;
|
|
// add it to the cache?
|
|
bool addToCache = false;
|
|
//if ( maxCacheAge > 0 ) addToCache = true;
|
|
|
|
// not if new! no we need to do this so XmlDoc::getDocId() works!
|
|
// this logic prevents us from setting g_errno to ENOTFOUND
|
|
// when m_msg22a below calls indexDocWrapper(). however, for
|
|
// doing a query delete on a not found docid will succumb to
|
|
// the g_errno because m_isIndexed is not valid i think...
|
|
if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
|
|
m_oldTitleRec = NULL;
|
|
m_oldTitleRecValid = true;
|
|
return &m_oldTitleRec;
|
|
}
|
|
// sanity check. if we have no url or docid ...
|
|
if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// use docid if first url not valid
|
|
int64_t docId = 0;
|
|
if ( ! m_firstUrlValid ) docId = m_docId;
|
|
// if url not valid, use NULL
|
|
char *u = NULL;
|
|
if ( docId == 0LL && ptr_firstUrl ) u = getFirstUrl()->getUrl();
|
|
// if both are not given that is a problem
|
|
if ( docId == 0LL && ! u ) {
|
|
log("doc: no url or docid provided to get old doc");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if using time axis then append the timestamp to the end of
|
|
// the url. this way Msg22::getAvailDocId() will return a docid
|
|
// based on that so we don't collide with other instances of this
|
|
// same url.
|
|
if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
|
|
SafeBuf *tau = getTimeAxisUrl();
|
|
u = tau->getBufStart();
|
|
}
|
|
|
|
// the title must be local since we're spidering it
|
|
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
|
|
u ,
|
|
docId , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
&m_oldTitleRec ,
|
|
&m_oldTitleRecSize ,
|
|
false , // just chk tfndb?
|
|
false , // getAvailDocIdOnly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
addToCache , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false ))// load balancing?
|
|
// return -1 if we blocked
|
|
return (char **)-1;
|
|
// not really an error
|
|
if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// got it
|
|
return &m_oldTitleRec;
|
|
}
|
|
|
|
// . look up TitleRec using Msg22 if we need to
|
|
// . set our m_titleRec member from titledb
|
|
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
|
// from scratch. this loads it from titledb.
|
|
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
|
char **XmlDoc::getRootTitleRec ( ) {
|
|
// if valid return that
|
|
if ( m_rootTitleRecValid ) return &m_rootTitleRec;
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
|
|
// if we are root use us!!!!! well, the old us...
|
|
if ( *isRoot ) {
|
|
char **otr = getOldTitleRec ( );
|
|
if ( ! otr || otr == (char **)-1 ) return (char **)otr;
|
|
m_rootTitleRec = m_oldTitleRec;
|
|
m_rootTitleRecSize = m_oldTitleRecSize;
|
|
return &m_rootTitleRec;
|
|
}
|
|
// get our site root
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// make it a url. keep it on stack since msg22 copies it into its
|
|
// url request buffer anyway! (m_msg22Request.m_url[])
|
|
Url site; site.set ( mysite );
|
|
// assume its valid
|
|
m_rootTitleRecValid = true;
|
|
// add it to the cache?
|
|
bool addToCache = false;
|
|
//if ( maxCacheAge > 0 ) addToCache = true;
|
|
// update status msg
|
|
setStatus ( "getting root title rec");
|
|
// the title must be local since we're spidering it
|
|
if ( ! m_msg22b.getTitleRec ( &m_msg22Request ,
|
|
site.getUrl() ,
|
|
0 , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
&m_rootTitleRec ,
|
|
&m_rootTitleRecSize ,
|
|
false , // just chk tfndb?
|
|
false , // getAvailDocIdOnly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
addToCache , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false ))// load balancing?
|
|
// return -1 if we blocked
|
|
return (char **)-1;
|
|
// not really an error
|
|
if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// got it
|
|
return &m_rootTitleRec;
|
|
}
|
|
|
|
/*
|
|
// . look up TitleRec using Msg22 if we need to
|
|
// . set our m_titleRec member from titledb
|
|
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
|
// from scratch. this loads it from titledb.
|
|
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
|
char **XmlDoc::getContactTitleRec ( char *u ) {
|
|
// clear if we blocked
|
|
//if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// if valid return that
|
|
if ( m_contactTitleRecValid ) return &m_contactTitleRec;
|
|
// fake
|
|
static char *s_fake = NULL;
|
|
// if no url, we got no contact title rec in titledb then!
|
|
if ( ! u || u[0] == '\0' ) return &s_fake;
|
|
// update status msg
|
|
setStatus ( "getting contact title rec");
|
|
// assume its valid
|
|
m_contactTitleRecValid = true;
|
|
// add it to the cache?
|
|
bool addToCache = false;
|
|
//if ( maxCacheAge > 0 ) addToCache = true;
|
|
// the title must be local since we're spidering it
|
|
if ( ! m_msg22c.getTitleRec ( &m_msg22Request ,
|
|
u ,
|
|
0 , // probable docid
|
|
m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
&m_contactTitleRec ,
|
|
&m_contactTitleRecSize ,
|
|
false , // just chk tfndb?
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
addToCache , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false ))// load balancing?
|
|
// return -1 if we blocked
|
|
return (char **)-1;
|
|
// not really an error
|
|
if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// got it
|
|
return &m_contactTitleRec;
|
|
}
|
|
*/
|
|
|
|
|
|
// used for indexing spider replies. we need a unique docid because it
|
|
// is treated as a different document even though its url will be the same.
|
|
// and there is never an "older" version of it because each reply is treated
|
|
// as a brand new document.
|
|
int64_t *XmlDoc::getAvailDocIdOnly ( int64_t preferredDocId ) {
|
|
if ( m_availDocIdValid && g_errno ) {
|
|
log("xmldoc: error getting availdocid: %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
if ( m_availDocIdValid )
|
|
// this is 0 or -1 if no avail docid was found
|
|
return &m_msg22c.m_availDocId;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// pre-validate it
|
|
m_availDocIdValid = true;
|
|
if ( ! m_msg22c.getAvailDocIdOnly ( &m_msg22Requestc ,
|
|
preferredDocId ,
|
|
cr->m_coll ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness ) )
|
|
return (int64_t *)-1;
|
|
// error?
|
|
log("xmldoc: error getting availdocid2: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
|
|
int64_t *XmlDoc::getDocId ( ) {
|
|
if ( m_docIdValid ) return &m_docId;
|
|
setStatus ("getting docid");
|
|
XmlDoc **od = getOldXmlDoc( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (int64_t *)od;
|
|
setStatus ("getting docid");
|
|
// . set our docid
|
|
// . *od is NULL if no title rec found with that docid in titledb
|
|
if ( *od ) {
|
|
m_docId = *(*od)->getDocId();
|
|
m_docIdValid = true;
|
|
return &m_docId;
|
|
}
|
|
|
|
m_docId = m_msg22a.getAvailDocId();
|
|
|
|
// if titlerec was there but not od it had an error uncompressing
|
|
// because of the corruption bug in RdbMem.cpp when dumping to disk.
|
|
if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
|
|
m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
|
|
log("build: salvaged docid %"INT64" from corrupt title rec "
|
|
"for %s",m_docId,m_firstUrl.m_url);
|
|
}
|
|
|
|
if ( m_docId == 0 ) {
|
|
log("build: docid is 0 for %s",m_firstUrl.m_url);
|
|
g_errno = ENODOCID;
|
|
return NULL;
|
|
}
|
|
|
|
// ensure it is within probable range
|
|
if ( ! getUseTimeAxis () ) {
|
|
char *u = getFirstUrl()->getUrl();
|
|
int64_t pd = g_titledb.getProbableDocId(u);
|
|
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
|
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
|
if ( m_docId < d1 || m_docId > d2 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// if docid is zero, none is a vailable!!!
|
|
//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
|
|
m_docIdValid = true;
|
|
return &m_docId;
|
|
}
|
|
|
|
// . is our docid on disk? i.e. do we exist in the index already?
|
|
// . TODO: just check tfndb?
|
|
char *XmlDoc::getIsIndexed ( ) {
|
|
if ( m_isIndexedValid ) return &m_isIndexed;
|
|
|
|
setStatus ( "getting is indexed" );
|
|
|
|
// we must be old if this is true
|
|
//if ( m_setFromTitleRec ) {
|
|
// m_isNew = false;
|
|
// m_isNewValid = true;
|
|
// return &m_isNew;
|
|
//}
|
|
// get the url
|
|
//char *u = getFirstUrl()->getUrl();
|
|
|
|
if ( m_oldDocValid ) {
|
|
m_isIndexedValid = true;
|
|
if ( m_oldDoc ) m_isIndexed = true;
|
|
else m_isIndexed = false;
|
|
return &m_isIndexed;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// sanity check. if we have no url or docid ...
|
|
if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// use docid if first url not valid
|
|
int64_t docId = 0;
|
|
char *url = NULL;
|
|
// use docid if its valid, otherwise use url
|
|
if ( m_docIdValid ) docId = m_docId;
|
|
else url = ptr_firstUrl;
|
|
|
|
// note it
|
|
if ( ! m_calledMsg22e )
|
|
setStatus ( "checking titledb for old title rec");
|
|
else
|
|
setStatus ( "back from msg22e call");
|
|
|
|
// . consult the title rec tree!
|
|
// . "justCheckTfndb" is set to true here!
|
|
if ( ! m_calledMsg22e &&
|
|
! m_msg22e.getTitleRec ( &m_msg22Request ,
|
|
url ,
|
|
docId , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
NULL , // tr ptr
|
|
NULL , // tr size ptr
|
|
true , // just chk tfndb?
|
|
false, // getavaildocidonly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false )){//load balancing?
|
|
// validate
|
|
m_calledMsg22e = true;
|
|
// return -1 if we blocked
|
|
return (char *)-1;
|
|
}
|
|
// got it
|
|
m_calledMsg22e = true;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// get it
|
|
if ( m_msg22e.m_found ) m_isIndexed = true;
|
|
else m_isIndexed = false;
|
|
|
|
// validate
|
|
m_isIndexedValid = true;
|
|
return &m_isIndexed;
|
|
}
|
|
|
|
void gotTagRecWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// note it
|
|
THIS->setStatus ( "in got tag rec wrapper" );
|
|
// set these
|
|
if ( ! g_errno ) {
|
|
THIS->m_tagRec.serialize ( THIS->m_tagRecBuf );
|
|
THIS->ptr_tagRecData = THIS->m_tagRecBuf.getBufStart();
|
|
THIS->size_tagRecData = THIS->m_tagRecBuf.length();
|
|
// validate
|
|
THIS->m_tagRecValid = true;
|
|
}
|
|
// continue
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
// if tagrec changed enough so that it would affect what we would index
|
|
// since last time we indexed this doc, we need to know that!
|
|
/*
|
|
int32_t *XmlDoc::getTagHash32 ( ) {
|
|
// make it valid
|
|
if ( m_tagHash32Valid ) return &m_tagHash32;
|
|
// compute it
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// init it
|
|
m_tagHash32 = 0;
|
|
// hash the values of all tags
|
|
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get data
|
|
uint32_t h = hash32(tag->getTagData(),tag->getTagDataSize(),0);
|
|
// skip if 0
|
|
if ( ! h ) continue;
|
|
// xor it up
|
|
m_tagHash32 = hash32h ( h , m_tagHash32 );
|
|
}
|
|
// validate
|
|
m_tagHash32Valid = true;
|
|
return &m_tagHash32;
|
|
}
|
|
*/
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
TagRec *XmlDoc::getTagRec ( ) {
|
|
// if we got it give it
|
|
if ( m_tagRecValid ) return &m_tagRec;
|
|
// do we got a title rec?
|
|
if ( m_setFromTitleRec && m_version >= 118 &&
|
|
// lookup up fresh from tagdb when doing a rebuild so we get
|
|
// the latest sitenuminlinks! nah, we set m_tagRecValid and
|
|
// m_tagRecDataValid to false in Repair.cpp iff rebuilding
|
|
// titledb!! otherwise, we have to use what is in titlerec
|
|
// to avoid parsing inconsistencies that would result in
|
|
// undeletable posdb data.
|
|
//! m_useSecondaryRdbs &&
|
|
// lookup the tagdb rec fresh if setting for a summary. that way
|
|
// we can see if it is banned or not
|
|
m_tagRecDataValid ) {
|
|
// all done
|
|
m_tagRecValid = true;
|
|
// assume null if old version
|
|
//if ( m_version <= 115 ) return &m_tagRec;
|
|
// just return empty otherwise
|
|
m_tagRec.setFromBuf ( ptr_tagRecData , size_tagRecData );
|
|
return &m_tagRec;
|
|
}
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// get our site, usually the hostname, but can be like
|
|
// "www.last.fm/user/breendaxx/"
|
|
// we can't call this because it CALLS getTagRec()!!!
|
|
//char *mysite = getSite();
|
|
//if ( ! mysite || mysite == (char *)-1 ) return (TagRec *)mysite;
|
|
// update status msg
|
|
setStatus ( "getting tagdb record" );
|
|
// get the final redirected url
|
|
//Url *u = getCurrentUrl();
|
|
// nah, try this
|
|
Url *u = getFirstUrl();
|
|
// if we are docid based url this might block!
|
|
//if ( ! u || u == (void *)-1 ) return (TagRec *)u;
|
|
// good to go
|
|
//m_oldTagRecValid = true;
|
|
// get it, user our collection for lookups, not m_tagdbColl[] yet!
|
|
if ( ! m_msg8a.getTagRec ( u ,
|
|
// we have to guess the site because
|
|
// we can't hit tagdb to get it at this
|
|
// point!!!
|
|
NULL, // guess it! // mysite ,
|
|
cr->m_collnum ,
|
|
false, // skip domain lookup? // true
|
|
m_niceness ,
|
|
this ,
|
|
gotTagRecWrapper ,
|
|
&m_tagRec ) )
|
|
// we blocked, return -1
|
|
return (TagRec *)-1;
|
|
// error? ENOCOLLREC?
|
|
if ( g_errno ) return NULL;
|
|
// assign it
|
|
m_tagRec.serialize ( m_tagRecBuf );
|
|
ptr_tagRecData = m_tagRecBuf.getBufStart();
|
|
size_tagRecData = m_tagRecBuf.length();
|
|
// validate
|
|
m_tagRecValid = true;
|
|
// our tag rec should be all valid now
|
|
return &m_tagRec;
|
|
}
|
|
|
|
|
|
|
|
|
|
// this is only for purposes of setting the site's TagRec
|
|
char *XmlDoc::getHasContactInfo ( ) {
|
|
|
|
if ( m_hasContactInfoValid ) return &m_hasContactInfo2;
|
|
|
|
setStatus ( "getting has contact info" );
|
|
|
|
// get it from the tag rec if we can
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
|
|
|
|
char *ic = getIsThisDocContacty ( );
|
|
if ( ! ic || ic == (void *)-1 ) return (char *)ic;
|
|
|
|
// the current top ip address
|
|
//int32_t *ip = getIp();
|
|
//if ( ! ip || ip == (int32_t *)-1) return (char *)ip;
|
|
//int32_t top = *ip & 0x00ffffff;
|
|
|
|
// and should have a contact page tag
|
|
Tag *tag = gr->getTag ("hascontactinfo");
|
|
|
|
if ( tag ) m_hasContactInfo = true;
|
|
else m_hasContactInfo = false;
|
|
|
|
m_hasContactInfo2 = m_hasContactInfo;
|
|
|
|
// are we a "contact" link? i.e. about us, etc. that would contain
|
|
// the physical address of the entity responsible for this website
|
|
//bool isContacty = getIsContacty( fu ,
|
|
// info1 ,
|
|
// hops ,
|
|
// *ct ,
|
|
// *isRoot ,
|
|
// m_niceness );
|
|
|
|
// bail early if not a candidate for contact info
|
|
if ( ! *ic ) { // check ) {
|
|
m_hasContactInfoValid = true;
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
//
|
|
// TODO: did IP change?? invalidate it???
|
|
//
|
|
|
|
// set status. we can time status changes with this routine!
|
|
setStatus ( "getting contact info on just this page" );
|
|
|
|
int32_t *nca = getNumContactAddresses();
|
|
if ( ! nca || nca == (void *)-1 ) return (char *)nca;
|
|
|
|
// did we have a contact address?
|
|
if ( *nca ) {
|
|
m_hasContactInfo = true;
|
|
m_hasContactInfo2 = true;
|
|
m_hasContactInfoValid = true;
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
// get the email addresses
|
|
int32_t *numOfficial = getNumOfficialEmails ( );
|
|
if ( ! numOfficial || numOfficial == (void *)-1)
|
|
return (char *)numOfficial;
|
|
|
|
// did we get some?
|
|
if ( *numOfficial > 0 ) {
|
|
m_hasContactInfo = true;
|
|
m_hasContactInfo2 = true;
|
|
m_hasContactInfoValid = true;
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
// this should set m_hasContactInfo as well as m_contact*[] arrays
|
|
//TagRec *pcitr = getContactInfoTagRec ();
|
|
//if ( ! pcitr || pcitr == (void *)-1 ) return (char *)pcitr;
|
|
|
|
// do not re-peat the above now
|
|
m_hasContactInfoValid = true;
|
|
|
|
return &m_hasContactInfo2;
|
|
}
|
|
|
|
// returns "type" of contact link, > 0
|
|
int32_t getIsContacty ( Url *url ,
|
|
LinkInfo *info1 ,
|
|
int32_t hops ,
|
|
uint8_t ct ,
|
|
bool isRoot ,
|
|
int32_t niceness ) {
|
|
|
|
static int64_t h_home ;
|
|
static int64_t h_site ;
|
|
static int64_t h_map ;
|
|
static int64_t h_sitemap ;
|
|
static int64_t h_contact ;
|
|
static int64_t h_about ;
|
|
static int64_t h_privacy ;
|
|
static int64_t h_policy ;
|
|
static int64_t h_statement ;
|
|
static int64_t h_terms ;
|
|
static int64_t h_of ;
|
|
static int64_t h_and ;
|
|
static int64_t h_service ;
|
|
static int64_t h_conditions ;
|
|
static int64_t h_use ;
|
|
static int64_t h_us ;
|
|
static int64_t h_help ;
|
|
static int64_t h_location ;
|
|
static int64_t h_faq ;
|
|
static int64_t h_faqs ;
|
|
static int64_t h_customer ;
|
|
static int64_t h_support ;
|
|
static int64_t h_advertise ;
|
|
static int64_t h_inquiry ;
|
|
static int64_t h_inquiries ;
|
|
static int64_t h_feedback ;
|
|
static int64_t h_company ;
|
|
static int64_t h_corporate ;
|
|
|
|
static bool s_inith = false;
|
|
if ( ! s_inith ) {
|
|
s_inith = true;
|
|
h_home = hash64n ("home");
|
|
h_site = hash64n ("site");
|
|
h_map = hash64n ("map");
|
|
h_sitemap = hash64n ("sitemap");
|
|
h_contact = hash64n ("contact");
|
|
h_about = hash64n ("about");
|
|
h_privacy = hash64n ("privacy");
|
|
h_policy = hash64n ("policy");
|
|
h_statement = hash64n ("statement");
|
|
h_terms = hash64n ("terms");
|
|
h_of = hash64n ("of");
|
|
h_and = hash64n ("and");
|
|
h_service = hash64n ("service");
|
|
h_conditions = hash64n ("conditions");
|
|
h_use = hash64n ("use");
|
|
h_us = hash64n ("us");
|
|
h_help = hash64n ("help");
|
|
h_location = hash64n ("location");
|
|
h_faq = hash64n ("faq");
|
|
h_faqs = hash64n ("faqs");
|
|
h_customer = hash64n ("customer");
|
|
h_support = hash64n ("support");
|
|
h_advertise = hash64n ("advertise");
|
|
h_inquiry = hash64n ("inquiry");
|
|
h_inquiries = hash64n ("inquiries");
|
|
h_feedback = hash64n ("feedback");
|
|
h_company = hash64n ("company");
|
|
h_corporate = hash64n ("corporate");
|
|
}
|
|
|
|
int32_t check = 0;
|
|
// loop over the link texts we got
|
|
for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
|
|
// never do anything if hop count >= 3
|
|
if ( hops >= 3 ) break;
|
|
// javascript must be hopcount 1 only
|
|
if ( ct == CT_JS && hops != 1 ) break;
|
|
// is this inlinker internal?
|
|
//bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// skip if not local to site
|
|
//if ( ! internal ) continue;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// assume utf-8. so do a utf-8 sanity check so it doesn't
|
|
// break Words::countWords() by thinking a character is
|
|
// 2+ bytes and breaching the buffer
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 1 from url=%s for %s",
|
|
k->getUrl(),url->m_url);
|
|
continue;
|
|
}
|
|
// convert into words i guess
|
|
Words ww;
|
|
// . TODO: use alt text if only an image in the link!!!!!
|
|
// . return -1 if it fails with g_errno set
|
|
if ( ! ww.setx ( txt , tlen , niceness) ) return (char)-1;
|
|
// int16_tcut
|
|
int32_t nw = ww.getNumWords();
|
|
// skip if too big
|
|
if ( nw >= 30 ) continue;
|
|
// int16_tcut
|
|
int64_t *wids = ww.getWordIds();
|
|
// reset alnumcount
|
|
int32_t count = 0;
|
|
// loop over its words
|
|
for ( int32_t j = 0 ; j < nw && ! check ; j++ ) {
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) continue;
|
|
// keep track of alnum word position
|
|
count++;
|
|
// "contact..." only good from root or root kid
|
|
if ( wids[j] == h_contact && hops >= 1 && count == 1 )
|
|
check = 1;
|
|
// "about..." only good from root or root kid
|
|
if ( wids[j] == h_about && hops >= 1 && count == 1 )
|
|
check = 2;
|
|
// "...privacy policy..."
|
|
if ( wids[j ] == h_privacy && j+2<nw &&
|
|
wids[j+2] == h_policy )
|
|
check = 3;
|
|
// "...privacy statement..."
|
|
if ( wids[j ] == h_privacy && j+2<nw &&
|
|
wids[j+2] == h_statement )
|
|
check = 4;
|
|
// "...terms of service..."
|
|
if ( wids[j ] == h_terms && j+4<nw &&
|
|
wids[j+2] == h_of &&
|
|
wids[j+4] == h_service )
|
|
check = 5;
|
|
// "...terms of use..."
|
|
if ( wids[j ] == h_terms && j+4<nw &&
|
|
wids[j+2] == h_of &&
|
|
wids[j+4] == h_use )
|
|
check = 6;
|
|
// "... terms & conditions ..."
|
|
if ( wids[j ] == h_terms && j+2<nw &&
|
|
wids[j+2] == h_conditions )
|
|
check = 7;
|
|
// "... terms and conditions ..."
|
|
if ( wids[j ] == h_terms && j+4<nw &&
|
|
wids[j+2] == h_and &&
|
|
wids[j+4] == h_conditions )
|
|
check = 8;
|
|
// "...site map ..."
|
|
if ( wids[j] == h_site && j+2<nw &&
|
|
wids[j+2] == h_map )
|
|
check = 9;
|
|
// "...about us..."
|
|
if ( wids[j] == h_about && j+2<nw &&
|
|
wids[j+2] == h_us )
|
|
check = 10;
|
|
// "...contact us..."
|
|
if ( wids[j] == h_contact && j+2<nw &&
|
|
wids[j+2] == h_us)
|
|
check = 11;
|
|
// "help..."
|
|
if ( wids[j] == h_help && count == 1 )
|
|
check = 12;
|
|
// "faq..."
|
|
if ( wids[j] == h_faq && count == 1 )
|
|
check = 13;
|
|
// "faqs..."
|
|
if ( wids[j] == h_faqs && count == 1 )
|
|
check = 14;
|
|
// "...customer support..."
|
|
if ( wids[j] == h_customer && j+2<nw &&
|
|
wids[j+2] == h_support )
|
|
check = 15;
|
|
// "advertise..."
|
|
if ( wids[j] == h_advertise && count == 1)
|
|
check = 16;
|
|
// "...inquiry..."
|
|
if ( wids[j] == h_inquiry )
|
|
check = 17;
|
|
// "...inquiries..."
|
|
if ( wids[j] == h_inquiries )
|
|
check = 18;
|
|
// one word only below here
|
|
if ( ww.getNumAlnumWords() != 1 ) continue;
|
|
if ( wids[j] == h_about ) check = 2;
|
|
if ( wids[j] == h_home ) check = 19;
|
|
if ( wids[j] == h_support ) check = 20;
|
|
if ( wids[j] == h_advertise ) check = 21;
|
|
if ( wids[j] == h_help ) check = 22;
|
|
if ( wids[j] == h_faq ) check = 23;
|
|
if ( wids[j] == h_faqs ) check = 24;
|
|
if ( wids[j] == h_contact ) check = 25;
|
|
if ( wids[j] == h_feedback ) check = 26;
|
|
if ( wids[j] == h_sitemap ) check = 27;
|
|
if ( wids[j] == h_company ) check = 28;
|
|
if ( wids[j] == h_corporate ) check = 29;
|
|
if ( wids[j] == h_privacy ) check = 30;
|
|
if ( wids[j] == h_terms ) check = 31;
|
|
// "location" fixes guildcinema.com
|
|
if ( wids[j] == h_location && isRoot ) check = 32;
|
|
}
|
|
}
|
|
|
|
|
|
// check for certain things in the url path that would indicate that
|
|
// this is a contact info page
|
|
//char *path = m_firstUrl.getPath();
|
|
char *path = url->getPath();
|
|
if ( gb_strcasestr(path,"contact" ) ) { check += 33; check *= 90; }
|
|
if ( gb_strcasestr(path,"/about" ) ) { check += 34; check *= 91; }
|
|
if ( gb_strcasestr(path,"/feedback") ) { check += 35; check *= 92; }
|
|
if ( gb_strcasestr(path,"/help" ) ) { check += 36; check *= 93; }
|
|
if ( gb_strcasestr(path,"/faq" ) ) { check += 37; check *= 94; }
|
|
if ( gb_strcasestr(path,"advertise") ) { check += 38; check *= 95; }
|
|
if ( gb_strcasestr(path,"inquir" ) ) { check += 39; check *= 96; }
|
|
|
|
return check;
|
|
}
|
|
|
|
char *XmlDoc::getIsThisDocContacty() {
|
|
if ( m_isContactyValid ) return &m_isContacty;
|
|
setStatus ( "getting is contacty" );
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
|
|
int8_t *hc = getHopCount();
|
|
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
// get the first url
|
|
Url *fu = getFirstUrl();
|
|
// int16_tcut
|
|
int32_t hops = *hc;
|
|
// check it
|
|
m_isContacty = getIsContacty ( fu ,
|
|
info1 ,
|
|
hops ,
|
|
*ct ,
|
|
*isRoot ,
|
|
m_niceness );
|
|
m_isContactyValid = true;
|
|
return &m_isContacty;
|
|
}
|
|
|
|
|
|
int32_t *XmlDoc::getNumContactAddresses ( ) {
|
|
// process
|
|
Address **ca = getContactAddresses();
|
|
if ( ! ca || ca == (void *)-1 ) return (int32_t *)ca;
|
|
// now we are valid
|
|
return &m_numContactAddresses;
|
|
}
|
|
|
|
|
|
Address **XmlDoc::getContactAddresses ( ) {
|
|
// assume none
|
|
if ( m_contactAddressesValid ) return m_contactAddresses;
|
|
// need this of course
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (void *)-1 ) return (Address **)aa;
|
|
// assume none
|
|
m_contactAddressesValid = true;
|
|
m_numContactAddresses = 0;
|
|
// not if not contacty. we gotta be a url like ".../contact.asp"
|
|
char *ic = getIsThisDocContacty ( );
|
|
if ( ! ic || ic == (void *)-1 ) return (Address **)ic;
|
|
// if not a of contact url form, return none
|
|
if ( ! *ic )
|
|
return m_contactAddresses;
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (Address **)isRoot;
|
|
// do not do this for root if multiple addresses. this
|
|
// fixes http://obits.abqjournal.com/
|
|
if ( *isRoot && aa->m_uniqueStreetHashes > 1 )
|
|
return m_contactAddresses;
|
|
// reset count
|
|
int32_t nca = 0;
|
|
// number of addresses in this doc
|
|
int32_t na = aa->m_am.getNumPtrs();
|
|
// add all addresses then???
|
|
for ( int32_t i = 0 ; i < na ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *ai = (Address *)aa->m_am.getPtr(i);
|
|
// do not add this to tagdb if not inlined!
|
|
if ( ! ( ai->m_flags & AF_INLINED ) ) continue;
|
|
// store it
|
|
m_contactAddresses[nca++] = ai;
|
|
// stop before breach
|
|
if ( nca >= MAX_CONTACT_ADDRESSES ) break;
|
|
}
|
|
// update count
|
|
m_numContactAddresses = nca;
|
|
return m_contactAddresses;
|
|
}
|
|
|
|
int32_t *XmlDoc::getNumOfficialEmails ( ) {
|
|
char *eb = getEmailBuf();
|
|
if ( ! eb || eb == (void *)-1 ) return (int32_t *)eb;
|
|
return &m_numOfficialEmails;
|
|
}
|
|
|
|
// . add email addresses to tag rec
|
|
// . add up to 3 of same domain and different domain addresses
|
|
// . return # of *official* contact infos added to tag rec
|
|
// . this now includes submission forms!
|
|
// . returns -1 and sets g_errno on error
|
|
char *XmlDoc::getEmailBuf ( ) {
|
|
|
|
if ( m_emailBufValid ) return m_emailBuf;
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
|
|
|
|
// count # of official contacts we got
|
|
int32_t official = 0;
|
|
|
|
// int16_tcuts
|
|
int64_t *wids = ww->m_wordIds;
|
|
char **wptrs = ww->m_words;
|
|
int32_t *wlens = ww->m_wordLens;
|
|
nodeid_t *tids = ww->m_tagIds;
|
|
int32_t nw = ww->getNumWords();
|
|
|
|
// get our url
|
|
Url *f = getFirstUrl();
|
|
// get its domain len
|
|
char *myDom = f->getMidDomain();
|
|
int32_t myDomLen = f->getMidDomainLen();
|
|
|
|
|
|
// point here
|
|
char *eptr = m_emailBuf;
|
|
char *emax = m_emailBuf + EMAILBUFSIZE;
|
|
|
|
m_emailBufValid = true;
|
|
|
|
// reset
|
|
*eptr = '\0';
|
|
|
|
//
|
|
// ADD EMAIL ADDRESSES
|
|
//
|
|
|
|
// count how many we find
|
|
int32_t ne = 0;
|
|
// loop over all the words
|
|
for ( int32_t i = 1 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// . email address? look for the '@'
|
|
// . might also have <img src="at.gif"> (bot proof)
|
|
if ( wptrs[i][0] != '@' && tids[i] != TAG_IMG ) continue;
|
|
// . make sure any image has an "/at." in it!
|
|
// . "mail<img src="/common/images/at.gif">pipl.com"
|
|
if(tids[i]==TAG_IMG&&!gb_strncasestr(wptrs[i],wlens[i],"/at."))
|
|
continue;
|
|
// must be a single char
|
|
if ( ! tids[i] && wlens[i] != 1 ) continue;
|
|
// if i was the last word, give up!
|
|
if ( i + 1 >= nw ) break;
|
|
// back up i until we hit a non-email char
|
|
int32_t a ;
|
|
for ( a = i ; a - 1 > 0 ; a-- ) {
|
|
if (wids [a-1] ) continue;
|
|
if (wptrs[a-1][0]=='.'&&wlens[a-1]==1)continue;
|
|
if (wptrs[a-1][0]=='-'&&wlens[a-1]==1)continue;
|
|
break;
|
|
}
|
|
// must not start with '.'
|
|
if ( wptrs[a][0]=='.' ) a++;
|
|
// now get the end of it
|
|
int32_t b;
|
|
int32_t periodCount = 0;
|
|
for ( b = i ; b+1 < nw ; b++ ) {
|
|
if (wids[b+1]) continue;
|
|
// only punct we allow is a single period
|
|
if ( wptrs[b+1][0]!='.' ) break;
|
|
if ( wlens[b+1] != 1 ) break;
|
|
periodCount++;
|
|
}
|
|
// must have at least one!
|
|
if ( ! periodCount ) continue;
|
|
// must not end on '.'
|
|
if ( wptrs[b][0]=='.') b--;
|
|
// hostname must have a valid tld
|
|
char *host = wptrs[i+1];
|
|
char *hend = wptrs[b]+wlens[b];
|
|
// temp null term
|
|
char c = *hend;
|
|
*hend = '\0';
|
|
int32_t tldLen ; char *tld = getTLDFast ( host, &tldLen , false );
|
|
// ignore the rest of this line for addresses even
|
|
// if tld is bogus
|
|
//ignoreLine = true;
|
|
// must have a legit tld!
|
|
if ( ! tld ) { *hend = c; continue; }
|
|
// if not from our same domain, use "emailaddressoffsite"
|
|
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
|
|
// use mid domain. subtract '.'
|
|
//int32_t midlen = tld - dom - 1;
|
|
// undo the temp NULL thing
|
|
*hend = c;
|
|
if ( ! dom ) continue;
|
|
|
|
// include last word
|
|
b++;
|
|
// normal buffer
|
|
char buf[100];
|
|
char *p = buf;
|
|
char *pend = buf + 100;
|
|
// normalize it
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// include the at sign
|
|
if ( j == i ) {*p++ = '@'; continue;}
|
|
// skip tags
|
|
if ( tids[j] ) continue;
|
|
// skip punct
|
|
if ( ! wids[j] ) {*p++ ='.'; continue;}
|
|
// ensure minimal space
|
|
if ( p + wlens[j] + 1 >= pend ) break;
|
|
// write out wids
|
|
gbmemcpy ( p , wptrs[j] , wlens[j] );
|
|
p += wlens[j];
|
|
}
|
|
// NULL term it
|
|
*p = '\0';
|
|
|
|
// do we match domains?
|
|
//char *tn = "emailaddressoffsite";
|
|
// use this if we match domains
|
|
//if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
|
|
// tn = "emailaddressonsite";
|
|
// // this is an official contact method
|
|
// //official++;
|
|
//}
|
|
// we now count even offsite email addresses as official
|
|
// for addresses like @gmail.com etc. because we are now
|
|
// only checking "contact us" and "about us" and root pages,
|
|
// so they should never be email addresses of commenters.
|
|
// and often bloggers have external email addresses.
|
|
// http://www.christinesaari.com/html/about.php?psi=44
|
|
official++;
|
|
// store it
|
|
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
|
|
// return -1;
|
|
int32_t blen = gbstrlen(buf);
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// limit it
|
|
if ( ++ne >= 3 ) break;
|
|
}
|
|
|
|
//
|
|
// ADD BOT-PROOF EMAIL ADDRESSES (bot proof)
|
|
//
|
|
// super dot john at xyz dot com
|
|
//
|
|
|
|
int64_t h_at = hash64Lower_utf8("at");
|
|
int64_t h_dot = hash64Lower_utf8("dot");
|
|
// loop over all the words
|
|
for ( int32_t i = 1 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// email address? look for the " at "
|
|
if ( wids[i] != h_at ) continue;
|
|
// front name word count
|
|
int32_t nameCount = 0;
|
|
// back up i until we hit a non-email word
|
|
int32_t a ;
|
|
// do a loop
|
|
for ( a = i - 1 ; a > 0 ; ) {
|
|
// need a space/punt word
|
|
if ( wids[a] ) break;
|
|
if ( tids[a] ) break;
|
|
// skip it
|
|
a--;
|
|
// then need the "john" part
|
|
if ( ! wids[a] ) break;
|
|
if ( tids[a] ) break;
|
|
if ( wids[a] == h_dot ) break; // "dot" is bad
|
|
// count account name part
|
|
nameCount++;
|
|
// go back if like "mike dot smith"
|
|
if ( a - 4 >= 0 &&
|
|
! tids[a-1] &&
|
|
wids [a-2] == h_dot &&
|
|
! tids[a-3] &&
|
|
wids [a-4] != h_dot &&
|
|
wids [a-4] != h_at )
|
|
a -= 4;
|
|
// that is good enough
|
|
break;
|
|
}
|
|
// need a name at least one
|
|
if ( nameCount <= 0 ) continue;
|
|
// skip over that space/punct word
|
|
//a--;
|
|
// now must be regular word before that
|
|
//if ( tids[a-1] ) continue;
|
|
//if ( ! wids[a-1] ) continue;
|
|
// we got it
|
|
//a--;
|
|
// now get the end of it
|
|
int32_t b ;
|
|
// count the dots
|
|
int32_t dotCount = 0;
|
|
// make sure last word is a legit tld
|
|
int32_t tldLen = 0; char *tld = NULL;
|
|
// do a loop
|
|
for ( b = i + 1 ; b + 3 < nw ; b++ ) {
|
|
// need a space/punt word
|
|
if ( wids[b] ) break;
|
|
if ( tids[b] ) break;
|
|
// skip it
|
|
b++;
|
|
// then need the "xyz" part
|
|
if ( ! wids[b] ) break;
|
|
if ( tids[b] ) break;
|
|
if ( wids[b] == h_dot ) break; // "dot" is bad
|
|
// remember it for tld detection
|
|
tld = wptrs[b];
|
|
tldLen = wlens[b];
|
|
// skip it
|
|
b++;
|
|
// need another space/punct word
|
|
if ( wids[b] ) break;
|
|
if ( tids[b] ) break;
|
|
// skip it
|
|
b++;
|
|
// now we need a "dot"
|
|
if ( wids[b] != h_dot ) break;
|
|
// count the dots
|
|
dotCount++;
|
|
}
|
|
// need at least one "dot"
|
|
if ( dotCount < 1 ) continue;
|
|
// not too many!
|
|
if ( dotCount > 5 ) continue;
|
|
// must have legit tld
|
|
if ( tld && ! isTLD ( tld , tldLen ) ) continue;
|
|
// normal buffer
|
|
char buf[100];
|
|
char *p = buf;
|
|
char *pend = buf + 100;
|
|
// normalize it
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// skip tags
|
|
if ( tids[j] ) continue;
|
|
// skip punct
|
|
if ( ! wids[j] ) continue;
|
|
// ensure minimal space
|
|
if ( p + wlens[j] + 1 >= pend ) break;
|
|
// write out wids
|
|
if ( wids[j] == h_at ) {*p++ = '@'; continue;}
|
|
if ( wids[j] == h_dot ) {*p++ = '.'; continue;}
|
|
gbmemcpy ( p , wptrs[j] , wlens[j] );
|
|
p += wlens[j];
|
|
}
|
|
// NULL term it
|
|
*p = '\0';
|
|
// get the host
|
|
char *host = buf ; // wptrs[i+1]; ?? is this right?
|
|
// if not from our same domain, use "emailaddressoffsite"
|
|
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
|
|
if ( ! dom ) continue;
|
|
// use mid domain
|
|
int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
|
|
// limit domain by that. subtract '.'
|
|
int32_t midlen = tld3 - dom - 1;
|
|
// do we match domains?
|
|
char *tn = "emailaddressoffsite";
|
|
// use this if we match domains
|
|
if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
|
|
tn = "emailaddressonsite";
|
|
// this is an official contact method
|
|
//official++;
|
|
}
|
|
// we now count even offsite email addresses as official
|
|
// for addresses like @gmail.com etc. because we are now
|
|
// only checking "contact us" and "about us" and root pages,
|
|
// so they should never be email addresses of commenters
|
|
// and often bloggers have external email addresses.
|
|
// http://www.christinesaari.com/html/about.php?psi=44
|
|
official++;
|
|
// store that
|
|
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
|
|
// return -1;
|
|
int32_t blen = gbstrlen(buf);
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// limit it
|
|
if ( ++ne >= 3 ) break;
|
|
}
|
|
|
|
//
|
|
// ADD EMAIL ADDRESSES IN MAILTO TAGS
|
|
//
|
|
// <a href=mailto:steve@xyz.com>
|
|
// <a href=mailto:"steve at xyz dot com">
|
|
// now we check char by char since a website had it in the javascript:
|
|
// http://www.botanique.com/bincgi/stateprov.CFM?state=NM
|
|
//
|
|
char *m = xml->m_xml;
|
|
char *mend = m + xml->m_xmlLen - 4;
|
|
// empty?
|
|
if ( ! m ) mend = m;
|
|
// scan
|
|
for ( ; ; m++ ) {
|
|
// breach?
|
|
if ( m >= mend ) break;
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not possible mailto:
|
|
if ( *m != 'm' && *m !='M' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'a' && *m !='A' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'i' && *m !='I' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'l' && *m !='L' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 't' && *m !='T' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != 'o' && *m !='O' ) continue;
|
|
// skip
|
|
m++;
|
|
// skip?
|
|
if ( *m != ':' ) continue;
|
|
// skip
|
|
m++;
|
|
// set end
|
|
char *mend = m + 100;
|
|
// skip over the mailto:
|
|
//m += 7;
|
|
// that is the start of the email address then
|
|
char *start = m;
|
|
// skip til '@'
|
|
for ( ; *m && m < mend && *m != '@' ; m++ ) {
|
|
// but give up if we hit a non-email name char
|
|
if ( is_alnum_a(*m) ) continue;
|
|
if ( *m == '.' ) continue;
|
|
if ( *m == '-' ) continue;
|
|
break;
|
|
}
|
|
// bad if no @
|
|
if ( *m != '@' ) continue;
|
|
// skip the @
|
|
m++;
|
|
// . skip until alnum
|
|
// . fix parsing of "dsquires@ unimelb.edu.au" for
|
|
// http://www.marcom1.unimelb.edu.au/public/contact.html
|
|
for (;*m && is_wspace_utf8(m); m+=getUtf8CharSize(m) );
|
|
// get the host
|
|
char *host = m;
|
|
// skip till end of hostname
|
|
for (;*m && m<mend && (is_alnum_a(*m)||*m=='.'||*m=='-');m++ );
|
|
// null term
|
|
char c = *m; *m = '\0';
|
|
// if not from our same domain, use "emailaddressoffsite"
|
|
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
|
|
// skip if no valid domain
|
|
if ( ! dom ) { *m = c; continue; }
|
|
// use mid domain
|
|
int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
|
|
// limit domain by that. subtract '.'
|
|
int32_t midlen = tld3 - dom - 1;
|
|
// put it back
|
|
*m = c;
|
|
// point "end" to end of the email address
|
|
char *end = dom + dlen;
|
|
// do we match domains?
|
|
char *tn = "emailaddressoffsite";
|
|
// use this if we match domains
|
|
if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
|
|
tn = "emailaddressonsite";
|
|
// this is an official contact method
|
|
//official++;
|
|
}
|
|
// we now count even offsite email addresses as official
|
|
// for addresses like @gmail.com etc. because we are now
|
|
// only checking "contact us" and "about us" and root pages,
|
|
// so they should never be email addresses of commenters
|
|
// and often bloggers have external email addresses.
|
|
// http://www.christinesaari.com/html/about.php?psi=44
|
|
official++;
|
|
// store that
|
|
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,start,end-start) )
|
|
// return -1;
|
|
// cast it
|
|
char *buf = start;
|
|
int32_t blen = end - start;
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// limit it
|
|
if ( ++ne >= 3 ) break;
|
|
}
|
|
|
|
|
|
//
|
|
// ADD CONTACT FORM
|
|
//
|
|
|
|
bool gotEmailBox = false;
|
|
bool storedForm = false;
|
|
int32_t emailPos = -1;
|
|
int32_t alnumCount = 0;
|
|
// quick compares
|
|
int64_t he1 = hash64Lower_utf8 ( "email");
|
|
int64_t he2 = hash64Lower_utf8 ( "mail");
|
|
// loop over all words again
|
|
for ( int32_t i = 1 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get tag id if any
|
|
int32_t tid = tids[i] & BACKBITCOMP;
|
|
// . do we have a submit form?
|
|
// . first, do we have a text box for the sender's email?
|
|
if ( tid == TAG_INPUT ) {
|
|
int32_t ttlen;
|
|
// bad i is not a node # it is a word #
|
|
int32_t nn = ww->m_nodes[i];
|
|
// must be valid
|
|
char *tt = xml->getString(nn,"type",&ttlen);
|
|
if ( ! tt || ttlen <= 0 ) continue;
|
|
// must be of type text
|
|
if ( strncasecmp(tt,"text",4) ) continue;
|
|
// might have "email" or "e-mail" in the value
|
|
int32_t vlen;
|
|
char *val = xml->getString(nn,"value",&vlen);
|
|
// check that
|
|
if ( val ) {
|
|
if ( gb_strncasestr(val,vlen,"email") ||
|
|
gb_strncasestr(val,vlen,"e-mail") )
|
|
// flag it good
|
|
gotEmailBox = true;
|
|
}
|
|
// must have the word "email" or "e-mail" within
|
|
// a few words right before it!
|
|
if ( emailPos == -1 ) continue;
|
|
//if ( i - emailPos >= 7 ) continue;
|
|
if ( alnumCount > 7 ) continue;
|
|
// flag it
|
|
gotEmailBox = true;
|
|
}
|
|
// text area? must happen AFTER the email adress box
|
|
if ( tid == TAG_TEXTAREA && gotEmailBox ) {
|
|
// must have had the form before us
|
|
// do not double store into tagdb rec
|
|
if ( storedForm ) continue;
|
|
// store this bad boy into the tagdb rec
|
|
//if ( ! gr->addTag("hascontactform",
|
|
// timestamp,
|
|
// "xmldoc",
|
|
// ip,
|
|
// "1" ,
|
|
// 1 ) )
|
|
// return -1;
|
|
// copy it
|
|
char *buf = "hascontactform";
|
|
int32_t blen = gbstrlen(buf);
|
|
// ignore if breach
|
|
if ( eptr + blen + 2 > emax ) continue;
|
|
// comma?
|
|
if ( eptr > m_emailBuf ) *eptr++ = ',';
|
|
// store it
|
|
gbmemcpy (eptr , buf , blen );
|
|
// advance
|
|
eptr += blen;
|
|
// do not double store
|
|
storedForm = true;
|
|
// this is an official contact method
|
|
official++;
|
|
// another contact method
|
|
ne++;
|
|
// that's enough!
|
|
break;
|
|
}
|
|
// alnum counter
|
|
if ( wids[i] ) alnumCount++;
|
|
// special counter
|
|
if ( wids[i] == he1 || wids[i] == he2 ) {
|
|
// mark it
|
|
emailPos = i;
|
|
// reset counter
|
|
alnumCount = 0;
|
|
}
|
|
}
|
|
|
|
// null term
|
|
*eptr = '\0';
|
|
|
|
m_numOfficialEmails = official;
|
|
|
|
// i guess that is it
|
|
return m_emailBuf;
|
|
}
|
|
|
|
// returns vector 1-1 with Words.m_words[] array
|
|
/*
|
|
Spam *XmlDoc::getSpam ( ) {
|
|
if ( m_spamValid ) return &m_spam;
|
|
// set it
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Spam *)ww;
|
|
Bits *bits = getBits ();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (Spam *)bits;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (Spam *)sni;
|
|
// if more than X% ("thresh") of words are spammed to some degree,
|
|
// index all words with a minimum score
|
|
int32_t thresh = 6;
|
|
if ( *sni > 10 ) thresh = 8;
|
|
if ( *sni > 30 ) thresh = 10;
|
|
if ( *sni > 100 ) thresh = 20;
|
|
if ( *sni > 500 ) thresh = 30;
|
|
//int64_t x[] = {30,40,50,70,90};
|
|
//int64_t y[] = {6,8,10,20,30};
|
|
//int32_t spamThresh = getY ( m_docQuality , x , y , 5 );
|
|
if ( ! m_spam.set ( ww ,
|
|
bits ,
|
|
m_version ,
|
|
thresh ,
|
|
20 ,
|
|
m_niceness ))
|
|
return NULL;
|
|
m_spamValid = true;
|
|
return &m_spam;
|
|
}
|
|
*/
|
|
|
|
// this means any tod now
|
|
bool *XmlDoc::getHasTOD ( ) {
|
|
if ( m_hasTODValid ) return &m_hasTOD2;
|
|
// scan the dates
|
|
Dates *dp = getDates() ;
|
|
if ( ! dp || dp == (Dates *)-1 ) return (bool *)dp;
|
|
// assume not
|
|
m_hasTOD2 = false;
|
|
m_hasTOD = false;
|
|
// scan the dates
|
|
for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get date
|
|
Date *di = dp->m_datePtrs[i];
|
|
// skip if got nuked
|
|
if ( ! di ) continue;
|
|
// tod?
|
|
if ( !(di->m_hasType & DT_TOD) ) continue;
|
|
// got one
|
|
m_hasTOD2 = true;
|
|
m_hasTOD = true;
|
|
}
|
|
// it is now valid
|
|
m_hasTODValid = true;
|
|
return &m_hasTOD2;
|
|
}
|
|
|
|
/*
|
|
bool *XmlDoc::getHasSiteVenue ( ) {
|
|
if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
|
|
// get the tag rec
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (bool *)gr;
|
|
// get tag from it
|
|
Tag *sv = gr->getTag("venueaddress") ;
|
|
// from that
|
|
m_hasSiteVenue2 = (bool)sv;
|
|
m_hasSiteVenue = (bool)sv;
|
|
m_hasSiteVenueValid = true;
|
|
return &m_hasSiteVenue2;
|
|
}
|
|
*/
|
|
|
|
|
|
// do not include addresses that are always in the header/footer of every page!
|
|
bool *XmlDoc::getHasAddress ( ) {
|
|
if ( m_hasAddressValid ) return &m_hasAddress2;
|
|
// get the addresses
|
|
Addresses *aa = getAddresses();
|
|
if ( ! aa || aa == (void *)-1 ) return (bool *)aa;
|
|
// from that
|
|
m_hasAddress2 = (aa->getNumNonDupAddresses() > 0);
|
|
m_hasAddress = (aa->getNumNonDupAddresses() > 0);
|
|
m_hasAddressValid = true;
|
|
return &m_hasAddress2;
|
|
}
|
|
|
|
Addresses *XmlDoc::getAddresses ( ) {
|
|
if ( m_addressesValid ) {
|
|
// return error if buf was breached
|
|
//if ( m_addresses.m_breached ) {
|
|
// g_errno = EBUFOVERFLOW;
|
|
// return NULL;
|
|
//}
|
|
// otherwise, return it
|
|
return &m_addresses;
|
|
}
|
|
// skip for now
|
|
m_addressesValid = true;
|
|
return &m_addresses;
|
|
// note it
|
|
setStatus ( "getting addresses");
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Addresses *)ww;
|
|
// we make sure that D_IS_IN_DATE is set by doing this
|
|
//Dates *dp = getDates();
|
|
//if ( ! dp || dp == (Dates *)-1) return (Addresses *)dp;
|
|
// we set the D_IS_IN_DATE flag for these bits
|
|
Bits *bits = getBits(); if ( ! bits ) return NULL;
|
|
Sections *sections = getExplicitSections();
|
|
if ( !sections||sections==(Sections *)-1) return (Addresses *)sections;
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (Addresses *)gr;
|
|
// the site hash
|
|
//int32_t *sh32 = getSiteHash32();
|
|
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Addresses *)sh32;
|
|
int32_t dh = getDomHash32();
|
|
// hash of all adjacent tag pairs
|
|
//uint32_t *tph = getTagPairHash32 ( ) ;
|
|
//if ( ! tph || tph == (void *)-1 ) return (Addresses *)tph;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Addresses *)d;
|
|
// get our ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1) return (Addresses *)ip;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
//char **stb = getSiteTitleBuf();
|
|
//if ( ! stb || stb == (void *)-1 ) return (Addresses *)stb;
|
|
// sanity check
|
|
//if ( ! m_siteTitleBufValid ) { char *xx=NULL;*xx=0; }
|
|
char **fbuf = getFilteredRootTitleBuf();
|
|
if ( ! fbuf || fbuf == (void *)-1 ) return (Addresses *)fbuf;
|
|
|
|
// this will set D_IS_IN_DATE in the Bits::m_bits[] array which
|
|
// Addresses::set() uses to avoid having addresses that are really
|
|
// just dates!
|
|
Dates *dd = getSimpleDates();
|
|
// return NULL on error
|
|
if ( ! dd ) return (Addresses *)NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if the serialized section is valid, use that
|
|
//char *sd = NULL;
|
|
//bool valid = false;
|
|
//if ( od && od->m_sectionsReplyValid ) valid = true;
|
|
//if ( valid ) sd = od->ptr_sectionsReply;
|
|
// assume valid, really only when it returns in case it blocked...
|
|
//m_addressesValid = true;
|
|
// this should not be outstanding!
|
|
if ( m_addressSetCalled ) { char *xx=NULL;*xx=0; }
|
|
// assume valid, really only when it returns in case it blocked...
|
|
m_addressesValid = true;
|
|
// set it
|
|
m_addressSetCalled = true;
|
|
// make a copy of the tag rec here in case it gets mangled later
|
|
// because the m_addresses class may reference its buffer
|
|
//m_savedTagRec1.copy ( gr );
|
|
// . this returns false if blocked
|
|
// . it uses the "venueaddress" from the tagrec, "gr", BUT if this
|
|
// page is the one that sets the venue address, it won't be able
|
|
// to use it as a default city/state thingy until next time it is
|
|
// spidered, since that info is in the tagrec
|
|
// . PROBLEM: if the venue address is on this page, we can't take
|
|
// advantage of it by usings its city/state as a default for the
|
|
// other addresses on this page
|
|
if ( ! m_addresses.set ( sections ,
|
|
ww ,
|
|
bits ,
|
|
&m_tagRec , // &m_savedTagRec1 , // gr
|
|
&m_firstUrl ,
|
|
*d ,
|
|
cr->m_collnum ,
|
|
dh , // *sh32
|
|
*ip ,
|
|
//(int32_t)*tph ,
|
|
m_niceness ,
|
|
m_pbuf ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
*ct ,
|
|
//ptr_addressReply ,
|
|
//size_addressReply ,
|
|
//m_addressReplyValid ,
|
|
m_filteredRootTitleBuf ,
|
|
m_filteredRootTitleBufSize ,
|
|
this ))
|
|
return (Addresses *)-1;
|
|
// sanity check
|
|
if ( m_addresses.m_msg2c &&
|
|
m_addresses.m_msg2c->m_requests !=
|
|
m_addresses.m_msg2c->m_replies) {
|
|
char *xx=NULL;*xx=0; }
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// return it if not breached
|
|
//if ( ! m_addresses.m_breached ) return &m_addresses;
|
|
// return that error otherwise
|
|
//g_errno = EBUFOVERFLOW;
|
|
//return NULL;
|
|
return &m_addresses;
|
|
}
|
|
|
|
/*
|
|
int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
|
|
if ( m_siteNumInlinksUniqueIpValid )
|
|
return &m_siteNumInlinksUniqueIp;
|
|
// get our companion number
|
|
int32_t *ni = getSiteNumInlinks();
|
|
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
|
|
// sanity check
|
|
if ( ! m_siteNumInlinksUniqueIp ) { char *xx=NULL;*xx=0; }
|
|
// ok we must be valid
|
|
return &m_siteNumInlinksUniqueIp;
|
|
}
|
|
|
|
int32_t *XmlDoc::getSiteNumInlinksUniqueCBlock ( ) {
|
|
if ( m_siteNumInlinksUniqueCBlockValid )
|
|
return &m_siteNumInlinksUniqueCBlock;
|
|
// get our companion number
|
|
int32_t *ni = getSiteNumInlinks();
|
|
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
|
|
// sanity check
|
|
if ( ! m_siteNumInlinksUniqueCBlock ) { char *xx=NULL;*xx=0; }
|
|
// ok we must be valid
|
|
return &m_siteNumInlinksUniqueCBlock;
|
|
}
|
|
|
|
int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
|
|
if ( m_siteNumInlinksTotalValid )
|
|
return &m_siteNumInlinksTotal;
|
|
// get our companion number
|
|
int32_t *ni = getSiteNumInlinks();
|
|
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
|
|
// sanity check
|
|
if ( ! m_siteNumInlinksTotal ) { char *xx=NULL;*xx=0; }
|
|
// ok we must be valid
|
|
return &m_siteNumInlinksTotal;
|
|
}
|
|
*/
|
|
|
|
// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
|
|
int32_t *XmlDoc::getFirstIp ( ) {
|
|
// return it if we got it
|
|
if ( m_firstIpValid ) return &m_firstIp;
|
|
// note it
|
|
setStatus ( "getting first ip");
|
|
// get tag rec
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// got it
|
|
Tag *tag = gr->getTag ( "firstip" );
|
|
// get from tag
|
|
m_firstIp = 0;
|
|
if ( tag ) m_firstIp = atoip(tag->getTagData());
|
|
// if no tag, or is bogus in tag... set from ip
|
|
if ( m_firstIp == 0 || m_firstIp == -1 ) {
|
|
// need ip then!
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
|
|
// set that
|
|
m_firstIp = *ip;
|
|
}
|
|
m_firstIpValid = true;
|
|
return &m_firstIp;
|
|
// must be 4 bytes - no now its a string
|
|
//if ( tag->getTagDataSize() != 4 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
uint8_t *XmlDoc::getSiteNumInlinks8 () {
|
|
if ( m_siteNumInlinks8Valid ) return &m_siteNumInlinks8;
|
|
// get the full count
|
|
int32_t *si = getSiteNumInlinks();
|
|
if ( ! si || si == (int32_t *)-1 ) return (uint8_t *)si;
|
|
// convert to 8
|
|
m_siteNumInlinks8 = score32to8 ( *si );
|
|
// validate
|
|
m_siteNumInlinks8Valid = true;
|
|
return &m_siteNumInlinks8;
|
|
}
|
|
|
|
// this is the # of GOOD INLINKS to the site. so it is no more than
|
|
// 1 per c block, and it has to pass link spam detection. this is the
|
|
// highest-level count of inlinks to the site. use it a lot.
|
|
int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
|
|
|
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
|
|
|
|
// sanity check
|
|
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// hacks of speed. computeSiteNumInlinks is true by default
|
|
// but if the user turns it off the just use sitelinks.txt
|
|
if ( cr && ! cr->m_computeSiteNumInlinks ) {
|
|
int32_t hostHash32 = getHostHash32a();
|
|
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
|
// try with www if not there
|
|
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
|
|
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
|
|
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
|
}
|
|
// fix core by setting these
|
|
// m_siteNumInlinksUniqueIp = 0;
|
|
// m_siteNumInlinksUniqueCBlock = 0;
|
|
// m_siteNumInlinksTotal = 0;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
//a nd this
|
|
m_siteNumInlinksValid = true;
|
|
m_siteNumInlinks = 0;
|
|
// if still not in sitelinks.txt, just use 0
|
|
if ( min < 0 ) {
|
|
return &m_siteNumInlinks;
|
|
}
|
|
m_siteNumInlinks = min;
|
|
return &m_siteNumInlinks;
|
|
}
|
|
|
|
setStatus ( "getting site num inlinks");
|
|
|
|
// get it from the tag rec if we can
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (void *)-1 ) return (int32_t *)gr;
|
|
|
|
// the current top ip address
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
|
|
//int32_t top = *ip & 0x00ffffff;
|
|
|
|
// this happens when its NXDOMAIN reply from dns so assume
|
|
// no site inlinks
|
|
if ( *ip == 0 ) {
|
|
m_siteNumInlinks = 0;
|
|
// m_siteNumInlinksUniqueIp = 0;
|
|
// m_siteNumInlinksUniqueCBlock = 0;
|
|
// m_siteNumInlinksTotal = 0;
|
|
m_siteNumInlinksValid = true;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
return &m_siteNumInlinks;
|
|
}
|
|
|
|
if ( *ip == -1 ) {
|
|
log("xmldoc: ip is %"INT32", can not get site inlinks",*ip);
|
|
g_errno = EBADIP;
|
|
return NULL;
|
|
}
|
|
|
|
// wait for clock to sync before calling getTimeGlobal
|
|
int32_t wfts = waitForTimeSync();
|
|
// 0 means error, i guess g_errno should be set, -1 means blocked
|
|
if ( ! wfts ) return NULL;
|
|
if ( wfts == -1 ) return (int32_t *)-1;
|
|
|
|
setStatus ( "getting site num inlinks");
|
|
// check the tag first
|
|
Tag *tag = gr->getTag ("sitenuminlinks");
|
|
// is it valid?
|
|
bool valid = true;
|
|
// current time
|
|
int32_t now = getTimeGlobal();
|
|
// use the spidered time for the test collection for consistency
|
|
if ( !strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
now = getSpideredTime();//m_spideredTime;
|
|
}
|
|
// get tag age in days
|
|
int32_t age = 0; if ( tag ) age = (now - tag->m_timestamp) ;
|
|
// add in some flutter to avoid having all hsots in the network
|
|
// calling msg25 for this site at the same time.
|
|
// a 10,000 second jitter. 3 hours.
|
|
int32_t flutter = rand() % 10000;
|
|
// add it in
|
|
age += flutter;
|
|
// . if site changes ip then toss the contact info out the window,
|
|
// but give it a two week grace period
|
|
// . well now we use the "ownershipchanged" tag to indicate that
|
|
//if (tag && age>14*3600*24) valid=false;
|
|
// . we also expire it periodically to keep the info uptodate
|
|
// . the higher quality the site, the longer the expiration date
|
|
int32_t ns = 0;
|
|
int32_t maxAge = 0;
|
|
int32_t sni = -1;
|
|
if ( tag ) {
|
|
// how many site inlinks?
|
|
ns = atol(tag->getTagData());
|
|
// for less popular sites use smaller maxAges
|
|
maxAge = 90;
|
|
if ( ns < 10 ) maxAge = 10;
|
|
else if ( ns < 30 ) maxAge = 15;
|
|
else if ( ns < 50 ) maxAge = 30;
|
|
else if ( ns < 100 ) maxAge = 60;
|
|
// if index size is tiny then maybe we are just starting to
|
|
// build something massive, so reduce the cached max age
|
|
int64_t nt = g_titledb.m_rdb.getCollNumTotalRecs(m_collnum);
|
|
if ( nt < 100000000 ) //100M
|
|
maxAge = 3;
|
|
if ( nt < 10000000 ) //10M
|
|
maxAge = 1;
|
|
// for every 100 urls you already got, add a day!
|
|
sni = atol(tag->getTagData());
|
|
// double if repairing
|
|
//if ( m_useSecondaryRdbs ) maxAge = (maxAge+1) * 2;
|
|
// fix bug for rebuild. rebuild any tag before now because
|
|
// the MAX_LINKERS_IN_TERMLIST was too small in Linkdb.cpp
|
|
// and i raised from 1M to 3M. it was hurting mahalo.com.
|
|
if ( m_useSecondaryRdbs && tag->m_timestamp < 1345819704 )
|
|
valid = false;
|
|
// force another rebuild of siterank because i fixed
|
|
// the 'beds' query a little to use firstip, so recompute
|
|
// siterank for those spammers.
|
|
if ( m_useSecondaryRdbs && tag->m_timestamp < 1348257346 &&
|
|
// leave really big guys in tact
|
|
sni < 300 )
|
|
valid = false;
|
|
// convert into seconds
|
|
maxAge *= 3600*24;
|
|
// so youtube which has 2997 links will add an extra 29 days
|
|
maxAge += (sni / 100) * 86400;
|
|
// hack for global index. never affect siteinlinks i imported
|
|
if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0;
|
|
// invalidate for that as wel
|
|
if ( age > maxAge ) valid = false;
|
|
}
|
|
// our companion tags, sitePop and fresh inlinks
|
|
// Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
|
|
// Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
|
|
// Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
|
|
// if we are missing either of those, invalidate as well
|
|
// if ( ! tag2 ) valid = false;
|
|
// if ( ! tag3 ) valid = false;
|
|
// if ( ! tag4 ) valid = false;
|
|
// if we have already been through this
|
|
if ( m_updatingSiteLinkInfoTags ) valid = false;
|
|
// if rebuilding linkdb assume we have no links to sample from!
|
|
if ( tag && m_useSecondaryRdbs && g_repair.m_rebuildLinkdb )
|
|
valid = true;
|
|
|
|
// debug log
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: valid=%"INT32" "
|
|
"age=%"INT32" ns=%"INT32" sni=%"INT32" "
|
|
"maxage=%"INT32" "
|
|
"tag=%"PTRFMT" "
|
|
// "tag2=%"PTRFMT" "
|
|
// "tag3=%"PTRFMT" "
|
|
"url=%s",
|
|
(int32_t)valid,age,ns,sni,
|
|
maxAge,
|
|
(PTRTYPE)tag,
|
|
// (PTRTYPE)tag2,
|
|
// (PTRTYPE)tag3,
|
|
m_firstUrl.m_url);
|
|
|
|
LinkInfo *sinfo = NULL;
|
|
char *mysite = NULL;
|
|
|
|
// if we are good return it
|
|
if ( tag && valid ) {
|
|
// set it
|
|
m_siteNumInlinks = atol(tag->getTagData());
|
|
m_siteNumInlinksValid = true;
|
|
|
|
// companion tags
|
|
// if ( tag2 ) {
|
|
// m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// }
|
|
// if ( tag3 ) {
|
|
// m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// }
|
|
// if ( tag4 ) {
|
|
// m_siteNumInlinksTotal =atol(tag4->getTagData());
|
|
// m_siteNumInlinksTotalValid = true;
|
|
// }
|
|
|
|
// . consult our sitelinks.txt file
|
|
// . returns -1 if not found
|
|
goto updateToMin;
|
|
}
|
|
|
|
// set status. we can time status changes with this routine!
|
|
//setStatus ( "getting site link info");
|
|
|
|
|
|
// if ip is bad we can't do this. we need to have a legit ip
|
|
// so we know if a linker is internal or not
|
|
/*
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
log("gb: bad ip so we can't get site num inlinks right");
|
|
m_siteNumInlinks = 0;
|
|
m_sitePop = 0;
|
|
m_siteNumInlinksFresh = 0;
|
|
m_siteNumInlinksValid = true;
|
|
m_siteNumInlinksFreshValid = true;
|
|
m_sitePopValid = true;
|
|
return &m_siteNumInlinks;
|
|
}
|
|
*/
|
|
|
|
// set this flag so when we are re-called, "valid" will be set to false
|
|
// so we can come down here and continue this. "flutter" might
|
|
// otherwise cause us to not make it down here.
|
|
m_updatingSiteLinkInfoTags = true;
|
|
|
|
// we need to re-get both if either is NULL
|
|
sinfo = getSiteLinkInfo();
|
|
// block or error?
|
|
if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;
|
|
|
|
//
|
|
// now update tagdb!
|
|
//
|
|
|
|
// ok, get the sites of the external outlinks and they must
|
|
// also be NEW outlinks, added to the page since the last time
|
|
// we spidered it...
|
|
//Links *links = getLinks ();
|
|
//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
|
|
|
mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;
|
|
|
|
setStatus ( "adding site info tags to tagdb 1");
|
|
|
|
// why are we adding tag again! should already be in tagdb!!!
|
|
if ( m_doingConsistencyCheck ) {char*xx=NULL;*xx=0;}
|
|
|
|
// do not re-call at this point
|
|
//m_siteNumInlinks = sinfo->m_numInlinksExtrapolated;
|
|
m_siteNumInlinks = (int32_t)sinfo->m_numGoodInlinks;
|
|
//m_siteNumInlinksFresh = sinfo->m_numInlinksFresh;
|
|
//m_sitePop = sinfo->m_pagePop;
|
|
// m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
|
|
// m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
|
|
// m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
|
|
|
|
m_siteNumInlinksValid = true;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
|
|
|
|
updateToMin:
|
|
|
|
// . consult our sitelinks.txt file
|
|
// . returns -1 if not found
|
|
int32_t hostHash32 = getHostHash32a();
|
|
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
|
|
|
// try with www if not there
|
|
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
|
|
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
|
|
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
|
}
|
|
|
|
if ( min >= 0 ) {
|
|
if ( m_siteNumInlinks < min ||
|
|
! m_siteNumInlinksValid ) {
|
|
m_siteNumInlinks = min;
|
|
m_siteNumInlinksValid = true;
|
|
}
|
|
// if ( ! m_siteNumInlinksUniqueIpValid ||
|
|
// m_siteNumInlinksUniqueIp < min ) {
|
|
// m_siteNumInlinksUniqueIp = min;
|
|
// m_siteNumInlinksUniqueIpValid = true;
|
|
// }
|
|
// if ( ! m_siteNumInlinksUniqueCBlockValid ||
|
|
// m_siteNumInlinksUniqueCBlock < min ) {
|
|
// m_siteNumInlinksUniqueCBlock = min;
|
|
// m_siteNumInlinksUniqueCBlockValid = true;
|
|
// }
|
|
// if ( ! m_siteNumInlinksTotalValid ||
|
|
// m_siteNumInlinksTotal < min ) {
|
|
// m_siteNumInlinksTotal = min;
|
|
// m_siteNumInlinksTotalValid = true;
|
|
// }
|
|
}
|
|
|
|
|
|
|
|
// deal with it
|
|
return &m_siteNumInlinks;
|
|
}
|
|
|
|
// . do a 'site:xyz.com | gbnuminlinks' query to get the top docs
|
|
// from a site and get the gigabits from that query!
|
|
// . then store the resulting gigabits into tagdb for efficiency
|
|
// . recompute once per month or so ... or if ip changes i guess
|
|
// . we need the root title as a source for city and adm1's for
|
|
// Addresses::set() function
|
|
//char **XmlDoc::getSiteGigabits ( ) {
|
|
//}
|
|
|
|
// TODO: can we have a NULL LinkInfo without having had an error?
|
|
LinkInfo *XmlDoc::getSiteLinkInfo() {
|
|
// lookup problem?
|
|
if ( g_errno ) {
|
|
log("build: error getting link info: %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
setStatus ( "getting site link info" );
|
|
|
|
if ( m_siteLinkInfoValid )
|
|
//return msg25.m_linkInfo;
|
|
return (LinkInfo *)m_mySiteLinkInfoBuf.getBufStart();
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
|
|
int32_t *fip = getFirstIp();
|
|
if ( ! fip || fip == (int32_t *)-1) return (LinkInfo *)fip;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// can we be cancelled?
|
|
bool canBeCancelled = true;
|
|
// not if pageparser though
|
|
if ( m_pbuf ) canBeCancelled = false;
|
|
// not if injecting
|
|
if ( ! m_sreqValid ) canBeCancelled = false;
|
|
// assume valid when it returns
|
|
m_siteLinkInfoValid = true;
|
|
// use this buffer so XmlDoc::print() can display it where it wants
|
|
SafeBuf *sb = NULL;
|
|
if ( m_pbuf ) sb = &m_siteLinkBuf;
|
|
// only do this for showing them!!!
|
|
if ( m_useSiteLinkBuf ) sb = &m_siteLinkBuf;
|
|
//bool onlyGetGoodInlinks = true;
|
|
//if ( m_useSiteLinkBuf ) onlyGetGoodInlinks = false;
|
|
// get this
|
|
int32_t lastUpdateTime = getTimeGlobal();
|
|
// get from spider request if there
|
|
//bool injected = false;
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
|
// but be consistent if doing the "qatest123" collection
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
|
|
lastUpdateTime = getSpideredTime();//m_spideredTime;
|
|
}
|
|
|
|
bool onlyNeedGoodInlinks = true;
|
|
// so if steve wants to display all links then set this
|
|
// to false so we get titles of bad inlinks
|
|
// seems like pageparser.cpp just sets m_pbuf and not
|
|
// m_usePageLinkBuf any more
|
|
if ( sb ) onlyNeedGoodInlinks = false;
|
|
|
|
// int16_tcut
|
|
//Msg25 *m = &m_msg25;
|
|
if ( ! getLinkInfo ( &m_tmpBuf11,
|
|
&m_mcast11,
|
|
mysite , // site
|
|
mysite , // url
|
|
true , // isSiteLinkInfo?
|
|
*fip ,
|
|
0 , // docId
|
|
cr->m_collnum , //linkInfoColl
|
|
NULL , // qbuf
|
|
0 , // qbufSize
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_contentInjected ,// isInjecting?
|
|
sb ,
|
|
m_printInXml ,
|
|
0 , // sitenuminlinks -- dunno!
|
|
//0 , // sitePop
|
|
NULL , // oldLinkInfo1 ,
|
|
m_niceness ,
|
|
cr->m_doLinkSpamCheck ,
|
|
cr->m_oneVotePerIpDom ,
|
|
canBeCancelled ,
|
|
lastUpdateTime ,
|
|
onlyNeedGoodInlinks ,
|
|
false,
|
|
0,
|
|
0,
|
|
// it will store the linkinfo into this safebuf
|
|
&m_mySiteLinkInfoBuf) )
|
|
// return -1 if it blocked
|
|
return (LinkInfo *)-1;
|
|
// sanity check
|
|
//if ( ! m_msg25.m_linkInfo ) {
|
|
// log("build: error making link info: %s",mstrerror(g_errno));
|
|
// return NULL;
|
|
//}
|
|
// we got it
|
|
//return m_msg25.m_linkInfo;
|
|
// getLinkInfo() now calls multicast so it returns true on errors only
|
|
log("build: error making link info: %s",mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
|
|
static void gotIpWrapper ( void *state , int32_t ip ) ;
|
|
|
|
static void delayWrapper ( int fd , void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
int32_t *XmlDoc::getIp ( ) {
|
|
// return if we got it
|
|
if ( m_ipValid ) return &m_ip;
|
|
// update status msg
|
|
setStatus ( "getting ip" );
|
|
|
|
m_ipStartTime = 0;
|
|
// assume the same in case we get it right away
|
|
m_ipEndTime = 0;
|
|
|
|
// if set from docid and recycling
|
|
if ( m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (int32_t *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// set it
|
|
if ( od ) {
|
|
m_ip = od->m_ip;
|
|
m_ipValid = true;
|
|
return &m_ip;
|
|
}
|
|
}
|
|
|
|
|
|
// fakeit for now
|
|
//log("FAKING IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
|
|
//m_ip = atoip("74.201.80.152",13);
|
|
//m_ipValid = true;
|
|
//return &m_ip;
|
|
|
|
// get the best url
|
|
Url *u = getCurrentUrl();
|
|
if ( ! u || u == (void *)-1 ) return (int32_t *)u;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
bool useTestCache = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
|
// unless its the pagesubmit.cpp event submission tool
|
|
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
|
|
|
|
|
// when building the "qatest123" collection try to get the ip from
|
|
// "./test/ips.txt" so our injections are consistent every time
|
|
// Test.cpp runs its injection loop into the "qatest123" collection
|
|
if ( useTestCache ) { // && m_useIpsTxtFile ) {
|
|
// stolen from msgc.cpp:
|
|
// if url is already in a.b.c.d format return that
|
|
int32_t ip2 = 0;
|
|
char *host = u->getHost();
|
|
if ( host ) ip2 = atoip ( host,u->getHostLen() );
|
|
if ( ip2 != 0 ) {
|
|
m_ip = ip2;
|
|
m_ipValid = true;
|
|
return &m_ip;
|
|
}
|
|
// assume not found in our file
|
|
bool found = false;
|
|
// get test dir
|
|
char *testDir = getTestDir();
|
|
// get it from "./test/ips.txt"
|
|
getTestIp ( u->getUrl() , &m_ip , &found , m_niceness,testDir);
|
|
// if we found a match...
|
|
if ( found ) { // m_ip != 0 ) {
|
|
// we are valid now
|
|
return gotIp ( false );
|
|
//m_ipValid = true;
|
|
// return it
|
|
//return &m_ip;
|
|
}
|
|
}
|
|
|
|
// we need the ip before we download the page, but before we get
|
|
// the IP and download the page, wait for this many milliseconds.
|
|
// this basically slows the spider down.
|
|
int32_t delay = cr->m_spiderDelayInMilliseconds;
|
|
// ignore for testing
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
|
|
// injected?
|
|
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
|
|
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0;
|
|
if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0;
|
|
// . don't do the delay when downloading extra doc, robots.txt etc.
|
|
// . this also reports a status msg of "getting new doc" when it
|
|
// really means "delaying spider"
|
|
if ( m_isChildDoc ) delay = 0;
|
|
|
|
if ( delay > 0 && ! m_didDelay ) {
|
|
// we did it
|
|
m_didDelay = true;
|
|
m_statusMsg = "delaying spider";
|
|
// random fuzz so we don't get everyone being unleashed at once
|
|
int32_t radius = (int32_t)(.20 * (double)delay);
|
|
int32_t fuzz = (rand() % (radius * 2)) - radius;
|
|
delay += fuzz;
|
|
// make a callback wrapper.
|
|
// this returns false and sets g_errno on error
|
|
if ( g_loop.registerSleepCallback ( delay ,
|
|
m_masterState ,
|
|
delayWrapper,//m_masterLoop
|
|
m_niceness ))
|
|
// wait for it, return -1 since we blocked
|
|
return (int32_t *)-1;
|
|
// if was not able to register, ignore delay
|
|
}
|
|
|
|
if ( m_didDelay && ! m_didDelayUnregister ) {
|
|
g_loop.unregisterSleepCallback(m_masterState,delayWrapper);
|
|
m_didDelayUnregister = true;
|
|
}
|
|
|
|
// update status msg
|
|
setStatus ( "getting ip" );
|
|
|
|
m_ipStartTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
// assume valid! if reply handler gets g_errno set then m_masterLoop
|
|
// should see that and call the final callback
|
|
//m_ipValid = true;
|
|
// get it
|
|
if ( ! m_msgc.getIp ( u->getHost () ,
|
|
u->getHostLen() ,
|
|
&m_ip ,
|
|
this ,
|
|
gotIpWrapper ))
|
|
// we blocked
|
|
return (int32_t *)-1;
|
|
// wrap it up
|
|
return gotIp ( true );
|
|
}
|
|
|
|
void gotIpWrapper ( void *state , int32_t ip ) {
|
|
// point to us
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
// wrap it up
|
|
THIS->gotIp ( true );
|
|
// . call the master callback
|
|
// . m_masterState usually equals THIS, unless THIS is the
|
|
// Xml::m_contactDoc or something...
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
int32_t *XmlDoc::gotIp ( bool save ) {
|
|
// return NULL on error
|
|
if ( g_errno ) return NULL;
|
|
// this is bad too
|
|
//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
|
|
//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());
|
|
|
|
setStatus ("got ip");
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// note it for crawlbot
|
|
if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
|
|
log("db: got ip %"INT32" for %s",
|
|
m_ip,getCurrentUrl()->getUrl());
|
|
|
|
bool useTestCache = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
|
// unless its the pagesubmit.cpp event submission tool
|
|
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
|
|
|
|
|
// when building the "qatest123" collection try to get the ip from
|
|
// "./test/ips.txt" so our injections are consistent every time
|
|
// Test.cpp runs its injection loop into the "qatest123" collection
|
|
if ( save && useTestCache ) {
|
|
// ip of 0 means NXDOMAIN i think (-1 means error)
|
|
//if ( m_ip == 0 ) {
|
|
// log("waiting for debug break");
|
|
// sleep(3600);
|
|
//}
|
|
// get the best url
|
|
Url *u = getCurrentUrl();
|
|
if ( !u || u == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// . add it to "./test/ips.txt"
|
|
// . this function is in Msge1.cpp
|
|
addTestIp ( u->getHost() , u->getHostLen() , m_ip );
|
|
// get test dir
|
|
char *testDir = getTestDir();
|
|
// save it
|
|
saveTestBuf ( testDir );
|
|
}
|
|
|
|
// we got it
|
|
m_ipValid = true;
|
|
// give it to them
|
|
return &m_ip;
|
|
}
|
|
|
|
#include "Mime.h"
|
|
|
|
// taken from Robotdb.cpp
|
|
bool isAllowed2 ( Url *url ,
|
|
char *userAgent ,
|
|
char *file ,
|
|
int32_t fileLen ,
|
|
bool *userAgentFound ,
|
|
bool substringMatch ,
|
|
int32_t *crawlDelay ,
|
|
char **cacheStart ,
|
|
int32_t *cacheLen ,
|
|
bool *hadAllowOrDisallow ) {
|
|
// assume nothing to cache yet
|
|
*cacheLen = 0;
|
|
*cacheStart = file;
|
|
// assume user agent is not in the file
|
|
*userAgentFound = false;
|
|
*hadAllowOrDisallow = false;
|
|
// assume no crawl delay (-1)
|
|
// *crawlDelay = -1;
|
|
// if fileLen is 0 it is allowed
|
|
if ( fileLen <= 0 ) return true;
|
|
// get path from url, include cgi stuff
|
|
char *path = url->getPath();
|
|
int32_t pathLen = url->getPathLenWithCgi();
|
|
// set the Mime class to this Mime file
|
|
Mime mime;
|
|
mime.set ( file , fileLen );
|
|
// get a line of Mime
|
|
char *f , *v;
|
|
int32_t flen, vlen;
|
|
// user agent length
|
|
int32_t uaLen = gbstrlen (userAgent);
|
|
// ptr into "file"
|
|
char *p = file;
|
|
char flag;
|
|
bool allowed = true;
|
|
loop:
|
|
// if p is NULL now we're done
|
|
if ( ! p ) return allowed;
|
|
// get the next Mime line
|
|
p = mime.getLine ( p , &f , &flen , &v , &vlen );
|
|
// if this field is NOT "user-agent" skip it
|
|
if ( flen != 10 ) goto loop;
|
|
if ( strncasecmp ( f , "user-agent" , 10 ) != 0 ) goto loop;
|
|
gotAgent:
|
|
//some webmasters put comments at the end of their lines,
|
|
//because they think this is a shell script or something.
|
|
char* vv = v;
|
|
while(vv - v < vlen && *vv != '#') vv++;
|
|
vlen = vv - v;
|
|
// decrement vlen to hack off spaces after the user-agent so that vlen
|
|
// is really the length of the user agent
|
|
while ( vlen > 0 && is_wspace_a(v[vlen-1]) ) vlen--;
|
|
// now match the user agent
|
|
if ( ! substringMatch && vlen != uaLen ) goto loop;
|
|
// otherwise take the min of the lengths
|
|
if ( uaLen < vlen ) vlen = uaLen;
|
|
// is it the right user-agent?
|
|
if ( strncasecmp ( v , userAgent , vlen ) != 0 ) goto loop;
|
|
// we got it, if first instance start our cache here
|
|
if ( !*userAgentFound ) *cacheStart = f;
|
|
*userAgentFound = true;
|
|
flag = 0;
|
|
urlLoop:
|
|
// if p is NULL now there is no more lines
|
|
if ( ! p ) {
|
|
// set our cache stop to the end of the file
|
|
*cacheLen = (file + fileLen) - *cacheStart;
|
|
return allowed;
|
|
}
|
|
// now loop over lines until we hit another user-agent line
|
|
p = mime.getLine ( p , &f , &flen , &v , &vlen );
|
|
// if it's another user-agent line ... ignore it unless we already
|
|
// have seen a disallow line, in which case we got another set of
|
|
if ( flag && flen==10 && strncasecmp(f,"user-agent",10)==0) {
|
|
// set our cache stop here
|
|
*cacheLen = f - *cacheStart;
|
|
goto gotAgent;
|
|
}
|
|
// if a crawl delay, get the delay
|
|
if ( flen == 11 && strncasecmp ( f , "crawl-delay", 11 ) == 0 ) {
|
|
// set flag
|
|
flag = 1;
|
|
// skip if invalid. it could be ".5" seconds
|
|
if ( ! is_digit ( *v ) && *v != '.' ) goto urlLoop;
|
|
// get this. multiply crawl delay by x1000 to be in
|
|
// milliseconds/ms
|
|
int64_t vv = (int64_t)(atof(v) * 1000LL);
|
|
// truncate to 0x7fffffff
|
|
if ( vv > 0x7fffffff ) *crawlDelay = 0x7fffffff;
|
|
else if ( vv < 0 ) *crawlDelay = -1;
|
|
else *crawlDelay = (int32_t)vv;
|
|
// get the delay
|
|
//*crawlDelay = atol(v) * 1000;
|
|
goto urlLoop;
|
|
}
|
|
// if already disallowed, just goto the next line
|
|
if ( !allowed ) goto urlLoop;
|
|
// if we have an allow line or sitemap: line, then set flag to 1
|
|
// so we can go to another user-agent line.
|
|
// fixes romwebermarketplace.com/robots.txt
|
|
// (doc.156447320458030317.txt)
|
|
if ( flen==5 && strncasecmp(f,"allow" ,5)==0 ) {
|
|
*hadAllowOrDisallow = true;
|
|
flag = 1;
|
|
}
|
|
if ( flen==7 && strncasecmp(f,"sitemap",7)==0 ) {
|
|
flag = 1;
|
|
}
|
|
// if not disallow go to loop at top
|
|
if ( flen != 8 ) goto urlLoop;
|
|
if ( strncasecmp ( f , "disallow" , 8 ) != 0 ) {
|
|
goto urlLoop;
|
|
}
|
|
// we had a disallow
|
|
*hadAllowOrDisallow = true;
|
|
// set flag
|
|
flag = 1;
|
|
// . take off trailing chars from the banned path name
|
|
// . this is now done below
|
|
//while ( vlen > 0 && is_space(v[vlen-1]) ) vlen--;
|
|
// . skip leading spaces
|
|
// . this should be done in mime class
|
|
// while ( vlen > 0 && is_space(v[0]) ) { v++; vlen--; }
|
|
// now stop at first space after url or end of line
|
|
char *s = v;
|
|
char *send = v + vlen;
|
|
// skip all non-space chars
|
|
while ( s < send && ! is_wspace_a(*s) ) s++;
|
|
// stop there
|
|
vlen = s - v;
|
|
// check for match
|
|
char *tmpPath = path;
|
|
int32_t tmpPathLen = pathLen;
|
|
// assume path begins with /
|
|
if ( vlen > 0 && v[0] != '/'){tmpPath++;tmpPathLen--;}
|
|
if ( vlen > tmpPathLen ) goto urlLoop;
|
|
if ( strncasecmp(tmpPath,v,vlen) != 0 ) goto urlLoop;
|
|
// an exact match
|
|
if ( vlen == tmpPathLen ) {
|
|
//return false;
|
|
allowed = false;
|
|
goto urlLoop;
|
|
}
|
|
// must be something
|
|
if ( vlen <= 0 ) goto urlLoop;
|
|
// "v" may or may not end in a /, it really should end in a / though
|
|
if ( v[vlen-1] == '/' && tmpPath[vlen-1] == '/' ) {
|
|
//return false;
|
|
allowed = false;
|
|
goto urlLoop;
|
|
}
|
|
if ( v[vlen-1] != '/' && tmpPath[vlen ] == '/' ) {
|
|
//return false;
|
|
allowed = false;
|
|
goto urlLoop;
|
|
}
|
|
// let's be stronger. just do the substring match. if the webmaster
|
|
// does not want us splitting path or file names then they should end
|
|
// all of their robots.txt entries in a '/'. this also fixes the
|
|
// problem of the "Disallow: index.htm?" line.
|
|
//return false;
|
|
allowed = false;
|
|
// get another url path
|
|
goto urlLoop;
|
|
}
|
|
|
|
// when doing a custom crawl we have to decide between the provided crawl
|
|
// delay, and the one in the robots.txt...
|
|
int32_t *XmlDoc::getFinalCrawlDelay() {
|
|
|
|
if ( m_finalCrawlDelayValid )
|
|
return &m_finalCrawlDelay;
|
|
|
|
bool *isAllowed = getIsAllowed();
|
|
if ( ! isAllowed || isAllowed == (void *)-1 ) return (int32_t *)isAllowed;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
m_finalCrawlDelayValid = true;
|
|
|
|
// getIsAllowed already sets m_crawlDelayValid to true
|
|
if ( ! cr->m_isCustomCrawl ) {
|
|
m_finalCrawlDelay = m_crawlDelay;
|
|
// default to 250ms i guess if none specified in robots
|
|
// just to be somewhat nice by default
|
|
if ( m_crawlDelay < 0 ) m_finalCrawlDelay = 250;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// get manually specified crawl delay in seconds. convert to ms.
|
|
int32_t manual = (int32_t)(cr->m_collectiveCrawlDelay * 1000.0);
|
|
// negative means -1 means unknown or not specified
|
|
if ( manual < 0 ) manual = -1;
|
|
|
|
// if both are unknown...
|
|
if ( m_crawlDelay == -1 && manual == -1 ) {
|
|
m_finalCrawlDelay = -1;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// if not in robots.txt use manual
|
|
if ( m_crawlDelay == -1 ) {
|
|
m_finalCrawlDelay = manual;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// if manually provided crawldelay is -1, use robots.txt then
|
|
if ( manual == -1 ) {
|
|
m_finalCrawlDelay = m_crawlDelay;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// let robots.txt dictate if both are >= 0
|
|
if ( m_useRobotsTxt ) {
|
|
m_finalCrawlDelay = m_crawlDelay;
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
// if not using robots.txt, pick the smallest
|
|
if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
|
|
else m_finalCrawlDelay = manual;
|
|
|
|
return &m_finalCrawlDelay;
|
|
}
|
|
|
|
bool XmlDoc::isFirstUrlRobotsTxt ( ) {
|
|
if ( m_isRobotsTxtUrlValid )
|
|
return m_isRobotsTxtUrl;
|
|
Url *fu = getFirstUrl();
|
|
m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
|
|
m_isRobotsTxtUrlValid = true;
|
|
return m_isRobotsTxtUrl;
|
|
}
|
|
|
|
// . get the Robots.txt and see if we are allowed
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
// . getting a robots.txt is not trivial since we need to follow redirects,
|
|
// so we make use of the powerful XmlDoc class for this
|
|
bool *XmlDoc::getIsAllowed ( ) {
|
|
// return if we got it
|
|
if ( m_isAllowedValid ) return &m_isAllowed;
|
|
// could be turned off for everyone
|
|
if ( ! m_useRobotsTxt ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
m_crawlDelayValid = true;
|
|
m_crawlDelay = -1;
|
|
//log("xmldoc: skipping robots.txt lookup for %s",
|
|
// m_firstUrl.m_url);
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// . if setting from a title rec, assume allowed
|
|
// . this avoids doConsistencyCheck() from blocking and coring
|
|
if ( m_setFromTitleRec ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
if ( m_recycleContent ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// HACK: so we can spider archive.org warcs and arcs internally
|
|
if ( m_firstUrlValid &&
|
|
m_firstUrl.getDomainLen() == 11 &&
|
|
strncmp ( m_firstUrl.getDomain() , "archive.org" , 11 ) == 0 ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
|
|
// double get?
|
|
if ( m_crawlDelayValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// bulk jobs don't need this
|
|
CollectionRec *cr = getCollRec();
|
|
if ( cr && cr->m_isCustomCrawl == 2 ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// . if WE are robots.txt that is always allowed!!!
|
|
// . check the *first* url since these often redirect to wierd things
|
|
if ( isFirstUrlRobotsTxt() ) {
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
m_crawlDelayValid = true;
|
|
// make it super fast...
|
|
m_crawlDelay = 0;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// or if using the "qatest123" collection, assume yes!
|
|
//if ( ! strcmp ( m_coll , "qatest123" ) ) {
|
|
// m_isAllowed = true;
|
|
// m_isAllowedValid = true;
|
|
// return &m_isAllowed;
|
|
//}
|
|
|
|
// update status msg
|
|
setStatus ( "getting robots.txt" );
|
|
// sanity
|
|
int32_t *ip = getIp ();
|
|
// error? or blocked?
|
|
if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
|
|
Url *fu = getFirstUrl();
|
|
// if ip does not exist on the dns, do not try to download robots.txt
|
|
// it is pointless... this can happen in the dir coll and we basically
|
|
// have "m_siteInCatdb" set to true
|
|
if ( *ip == 1 || *ip == 0 || *ip == -1 ) {
|
|
// note this
|
|
log("build: robots.txt ip is %s for url=%s. allowing for now.",
|
|
fu->getUrl(),iptoa(*ip));
|
|
// just core for now
|
|
//char *xx=NULL;*xx=0;
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
// since ENOMIME is no longer causing the indexCode
|
|
// to be set, we are getting a core because crawlDelay
|
|
// is invalid in getNewSpiderReply()
|
|
m_crawlDelayValid = true;
|
|
m_crawlDelay = -1;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// we need this so getExtraDoc does not core
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (bool *)pfip;
|
|
|
|
// get the current url after redirects
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (bool *)cu;
|
|
|
|
// set m_extraUrl to the robots.txt url
|
|
char buf[MAX_URL_LEN+2];
|
|
char *p = buf;
|
|
if ( cu->isHttps() ) p += sprintf ( p , "https://" );
|
|
else p += sprintf ( p , "http://" );
|
|
// sanity
|
|
if ( ! cu->getHost() ) { char *xx=NULL;*xx=0; }
|
|
gbmemcpy ( p , cu->getHost() , cu->getHostLen() );
|
|
p += cu->getHostLen();
|
|
int32_t port = cu->getPort();
|
|
// 80 is the default port
|
|
int32_t defPort = 80;
|
|
// is it https://?
|
|
if ( cu->m_url[4] == 's' ) defPort = 443;
|
|
if ( port != defPort ) p += sprintf ( p , ":%"INT32"",port );
|
|
p += sprintf ( p , "/robots.txt" );
|
|
m_extraUrl.set ( buf );
|
|
|
|
// . maxCacheAge = 3600 seconds = 1 hour for robots.txt
|
|
// . if this is non-zero then msg13 should store it as well!
|
|
// . for robots.txt it should only cache the portion of the doc
|
|
// relevant to our user agent!
|
|
// . getHttpReply() should use msg13 to get cached reply!
|
|
XmlDoc **ped = getExtraDoc ( m_extraUrl.getUrl() , 3600 );
|
|
if ( ! ped || ped == (void *)-1 ) return (bool *)ped;
|
|
// assign it
|
|
XmlDoc *ed = *ped;
|
|
// return NULL on error with g_errno set
|
|
if ( ! ed ) {
|
|
// sanity check, g_errno must be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// log it -- should be rare?
|
|
log("doc: had error getting robots.txt: %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
// inherit this
|
|
//if ( ! m_useIpsTxtFile ) ed->m_useIpsTxtFile = false;
|
|
// . steal m_firstIp from us to avoid tag rec lookup
|
|
// . why was this commented out?
|
|
// . maybe because if we redirect, this is not the same!!!
|
|
//ed->m_firstIp = m_firstIp;
|
|
//ed->m_firstIpValid = m_firstIpValid;//true;
|
|
// also, steal our ip! neither is this!
|
|
//ed->m_ip = m_ip;
|
|
//ed->m_ipValid = m_ipValid;
|
|
// . now try the content
|
|
// . should call getHttpReply
|
|
char **pcontent = ed->getContent();
|
|
if ( ! pcontent || pcontent == (void *)-1 ) return (bool *)pcontent;
|
|
// get the mime
|
|
HttpMime *mime = ed->getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (bool *)mime;
|
|
// get this
|
|
int32_t contentLen = ed->m_contentLen;
|
|
// save this
|
|
m_robotsTxtLen = contentLen;
|
|
m_robotsTxtLenValid = true;
|
|
// get content
|
|
char *content = *pcontent;
|
|
// sanity check
|
|
if ( content && contentLen>0 && content[contentLen] != '\0'){
|
|
char*xx=NULL;*xx=0;}
|
|
|
|
// reset this. -1 means unknown or none found.
|
|
m_crawlDelay = -1;
|
|
m_crawlDelayValid = true;
|
|
|
|
// assume valid and ok to spider
|
|
m_isAllowed = true;
|
|
m_isAllowedValid = true;
|
|
|
|
// put in a crawldelay test for diffbot
|
|
/*
|
|
SafeBuf tmp;
|
|
if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) {
|
|
tmp.safePrintf("User-Agent: *\n"
|
|
"Crawl-Delay: 10.1\n"
|
|
);
|
|
content = tmp.getBufStart();
|
|
contentLen = tmp.getLength();
|
|
}
|
|
|
|
// if not success, assume no robots.txt
|
|
else*/
|
|
|
|
if ( mime->getHttpStatus() != 200 ) {
|
|
// nuke it to save mem
|
|
nukeDoc ( ed );
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
// get the url we lookup
|
|
//Url *cu = getCurrentUrl();
|
|
|
|
// this is set to true if our userAgent was found explicitly
|
|
bool uaFound;
|
|
bool allowed;
|
|
char *cacheStart;
|
|
int32_t cacheLen;
|
|
bool hadAllowOrDisallow;
|
|
int32_t savedCrawlDelay = -1;
|
|
// now use left-anchored substring match so we can match Gigabot/1.0
|
|
allowed = isAllowed2 ( cu ,
|
|
g_conf.m_spiderUserAgent ,
|
|
content ,
|
|
contentLen ,
|
|
&uaFound ,
|
|
true , // substrmatch?
|
|
&m_crawlDelay ,
|
|
&cacheStart ,
|
|
&cacheLen ,
|
|
&hadAllowOrDisallow );
|
|
// save it
|
|
savedCrawlDelay = m_crawlDelay;
|
|
// . if didn't find our user agent so check for * as a user-agent
|
|
// . www.wikihow.com/robots.txt just has "Gigabot: crawl-delay:10\n"
|
|
// and then a "User-Agent: *" after that with the disallows, so
|
|
// i added the hadAllowDisallow parm
|
|
if ( ! uaFound || ! hadAllowOrDisallow )
|
|
allowed = isAllowed2 ( cu ,
|
|
"*" ,
|
|
content ,
|
|
contentLen ,
|
|
&uaFound ,
|
|
false , // substrmatch?
|
|
&m_crawlDelay ,
|
|
&cacheStart ,
|
|
&cacheLen ,
|
|
&hadAllowOrDisallow );
|
|
// bring back?
|
|
if ( savedCrawlDelay != -1 ) m_crawlDelay = savedCrawlDelay;
|
|
// nuke it to save mem
|
|
nukeDoc ( ed );
|
|
// we are legit
|
|
m_isAllowed = allowed;
|
|
m_isAllowedValid = true;
|
|
return &m_isAllowed;
|
|
}
|
|
|
|
|
|
// . lookup the title rec with the "www." if we do not have that in the url
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
char *XmlDoc::getIsWWWDup ( ) {
|
|
// this is not a real error really
|
|
//if ( g_errno == ENOTFOUND ) g_errno = 0;
|
|
// return if we got it
|
|
if ( m_isWWWDupValid ) return &m_isWWWDup;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// could be turned off for everyone
|
|
if ( ! cr->m_dupCheckWWW ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
// get the FIRST URL... (no longer current url after redirects)
|
|
Url *u = getFirstUrl(); // CurrentUrl();
|
|
// if we are NOT a DOMAIN-ONLY url, then no need to do this dup check
|
|
if ( u->getDomainLen() != u->getHostLen() ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
// must NOT have a www
|
|
if ( ! u->isHostWWW() ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
// watch out for idiot urls like www.gov.uk and www.gov.za
|
|
// treat them as though the TLD is uk/za and the domain
|
|
// is gov.uk and gov.za
|
|
if ( u->getDomain() &&
|
|
strncmp ( u->getDomain() , "www." , 4 ) == 0 ) {
|
|
m_isWWWDup = false;
|
|
m_isWWWDupValid = true;
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
// make it without the www
|
|
char withoutWWW[MAX_URL_LEN+1];
|
|
char *proto = "http";
|
|
if ( u->isHttps() ) proto = "https";
|
|
sprintf(withoutWWW,"%s://%s",proto,u->getDomain());
|
|
|
|
// assume yes
|
|
m_isWWWDup = true;
|
|
|
|
if ( ! m_calledMsg22f )
|
|
setStatus ( "getting possible www dup title rec" );
|
|
|
|
// . does this title rec exist in titledb?
|
|
// . "justCheckTfndb" is set to true here!
|
|
if ( ! m_calledMsg22f &&
|
|
! m_msg22f.getTitleRec ( &m_msg22Request ,
|
|
withoutWWW ,
|
|
0 , // probable docid
|
|
cr->m_coll ,
|
|
// . msg22 will set this to point to it!
|
|
// . if NULL that means NOT FOUND
|
|
NULL , // tr ptr
|
|
NULL , // tr size ptr
|
|
true , // just chk tfndb?
|
|
false, // getavaildocidonly
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_niceness , // niceness
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
999999 , // timeout seconds
|
|
false )){//load balancing?
|
|
// validate
|
|
m_calledMsg22f = true;
|
|
// return -1 if we blocked
|
|
return (char *)-1;
|
|
}
|
|
// got it
|
|
m_calledMsg22f = true;
|
|
// valid now
|
|
m_isWWWDupValid = true;
|
|
// found?
|
|
if ( ! g_errno && m_msg22f.m_found ) {
|
|
// crap we are a dup
|
|
m_isWWWDup = true;
|
|
// set the index code
|
|
//m_indexCode = EDOCDUPWWW;
|
|
}
|
|
// return us
|
|
return &m_isWWWDup;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LinkInfo s_dummy2;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
|
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 )
|
|
return ptr_linkInfo1;
|
|
|
|
// do not generate in real-time from a msg20 request for a summary,
|
|
// because if this falls through then getFirstIp() below can return -1
|
|
// and we return -1, causing all kinds of bad things to happen for
|
|
// handling the msg20 request
|
|
if ( m_setFromTitleRec && m_req && ! ptr_linkInfo1 ) {
|
|
returnDummy:
|
|
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
|
s_dummy2.m_lisize = sizeof(LinkInfo);
|
|
ptr_linkInfo1 = &s_dummy2;
|
|
size_linkInfo1 = sizeof(LinkInfo);
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
// at least get our firstip so if cr->m_getLinkInfo is false
|
|
// then getRevisedSpiderReq() will not core because it is invalid
|
|
int32_t *ip = getFirstIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;
|
|
|
|
// just return nothing if not doing link voting
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// to keep things fast we avoid getting link info for some collections
|
|
if ( ! m_linkInfo1Valid && ! cr->m_getLinkInfo ) {
|
|
ptr_linkInfo1 = NULL;
|
|
m_linkInfo1Valid = true;
|
|
}
|
|
|
|
// sometimes it is NULL in title rec when setting from title rec
|
|
if ( m_linkInfo1Valid && ! ptr_linkInfo1 ) {
|
|
goto returnDummy;
|
|
}
|
|
|
|
// return if we got it
|
|
if ( m_linkInfo1Valid )
|
|
return ptr_linkInfo1;
|
|
|
|
// change status
|
|
setStatus ( "getting local inlinkers" );
|
|
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo *)od;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni;
|
|
//int32_t *fip = getFirstIp();
|
|
//if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d;
|
|
// sanity check. error?
|
|
if ( *d == 0LL ) {
|
|
log("xmldoc: crap no g_errno");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return NULL;
|
|
}
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
|
|
|
|
// no linkinfo for diffbot custom crawls to speed up
|
|
if ( cr->m_isCustomCrawl ) {
|
|
m_linkInfo1Valid = true;
|
|
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
|
s_dummy2.m_lisize = sizeof(LinkInfo);
|
|
ptr_linkInfo1 = &s_dummy2;
|
|
size_linkInfo1 = sizeof(LinkInfo);
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
// grab a ptr to the LinkInfo contained in our Doc class
|
|
LinkInfo *oldLinkInfo1 = NULL;
|
|
if ( *od ) oldLinkInfo1 = (*od)->getLinkInfo1();
|
|
|
|
// if ip does not exist, make it 0
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
m_linkInfo1Valid = true;
|
|
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
|
s_dummy2.m_lisize = sizeof(LinkInfo);
|
|
ptr_linkInfo1 = &s_dummy2;
|
|
size_linkInfo1 = sizeof(LinkInfo);
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
//link info generation requires an IP for internal/external computation
|
|
// UNLESS we are from getSpiderStatusDocMetaList2() ... so handle
|
|
// -1 above!
|
|
//if ( *ip == -1 || *ip == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . error getting linkers?
|
|
// . on udp timeout we were coring below because msg25.m_linkInfo
|
|
// was NULL
|
|
if ( g_errno && m_calledMsg25 ) return NULL;
|
|
// prevent core as well
|
|
//if ( m_calledMsg25 && ! size_linkInfo1 ) { // m_msg25.m_linkInfo ) {
|
|
// log("xmldoc: msg25 had null link info");
|
|
// g_errno = EBADENGINEER;
|
|
// return NULL;
|
|
//}
|
|
|
|
// . now search for some link info for this url/doc
|
|
// . this queries the search engine to get linking docIds along
|
|
// with their termIds/scores from anchor text and then compiles
|
|
// it all into one IndexList
|
|
// . if we have no linkers to this url then we set siteHash, etc.
|
|
// for this linkInfo class
|
|
// . this is my google algorithm
|
|
// . let's use the first url (before redirects) for this
|
|
// . m_newDocId is used for classifying doc under predefined news topic
|
|
// . catSiteRec is used for classifying pages under a predefined
|
|
// newstopic. this is currently for news search only.
|
|
// . use the rootTitleRecPtr if there and we are doing our link info
|
|
// stuff in this collection, but if doing it in another collection
|
|
// the msg25 will look up the root in that collection...
|
|
if ( ! m_calledMsg25 ) {
|
|
// get this
|
|
int32_t lastUpdateTime = getTimeGlobal();
|
|
// but be consistent if doing the "qatest123" collection
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
|
|
lastUpdateTime = getSpideredTime();//m_spideredTime;
|
|
}
|
|
// do not redo it
|
|
m_calledMsg25 = true;
|
|
// int16_tcut
|
|
//Msg25 *m = &m_msg25;
|
|
// can we be cancelled?
|
|
bool canBeCancelled = true;
|
|
// not if pageparser though
|
|
if ( m_pbuf ) canBeCancelled = false;
|
|
// not if injecting
|
|
if ( ! m_sreqValid ) canBeCancelled = false;
|
|
// use this buffer so XmlDoc::print() can display wherever
|
|
SafeBuf *sb = NULL;
|
|
if ( m_pbuf ) sb = &m_pageLinkBuf;
|
|
// only do this for showing them!!!
|
|
if ( m_usePageLinkBuf ) sb = &m_pageLinkBuf;
|
|
// get from spider request if there
|
|
//bool injected = false;
|
|
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
|
// we do not want to waste time computing the page title
|
|
// of bad inlinks if we only want the good inlinks, because
|
|
// as of oct 25, 2012 we only store the "good" inlinks
|
|
// in the titlerec
|
|
bool onlyNeedGoodInlinks = true;
|
|
// so if steve wants to display all links then set this
|
|
// to false so we get titles of bad inlinks
|
|
if ( m_usePageLinkBuf ) onlyNeedGoodInlinks = false;
|
|
// seems like pageparser.cpp just sets m_pbuf and not
|
|
// m_usePageLinkBuf any more
|
|
if ( m_pbuf ) onlyNeedGoodInlinks = false;
|
|
// status update
|
|
setStatus ( "calling msg25 for url" );
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// we want to get all inlinks if doing a custom crawlbot crawl
|
|
// because we need the anchor text to pass in to diffbot
|
|
bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
|
|
bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
|
|
// this seems to overdo it when we have a ton of linktext
|
|
// perhaps, so take this out...
|
|
//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
|
|
// doLinkSpamCheck = false;
|
|
// oneVotePerIpDom = false;
|
|
// onlyNeedGoodInlinks = false;
|
|
//}
|
|
|
|
// call it. this is defined in Linkdb.cpp
|
|
char *url = getFirstUrl()->getUrl();
|
|
if ( ! getLinkInfo ( &m_tmpBuf12,
|
|
&m_mcast12,
|
|
mysite ,
|
|
url ,
|
|
false , // isSiteLinkInfo?
|
|
*ip ,
|
|
*d ,
|
|
cr->m_collnum , //linkInfoColl
|
|
NULL , // qbuf
|
|
0 , // qbufSize
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
m_contentInjected ,//m_injectedReply ,
|
|
sb ,
|
|
m_printInXml ,
|
|
*sni ,
|
|
//m_sitePop ,
|
|
oldLinkInfo1 ,
|
|
m_niceness ,
|
|
doLinkSpamCheck ,
|
|
oneVotePerIpDom ,
|
|
canBeCancelled ,
|
|
lastUpdateTime ,
|
|
onlyNeedGoodInlinks ,
|
|
false, // getlinkertitles
|
|
0, // ourhosthash32 (special)
|
|
0, // ourdomhash32 (special)
|
|
&m_myPageLinkInfoBuf
|
|
) )
|
|
// blocked
|
|
return (LinkInfo *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// panic! what the fuck? why did it return true and then
|
|
// call our callback???
|
|
//if ( g_conf.m_logDebugBuild ) {
|
|
log("build: xmldoc call to msg25 did not block");
|
|
// must now block since it uses multicast now to
|
|
// send the request onto the network
|
|
char *xx=NULL;*xx=0;
|
|
//}
|
|
}
|
|
|
|
// at this point assume its valid
|
|
m_linkInfo1Valid = true;
|
|
// . get the link info we got set
|
|
// . this ptr references into m_myPageLinkInfoBuf safebuf
|
|
//ptr_linkInfo1 = m_msg25.m_linkInfo;
|
|
//size_linkInfo1 = m_msg25.m_linkInfo->getSize();
|
|
ptr_linkInfo1 = (LinkInfo *)m_myPageLinkInfoBuf.getBufStart();
|
|
size_linkInfo1 = m_myPageLinkInfoBuf.length();
|
|
// we should free it
|
|
m_freeLinkInfo1 = true;
|
|
// this can not be NULL!
|
|
if ( ! ptr_linkInfo1 || size_linkInfo1 <= 0 ) {
|
|
log("build: error getting linkinfo1: %s",mstrerror(g_errno));
|
|
char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
// take it from msg25 permanently
|
|
//m_msg25.m_linkInfo = NULL;
|
|
// set flag
|
|
m_linkInfo1Valid = true;
|
|
// . validate the hop count thing too
|
|
// . i took hopcount out of linkdb to put in lower ip byte for steve
|
|
//m_minInlinkerHopCount = -1;//m_msg25.getMinInlinkerHopCount();
|
|
// return it
|
|
return ptr_linkInfo1;
|
|
}
|
|
|
|
|
|
static void *s_null = NULL;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . returns -1 if blocked, will re-call m_callback
|
|
LinkInfo **XmlDoc::getLinkInfo2 ( ) {
|
|
|
|
// this can now be title hashes for XmlDoc::m_diffbotTitleHashes
|
|
// but otherwise, we don't use it for link info from another cluster
|
|
// any more.
|
|
m_linkInfo2Valid = true;
|
|
return (LinkInfo **)&s_null;
|
|
|
|
// return if we got it
|
|
if ( m_linkInfo2Valid ) return &ptr_linkInfo2;
|
|
|
|
m_linkInfo2Valid = true;
|
|
ptr_linkInfo2 = NULL;
|
|
return &ptr_linkInfo2;
|
|
|
|
/*
|
|
if ( ! cr->m_importFromHosts2Conf ) {
|
|
m_linkInfo2Valid = true;
|
|
ptr_linkInfo2 = NULL;
|
|
return &ptr_linkInfo2;
|
|
}
|
|
|
|
// change status
|
|
setStatus ( "getting remote hosts2.conf inlinkers" );
|
|
|
|
XmlDoc **od = getOldXmlDoc ( );
|
|
if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo **)od;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo **)sni;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo **)ip;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (LinkInfo **)d;
|
|
// grab a ptr to the LinkInfo contained in our Doc class
|
|
LinkInfo *oldLinkInfo2 = NULL;
|
|
if ( *od ) oldLinkInfo2 = *(*od)->getLinkInfo2();
|
|
|
|
// . now search for some link info for this url/doc
|
|
// . this queries the search engine to get linking docIds along
|
|
// with their termIds/scores from anchor text and then compiles
|
|
// it all into one IndexList
|
|
// . if we have no linkers to this url then we set siteHash, etc.
|
|
// for this linkInfo class
|
|
// . this is my google algorithm
|
|
// . let's use the first url (before redirects) for this
|
|
// . m_newDocId is used for classifying doc under predefined news topic
|
|
// . catSiteRec is used for classifying pages under a predefined
|
|
// newstopic. this is currently for news search only.
|
|
// . use the rootTitleRecPtr if there and we are doing our link info
|
|
// stuff in this collection, but if doing it in another collection
|
|
// the msg25 will look up the root in that collection...
|
|
if ( ! m_calledMsg25b ) {
|
|
// do not redo it
|
|
m_calledMsg25b = true;
|
|
// int16_tcut
|
|
Msg25 *m = &m_msg25;
|
|
// can we be cancelled?
|
|
bool canBeCancelled = true;
|
|
// not if pageparser though
|
|
if ( m_pbuf ) canBeCancelled = false;
|
|
// not if injecting
|
|
if ( ! m_sreqValid ) canBeCancelled = false;
|
|
// use this buffer so XmlDoc::print() can display wherever
|
|
//SafeBuf *sb = NULL;
|
|
//if ( m_pbuf ) sb = &m_pageLinkBuf2;
|
|
// call it
|
|
if ( ! m->getPageLinkInfo2 ( getFirstUrl() ,
|
|
m_coll ,
|
|
cr->m_externalColl ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
cr->m_doLinkSpamCheck ,
|
|
cr->m_oneVotePerIpDom ,
|
|
canBeCancelled ) )
|
|
// blocked
|
|
return (LinkInfo **)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
}
|
|
|
|
// at this point assume its valid
|
|
m_linkInfo2Valid = true;
|
|
// get the link info we got set
|
|
ptr_linkInfo2 = m_msg25.m_linkInfo;
|
|
// we should free it
|
|
m_freeLinkInfo2 = true;
|
|
// take it from msg25 permanently
|
|
m_msg25.m_linkInfo = NULL;
|
|
// set flag
|
|
m_linkInfo2Valid = true;
|
|
// validate the hop count thing too
|
|
//m_minInlinkerHopCount = m_msg25.getMinInlinkerHopCount();
|
|
// return it
|
|
return &ptr_linkInfo2;
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
|
|
static void gotSiteWrapper ( void *state ) ;
|
|
|
|
// . we should store the site in the title rec because site getter might
|
|
// change what it thinks the site is!
|
|
char *XmlDoc::getSite ( ) {
|
|
// was there a problem getting site?
|
|
if ( m_siteValid && m_siteGetter.m_errno ) {
|
|
g_errno = m_siteGetter.m_errno;
|
|
return NULL;
|
|
}
|
|
// ok, return it
|
|
if ( m_siteValid ) return ptr_site;//m_siteGetter.m_site;
|
|
// note it
|
|
setStatus ( "getting site");
|
|
// need this
|
|
TagRec *gr = getTagRec();
|
|
// sanity check
|
|
if ( ! gr && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// blocked or error?
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// get url
|
|
Url *f = getFirstUrl();
|
|
// bogus first url? prevent core in getIsSiteRoot().
|
|
if ( f->getUrlLen() <= 1 ) {
|
|
log("xmldoc: getSite: got bogus first url.");
|
|
g_errno = EBADURL;
|
|
return NULL;
|
|
}
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t timestamp = getSpideredTime();//m_spideredTime;
|
|
// add tags to tagdb?
|
|
//bool addTags = true;
|
|
//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
|
|
//if ( getIsPageParser() ) addTags = false;
|
|
// do it
|
|
if ( ! m_siteGetter.getSite ( f->getUrl() ,
|
|
gr ,
|
|
timestamp ,
|
|
cr->m_collnum ,
|
|
m_niceness ,
|
|
//addTags ,
|
|
this , // state
|
|
gotSiteWrapper ))
|
|
// return -1 if we blocked
|
|
return (char *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// set these then
|
|
gotSite();
|
|
return ptr_site;//m_siteGetter.m_site;
|
|
}
|
|
|
|
// set it
|
|
void gotSiteWrapper ( void *state ) {
|
|
// point to us
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotSite ();
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
void XmlDoc::gotSite ( ) {
|
|
// sanity check
|
|
if ( ! m_siteGetter.m_allDone && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// this sets g_errno on error
|
|
ptr_site = m_siteGetter.m_site;
|
|
size_site = m_siteGetter.m_siteLen+1; // include \0
|
|
// sanity check -- must have a site
|
|
if ( ! g_errno && size_site <= 1 ) { char *xx=NULL;*xx=0; }
|
|
// sitegetter.m_errno might be set!
|
|
m_siteValid = true;
|
|
// must be valid
|
|
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
// add the sitepathdepth tag to our tagrec
|
|
//Tag *a = m_siteGetter.m_addedTag.getFirstTag();
|
|
//if ( a ) m_newTagRec.addTag ( a );
|
|
}
|
|
|
|
int64_t *XmlDoc::getSiteHash64 ( ) {
|
|
if ( m_siteHash64Valid ) return &m_siteHash64;
|
|
char *site = getSite();
|
|
// sanity check
|
|
if ( ! site && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
if ( ! site || site == (void *)-1) return (int64_t *)site;
|
|
m_siteHash64 = hash64 ( site , gbstrlen(site) );
|
|
m_siteHash64Valid = true;
|
|
return &m_siteHash64;
|
|
}
|
|
|
|
|
|
int32_t *XmlDoc::getSiteHash32 ( ) {
|
|
if ( m_siteHash32Valid ) return &m_siteHash32;
|
|
char *site = getSite();
|
|
if ( ! site || site == (void *)-1) return (int32_t *)site;
|
|
m_siteHash32 = hash32 ( site , gbstrlen(site) );
|
|
m_siteHash32Valid = true;
|
|
return &m_siteHash32;
|
|
}
|
|
|
|
|
|
|
|
|
|
void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
|
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
bool hadError = false;
|
|
|
|
THIS->setStatus("got diffbot reply");
|
|
|
|
// wha?
|
|
if ( g_errno ) {
|
|
log("diffbot: http error2 %s",mstrerror(g_errno));
|
|
THIS->m_diffbotReplyError = g_errno;
|
|
hadError = true;
|
|
}
|
|
|
|
// just retry if connection got reset by peer!
|
|
if ( g_errno == ECONNRESET ||
|
|
g_errno == ETIMEDOUT ) {
|
|
retry:
|
|
// reset error in case was set below before our retry.
|
|
// getDiffbotReply() will retry because we never set
|
|
// m_diffbotReplyValid to true, below.
|
|
THIS->m_diffbotReplyError = 0;
|
|
log("buld: retrying diffbot reply");
|
|
THIS->m_diffbotReplyRetries++;
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
return;
|
|
}
|
|
|
|
THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
//char *buf = s->m_readBuf;
|
|
// do not allow TcpServer.cpp to free it since m_diffbotReply
|
|
// is now responsible for that
|
|
//s->m_readBuf = NULL;
|
|
|
|
// set the mime
|
|
HttpMime mime;
|
|
if ( ! hadError && s && s->m_readOffset>0 &&
|
|
// set location url to "null"
|
|
! mime.set ( s->m_readBuf , s->m_readOffset , NULL ) ) {
|
|
// g_errno should be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("build: error setting diffbot mime");
|
|
THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR;
|
|
hadError = true;
|
|
}
|
|
|
|
bool retryUrl = false;
|
|
|
|
// check the status
|
|
if ( ! hadError && mime.getHttpStatus() != 200 ) {
|
|
THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS;
|
|
log("xmldoc: diffbot reply mime was %"INT32"",
|
|
mime.getHttpStatus());
|
|
hadError = true;
|
|
// gateway timed out? then retry.
|
|
if ( mime.getHttpStatus() == 504 )
|
|
retryUrl = true;
|
|
}
|
|
|
|
if ( hadError )
|
|
log("build: diffbot error for url %s",
|
|
THIS->m_diffbotUrl.getBufStart());
|
|
|
|
|
|
CollectionRec *cr = THIS->getCollRec();
|
|
|
|
if ( cr && strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
|
|
log("build: diffbot reply for url %s = %s",
|
|
THIS->m_diffbotUrl.getBufStart(),
|
|
s->m_readBuf);
|
|
}
|
|
|
|
|
|
if ( retryUrl )
|
|
goto retry;
|
|
|
|
// get page content
|
|
char *page = NULL;
|
|
int32_t pageLen = 0;
|
|
if ( ! hadError && mime.getMimeLen() >= 0 ) {
|
|
page = s->m_readBuf + mime.getMimeLen();
|
|
char *end = s->m_readBuf + s->m_readOffset;
|
|
pageLen = end - page;
|
|
}
|
|
|
|
// "-1" means diffbot had an error
|
|
if ( page &&
|
|
page[0] == '-' &&
|
|
page[1] == '1' ) {
|
|
log("xmldoc: diffbot reply was -1");
|
|
THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
|
|
}
|
|
|
|
|
|
// . verify that it contains legit json and has the last field
|
|
// b/c we saw a case where the diffbot reply was truncated
|
|
// somehow
|
|
// . check to make sure it has the "url": field as all diffbot
|
|
// json replies must
|
|
if ( ! THIS->m_diffbotReplyError ) {
|
|
char *ttt = strstr ( page , "\"url\":\"");
|
|
if ( ! ttt ) ttt = strstr ( page , "\"pageUrl\":\"");
|
|
if ( ! ttt ) {
|
|
log("xmldoc: diffbot reply for %s using %s is missing "
|
|
"the url: field in the json reply. reply=%s",
|
|
THIS->m_firstUrl.m_url,
|
|
THIS->m_diffbotUrl.getBufStart(),
|
|
page
|
|
);
|
|
// try to get the right error code
|
|
char *err = strstr(page,"\"error\":\"");
|
|
if ( err ) err += 9;
|
|
int32_t code = EDIFFBOTUNKNOWNERROR;
|
|
if ( ! err &&
|
|
page[0]=='{' &&
|
|
page[1]=='}' )
|
|
code = EDIFFBOTCURLYREPLY;
|
|
if ( err && !strncmp(err,"Unable to apply rules",21))
|
|
code = EDIFFBOTUNABLETOAPPLYRULES;
|
|
// like .pdf pages get this error
|
|
if ( err && !strncmp(err,"Could not parse page",20))
|
|
code = EDIFFBOTCOULDNOTPARSE;
|
|
// if it is 404... 502, etc. any http status code
|
|
if ( err && !strncmp(err,"Could not download page",23))
|
|
code = EDIFFBOTCOULDNOTDOWNLOAD;
|
|
// custom api does not apply to the url
|
|
if ( err && !strncmp(err,"Invalid API",11))
|
|
code = EDIFFBOTINVALIDAPI;
|
|
if ( err && !strncmp(err,"Version required",16))
|
|
code = EDIFFBOTVERSIONREQ;
|
|
if ( err && !strncmp(err,"Empty content",13))
|
|
code = EDIFFBOTEMPTYCONTENT;
|
|
if ( err && !strncmp(err,"The selected pages contains too many TextNodes",46))
|
|
code = EDIFFBOTTOOMANYTEXTNODES;
|
|
if ( err && !strncmp(err,"No content received",19))
|
|
code = EDIFFBOTEMPTYCONTENT;
|
|
if ( err && !strncmp(err,"Request timed",13))
|
|
code = EDIFFBOTREQUESTTIMEDOUT;
|
|
if ( err &&!strncmp(err,"Request of third-party c",24))
|
|
code = EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY;
|
|
// error processing url
|
|
if ( err && !strncmp(err,"Error processing",16))
|
|
code = EDIFFBOTURLPROCESSERROR;
|
|
if ( err && !strncmp(err,"Your token has exp",18))
|
|
code = EDIFFBOTTOKENEXPIRED;
|
|
if ( err && !strncmp(err,"Not authorized API tok",22))
|
|
code = EDIFFBOTTOKENUNAUTHORIZED;
|
|
if ( err && !strncmp(err,"Error.",6) )
|
|
code = EDIFFBOTPLAINERROR;
|
|
THIS->m_diffbotReplyError = code;
|
|
}
|
|
// a hack for detecting if token is expired
|
|
if ( THIS->m_diffbotReplyError == EDIFFBOTTOKENEXPIRED ) {
|
|
// note it
|
|
log("xmldoc: pausing crawl %s (%"INT32") because "
|
|
"token is expired",cr->m_coll,
|
|
(int32_t)cr->m_collnum);
|
|
// pause the crawl
|
|
SafeBuf parmList;
|
|
// spidering enabled is the "cse" cgi parm in Parms.cpp
|
|
g_parms.addNewParmToList1 ( &parmList ,
|
|
cr->m_collnum,
|
|
"0", // val
|
|
-1 ,
|
|
"cse");
|
|
// this uses msg4 so parm ordering is guaranteed
|
|
g_parms.broadcastParmList ( &parmList , NULL , NULL );
|
|
}
|
|
}
|
|
|
|
// reply is now valid but might be empty
|
|
THIS->m_diffbotReplyValid = true;
|
|
|
|
// if json reply was truncated, that is an error as well.
|
|
// likewise we have to check if such bad json is in the serps
|
|
// when doing an icc=1 and print 'bad json' in json instead.
|
|
if ( ! THIS->m_diffbotReplyError && s->m_readOffset > 1 &&
|
|
// json must end with '}' (ignores trailing whitespace)
|
|
! endsInCurly ( s->m_readBuf , s->m_readOffset ) ) {
|
|
// hopefully this can be re-tried later.
|
|
THIS->m_diffbotReplyError = EJSONMISSINGLASTCURLY;
|
|
// make a note of it
|
|
log("build: got diffbot reply missing curly for %s",
|
|
THIS->m_firstUrl.m_url);
|
|
}
|
|
|
|
//if ( ! cr ) return;
|
|
|
|
bool countIt = true;
|
|
if ( ! cr ) countIt = false;
|
|
if ( THIS->m_diffbotReplyError ) countIt = false;
|
|
|
|
/*
|
|
|
|
// solution for bug #2092 but probably not really needed so
|
|
// commented out.
|
|
|
|
// if doing /vxxx/analzye?mode=xxxx then ensure matches
|
|
bool isAnalyze = false;
|
|
if ( countIt &&
|
|
THIS->m_diffbotApiUrlValid &&
|
|
strstr ( THIS->m_diffbotApiUrl.getBufStart(), "/analyze?") )
|
|
isAnalyze = true;
|
|
|
|
char *mode = NULL;
|
|
if ( isAnalyze ) {
|
|
mode = strstr (THIS->m_diffbotApiUrl.getBufStart(), "mode=");
|
|
if ( mode ) mode += 5;
|
|
// find end of it
|
|
}
|
|
|
|
char *pageType = NULL;
|
|
int32_t pageTypeLen;
|
|
if ( mode &&
|
|
THIS->m_diffbotReplyValid &&
|
|
THIS->m_diffbotReply.length() > 5 ) {
|
|
char *reply = THIS->m_diffbotReply.getBufStart();
|
|
pageType = strstr ( reply , "\"type\":\"" );
|
|
if ( pageType ) pageType += 8;
|
|
char *e = pageType;
|
|
for ( ; *e && *e != '\"' ; e++ );
|
|
pageTypeLen = e - pageType;
|
|
}
|
|
|
|
// if it does not match, do not count it
|
|
if ( mode && pageType && strncmp ( mode , pageType , pageTypeLen ) )
|
|
countIt = false;
|
|
*/
|
|
|
|
// increment this counter on a successful reply from diffbot
|
|
if ( countIt ) { // ! THIS->m_diffbotReplyError && cr ) {
|
|
// mark this flag
|
|
THIS->m_gotDiffbotSuccessfulReply = 1;
|
|
// count it for stats
|
|
cr->m_localCrawlInfo.m_pageProcessSuccesses++;
|
|
cr->m_globalCrawlInfo.m_pageProcessSuccesses++;
|
|
// per round as well
|
|
cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound++;
|
|
cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound++;
|
|
// log it
|
|
log(LOG_INFO,
|
|
"build: processed page %s (pageLen=%"INT32")",
|
|
THIS->m_firstUrl.m_url,
|
|
pageLen);
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
// sanity!
|
|
// crap, this can happen if we try to get the metalist
|
|
// of an old page for purposes of incremental indexing or
|
|
// deletion. we do not re-download it, but it seems we try
|
|
// to re-process it...
|
|
//if ( cr->m_localCrawlInfo.m_pageProcessAttempts >
|
|
// cr->m_localCrawlInfo.m_pageDownloadAttempts ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// need to save collection rec now during auto save
|
|
cr->m_needsSave = true;
|
|
// the diffbot api url we used
|
|
//SafeBuf *au = THIS->getDiffbotApiUrl();
|
|
//if ( ! au || au == (void *)-1 ) {char *xx=NULL;*xx=0;}
|
|
// set the reply properly
|
|
int32_t need = pageLen + 1;// + au->length() + 1;
|
|
if ( ! THIS->m_diffbotReply.reserve ( need ) )
|
|
goto skip;
|
|
// first store the url we used on first line
|
|
//THIS->m_diffbotReply.safeMemcpy ( au->getBufStart(),
|
|
// au->length() );
|
|
//THIS->m_diffbotReply.pushChar('\n');
|
|
// convert the \u1f23 to utf8 (\n and \r as well)
|
|
// crap, this decodes \\\\\" to \\" which is causing
|
|
// the json parser to believe it is an encoded \ then
|
|
// a REAL quote... but quote is contained...
|
|
//THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen ,
|
|
// THIS->m_niceness );
|
|
|
|
// do not do that any more then, jsonparse can call it
|
|
// on a per string basis
|
|
THIS->m_diffbotReply.safeMemcpy ( page , pageLen );
|
|
|
|
// convert embedded \0 to space
|
|
//char *p = THIS->m_diffbotReply.getBufStart();
|
|
//char *pend = p + THIS->m_diffbotReply.getLength();
|
|
// tack on a \0 but don't increment m_length
|
|
THIS->m_diffbotReply.nullTerm();
|
|
|
|
// any embedded \0's in the utf8?
|
|
int32_t testLen1 = THIS->m_diffbotReply.length();
|
|
int32_t testLen2 = gbstrlen(THIS->m_diffbotReply.getBufStart());
|
|
if ( testLen1 != testLen2 ) { char *xx=NULL;*xx=0; }
|
|
// convert the \u1f23 to utf8 (\n and \r as well)
|
|
//THIS->m_diffbotReply.decodeJSONToUtf8 ( THIS->m_niceness );
|
|
//THIS->m_diffbotReply.nullTerm();
|
|
}
|
|
|
|
skip:
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
|
|
|
|
if ( m_diffbotApiUrlValid )
|
|
return &m_diffbotApiUrl;
|
|
|
|
// if we are a diffbot json object, do not re-send to diffbot!
|
|
if ( m_isDiffbotJSONObject ) {
|
|
//m_diffbotApiNum = DBA_NONE;
|
|
m_diffbotApiUrlValid = true;
|
|
return &m_diffbotApiUrl;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
|
|
m_diffbotApiUrl.safeMemcpy ( &cr->m_diffbotApiUrl );
|
|
m_diffbotApiUrl.nullTerm();
|
|
m_diffbotApiUrlValid = true;
|
|
|
|
// this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
|
|
// in case the url filters table changes while spidering this!!!
|
|
// gotta be careful of that.
|
|
//int32_t *ufn = getUrlFilterNum();
|
|
//if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
|
|
|
|
// ensure it does set it!
|
|
//if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
//m_diffbotApiNum = cr->m_spiderDiffbotApiNum[*ufn];
|
|
|
|
// sanity check
|
|
//if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
//m_diffbotApiNumValid = true;
|
|
return &m_diffbotApiUrl;
|
|
}
|
|
|
|
// if only processing NEW URLs is enabled, then do not get diffbot reply
|
|
// if we already got one before
|
|
bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
|
|
|
if ( m_recycleDiffbotReplyValid )
|
|
return &m_recycleDiffbotReply;
|
|
|
|
// if from pageparser.cpp re-call diffbot for debugging
|
|
if ( getIsPageParser() ) {
|
|
m_recycleDiffbotReply = false;
|
|
m_recycleDiffbotReplyValid = true;
|
|
return &m_recycleDiffbotReply;
|
|
}
|
|
|
|
XmlDoc **odp = getOldXmlDoc( );
|
|
if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp;
|
|
XmlDoc *od = *odp;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if doc has been successfully processed in the past then
|
|
// ***RECYCLE*** the diffbot reply!
|
|
m_recycleDiffbotReply = false;
|
|
|
|
if ( cr->m_diffbotOnlyProcessIfNewUrl &&
|
|
od && od->m_gotDiffbotSuccessfulReply )
|
|
m_recycleDiffbotReply = true;
|
|
|
|
// to fight off corrupted title recs just assume that even though
|
|
// we could not uncompress the title rec that it had a successful reply
|
|
// if ( cr->m_diffbotOnlyProcessIfNewUrl &&
|
|
// m_oldDocExistedButHadError )
|
|
// m_recycleDiffbotReply = true;
|
|
|
|
// don't recycle if specfically asked to reindex though
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex )
|
|
m_recycleDiffbotReply = false;
|
|
|
|
// unless the 'recycle content' checkbox was checked when doing
|
|
// the query (page) reindex...
|
|
if ( m_sreqValid && m_sreq.m_recycleContent )
|
|
m_recycleDiffbotReply = true;
|
|
|
|
|
|
m_recycleDiffbotReplyValid = true;
|
|
|
|
return &m_recycleDiffbotReply;
|
|
}
|
|
|
|
// get hashes of the json objects in the diffbotreply
|
|
int32_t *XmlDoc::getDiffbotTitleHashes ( int32_t *numHashes ) {
|
|
|
|
*numHashes = size_linkInfo2 / 4;
|
|
|
|
if ( ! ptr_linkInfo2 ) *numHashes = 0;
|
|
|
|
// hack: use linkdbdata2 field
|
|
if ( m_diffbotTitleHashBufValid ) {
|
|
// do not return NULL without g_errno set
|
|
if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
|
|
return (int32_t *)ptr_linkInfo2;
|
|
}
|
|
|
|
SafeBuf *tdbr = getTokenizedDiffbotReply();
|
|
if ( ! tdbr || tdbr == (void *)-1 ) return (int32_t *)tdbr;
|
|
|
|
HashTableX dedup;
|
|
if ( ! dedup.set ( 4,0,1024,NULL,0,false,m_niceness,"ddthbuf") )
|
|
return NULL;
|
|
|
|
// parse out the json items in the reply
|
|
char *p = tdbr->getBufStart();
|
|
char *pend = p + tdbr->length();
|
|
|
|
int32_t plen;
|
|
|
|
for ( ; p < pend ; p += plen + 1 ) {
|
|
// breathe some in case diffbot reply is 250MB
|
|
QUICKPOLL(m_niceness);
|
|
// set this
|
|
plen = gbstrlen(p);
|
|
// get title from it
|
|
int32_t valLen;
|
|
char *val = getJSONFieldValue ( p , "title", &valLen );
|
|
int32_t th32 = 0;
|
|
// hash the title
|
|
if ( val && valLen ) {
|
|
th32 = hash32 ( val , valLen );
|
|
// avoid 0
|
|
if ( th32 == 0 ) th32 = 1;
|
|
}
|
|
// if no title, use hash of body
|
|
if ( th32 == 0 ) {
|
|
th32 = hash32 ( p , plen );
|
|
// avoid 0
|
|
if ( th32 == 0 ) th32 = 2;
|
|
}
|
|
// if our hash is duplicated then increment until unique
|
|
while ( dedup.isInTable ( &th32 ) ) th32++;
|
|
// store it for deduping
|
|
dedup.addKey ( &th32 );
|
|
// store it
|
|
m_diffbotTitleHashBuf.pushLong(th32);
|
|
}
|
|
|
|
ptr_linkInfo2 = (LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
|
|
size_linkInfo2 = m_diffbotTitleHashBuf.length();
|
|
*numHashes = size_linkInfo2 / 4;
|
|
m_diffbotTitleHashBufValid = true;
|
|
|
|
// if no hashes return 0x01 because NULL means g_errno
|
|
if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
|
|
|
|
return (int32_t *)ptr_linkInfo2;
|
|
}
|
|
|
|
// . we now get the TOKENIZED diffbot reply.
|
|
// . that converts a single diffbot reply into multiple \0 separated
|
|
// json objects.
|
|
// . for instance, the diffbot product api returns an array like
|
|
// "products":[{...},{...}],"url":... that consists of multiple
|
|
// json product items, but the json elements that are not in
|
|
// this array are description of the page itself, like url and title.
|
|
// so we need to carry over these outter json objects to each
|
|
// inner json object we tokenize.
|
|
// . in this fashion we'll have separate objects that can each be indexed
|
|
// as a single page, which is what we want for searching.
|
|
SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
|
|
|
|
if ( m_tokenizedDiffbotReplyValid )
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( ! dbr || dbr == (void *)-1 ) return dbr;
|
|
|
|
// empty? that's easy. might be just "{}\n" i guess
|
|
if ( dbr->length() <= 3 ) return dbr;
|
|
|
|
char *text = dbr->getBufStart();
|
|
|
|
Json jp;
|
|
if ( ! jp.parseJsonStringIntoJsonItems ( text , m_niceness ) ) {
|
|
g_errno = EBADJSONPARSER;
|
|
return NULL;
|
|
}
|
|
|
|
JsonItem *jsonItem = jp.getItem("objects");
|
|
char *array = NULL;
|
|
int32_t arrayLen = 0;
|
|
if ( jsonItem ) {
|
|
array = jsonItem->getArrayStart();
|
|
arrayLen = jsonItem->getArrayLen();
|
|
}
|
|
if ( array && arrayLen > 0 ) {
|
|
m_v3buf.safeMemcpy( array , arrayLen );
|
|
m_v3buf.nullTerm();
|
|
// trim off the enclosing []'s
|
|
char *p = m_v3buf.getBufStart();
|
|
for ( ; *p && is_wspace_a(*p) ; p++ );
|
|
if ( *p == '[') *p = ' ';
|
|
char *e = m_v3buf.getBuf()-1;
|
|
for ( ; e>p && is_wspace_a(*e) ;e--);
|
|
if ( *e ==']') *e=' ';
|
|
// replace top level commas with \0's
|
|
int32_t curlies = 0;
|
|
char *x = p;
|
|
bool inQuotes = false;
|
|
// scan now
|
|
for ( ; *x ; x++ ) {
|
|
// escaping a backslash?
|
|
if ( *x == '\\' && x[1] == '\\' ) {
|
|
// skip two bytes then..
|
|
x++;
|
|
continue;
|
|
}
|
|
// escaping a quote? ignore quote then.
|
|
if ( *x == '\\' && x[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
x++;
|
|
continue;
|
|
}
|
|
if ( *x == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
continue;
|
|
}
|
|
// if in a quote, ignore {} in there
|
|
if ( inQuotes ) continue;
|
|
if ( *x== '{' ) {
|
|
curlies++;
|
|
continue;
|
|
}
|
|
if ( *x == '}' ) {
|
|
curlies--;
|
|
continue;
|
|
}
|
|
if ( curlies != 0 ) continue;
|
|
if ( *x == ',' ) *x = '\0';
|
|
}
|
|
m_tokenizedDiffbotReplyPtr = &m_v3buf;
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
|
|
// it must have \"type\":\"product or \"type\":\"image
|
|
// in order for us to do the array separation logic below.
|
|
// we don't want to do this logic for articles because they
|
|
// contain an image array!!!
|
|
|
|
// this must be on the FIRST level of the json object, otherwise
|
|
// we get errors because we got type:article and it
|
|
// contains an images array!
|
|
|
|
int32_t valLen;
|
|
char *val = getJSONFieldValue ( text , "type", &valLen );
|
|
|
|
bool isProduct = false;
|
|
bool isImage = false;
|
|
|
|
if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 )
|
|
isProduct = true;
|
|
|
|
if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 )
|
|
isImage = true;
|
|
|
|
if ( ! isProduct && ! isImage ) {
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
|
|
char *needle;
|
|
char *newTerm;
|
|
if ( isProduct ) {
|
|
needle = ",\"products\":[";
|
|
newTerm = "product";
|
|
}
|
|
else {
|
|
needle = ",\"images\":[";
|
|
newTerm = "image";
|
|
}
|
|
|
|
char *parray = strstr ( text , needle );
|
|
|
|
// if not found, no need to do anything...
|
|
if ( ! parray ) {
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
|
|
// point to [
|
|
char *pstart = parray + gbstrlen(needle) - 1;
|
|
|
|
//
|
|
// ok, now we have to do so json ju jitsu to fix it
|
|
//
|
|
|
|
// point to array. starting at the '['
|
|
char *p = pstart;
|
|
int32_t brackets = 0;
|
|
bool inQuotes = false;
|
|
for ( ; *p ; p++ ) {
|
|
// escaping a quote? ignore quote then.
|
|
if ( *p == '\\' && p[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
p++;
|
|
continue;
|
|
}
|
|
if ( *p == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
continue;
|
|
}
|
|
// if in a quote, ignore {} in there
|
|
if ( inQuotes ) continue;
|
|
if ( *p == '[' ) brackets++;
|
|
if ( *p != ']' ) continue;
|
|
brackets--;
|
|
// stop if array is done. p points to ']'
|
|
if ( brackets == 0 ) break;
|
|
}
|
|
|
|
// now point to outter items to the left of the ",\"products\":[...
|
|
char *left1 = dbr->getBufStart();
|
|
char *left2 = parray;
|
|
// then to the right. skip over the ending ']'
|
|
char *right1 = p + 1;
|
|
char *right2 = dbr->getBuf(); // end of the buffer
|
|
|
|
|
|
SafeBuf *tbuf = &m_tokenizedDiffbotReply;
|
|
|
|
// now scan the json products or images in the array
|
|
char *x = pstart;
|
|
// skip over [
|
|
x++;
|
|
// each product item in array is enclosed in {}'s
|
|
if ( *x != '{' ) {
|
|
log("build: something is wrong with diffbot reply");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
// reset CURLY bracket count
|
|
int32_t curlies = 0;
|
|
char *xstart = NULL;
|
|
inQuotes = false;
|
|
// scan now
|
|
for ( ; x < right1 ; x++ ) {
|
|
// escaping a quote? ignore quote then.
|
|
if ( *x == '\\' && x[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
x++;
|
|
continue;
|
|
}
|
|
if ( *x == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
continue;
|
|
}
|
|
// if in a quote, ignore {} in there
|
|
if ( inQuotes ) continue;
|
|
if ( *x== '{' ) {
|
|
if ( curlies == 0 ) xstart = x;
|
|
curlies++;
|
|
continue;
|
|
}
|
|
if ( *x == '}' ) {
|
|
curlies--;
|
|
if ( curlies != 0 ) continue;
|
|
// unreciprocated '{'? wtf???
|
|
if ( ! xstart ) continue;
|
|
// skip empty curlies
|
|
if ( x[-1] == '{' ) continue;
|
|
//
|
|
// ok, we got an item!
|
|
//
|
|
|
|
// left top items
|
|
if ( ! tbuf->safeMemcpy ( left1 , left2-left1 ) )
|
|
return NULL;
|
|
// use "product":
|
|
|
|
if ( ! tbuf->safePrintf(",\"%s\":" , newTerm ) )
|
|
return NULL;
|
|
// the item itself, include it's curlies.
|
|
if ( ! tbuf->safeMemcpy ( xstart , x - xstart+1 ) )
|
|
return NULL;
|
|
// right top items
|
|
if ( ! tbuf->safeMemcpy ( right1 , right2-right1 ) )
|
|
return NULL;
|
|
// then a \0
|
|
if ( ! tbuf->pushChar('\0') )
|
|
return NULL;
|
|
// reset this!
|
|
xstart = NULL;
|
|
}
|
|
}
|
|
|
|
// now show the items. debug!
|
|
//p = tbuf->getBufStart();
|
|
//for ( ; p < tbuf->getBuf() ; p += gbstrlen(p) + 1 )
|
|
// fprintf(stderr,"ITEM\n%s\n\n",p);
|
|
|
|
|
|
m_tokenizedDiffbotReplyPtr = tbuf;
|
|
m_tokenizedDiffbotReplyValid = true;
|
|
return m_tokenizedDiffbotReplyPtr;
|
|
}
|
|
|
|
void gotDiffbotProxyReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_diffbotProxyReply = NULL;
|
|
// if a valid reply, then point to it
|
|
if ( slot->m_readBufSize == sizeof(ProxyReply) ) {
|
|
THIS->m_diffbotProxyReply = (ProxyReply *)slot->m_readBuf;
|
|
// steal it, we will free it in XmlDoc::reset()
|
|
slot->m_readBuf = NULL;
|
|
}
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . convert document into json representing multiple documents
|
|
// if it makes sense. sometimes a single url contains multiple
|
|
// subdocuments that each should have their own url, but do not,
|
|
// so we fix that here.
|
|
// . the diffbot reply will be a list of json objects we want to index
|
|
SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
|
|
|
// got reply of malformed json missing final '}'
|
|
if ( m_diffbotReplyValid &&
|
|
m_diffbotReplyError == EJSONMISSINGLASTCURLY ) {
|
|
// hopefully spider will retry later
|
|
g_errno = m_diffbotReplyError;
|
|
return NULL;
|
|
}
|
|
|
|
if ( m_diffbotReplyValid )
|
|
return &m_diffbotReply;
|
|
|
|
// . check the url filters table to see if diffbot api is specified
|
|
// . just return "\0" if none, but NULL means error i guess
|
|
SafeBuf *au = getDiffbotApiUrl();
|
|
if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
|
|
|
|
// if no url, assume do not access diffbot
|
|
if ( au->length() <= 0 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// if we are json do not send that to diffbot, like an injected
|
|
// json diffbot object. should fix json injections into gobal index
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
|
|
if ( *ct == CT_JSON ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// we make a "fake" url for the diffbot reply when indexing it
|
|
// by appending -diffbotxyz%"UINT32". see "fakeUrl" below.
|
|
if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
|
|
if ( m_firstUrlValid )
|
|
log("build: diffbot url would be too long for "
|
|
"%s", m_firstUrl.getUrl() );
|
|
else
|
|
log("build: diffbot url would be too long for "
|
|
"%"INT64"", m_docId );
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// getIndexCode() calls getDiffbotReply(), so avoid a loop!
|
|
//if ( *getIndexCode() )
|
|
// return &m_diffbotReply;
|
|
if ( m_indexCodeValid && m_indexCode )
|
|
return &m_diffbotReply;
|
|
|
|
|
|
if ( m_isDiffbotJSONObject ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// if this is a robots.txt or a root page we are downloading
|
|
// separately to get the title for to compare to this page's title,
|
|
// or whatever, do not pass to diffbot
|
|
if ( m_isChildDoc ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// get list of substring patterns
|
|
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
|
|
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
|
|
if ( upp && ! upp[0] ) upp = NULL;
|
|
if ( ucp && ! ucp[0] ) ucp = NULL;
|
|
// do we match the url process pattern or regex?
|
|
// get the compiled regular expressions
|
|
//regex_t *ucr = &cr->m_ucr;
|
|
regex_t *upr = &cr->m_upr;
|
|
//if ( ! cr->m_hasucr ) ucr = NULL;
|
|
if ( ! cr->m_hasupr ) upr = NULL;
|
|
// get the url
|
|
Url *f = getFirstUrl();
|
|
char *url = f->getUrl();
|
|
// . "upp" is a ||-separated list of substrings
|
|
// . "upr" is a regex
|
|
// . regexec returns 0 for a match
|
|
if ( upr && regexec(upr,url,0,NULL,0) ) {
|
|
// return empty reply
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
if ( upp && !upr &&!doesStringContainPattern(url,upp)) {
|
|
// return empty reply
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
|
|
|
|
// if already processed and onlyprocessifnewurl is enabled then
|
|
// we recycle and do not bother with this, we also do not nuke
|
|
// the diffbot json objects we have already indexed by calling
|
|
// nukeJSONObjects()
|
|
bool *recycle = getRecycleDiffbotReply();
|
|
if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle;
|
|
if ( *recycle ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// if set from title rec, do not do it. we are possibly an "old doc"
|
|
// and we should only call diffbot.com with new docs
|
|
if ( m_setFromTitleRec ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// "none" means none too! Parms.cpp doesn't like &dapi1=& because
|
|
// it does not call setParm() on such things even though it probably
|
|
// should, it doesn't like no values, so i put "none" in there.
|
|
if ( strncasecmp(au->getBufStart(),"none",4) == 0 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
if ( strncasecmp(au->getBufStart(),"donotprocess",12) == 0 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// invalid url?
|
|
Url apiUrl; apiUrl.set ( au->getBufStart() );
|
|
if (apiUrl.getUrlLen() <= 0 ||
|
|
apiUrl.getHostLen() <= 0 ||
|
|
apiUrl.getDomainLen() <= 0 ) {
|
|
log("build: invalid diffbot api url of \"%s\".",
|
|
au->getBufStart() );
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// when respidering an "old" doc, never call this. we already
|
|
// have the diffbot replies xyz.com/-diffbot-0 and xyz.com/-diffbot-1
|
|
// etc.
|
|
//if ( m_setFromTitleRec ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// sanity check. no! barfs on legit url with -diffbot- in it
|
|
//if ( strstr(m_firstUrl.m_url,"-diffbot-") ) {
|
|
// char *xx=NULL; *xx = 0; }
|
|
|
|
// we should not "process" (i.e. send to diffbot) urls that do
|
|
// not match the supplied CollectionRec::m_diffbotUrlProcessPattern
|
|
// let's just put a checkbox in the url filters box for this!
|
|
// i.e. Send to Diffbot? [X]
|
|
//if ( m_useDiffbot && ! doesUrlMatchDiffbotProcessPattern() ) {
|
|
// m_diffbotReplyValid = true;
|
|
// return &m_diffbotReply;
|
|
//}
|
|
|
|
// empty content, do not send to diffbot then
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
|
|
if ( ! *u8 ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// do not send to diffbot if its binary!
|
|
char *ib = getIsBinary();
|
|
if ( ! ib || ib == (void *)-1 ) return (SafeBuf *)ib;
|
|
if ( *ib ) {
|
|
m_diffbotReplyValid = true;
|
|
log("diffbot: skipping binary page %s",m_firstUrl.m_url);
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
|
|
// or if original page content matches the page regex dont hit diffbot
|
|
if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
|
|
m_diffbotReplyValid = true;
|
|
return &m_diffbotReply;
|
|
}
|
|
|
|
// now include referring link anchor text, etc.
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
|
|
|
|
|
|
setStatus("getting diffbot reply");
|
|
|
|
|
|
// set up dedup table for deduping on link text
|
|
HashTableX dedup;
|
|
char tmp[512];
|
|
if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") )
|
|
return NULL;
|
|
|
|
SafeBuf headers;
|
|
bool first = true;
|
|
|
|
// . make additional headers
|
|
// . add two headers for every "good" (non-dup) link
|
|
// . do NOT end headers in \r\n since HttpServer adds that!
|
|
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// sanity
|
|
if ( k->size_urlBuf <= 1 ) continue;
|
|
// skip if too long
|
|
if ( k->size_linkText > 1024 ) continue;
|
|
// or not enough! (size includes \0)
|
|
if ( k->size_linkText <= 1 ) continue;
|
|
// sanity check
|
|
char *txt = k->getLinkText();
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// this seems to happen sometimes..
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) continue;
|
|
// if anchor text has \0 skip it
|
|
if ( gbstrlen(txt) != tlen ) continue;
|
|
// or if surrounding text has \0 skip as well
|
|
char *surStr = k->getSurroundingText();
|
|
int32_t surLen = k->size_surroundingText;
|
|
if ( surLen > 0 ) surLen--;
|
|
if ( surStr && gbstrlen(surStr) != surLen ) continue;
|
|
// dedup on that
|
|
int32_t h32 = hash32 ( txt , tlen );
|
|
if ( dedup.isInTable ( &h32 ) ) continue;
|
|
if ( ! dedup.addKey ( &h32 ) ) return NULL;
|
|
// separate with \r\n
|
|
if ( ! first && ! headers.safePrintf("\r\n" ) )
|
|
return NULL;
|
|
first = false;
|
|
// add to http header
|
|
if ( ! headers.safePrintf("X-referring-url: ") )
|
|
return NULL;
|
|
// do not include the terminating \0, so -1
|
|
if ( ! headers.safeMemcpy(k->getUrl() , k->size_urlBuf-1 ))
|
|
return NULL;
|
|
// and link text
|
|
if ( ! headers.safePrintf("\r\nX-anchor-text: ") )
|
|
return NULL;
|
|
// store the anchor text without any \r or \n chars
|
|
if ( ! headers.reserve ( tlen ) ) return NULL;
|
|
char *p = txt;
|
|
char *pend = txt + tlen;
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( *p == '\r' ) continue;
|
|
if ( *p == '\n' ) continue;
|
|
headers.pushChar(*p);
|
|
}
|
|
// do not include it if more than 2000 chars big
|
|
if ( surLen > 0 && surLen < 2000 ) {
|
|
if ( ! headers.safePrintf("\r\nX-surrounding-text: ") )
|
|
return NULL;
|
|
// make room for copying the surrounding text
|
|
if ( ! headers.reserve ( surLen ) ) return NULL;
|
|
// copy minus any \r or \n so its mime header safe
|
|
p = surStr;
|
|
pend = surStr + surLen;
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( *p == '\r' ) continue;
|
|
if ( *p == '\n' ) continue;
|
|
headers.pushChar(*p);
|
|
}
|
|
}
|
|
}
|
|
|
|
// make sure to null term the headers
|
|
if ( headers.length() && ! headers.nullTerm() ) return NULL;
|
|
|
|
//char *path = "api";
|
|
//if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 )
|
|
// path = "v2";
|
|
|
|
//
|
|
// DIFFBOT injection interface TODO
|
|
//
|
|
// if we are intercepting a direct injection diffbot request
|
|
// then we will probably take the exact same parms provided and
|
|
// just relay them to diffbot here. maybe Diffbot.cpp can set
|
|
// the original diffbot.com request url in this xmldoc class that
|
|
// is being inject using the url encoded in that request.
|
|
//
|
|
|
|
// url can be on the stack since httpserver.cpp makes an http mime
|
|
// from this url
|
|
//SafeBuf diffbotUrl;
|
|
|
|
// TODO: make sure "api" works as hostname for not just product...
|
|
//diffbotUrl.safePrintf("http://www.diffbot.com/");
|
|
// skip extra '/'?
|
|
//char *api = au->getBufStart();
|
|
//int32_t apiLen = au->length();
|
|
//if ( api && api[0] == '/' ) { api++; apiLen--; }
|
|
// append the custom url. i.e. /api/analyze?mode=auto&u=
|
|
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
|
|
|
|
// reset it in case we are a re-call from gotDiffbotReplyWrapper()
|
|
// if g_errno == ECONNRESET
|
|
m_diffbotUrl.reset();
|
|
// store the api url into here
|
|
m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() );
|
|
|
|
// . m_diffbotApi Is like "article" or "product" etc.
|
|
// . if classify is true we always return the classification
|
|
// of the page in the JSON. like "type":"frontpage" regardless
|
|
// of the "api" specified.
|
|
// . otherwise, if classify is false empty json will be returned
|
|
// if there is no json objects of the specified page type, "api"
|
|
// . BUT if api is "all" return all types of json objects
|
|
// . SHOULD we return "type" in the json output?
|
|
/*
|
|
if ( *an == DBA_ALL )
|
|
diffbotUrl.safePrintf("analyze?mode=auto&" );
|
|
else if ( *an == DBA_ARTICLE_FORCE )
|
|
diffbotUrl.safePrintf("article?");
|
|
else if ( *an == DBA_ARTICLE_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=article&");
|
|
else if ( *an == DBA_PRODUCT_FORCE )
|
|
diffbotUrl.safePrintf("product?");
|
|
else if ( *an == DBA_PRODUCT_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=product&");
|
|
else if ( *an == DBA_IMAGE_FORCE )
|
|
diffbotUrl.safePrintf("image?");
|
|
else if ( *an == DBA_IMAGE_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=image&");
|
|
else if ( *an == DBA_FRONTPAGE_FORCE )
|
|
diffbotUrl.safePrintf("frontpage?");
|
|
else if ( *an == DBA_FRONTPAGE_AUTO )
|
|
diffbotUrl.safePrintf("analyze?mode=frontpage&");
|
|
else {
|
|
log("build: unknown diffbot api num = %"INT32". assuming all",*an );
|
|
diffbotUrl.safePrintf("analyze?mode=auto&" );
|
|
}
|
|
*/
|
|
|
|
//CollectionRec *cr = getCollRec();
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// add a '?' if none
|
|
if ( ! strchr ( apiUrl.getUrl() , '?' ) )
|
|
m_diffbotUrl.pushChar('?');
|
|
else
|
|
m_diffbotUrl.pushChar('&');
|
|
|
|
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
|
|
// only print token if we have one, because if user provides their
|
|
// own diffbot url (apiUrl in Parms.cpp) then they might include
|
|
// the token in that for their non-custom crawl. m_customCrawl=0.
|
|
if ( cr->m_diffbotToken.length())
|
|
m_diffbotUrl.safePrintf("token=%s",
|
|
cr->m_diffbotToken.getBufStart());
|
|
|
|
bool useProxies = true;
|
|
// user can turn off proxy use with this switch
|
|
if ( ! g_conf.m_useProxyIps ) useProxies = false;
|
|
// did collection override?
|
|
if ( cr->m_forceUseFloaters ) useProxies = true;
|
|
// we gotta have some proxy ips that we can use
|
|
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
|
|
|
|
// until we fix https CONNECT support for https urls diffbot can't
|
|
// go through gb. we should fix that by downloading the whole page
|
|
// ourselves and sending it back, and tell diffbot's phantomjs not
|
|
// to do the certificate check.
|
|
//
|
|
// for now, allow http and NOT https urls through though.
|
|
// TODO: if the url redirects to an https url will this mess us up?
|
|
// if ( ! m_firstUrlValid )
|
|
// useProxies = false;
|
|
// if ( m_firstUrlValid && m_firstUrl.isHttps() )
|
|
// useProxies = false;
|
|
|
|
// turn off for now always
|
|
//useProxies = false;
|
|
|
|
if ( useProxies && ! m_diffbotProxyReplyValid && m_ipValid ) {
|
|
// a special opcode used in SpiderProxy.cpp
|
|
Msg13Request *r = &m_diffbotProxyRequest;
|
|
r->m_opCode = OP_GETPROXYFORDIFFBOT;
|
|
r->m_banProxyIp = 0;
|
|
r->m_urlIp = m_ip;
|
|
m_diffbotProxyReplyValid = true;
|
|
// get first alive host, usually host #0 but if he is dead then
|
|
// host #1 must take over! if all are dead, it returns host #0.
|
|
// so we are guaranteed "h will be non-null
|
|
Host *h = g_hostdb.getFirstAliveHost();
|
|
// now ask that host for the best spider proxy to send to
|
|
if ( ! g_udpServer.sendRequest ( (char *)r,
|
|
// just the top part of the
|
|
// Msg13Request is sent to
|
|
// handleRequest54() now
|
|
r->getProxyRequestSize() ,
|
|
0x54 , // msgType 0x54
|
|
h->m_ip ,
|
|
h->m_port ,
|
|
-1 , // h->m_hostId ,
|
|
NULL ,
|
|
this , // state data
|
|
gotDiffbotProxyReplyWrapper,
|
|
9999999 )){// 99999sectimeout
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// report it
|
|
log("spider: msg54 request3: %s %s",
|
|
mstrerror(g_errno),r->ptr_url);
|
|
return NULL;
|
|
}
|
|
// wait for reply
|
|
return (SafeBuf *)-1;
|
|
}
|
|
|
|
|
|
// if we used a proxy to download the doc, then diffbot should too
|
|
// BUT tell diffbot to go through host #0 so we can send it to the
|
|
// correct proxy using our load balancing & backoff algos.
|
|
if ( useProxies ) {
|
|
//Host *h0 = g_hostdb.getHost(0);
|
|
// use a random host now to avoid host #0 running
|
|
// out of sockets from diffbot trying to connect
|
|
// for downloading hundreds of urls from the same
|
|
// high crawl delay site.
|
|
// round robin over the hosts just to be more evenly
|
|
// distributed. it will likely get several http requests
|
|
// from diffbot.
|
|
// static int32_t s_lastHostId = -1;
|
|
// if ( s_lastHostId == -1 )
|
|
// s_lastHostId = g_hostdb.m_myHost->m_hostId;
|
|
// int32_t r = s_lastHostId;//rand() % g_hostdb.m_numHosts;
|
|
// if ( ++s_lastHostId >= g_hostdb.m_numHosts )
|
|
// s_lastHostId = 0;
|
|
// Host *h0 = g_hostdb.getHost(r);
|
|
// m_diffbotUrl.safePrintf("&proxy=%s:%"INT32"",
|
|
// iptoa(h0->m_ip),
|
|
// (int32_t)h0->m_httpPort);
|
|
ProxyReply *prep = m_diffbotProxyReply;
|
|
m_diffbotUrl.safePrintf("&proxy=%s:%"UINT32"",
|
|
iptoa(prep->m_proxyIp),
|
|
(uint32_t)prep->m_proxyPort);
|
|
m_diffbotUrl.safePrintf("&proxyAuth=");
|
|
m_diffbotUrl.urlEncode(prep->m_usernamePwd);
|
|
}
|
|
// char *p = g_conf.m_proxyAuth.getBufStart();
|
|
// if ( useProxies && p ) {
|
|
// char *p1 = p;
|
|
// for ( ; *p1 && is_wspace_a(*p1) ; p1++ );
|
|
// char *p2 = p1;
|
|
// for ( ; *p2 && ! is_wspace_a(*p2) ; p2++ );
|
|
// char c = *p2;
|
|
// *p2 = '\0';
|
|
// m_diffbotUrl.safePrintf("&proxyAuth=");
|
|
// m_diffbotUrl.urlEncode(p1);
|
|
// *p2 = c;
|
|
// }
|
|
|
|
// now so it works just give it a proxy directly, so it doesn't
|
|
// have to go through gb.
|
|
// if ( useProxies ) {
|
|
// // msg13 typically uses this to get an unbanned proxy
|
|
// getProxiesToUse();
|
|
// }
|
|
|
|
// if we use proxies then increase the timeout since proxies
|
|
// increase the crawl delay in hopes of backing off to discover
|
|
// the website's policy so we don't hit it too hard and get banned.
|
|
// so to avoid diffbot timing out tell it to wait up to a minute
|
|
// because the crawl delay can be as high as that, even higher
|
|
if ( useProxies )
|
|
m_diffbotUrl.safePrintf("&timeout=%"INT32"",
|
|
(int32_t)MAX_PROXYCRAWLDELAYMS+10000);
|
|
|
|
m_diffbotUrl.safePrintf("&url=");
|
|
// give diffbot the url to process
|
|
m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
|
|
// append this just in case the next thing doesn't have it.
|
|
//if ( cr->m_diffbotApiQueryString.length() &&
|
|
// cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
|
|
// diffbotUrl.pushChar('&');
|
|
// then user provided parms that are dependent on if it is an
|
|
// article, product, etc. like "&dontstripads=1" or whatever
|
|
//diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart());
|
|
|
|
// for analyze requests without mode=, make sure that diffbot expands all objects
|
|
// "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze
|
|
// null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above)
|
|
if (m_diffbotUrl.nullTerm()) {
|
|
char *u = m_diffbotUrl.getBufStart();
|
|
if (strstr(u, "/analyze") && !strstr(u, "mode=")) {
|
|
m_diffbotUrl.safePrintf("&expand");
|
|
}
|
|
}
|
|
|
|
// null term it
|
|
m_diffbotUrl.nullTerm();
|
|
|
|
// mark as tried
|
|
if ( m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
m_sentToDiffbotThisTime = true;
|
|
|
|
// might have been a recall if gotDiffbotReplyWrapper() sensed
|
|
// g_errno == ECONNRESET and it will retry
|
|
if ( ! m_sentToDiffbot ) {
|
|
|
|
m_sentToDiffbot = 1;
|
|
|
|
// count it for stats
|
|
cr->m_localCrawlInfo.m_pageProcessAttempts++;
|
|
cr->m_globalCrawlInfo.m_pageProcessAttempts++;
|
|
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
|
|
cr->m_needsSave = true;
|
|
}
|
|
|
|
char *additionalHeaders = NULL;
|
|
if ( headers.length() > 0 )
|
|
additionalHeaders = headers.getBufStart();
|
|
|
|
// if did not get the web page first and we are crawling, not
|
|
// doing a bulk, then core. we need the webpage to harvest links
|
|
// and sometimes to check the pageprocesspattern to see if we should
|
|
// process.
|
|
if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
log(LOG_INFO,
|
|
"diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
|
|
additionalHeaders);
|
|
|
|
m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
|
|
0 , // ip
|
|
0 , // offset
|
|
-1 , // size
|
|
0 , // ifmodifiedsince
|
|
this , // state
|
|
gotDiffbotReplyWrapper ,
|
|
// MDW: boost timeout from 180 to 18000
|
|
// seconds so we can figure out why
|
|
// diffbot times out, etc. what is
|
|
// going on.
|
|
// this is slowing things too much
|
|
// so make it 240 seconds
|
|
240*1000, // 240 sec timeout
|
|
0,//proxyip
|
|
0,//proxyport
|
|
// unlimited replies i guess
|
|
-1,//maxtextdoclen unlimited
|
|
-1,//maxotherdoclen unlimited
|
|
g_conf.m_spiderUserAgent ,
|
|
"HTTP/1.0",
|
|
false, // do post?
|
|
NULL, // cookie
|
|
additionalHeaders ) )
|
|
// return -1 if blocked
|
|
return (SafeBuf *)-1;
|
|
// error?
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// wha?
|
|
log("diffbot: http error %s",mstrerror(g_errno));
|
|
// had an error!
|
|
return NULL;
|
|
}
|
|
|
|
char **XmlDoc::getHttpReply ( ) {
|
|
// both must be valid now
|
|
if ( m_redirUrlValid && m_httpReplyValid ) {
|
|
// might have been a download error of ECORRUPTDATA
|
|
if ( m_downloadStatus == ECORRUPTDATA ) {
|
|
// set g_errno so caller knows
|
|
g_errno = m_downloadStatus;
|
|
// null means error
|
|
return NULL;
|
|
}
|
|
// otherwise, assume reply is valid
|
|
return &m_httpReply;
|
|
}
|
|
|
|
setStatus("getting http reply");
|
|
|
|
// come back up here if a redirect invalidates it
|
|
loop:
|
|
// sanity test -- only if not the test collection (NO, might be EBADIP)
|
|
//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
|
|
// get the http reply
|
|
char **replyPtr = getHttpReply2();
|
|
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
|
|
// . now if the reply was a redirect we should set m_redirUrl to it
|
|
// and re-do all this code
|
|
// . this often sets m_indexCode to stuff like ESIMPLIFIEDREDIR, etc.
|
|
Url **redirp = getRedirUrl();
|
|
// we often lookup the assocaited linkInfo on the original url to
|
|
// see if it is worth keeping and indexing just to take advantage of
|
|
// the incoming link text it has, so we may block on that!
|
|
// but in the case of a contactDoc, getContactDoc() sets these things
|
|
// to NULL to avoid unnecessary lookups.
|
|
if ( ! redirp || redirp == (void *)-1 ) return (char **)redirp;
|
|
// sanity check
|
|
if ( *redirp && ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
|
|
// if NULL, we are done
|
|
if ( ! *redirp ) return &m_httpReply;
|
|
// . also, hang it up if we got a simplified redir url now
|
|
// . we set m_redirUrl so that getLinks() can add a spiderRequest
|
|
// for it, but we do not want to actually redirect to it to get
|
|
// the content for THIS document
|
|
if ( m_redirError ) return &m_httpReply;
|
|
// and invalidate the redir url because we do not know if the
|
|
// current url will redirect or not (mdwmdw)
|
|
m_redirUrlValid = false;
|
|
m_metaRedirUrlValid = false;
|
|
// free it
|
|
mfree ( m_httpReply , m_httpReplyAllocSize, "freehr" );
|
|
// always nullify if we free so we do not re-use freed mem
|
|
m_httpReply = NULL;
|
|
// otherwise, we had a redirect, so invalidate what we had set
|
|
m_httpReplyValid = false;
|
|
// do not invalidate this any more, now it is when we STARTED spidering
|
|
// the document
|
|
//m_spideredTimeValid = false;
|
|
m_isContentTruncatedValid = false;
|
|
// do not redo robots.txt lookup if the redir url just changed from
|
|
// http to https or vice versa
|
|
Url *ru = *redirp;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1) return (char **)cu;
|
|
if ( strcmp ( ru->getUrl() + ru->getSchemeLen() ,
|
|
cu->getUrl() + cu->getSchemeLen() ) ) {
|
|
// redo robots.txt lookup. might be cached.
|
|
m_isAllowedValid = false;
|
|
m_crawlDelayValid = false;
|
|
}
|
|
// keep the same ip if hostname is unchanged
|
|
if ( ru->getHostLen() != cu->getHostLen() ||
|
|
strncmp ( ru->getHost() , cu->getHost(), cu->getHostLen() ) )
|
|
// ip is supposed to be that of the current url, which changed
|
|
m_ipValid = false;
|
|
// we set our m_xml to the http reply to check for meta redirects
|
|
// in the html sometimes in getRedirUrl() so since we are redirecting,
|
|
// invalidate that xml
|
|
m_xmlValid = false;
|
|
m_wordsValid = false;
|
|
m_rawUtf8ContentValid = false;
|
|
m_expandedUtf8ContentValid= false;
|
|
m_utf8ContentValid = false;
|
|
m_filteredContentValid = false;
|
|
m_contentValid = false;
|
|
m_mimeValid = false;
|
|
// update our current url now to be the redirected url
|
|
m_currentUrl.set ( *redirp , false );
|
|
m_currentUrlValid = true;
|
|
// loop it
|
|
goto loop;
|
|
}
|
|
|
|
void gotHttpReplyWrapper ( void *state ) {
|
|
// point to us
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// this sets g_errno on error
|
|
THIS->gotHttpReply ( );
|
|
// resume. this checks g_errno for being set.
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// "NULL" can be a valid http reply (empty page) so we need to use "char **"
|
|
char **XmlDoc::getHttpReply2 ( ) {
|
|
if ( m_httpReplyValid ) return &m_httpReply;
|
|
|
|
setStatus("getting http reply2");
|
|
|
|
|
|
// if recycle is set then NEVER download if doing query reindex
|
|
// but if doing an injection then i guess we can download.
|
|
// do not even do ip lookup if no old titlerec, which is how we
|
|
// ended up here...
|
|
if ( m_recycleContent && m_sreqValid && m_sreq.m_isPageReindex ) {
|
|
g_errno = ENOTITLEREC;
|
|
return NULL;
|
|
}
|
|
|
|
// doing a query reindex on diffbot objects does not have a
|
|
// valid spider request, only sets m_recycleContent to true
|
|
// in reindexJSONObjects()/redoJSONObjects()
|
|
if ( m_recycleContent && m_isDiffbotJSONObject ) {
|
|
g_errno = ENOTITLEREC;
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// get ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (char **)ip;
|
|
|
|
// reset
|
|
m_httpReplySize = 0;
|
|
m_httpReply = NULL;
|
|
|
|
// if ip is bogus, we are done
|
|
if ( *ip == 0 || *ip == -1 ) {
|
|
log("xmldoc: ip is bogus 0 or -1 for %s. skipping download",
|
|
m_firstUrl.getUrl());
|
|
m_httpReplyValid = true;
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// need this now too. but don't hurt a nonzero val if we have
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
return &m_httpReply;
|
|
//return gotHttpReply ( );
|
|
}
|
|
|
|
// get this. should operate on current url (i.e. redir url if there)
|
|
bool *isAllowed = getIsAllowed();
|
|
// error or blocked
|
|
if ( ! isAllowed || isAllowed == (void *)-1) return (char **)isAllowed;
|
|
// this must be valid, since we share m_msg13 with it
|
|
if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *cd = getFinalCrawlDelay();
|
|
if ( ! cd || cd == (void *)-1 ) return (char **)cd;
|
|
|
|
// we might bail
|
|
if ( ! *isAllowed ) {
|
|
m_httpReplyValid = true;
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// need this now too. but don't hurt a nonzero val if we have
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
m_downloadStatusValid = true;
|
|
// forbidden? assume we downloaded it and it was empty
|
|
m_downloadStatus = 0; // EDOCDISALLOWED;//403;
|
|
return &m_httpReply;
|
|
//return gotHttpReply ( );
|
|
}
|
|
|
|
// are we site root page?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
|
|
//int8_t *hc = getHopCount();
|
|
//if ( ! hc || hc == (void *)-1 ) return (char **)hc;
|
|
|
|
XmlDoc *od = NULL;
|
|
if ( ! m_isSpiderProxy &&
|
|
// don't lookup xyz.com/robots.txt in titledb
|
|
! isFirstUrlRobotsTxt() ) {
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
|
|
// get ptr to old xml doc, could be NULL if non exists
|
|
od = *pod;
|
|
}
|
|
|
|
// sanity check
|
|
if ( od && m_recycleContent ) {char *xx=NULL;*xx=0; }
|
|
|
|
// validate m_firstIpValid
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// robots.txt and css files etc should have m_isChildDoc as true
|
|
//if ( ! m_downloadAttempted && ! m_isChildDoc )
|
|
// // keep track of spider stats
|
|
// cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
|
|
|
// we made an attempt to download, so mark it
|
|
//m_downloadAttempted = true;
|
|
|
|
// if we didn't block getting the lock, keep going
|
|
setStatus ( "getting web page" );
|
|
|
|
|
|
// sanity check
|
|
if ( ! m_masterLoop ) { char *xx=NULL;*xx=0; }
|
|
|
|
// int16_tcut. this will return the redirUrl if it is non-empty.
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (char **)cu;
|
|
|
|
/*
|
|
// if on google, make it empty so we do not hit them
|
|
if ( strstr(cu->getUrl(),".google.com/") ) {
|
|
log("spider: encountered google.com url. emptying.");
|
|
m_httpReplyValid = true;
|
|
m_isContentTruncated = false;
|
|
m_isContentTruncatedValid = true;
|
|
// need this now too. but don't hurt a nonzero val if we have
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
return &m_httpReply;
|
|
}
|
|
*/
|
|
|
|
// no ip found means empty page i guess
|
|
//if ( *ip == 0 || *ip == -1 )
|
|
// return gotHttpReply ( );
|
|
|
|
bool useTestCache = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
|
// unless its the pagesubmit.cpp event submission tool
|
|
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
|
|
|
// sanity check
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set parms
|
|
Msg13Request *r = &m_msg13Request;
|
|
// clear it first
|
|
r->reset();
|
|
// and set the url
|
|
//strcpy ( r->m_url , cu->getUrl() );
|
|
r->ptr_url = cu->getUrl();
|
|
r->size_url = cu->getUrlLen()+1;
|
|
|
|
// caution: m_sreq.m_hopCountValid is false sometimes for page parser
|
|
// this is used for Msg13.cpp's ipWasBanned()
|
|
// we use hopcount now instead of isInSeedBuf(cr,r->ptr_url)
|
|
bool isInjecting = getIsInjecting();
|
|
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
|
|
r->m_isRootSeedUrl = 1;
|
|
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
|
|
r->m_isRootSeedUrl = 1;
|
|
|
|
// sanity check
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// r->m_maxTextDocLen = maxDownload;
|
|
// r->m_maxOtherDocLen = maxDownload;
|
|
r->m_maxTextDocLen = cr->m_maxTextDocLen;
|
|
r->m_maxOtherDocLen = cr->m_maxOtherDocLen;
|
|
|
|
// max to download in bytes. currently 1MB.
|
|
//int32_t maxDownload = (int32_t)MAXDOCLEN;
|
|
// but if url is http://127.0.0.1.... or local then
|
|
if ( m_ipValid ) {
|
|
// make into a string
|
|
char *ipStr = iptoa(m_ip);
|
|
// is it local?
|
|
bool isLocal = false;
|
|
if ( strncmp(ipStr,"192.168.",8) == 0) isLocal = true;
|
|
if ( strncmp(ipStr,"10." ,3) == 0) isLocal = true;
|
|
if ( m_ip == 16777343 ) isLocal = true; // 127.0.0.1 ?
|
|
// . if local then make web page download max size unlimited
|
|
// . this is for adding the gbdmoz.urls.txt.* files to
|
|
// populate dmoz. those files are about 25MB each.
|
|
if ( isLocal ) {
|
|
//maxDownload = -1;
|
|
r->m_maxTextDocLen = -1;
|
|
r->m_maxOtherDocLen = -1;
|
|
}
|
|
}
|
|
// m_maxCacheAge is set for getting contact or root docs in
|
|
// getContactDoc() and getRootDoc() and it only applies to
|
|
// titleRecs in titledb i guess... but still... for Msg13 it applies
|
|
// to its cache ... for robots.txt files too
|
|
r->m_maxCacheAge = m_maxCacheAge;
|
|
r->m_urlIp = *ip;
|
|
r->m_firstIp = m_firstIp;
|
|
r->m_urlHash48 = getFirstUrlHash48();
|
|
if ( r->m_maxTextDocLen < 100000 ) r->m_maxTextDocLen = 100000;
|
|
if ( r->m_maxOtherDocLen < 200000 ) r->m_maxOtherDocLen = 200000;
|
|
r->m_forwardDownloadRequest = (bool)m_forwardDownloadRequest;
|
|
r->m_useTestCache = (bool)useTestCache;
|
|
r->m_spideredTime = getSpideredTime();//m_spideredTime;
|
|
r->m_ifModifiedSince = 0;
|
|
r->m_skipHammerCheck = 0;
|
|
|
|
//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
|
|
//else r->m_addToTestCache = false;
|
|
r->m_addToTestCache = (bool)useTestCache;
|
|
|
|
if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
|
|
r->ptr_cookie = m_redirCookieBuf.getBufStart();
|
|
r->size_cookie = m_redirCookieBuf.length() + 1;
|
|
// . only do once per redirect
|
|
// . do not invalidate because we might have to carry it
|
|
// through to the next redir... unless we change domain
|
|
// . this fixes the nyt.com/nytimes.com bug some more
|
|
//m_redirCookieBufValid = false;
|
|
}
|
|
|
|
// . this is -1 if unknown. none found in robots.txt or provided
|
|
// in the custom crawl parms.
|
|
// . it should also be 0 for the robots.txt file itself
|
|
r->m_crawlDelayMS = *cd;
|
|
|
|
// let's time our crawl delay from the initiation of the download
|
|
// not from the end of the download. this will make things a little
|
|
// faster but could slam servers more.
|
|
r->m_crawlDelayFromEnd = false;
|
|
|
|
// need this in order to get all languages, etc. and avoid having
|
|
// to set words class at the spider compression proxy level
|
|
r->m_forEvents = 0;
|
|
// new stuff
|
|
r->m_contentHash32 = 0;
|
|
// if valid in SpiderRequest, use it. if spider compression proxy
|
|
// sees the content is unchanged it will not send it back! it will
|
|
// send back g_errno = EDOCUNCHANGED or something
|
|
if ( m_sreqValid )
|
|
r->m_contentHash32 = m_sreq.m_contentHash32;
|
|
|
|
// if we have the old doc already set use that
|
|
if ( od )
|
|
r->m_contentHash32 = od->m_contentHash32;
|
|
|
|
// force floater usage on even if "use spider proxies" parms is off
|
|
// if we're a diffbot crawl and use robots is off.
|
|
//if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
|
|
// r->m_forceUseFloaters = true;
|
|
|
|
// for beta testing, make it a collection specific parm for diffbot
|
|
// so we can turn on manually
|
|
if ( cr->m_forceUseFloaters )
|
|
r->m_forceUseFloaters = true;
|
|
|
|
// eventgurubot is the max
|
|
//char *userAgent = g_conf.m_spiderUserAgent;
|
|
// hardcode it
|
|
//char *userAgent = "EventGuruBot";
|
|
//int32_t uaLen = gbstrlen(userAgent);
|
|
//if ( uaLen > 12 ) {
|
|
// log("spider: user agent string too long");
|
|
// uaLen = 12;
|
|
//}
|
|
//strncpy(r->m_userAgent,userAgent,uaLen);
|
|
//r->m_userAgent[uaLen] = '\0';
|
|
|
|
// turn this off too
|
|
r->m_attemptedIframeExpansion = false;
|
|
|
|
r->m_collnum = (collnum_t)-1;
|
|
if ( m_collnumValid )r->m_collnum = m_collnum;
|
|
|
|
// turn off
|
|
r->m_useCompressionProxy = false;
|
|
r->m_compressReply = false;
|
|
r->m_isCustomCrawl = cr->m_isCustomCrawl;
|
|
|
|
// set it for this too
|
|
if ( g_conf.m_useCompressionProxy &&
|
|
// do not use for the test collection ever, that is qa'ing
|
|
strcmp(cr->m_coll,"qatest123") ) {
|
|
r->m_useCompressionProxy = true;
|
|
r->m_compressReply = true;
|
|
}
|
|
|
|
// are we a robots.txt file?
|
|
//bool isRobotsTxt = isRobotsTxtFile ( cu->getUrl() , cu->getUrlLen());
|
|
|
|
char *td = getTestDir();
|
|
if ( td ) strncpy ( r->m_testDir, td, 31);
|
|
|
|
//r->m_isPageParser = getIsPageParser();
|
|
//r->m_isPageInject = ( m_sreqValid && m_sreq.m_isInjecting );
|
|
|
|
// if current url IS NOT EQUAL to first url then set redir flag
|
|
if ( strcmp(cu->m_url,m_firstUrl.m_url) )
|
|
r->m_skipHammerCheck = 1;
|
|
// or if this an m_extraDoc or m_rootDoc for another url then
|
|
// do not bother printing the hammer ip msg in msg13.cpp either
|
|
if ( m_isChildDoc )
|
|
r->m_skipHammerCheck = 1;
|
|
|
|
if ( m_contentInjected ) // oldsrValid && m_sreq.m_isInjecting )
|
|
r->m_skipHammerCheck = 1;
|
|
|
|
// or if ahrefs
|
|
if ( strncmp(cu->m_url,"http://api.ahrefs.com/",22) == 0 )
|
|
r->m_skipHammerCheck = 1;
|
|
|
|
if ( r->m_skipHammerCheck )
|
|
log(LOG_DEBUG,"build: skipping hammer check");
|
|
|
|
// if we had already spidered it... try to save bandwidth and time
|
|
if ( od ) {
|
|
// sanity check
|
|
if ( ! od->m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// only get it if modified since last spider time
|
|
r->m_ifModifiedSince = od->m_spideredTime;
|
|
}
|
|
|
|
// tell msg13 he is scraping...
|
|
if ( m_sreqValid && m_sreq.m_isScraping )
|
|
r->m_isScraping = 1;
|
|
|
|
// if doing frame expansion on a doc we just downloaded as the
|
|
// spider proxy, we are asking ourselves now to download the url
|
|
// from an <iframe src=...> tag. so definitely use msg13 again
|
|
// so it can use the robots.txt cache, and regular html page cache.
|
|
if ( m_isSpiderProxy ) {
|
|
r->m_useCompressionProxy = false;
|
|
r->m_compressReply = false;
|
|
r->m_skipHammerCheck = 1;
|
|
//r->m_requireGoodDate = false;
|
|
// no frames within frames
|
|
r->m_attemptedIframeExpansion = 1;
|
|
log(LOG_DEBUG,"build: skipping hammer check 2");
|
|
|
|
}
|
|
|
|
// . use msg13 to download the file, robots.txt
|
|
// . msg13 will ensure only one download of that url w/ locks
|
|
// . msg13 can use the compress the http reply before
|
|
// sending it back to you via udp (compression proxy)
|
|
// . msg13 uses XmlDoc::getHttpReply() function to handle
|
|
// redirects, etc.? no...
|
|
bool isTestColl = false;
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;
|
|
|
|
//if ( isTestColl && m_contentType == CT_IMAGE )
|
|
// isTestColl = false;
|
|
|
|
// sanity check. keep injections fast. no downloading!
|
|
if ( m_wasContentInjected ) {
|
|
log("xmldoc: url injection failed! error!");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// sanity check
|
|
if ( m_deleteFromIndex ) {
|
|
log("xmldoc: trying to download page to delete");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
m_downloadStartTimeValid = true;
|
|
m_downloadStartTime = gettimeofdayInMillisecondsGlobal();
|
|
|
|
if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
|
|
// return -1 if blocked
|
|
return (char **)-1;
|
|
return gotHttpReply ( );
|
|
}
|
|
// . this returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
char **XmlDoc::gotHttpReply ( ) {
|
|
// save it
|
|
int32_t saved = g_errno;
|
|
// note it
|
|
setStatus ( "got web page" );
|
|
|
|
// sanity check. are we already valid?
|
|
if ( m_httpReply && m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do not re-call
|
|
m_httpReplyValid = true;
|
|
|
|
// assume none
|
|
m_httpReply = NULL;
|
|
|
|
// . get the HTTP reply
|
|
// . TODO: free it on reset/destruction, we own it now
|
|
// . this is now NULL terminated thanks to changes in
|
|
// Msg13.cpp, but watch the buf size, need to subtract 1
|
|
// . therefore, we can set the Xml class with it
|
|
m_httpReply = m_msg13.m_replyBuf;
|
|
m_httpReplySize = m_msg13.m_replyBufSize;
|
|
// how much to free?
|
|
m_httpReplyAllocSize = m_msg13.m_replyBufAllocSize;
|
|
|
|
// sanity check
|
|
if ( m_httpReplySize > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
|
|
// what is this for? that makes it into a length not a size!
|
|
//if ( m_httpReplySize > 0 ) m_httpReplySize--;
|
|
// . save entire reply length we read from the net so
|
|
// SpiderCache
|
|
// can use it for its m_avgReplyLen for throttling
|
|
// . m_bufLen may change due to filtering
|
|
//m_replyLen = m_bufLen;
|
|
// . don't let UdpServer free m_buf when socket is
|
|
// recycled/closed
|
|
// . we own it now and are responsible for freeing it
|
|
//slot->m_readBuf = NULL;
|
|
m_msg13.m_replyBuf = NULL;
|
|
// relabel mem so we know where it came from
|
|
relabel( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . sanity test -- only if not the test collection
|
|
// . i.e. what are you doing downloading the page if there was
|
|
// a problem with the page we already know about
|
|
if ( m_indexCode && m_indexCodeValid &&
|
|
strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
|
|
|
// fix this
|
|
if ( saved == EDOCUNCHANGED ) {
|
|
// assign content from it since unchanged
|
|
m_recycleContent = true;
|
|
// clear the error
|
|
saved = 0;
|
|
g_errno = 0;
|
|
}
|
|
|
|
// . save the error in download status
|
|
// . could now be EDOCUNCHANGED or EDOCNOGOODDATE (w/ tod)
|
|
m_downloadStatus = saved; // g_errno;
|
|
// validate
|
|
m_downloadStatusValid = true;
|
|
|
|
// update m_downloadEndTime if we should, used for sameIpWait
|
|
m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
|
|
m_downloadEndTimeValid = true;
|
|
|
|
// make it so
|
|
g_errno = saved;
|
|
|
|
bool doIncrement = true;
|
|
if ( m_isChildDoc ) doIncrement = false;
|
|
if ( m_incrementedDownloadCount ) doIncrement = false;
|
|
|
|
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
|
|
|
// if it doesn't match the crawl pattern, just the process pattern
|
|
// then do not increment download successes
|
|
if ( doIncrement &&
|
|
cr->m_isCustomCrawl == 1 &&
|
|
// allow seeds to be counted
|
|
! isSeed &&
|
|
//! sreq->m_isPageReindex &&
|
|
//! sreq->m_isInjecting &&
|
|
! doesUrlMatchDiffbotCrawlPattern() )
|
|
doIncrement = false;
|
|
|
|
|
|
|
|
// . do not count bad http status in mime as failure i guess
|
|
// . do not inc this count for robots.txt and root page downloads, etc.
|
|
if ( doIncrement ) {
|
|
cr->m_localCrawlInfo.m_pageDownloadSuccesses++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadSuccesses++;
|
|
cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound++;
|
|
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound++;
|
|
m_incrementedDownloadCount = true;
|
|
cr->m_needsSave = true;
|
|
// changing status, resend local crawl info to all
|
|
cr->localCrawlInfoUpdate();
|
|
}
|
|
|
|
// this means the spider compression proxy's reply got corrupted
|
|
// over roadrunner's crappy wireless internet connection
|
|
if ( saved == ECORRUPTDATA ) return NULL;
|
|
// this one happens too! for the same reason...
|
|
if ( saved == EBADREPLYSIZE ) return NULL;
|
|
// might as well check this too while we're at it
|
|
if ( saved == ENOMEM ) return NULL;
|
|
|
|
// sanity check -- check after bailing on corruption because
|
|
// corrupted replies do not end in NULLs
|
|
if ( m_httpReplySize > 0 && m_httpReply[m_httpReplySize-1] ) {
|
|
log("http: httpReplySize=%"INT32" http reply does not end in \\0 "
|
|
"for %s in collnum=%"INT32". blanking out reply."
|
|
,m_httpReplySize
|
|
,m_firstUrl.m_url
|
|
,(int32_t)m_collnum
|
|
);
|
|
// free it i guess
|
|
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
|
// and reset it
|
|
m_httpReplySize = 0;
|
|
m_httpReply = NULL;
|
|
m_httpReplyAllocSize = 0;
|
|
// call it data corruption i guess for now
|
|
g_errno = ECORRUPTDATA;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// if its a bad gzip reply, a compressed http reply, then
|
|
// make the whole thing empty? some websites return compressed replies
|
|
// even though we do not ask for them. and then the compression
|
|
// is corrupt.
|
|
if ( saved == ECORRUPTHTTPGZIP ||
|
|
// if somehow we got a page too big for MAX_DGRAMS... treat
|
|
// it like an empty page...
|
|
saved == EMSGTOOBIG ) {
|
|
// free it i guess
|
|
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
|
// and reset it
|
|
m_httpReplySize = 0;
|
|
m_httpReply = NULL;
|
|
m_httpReplyAllocSize = 0;
|
|
}
|
|
|
|
// if errors were not local, reset g_errno and set m_indexCode
|
|
//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
|
|
//if ( g_errno == EBADMIME ) m_indexCode = EBADMIME;
|
|
// clear g_errno
|
|
//if ( m_indexCode ) g_errno = 0;
|
|
// return if cancelled, etc.
|
|
//if ( g_errno ) return NULL;
|
|
|
|
// clear this i guess
|
|
g_errno = 0;
|
|
|
|
/*
|
|
MDW: 2/8/16 this logic now below in getIsContentTruncated() function
|
|
|
|
// int16_tcut - convert size to length
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
|
|
m_isContentTruncated = false;
|
|
// was the content truncated? these might label a doc is truncated
|
|
// when it really is not... but we only use this for link spam stuff,
|
|
// so it should not matter too much. it should only happen rarely.
|
|
//if ( LEN >= cr->m_maxTextDocLen-1 ) m_isContentTruncated = true;
|
|
//if ( LEN >= cr->m_maxOtherDocLen-1 ) m_isContentTruncated = true;
|
|
if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
|
|
// set this
|
|
m_isContentTruncated2 = (bool)m_isContentTruncated;
|
|
// validate it
|
|
m_isContentTruncatedValid = true;
|
|
*/
|
|
|
|
return &m_httpReply;
|
|
}
|
|
|
|
char *XmlDoc::getIsContentTruncated ( ) {
|
|
if ( m_isContentTruncatedValid ) return &m_isContentTruncated2;
|
|
|
|
setStatus ( "getting is content truncated" );
|
|
|
|
// if recycling content use its download end time
|
|
if ( m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (char *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
m_isContentTruncated = od->m_isContentTruncated;
|
|
m_isContentTruncated2 = (bool)m_isContentTruncated;
|
|
m_isContentTruncatedValid = true;
|
|
return &m_isContentTruncated2;
|
|
}
|
|
}
|
|
|
|
// need a valid reply
|
|
char **replyPtr = getHttpReply ();
|
|
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char *)replyPtr;
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (char *)ct;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// shortcut - convert size to length
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
|
|
m_isContentTruncated = false;
|
|
// was the content truncated? these might label a doc is truncated
|
|
// when it really is not... but we only use this for link spam stuff,
|
|
// so it should not matter too much. it should only happen rarely.
|
|
if ( cr->m_maxTextDocLen >= 0 &&
|
|
LEN >= cr->m_maxTextDocLen-1 &&
|
|
*ct == CT_HTML )
|
|
m_isContentTruncated = true;
|
|
|
|
if ( cr->m_maxOtherDocLen >= 0 &&
|
|
LEN >= cr->m_maxOtherDocLen-1 &&
|
|
*ct != CT_HTML )
|
|
m_isContentTruncated = true;
|
|
|
|
//if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
|
|
// set this
|
|
m_isContentTruncated2 = (bool)m_isContentTruncated;
|
|
// validate it
|
|
m_isContentTruncatedValid = true;
|
|
|
|
return &m_isContentTruncated2;
|
|
}
|
|
|
|
int32_t *XmlDoc::getDownloadStatus ( ) {
|
|
if ( m_downloadStatusValid ) return &m_downloadStatus;
|
|
// log it
|
|
setStatus ( "getting download status");
|
|
// if recycling content, we're 200!
|
|
if ( m_recycleContent ) {
|
|
m_downloadStatus = 0;
|
|
m_downloadStatusValid = true;
|
|
return &m_downloadStatus;
|
|
}
|
|
// get ip
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
|
|
// . first try ip
|
|
// . this means the dns lookup timed out
|
|
if ( *ip == -1 ) {
|
|
m_downloadStatus = EDNSTIMEDOUT;
|
|
m_downloadStatusValid = true;
|
|
return &m_downloadStatus;
|
|
}
|
|
// this means ip does not exist
|
|
if ( *ip == 0 ) {
|
|
m_downloadStatus = EBADIP;
|
|
m_downloadStatusValid = true;
|
|
return &m_downloadStatus;
|
|
}
|
|
// need a valid reply
|
|
char **reply = getHttpReply ();
|
|
if ( ! reply || reply == (void *)-1 ) return (int32_t *)reply;
|
|
// must be valid now
|
|
if ( ! m_downloadStatusValid ) { char *xx=NULL;*xx=0; }
|
|
// return it
|
|
return &m_downloadStatus;
|
|
}
|
|
|
|
int64_t *XmlDoc::getDownloadEndTime ( ) {
|
|
if ( m_downloadEndTimeValid ) return &m_downloadEndTime;
|
|
// log it
|
|
setStatus ( "getting download end time");
|
|
|
|
// do not cause us to core in getHttpReply2() because m_deleteFromIndex
|
|
// is set to true...
|
|
if ( m_deleteFromIndex ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
return &m_downloadEndTime;
|
|
}
|
|
|
|
// if recycling content use its download end time
|
|
if ( m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (int64_t *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
m_downloadEndTime = od->m_downloadEndTime;
|
|
m_downloadEndTimeValid = true;
|
|
return &m_downloadEndTime;
|
|
}
|
|
}
|
|
|
|
// need a valid reply
|
|
char **reply = getHttpReply ();
|
|
if ( ! reply || reply == (void *)-1 ) return (int64_t *)reply;
|
|
// must be valid now
|
|
if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0;}
|
|
// return it
|
|
return &m_downloadEndTime;
|
|
}
|
|
|
|
|
|
int16_t *XmlDoc::getHttpStatus ( ) {
|
|
// if we got a title rec then return that
|
|
if ( m_httpStatusValid ) return &m_httpStatus;
|
|
// get mime otherwise
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (int16_t *)mime;
|
|
// get from that
|
|
m_httpStatus = mime->getHttpStatus();
|
|
m_httpStatusValid = true;
|
|
return &m_httpStatus;
|
|
}
|
|
|
|
HttpMime *XmlDoc::getMime () {
|
|
if ( m_mimeValid ) return &m_mime;
|
|
|
|
// log debug
|
|
setStatus("getting http mime");
|
|
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1) return (HttpMime *)cu;
|
|
|
|
// injection from SpiderLoop.cpp sets this to true
|
|
if ( m_useFakeMime ) {
|
|
usefake:
|
|
m_mime.set ( NULL , 0 , cu );
|
|
m_mime.setHttpStatus ( 200 );
|
|
m_mime.setContentType ( CT_HTML );
|
|
m_mimeValid = true;
|
|
return &m_mime;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if recycling content, fake this mime
|
|
if ( cr->m_recycleContent || m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (HttpMime *)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// . this is non-NULL if it existed
|
|
// . fake it for now
|
|
if ( od ) goto usefake;
|
|
}
|
|
|
|
// need a valid reply
|
|
char **reply = getHttpReply ();
|
|
if ( ! reply || reply == (void *)-1 ) return (HttpMime *)reply;
|
|
|
|
// fake it for now
|
|
m_mime.set ( NULL , 0 , cu );
|
|
m_mime.setHttpStatus ( 200 );
|
|
m_mime.setContentType ( CT_HTML );
|
|
|
|
// int16_tcut
|
|
int32_t LEN = m_httpReplySize - 1;
|
|
|
|
// validate it
|
|
m_mimeValid = true;
|
|
|
|
// TODO: try again on failures because server may have been overloaded
|
|
// and closed the connection w/o sending anything
|
|
if ( LEN>0 && ! m_mime.set ( m_httpReply , LEN , cu ) ) {
|
|
// set this on mime error
|
|
//m_indexCode = EBADMIME;
|
|
// return a fake thing. content length is 0.
|
|
return &m_mime;
|
|
}
|
|
|
|
// . check the mime status, should be in the 200's for success
|
|
// . spider should redirect on 3xx codes
|
|
// . 404 means not found, etc.
|
|
// . 304 is not modified since
|
|
// . >= 300 should only happen if redirect chain was too long to follow
|
|
//int32_t httpStatus = m_mime.getHttpStatus();
|
|
// sanity check, these must be reserved! no longer, we have
|
|
// a separate m_httpStatus in the SpiderReply class now
|
|
//if ( mstrerror(httpStatus) ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
//if ( m_indexCode ) { char *xx=NULL;*xx=0; }
|
|
// set it
|
|
//m_indexCode = httpStatus;
|
|
// clear if it was ok though
|
|
//if ( m_indexCode == 200 ) m_indexCode = 0;
|
|
// bail out now
|
|
return &m_mime;
|
|
}
|
|
|
|
// need to use "char **" since content might be NULL itself, if none
|
|
char **XmlDoc::getContent ( ) {
|
|
if ( m_contentValid ) return &m_content;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// recycle?
|
|
if ( cr->m_recycleContent || m_recycleContent ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (char **)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
m_content = od-> ptr_utf8Content;
|
|
m_contentLen = od->size_utf8Content - 1;
|
|
m_contentValid = true;
|
|
return &m_content;
|
|
}
|
|
if ( m_recycleContent )
|
|
log("xmldoc: failed to load old title rec "
|
|
"when recycle content was true and url = "
|
|
"%s",ptr_firstUrl);
|
|
// if could not find title rec and we are docid-based then
|
|
// we can't go any further!!
|
|
if ( m_setFromDocId ) {
|
|
log("xmldoc: null content for docid-based titlerec "
|
|
"lookup which was not found");
|
|
m_content = NULL;
|
|
m_contentLen = 0;
|
|
m_contentValid = true;
|
|
return &m_content;
|
|
}
|
|
}
|
|
|
|
if ( m_recycleContent ) {
|
|
if ( m_firstUrlValid )
|
|
log("xmldoc: failed to recycle content for %s. could "
|
|
"not load title rec",m_firstUrl.m_url);
|
|
else if ( m_docIdValid )
|
|
log("xmldoc: failed to recycle content for %"UINT64". "
|
|
"could "
|
|
"not load title rec",m_docId );
|
|
else
|
|
log("xmldoc: failed to recycle content. "
|
|
"could not load title rec" );
|
|
// let's let it pass and just download i guess, then
|
|
// we can get page stats for urls not in the index
|
|
//g_errno = EBADENGINEER;
|
|
//return NULL;
|
|
}
|
|
|
|
|
|
// if we were set from a title rec use that we do not have the original
|
|
// content, and caller should be calling getUtf8Content() anyway!!
|
|
if ( m_setFromTitleRec ) { char *xx=NULL; *xx=0; }
|
|
|
|
// query reindex has m_setFromDocId to true and we WANT to re-download
|
|
// the content... so why did i have this here? MDW 9/25/2014
|
|
//if ( m_setFromDocId ) { char *xx=NULL; *xx=0; }
|
|
|
|
// recycle?
|
|
//if ( m_recycleContent ) { char *xx=NULL; *xx=0; }
|
|
|
|
// get the mime first
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (char **)mime;
|
|
|
|
// http reply must be valid
|
|
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make it valid
|
|
m_contentValid = true;
|
|
|
|
// assume none
|
|
m_content = NULL;
|
|
m_contentLen = 0;
|
|
|
|
// all done if no reply
|
|
if ( ! m_httpReply ) return &m_content;
|
|
|
|
// set the content, account for mime header
|
|
m_content = m_httpReply + mime->getMimeLen() ;
|
|
m_contentLen = m_httpReplySize - mime->getMimeLen() ;
|
|
|
|
// watch out for this!
|
|
if ( m_useFakeMime ) {
|
|
m_content = m_httpReply;
|
|
m_contentLen = m_httpReplySize;
|
|
}
|
|
|
|
// why is this not really the size???
|
|
m_contentLen--;
|
|
|
|
// sanity check
|
|
if ( m_contentLen < 0 ) { char *xx = NULL; *xx = 0; }
|
|
return &m_content;
|
|
}
|
|
|
|
char getContentTypeFromContent ( char *p , int32_t niceness ) {
|
|
char ctype = 0;
|
|
// max
|
|
char *pmax = p + 100;
|
|
// check that out
|
|
for ( ; p && *p && p < pmax ; p++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( p[0] != '<' ) continue;
|
|
if ( p[1] != '!' ) continue;
|
|
if ( to_lower_a(p[2]) != 'd' ) continue;
|
|
if ( strncasecmp(p,"<!doctype ",10) ) continue;
|
|
char *dt = p + 10;
|
|
// skip spaces
|
|
for ( ; *dt ; dt++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( ! is_wspace_a ( *dt ) ) break;
|
|
}
|
|
// point to that
|
|
if ( ! strncasecmp(dt,"html" ,4) ) ctype = CT_HTML;
|
|
if ( ! strncasecmp(dt,"xml" ,3) ) ctype = CT_XML;
|
|
if ( ! strncasecmp(dt,"text/html",9) ) ctype = CT_HTML;
|
|
if ( ! strncasecmp(dt,"text/xml" ,8) ) ctype = CT_XML;
|
|
break;
|
|
}
|
|
return ctype;
|
|
}
|
|
|
|
uint8_t *XmlDoc::getContentType ( ) {
|
|
if ( m_contentTypeValid ) return &m_contentType;
|
|
// log debug
|
|
setStatus("getting content type");
|
|
// get the mime first
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (HttpMime *)-1 ) return (uint8_t *)mime;
|
|
// then get mime
|
|
m_contentType = mime->getContentType();
|
|
// but if they specify <!DOCTYPE html> in the document that overrides
|
|
// the content type in the mime! fixes planet.mozilla.org
|
|
char **pp = getContent();
|
|
if ( ! pp || pp == (void *)-1 ) return (uint8_t *)pp;
|
|
char *p = *pp;
|
|
// scan content for content type. returns 0 if none found.
|
|
char ctype2 = getContentTypeFromContent ( p , m_niceness );
|
|
// valid?
|
|
if ( ctype2 != 0 ) m_contentType = ctype2;
|
|
// it is valid now
|
|
m_contentTypeValid = true;
|
|
// give to to them
|
|
return &m_contentType;
|
|
}
|
|
|
|
|
|
// . similar to getMetaRedirUrl but look for different strings
|
|
// . rel="canonical" or rel=canonical in a link tag.
|
|
Url **XmlDoc::getCanonicalRedirUrl ( ) {
|
|
// return if we got it
|
|
if ( m_canonicalRedirUrlValid ) return &m_canonicalRedirUrlPtr;
|
|
|
|
//if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume none in doc
|
|
m_canonicalRedirUrlPtr = NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// disable for crawlbot, not good really for deduping
|
|
if ( cr->m_isCustomCrawl ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
if ( ! cr->m_useCanonicalRedirects ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
|
|
// are we site root page? don't follow canonical url then.
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (Url **)isRoot;
|
|
if ( *isRoot ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
// if this page has an inlink, then let it stand
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
|
|
if ( info1->getNumGoodInlinks() > 0 ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
|
|
// these canonical links only supported in xml/html i think
|
|
if ( *ct != CT_HTML && *ct != CT_XML ) {
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Url **)xml;
|
|
|
|
// scan nodes looking for a <link> node. like getBaseUrl()
|
|
for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
|
|
// breathe some
|
|
QUICKPOLL(m_niceness);
|
|
// 12 is the <base href> tag id
|
|
if ( xml->getNodeId ( i ) != TAG_LINK ) continue;
|
|
// get the href field of this base tag
|
|
int32_t linkLen;
|
|
char *link = (char *) xml->getString ( i, "href", &linkLen );
|
|
// skip if not valid
|
|
if ( ! link || linkLen == 0 ) continue;
|
|
// must also have rel=canoncial
|
|
int32_t relLen;
|
|
char *rel = xml->getString(i,"rel",&relLen);
|
|
if ( ! rel ) continue;
|
|
// skip if does not match "canonical"
|
|
if ( strncasecmp(rel,"canonical",relLen) ) continue;
|
|
// allow for relative urls
|
|
Url *cu = getCurrentUrl();
|
|
// set base to it. addWWW=false
|
|
m_canonicalRedirUrl.set(cu,link,linkLen,false);//true
|
|
// assume it is not our url
|
|
bool isMe = false;
|
|
// if it is us, then skip!
|
|
if(strcmp(m_canonicalRedirUrl.getUrl(),m_firstUrl.getUrl())==0)
|
|
isMe = true;
|
|
// might also be our redir url i guess
|
|
if(strcmp(m_canonicalRedirUrl.getUrl(),m_redirUrl.getUrl())==0)
|
|
isMe = true;
|
|
// if it is us, keep it NULL, it's not a redirect. we are
|
|
// the canonical url.
|
|
if ( isMe ) break;
|
|
// ignore if in an expanded iframe (<gbrame>) tag
|
|
char *pstart = xml->m_xml;
|
|
char *p = link;
|
|
// scan backwards
|
|
if ( ! m_didExpansion ) p = pstart;
|
|
bool skip = false;
|
|
for ( ; p > pstart ; p-- ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p[0] != '<' )
|
|
continue;
|
|
if ( p[1] == '/' &&
|
|
p[2] == 'g' &&
|
|
p[3] == 'b' &&
|
|
p[4] == 'f' &&
|
|
p[5] == 'r' &&
|
|
p[6] == 'a' &&
|
|
p[7] == 'm' &&
|
|
p[8] == 'e' &&
|
|
p[9] == '>' )
|
|
break;
|
|
if ( p[1] == 'g' &&
|
|
p[2] == 'b' &&
|
|
p[3] == 'f' &&
|
|
p[4] == 'r' &&
|
|
p[5] == 'a' &&
|
|
p[6] == 'm' &&
|
|
p[7] == 'e' &&
|
|
p[8] == '>' ) {
|
|
skip = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( skip ) continue;
|
|
// otherwise, it is not us, we are NOT the canonical url
|
|
// and we should not be indexed, but just ass the canonical
|
|
// url as a spiderrequest into spiderdb, just like
|
|
// simplified meta redirect does.
|
|
m_canonicalRedirUrlPtr = &m_canonicalRedirUrl;
|
|
break;
|
|
}
|
|
|
|
m_canonicalRedirUrlValid = true;
|
|
return &m_canonicalRedirUrlPtr;
|
|
}
|
|
|
|
// returns false if none found
|
|
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
|
|
Url *cu ) {
|
|
// limit scan
|
|
char *limit = p + 30;
|
|
// skip whitespace
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// must be a num
|
|
if ( ! is_digit(*p) ) return false;
|
|
// init delay
|
|
int32_t delay = atol ( p );
|
|
// ignore long delays
|
|
if ( delay >= 10 ) return false;
|
|
// now find the semicolon, if any
|
|
for ( ; *p && p < limit && *p != ';' ; p++ );
|
|
// must have semicolon
|
|
if ( *p != ';' ) return false;
|
|
// skip it
|
|
p++;
|
|
// skip whitespace some more
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// must have URL
|
|
if ( strncasecmp(p,"URL",3) ) return false;
|
|
// skip that
|
|
p += 3;
|
|
// skip white space
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// then an equal sign
|
|
if ( *p != '=' ) return false;
|
|
// skip equal sign
|
|
p++;
|
|
// them maybe more whitespace
|
|
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
|
// an optional quote
|
|
if ( *p == '\"' ) p++;
|
|
// can also be a single quote!
|
|
if ( *p == '\'' ) p++;
|
|
// set the url start
|
|
char *url = p;
|
|
// now advance to next quote or space or >
|
|
for ( ; *p && !is_wspace_a(*p) &&
|
|
*p !='\'' &&
|
|
*p !='\"' &&
|
|
*p !='>' ;
|
|
p++);
|
|
// that is the end
|
|
char *urlEnd = p;
|
|
// get size
|
|
int32_t usize = urlEnd - url;
|
|
// skip if too big
|
|
if ( usize > 1024 ) {
|
|
log("build: meta redirurl of %"INT32" bytes too big",usize);
|
|
return false;
|
|
}
|
|
// get our current utl
|
|
//Url *cu = getCurrentUrl();
|
|
// decode what we got
|
|
char decoded[MAX_URL_LEN];
|
|
// convert & to "&"
|
|
int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
|
|
decoded[decBytes]='\0';
|
|
// . then the url
|
|
// . set the url to the one in the redirect tag
|
|
// . but if the http-equiv meta redirect url starts with a '?'
|
|
// then just replace our cgi with that one
|
|
if ( *url == '?' ) {
|
|
char foob[MAX_URL_LEN*2];
|
|
char *pf = foob;
|
|
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
|
|
gbmemcpy(foob,cu->getUrl(),cuBytes);
|
|
pf += cuBytes;
|
|
gbmemcpy ( pf , decoded , decBytes );
|
|
pf += decBytes;
|
|
*pf = '\0';
|
|
metaRedirUrl->set(foob);
|
|
}
|
|
// . otherwise, append it right on
|
|
// . use "url" as the base Url
|
|
// . it may be the original url or the one we redirected to
|
|
// . redirUrl is set to the original at the top
|
|
else
|
|
// addWWW = false, stripSessId=true
|
|
metaRedirUrl->set(cu,decoded,decBytes,false,true);
|
|
return true;
|
|
}
|
|
|
|
|
|
// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
|
|
Url **XmlDoc::getMetaRedirUrl ( ) {
|
|
if ( m_metaRedirUrlValid ) return &m_metaRedirUrlPtr;
|
|
// get ptr to utf8 content
|
|
//char **u8 = getHttpReply();
|
|
//if ( ! u8 || u8 == (void *)-1 ) return (Url **)u8;
|
|
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
char *p = m_httpReply;
|
|
// subtract one since this is a size not a length
|
|
char *pend = p + m_httpReplySize - 1;//size_utf8Content;
|
|
|
|
// assume no meta refresh url
|
|
m_metaRedirUrlPtr = NULL;
|
|
// make it valid regardless i guess
|
|
m_metaRedirUrlValid = true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if we are recycling or injecting, do not consider meta redirects
|
|
if ( cr->m_recycleContent || m_recycleContent )
|
|
return &m_metaRedirUrlPtr;
|
|
|
|
// will this work in here?
|
|
//uint8_t *ct = getContentType();
|
|
//if ( ! ct ) return NULL;
|
|
|
|
Url *cu = getCurrentUrl();
|
|
|
|
bool gotOne = false;
|
|
|
|
// advance a bit, we are initially looking for the 'v' char
|
|
p += 10;
|
|
// begin the string matching loop
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// fix <!--[if lte IE 6]>
|
|
// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
|
|
if ( *p == '!' &&
|
|
p[-1]=='<' &&
|
|
p[1] == '-' &&
|
|
p[2] == '-' ) {
|
|
// find end of comment
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p[0] == '-' &&
|
|
p[1] == '-' &&
|
|
p[2] == '>' )
|
|
break;
|
|
}
|
|
// if found no end of comment, then stop
|
|
if ( p >= pend )
|
|
break;
|
|
// resume looking for meta redirect tags
|
|
continue;
|
|
}
|
|
// base everything off the equal sign
|
|
if ( *p != '=' ) continue;
|
|
// did we match "http-equiv="?
|
|
if ( to_lower_a(p[-1]) != 'v' ) continue;
|
|
if ( to_lower_a(p[-2]) != 'i' ) continue;
|
|
if ( to_lower_a(p[-3]) != 'u' ) continue;
|
|
if ( to_lower_a(p[-4]) != 'q' ) continue;
|
|
if ( to_lower_a(p[-5]) != 'e' ) continue;
|
|
if ( p[-6] != '-' ) continue;
|
|
if ( to_lower_a(p[-7]) != 'p' ) continue;
|
|
if ( to_lower_a(p[-8]) != 't' ) continue;
|
|
if ( to_lower_a(p[-9]) != 't' ) continue;
|
|
if ( to_lower_a(p[-10])!= 'h' ) continue;
|
|
// skip the equal sign
|
|
p++;
|
|
// skip quote if there
|
|
if ( *p == '\"' ) p++;
|
|
// must be "refresh", continue if not
|
|
if ( strncasecmp(p,"refresh",7) ) continue;
|
|
// skip that
|
|
p += 7;
|
|
// skip another quote if there
|
|
if ( *p == '\"' ) p++;
|
|
// limit the # of white spaces
|
|
char *limit = p + 20;
|
|
// skip white spaces
|
|
while ( *p && p < limit && is_wspace_a(*p) ) p++;
|
|
// must be content now
|
|
if ( strncasecmp(p,"content=",8) ) continue;
|
|
// skip that
|
|
p += 8;
|
|
// skip possible quote
|
|
if ( *p == '\"' ) p++;
|
|
// PARSE OUT THE URL
|
|
Url dummy;
|
|
if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
|
|
continue;
|
|
gotOne = true;
|
|
break;
|
|
}
|
|
|
|
if ( ! gotOne )
|
|
return &m_metaRedirUrlPtr;
|
|
|
|
// to fix issue with scripts containing
|
|
// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
|
|
// we have to get the Xml. we can't call getXml() because of
|
|
// recursion bugs so just do it directly here
|
|
|
|
Xml xml;
|
|
if ( ! xml.set ( m_httpReply ,
|
|
m_httpReplySize - 1, // make it a length
|
|
false , // ownData?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
false , // setParentsArg?
|
|
m_niceness ,
|
|
// assume html since getContentType() is recursive
|
|
// on us.
|
|
CT_HTML ) ) // *ct ) )
|
|
// return NULL on error with g_errno set
|
|
return NULL;
|
|
|
|
XmlNode *nodes = xml.getNodes();
|
|
int32_t n = xml.getNumNodes();
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != 68 ) continue;
|
|
// only get content for <meta http-equiv=..>
|
|
int32_t tagLen;
|
|
char *tag ;
|
|
tag = xml.getString ( i , "http-equiv" , &tagLen );
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// if not a refresh, skip it
|
|
if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
|
|
// get the content
|
|
tag = xml.getString ( i ,"content", &tagLen );
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// PARSE OUT THE URL
|
|
if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
|
|
continue;
|
|
// set it
|
|
m_metaRedirUrlPtr = &m_metaRedirUrl;
|
|
// return it
|
|
return &m_metaRedirUrlPtr;
|
|
}
|
|
|
|
// nothing found
|
|
return &m_metaRedirUrlPtr;
|
|
}
|
|
|
|
uint16_t getCharsetFast ( HttpMime *mime,
|
|
char *url,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int32_t niceness ){
|
|
|
|
int16_t charset = csUnknown;
|
|
|
|
if ( slen < 0 ) slen = 0;
|
|
|
|
char *pstart = s;
|
|
char *pend = s + slen;
|
|
|
|
char *cs = mime->getCharset();
|
|
int32_t cslen = mime->getCharsetLen();
|
|
if ( cslen > 31 ) cslen = 31;
|
|
if ( cs && cslen > 0 ) {
|
|
char *p2 = cs + cslen ; char c = *p2; *p2 = '\0';
|
|
// get it
|
|
charset = get_iana_charset ( cs , gbstrlen(cs) );
|
|
// restore
|
|
*p2 = c;
|
|
}
|
|
|
|
// look for Unicode BOM first though
|
|
cs = ucDetectBOM ( pstart , pend - pstart );
|
|
if ( cs && charset == csUnknown ) {
|
|
log(LOG_DEBUG, "build: Unicode BOM signature detected: %s",cs);
|
|
int32_t len = gbstrlen(cs); if ( len > 31 ) len = 31;
|
|
charset = get_iana_charset ( cs , len );
|
|
}
|
|
|
|
// prepare to scan doc
|
|
char *p = pstart;
|
|
|
|
// if the doc claims it is utf-8 let's double check because
|
|
// newmexicomusic.org says its utf-8 in the mime header and it says
|
|
// it is another charset in a meta content tag, and it is NOT in
|
|
// utf-8, so don't trust that!
|
|
if ( charset == csUTF8 ) {
|
|
// loop over every char
|
|
for ( char *s = pstart ; s < pend ; s += getUtf8CharSize(s) ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// sanity check
|
|
if ( ! isFirstUtf8Char ( s ) ) {
|
|
// note it
|
|
log(LOG_DEBUG,
|
|
"build: mime says UTF8 but does not "
|
|
"seem to be for url %s",url);
|
|
// reset it back to unknown then
|
|
charset = csUnknown;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// do not scan the doc if we already got it set
|
|
if ( charset != csUnknown ) p = pend;
|
|
|
|
//
|
|
// it is inefficient to set xml just to get the charset.
|
|
// so let's put in some quick string matching for this!
|
|
//
|
|
|
|
// . how big is one char? usually this is 1 unless we are in utf16...
|
|
// . if we are in utf16 natively then this code needs to know that and
|
|
// set oneChar to 2! TODO!!
|
|
//char oneChar = 1;
|
|
// advance a bit, we are initially looking for the = sign
|
|
if ( p ) p += 10;
|
|
// begin the string matching loop
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// base everything off the equal sign
|
|
if ( *p != '=' ) continue;
|
|
// must have a 't' or 'g' before the equal sign
|
|
char c = to_lower_a(p[-1]);
|
|
// did we match "charset="?
|
|
if ( c == 't' ) {
|
|
if ( to_lower_a(p[-2]) != 'e' ) continue;
|
|
if ( to_lower_a(p[-3]) != 's' ) continue;
|
|
if ( to_lower_a(p[-4]) != 'r' ) continue;
|
|
if ( to_lower_a(p[-5]) != 'a' ) continue;
|
|
if ( to_lower_a(p[-6]) != 'h' ) continue;
|
|
if ( to_lower_a(p[-7]) != 'c' ) continue;
|
|
}
|
|
// did we match "encoding="?
|
|
else if ( c == 'g' ) {
|
|
if ( to_lower_a(p[-2]) != 'n' ) continue;
|
|
if ( to_lower_a(p[-3]) != 'i' ) continue;
|
|
if ( to_lower_a(p[-4]) != 'd' ) continue;
|
|
if ( to_lower_a(p[-5]) != 'o' ) continue;
|
|
if ( to_lower_a(p[-6]) != 'c' ) continue;
|
|
if ( to_lower_a(p[-7]) != 'n' ) continue;
|
|
if ( to_lower_a(p[-8]) != 'e' ) continue;
|
|
}
|
|
// if not either, go to next char
|
|
else
|
|
continue;
|
|
// . make sure a <xml or a <meta preceeds us
|
|
// . do not look back more than 500 chars
|
|
char *limit = p - 500;
|
|
// assume charset= or encoding= did NOT occur in a tag
|
|
bool inTag = false;
|
|
// check crazy wrap if m_content was close to a NULL ptr...
|
|
if ( limit >= pend ) limit = pstart;
|
|
if ( limit < pstart ) limit = pstart;
|
|
for ( char *s = p ; s >= limit ; s -= 1 ) { // oneChar ) {
|
|
// break at > or <
|
|
if ( *s == '>' ) break;
|
|
if ( *s != '<' ) continue;
|
|
// . TODO: this could be in a quoted string too! fix!!
|
|
// . is it in a <meta> tag?
|
|
if ( to_lower_a(s[1]) == 'm' &&
|
|
to_lower_a(s[2]) == 'e' &&
|
|
to_lower_a(s[3]) == 't' &&
|
|
to_lower_a(s[4]) == 'a' ) {
|
|
inTag = true;
|
|
break;
|
|
}
|
|
// is it in an <xml> tag?
|
|
if ( to_lower_a(s[1]) == 'x' &&
|
|
to_lower_a(s[2]) == 'm' &&
|
|
to_lower_a(s[3]) == 'l' ) {
|
|
inTag = true;
|
|
break;
|
|
}
|
|
// is it in an <?xml> tag?
|
|
if ( to_lower_a(s[1]) == '?' &&
|
|
to_lower_a(s[2]) == 'x' &&
|
|
to_lower_a(s[3]) == 'm' &&
|
|
to_lower_a(s[4]) == 'l' ) {
|
|
inTag = true;
|
|
break;
|
|
}
|
|
}
|
|
// if not in a tag proper, it is useless
|
|
if ( ! inTag ) continue;
|
|
// skip over equal sign
|
|
p += 1;//oneChar;
|
|
// skip over ' or "
|
|
if ( *p == '\'' ) p += 1;//oneChar;
|
|
if ( *p == '\"' ) p += 1;//oneChar;
|
|
// keep start ptr
|
|
char *csString = p;
|
|
// set a limit
|
|
limit = p + 50;
|
|
if ( limit > pend ) limit = pend;
|
|
if ( limit < p ) limit = pend;
|
|
// stop at first special character
|
|
while ( p < limit &&
|
|
*p &&
|
|
*p !='\"' &&
|
|
*p !='\'' &&
|
|
! is_wspace_a(*p) &&
|
|
*p !='>' &&
|
|
*p != '<' &&
|
|
*p !='?' &&
|
|
*p !='/' &&
|
|
// fix yaya.pro-street.us which has
|
|
// charset=windows-1251;charset=windows-1"
|
|
*p !=';' &&
|
|
*p !='\\' )
|
|
p += 1;//oneChar;
|
|
// save it
|
|
char d = *p;
|
|
// do the actual NULL termination
|
|
*p = 0;
|
|
// get the character set
|
|
int16_t metaCs = get_iana_charset(csString, gbstrlen(csString));
|
|
// put it back
|
|
*p = d;
|
|
// update "charset" to "metaCs" if known, it overrides all
|
|
if (metaCs != csUnknown ) charset = metaCs;
|
|
// all done, only if we got a known char set though!
|
|
if ( charset != csUnknown ) break;
|
|
}
|
|
|
|
// alias these charsets so iconv understands
|
|
if ( charset == csISO58GB231280 ||
|
|
charset == csHZGB2312 ||
|
|
charset == csGB2312 )
|
|
charset = csGB18030;
|
|
|
|
if ( charset == csEUCKR )
|
|
charset = csKSC56011987; //x-windows-949
|
|
|
|
// use utf8 if still unknown
|
|
if ( charset == csUnknown ) {
|
|
if ( g_conf.m_logDebugSpider )
|
|
logf(LOG_DEBUG,"doc: forcing utf8 charset");
|
|
charset = csUTF8;
|
|
}
|
|
|
|
// once again, if the doc is claiming utf8 let's double check it!
|
|
if ( charset == csUTF8 ) {
|
|
// use this for iterating
|
|
char size;
|
|
// loop over every char
|
|
for ( char *s = pstart ; s < pend ; s += size ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// set
|
|
size = getUtf8CharSize(s);
|
|
// sanity check
|
|
if ( ! isFirstUtf8Char ( s ) ) {
|
|
// but let 0x80 slide? it is for the
|
|
// 0x80 0x99 apostrophe i've seen for
|
|
// eventvibe.com. it did have a first byte,
|
|
// 0xe2 that led that sequece but it was
|
|
// converted into â by something that
|
|
// thought it was a latin1 byte.
|
|
if ( s[0] == (char)0x80 &&
|
|
s[1] == (char)0x99 ) {
|
|
s += 2;
|
|
size = 0;
|
|
continue;
|
|
}
|
|
// note it
|
|
log(LOG_DEBUG,
|
|
"build: says UTF8 (2) but does not "
|
|
"seem to be for url %s"
|
|
" Resetting to ISOLatin1.",url);
|
|
// reset it to ISO then! that's pretty common
|
|
// no! was causing problems for
|
|
// eventvibe.com/...Yacht because it had
|
|
// some messed up utf8 in it but it really
|
|
// was utf8. CRAP, but really messes up
|
|
// sunsetpromotions.com and washingtonia
|
|
// if we do not have this here
|
|
charset = csISOLatin1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
|
|
//char *csName = get_charset_str(charset);
|
|
|
|
// breathe
|
|
//QUICKPOLL ( m_niceness );
|
|
|
|
// if we are not supported, set m_indexCode
|
|
//if ( csName && ! supportedCharset(charset) ) {
|
|
// log("build: xml: Unsupported charset: %s", csName);
|
|
// g_errno = EBADCHARSET;
|
|
// return NULL;
|
|
// //charset = csUnknown;
|
|
// // i guess do not risk it
|
|
// //m_indexCode = EBADCHARSET;
|
|
//}
|
|
|
|
// all done
|
|
return charset;
|
|
}
|
|
|
|
|
|
uint16_t *XmlDoc::getCharset ( ) {
|
|
if ( m_charsetValid ) return &m_charset;
|
|
|
|
// . get ptr to filtered content
|
|
// . we can't get utf8 content yet until we know what charset this
|
|
// junk is so we can convert it!
|
|
char **fc = getFilteredContent();
|
|
if ( ! fc || fc == (void *)-1 ) return (uint16_t *)fc;
|
|
|
|
// scan document for two things:
|
|
// 1. charset= (in a <meta> tag)
|
|
// 2. encoding= (in an <?xml> tag)
|
|
char *pstart = *fc;
|
|
//char *pend = *fc + m_filteredContentLen;
|
|
|
|
// assume known charset
|
|
m_charset = csUnknown;
|
|
// make it valid regardless i guess
|
|
m_charsetValid = true;
|
|
|
|
// check in http mime for charset
|
|
HttpMime *mime = getMime();
|
|
|
|
m_charset = getCharsetFast ( mime ,
|
|
m_firstUrl.getUrl(),
|
|
pstart ,
|
|
m_filteredContentLen,
|
|
m_niceness );
|
|
m_charsetValid = true;
|
|
return &m_charset;
|
|
}
|
|
|
|
char *XmlDoc::getIsBinary ( ) {
|
|
if ( m_isBinaryValid ) return &m_isBinary;
|
|
|
|
// get the content
|
|
char **u8 = getUtf8Content();
|
|
if ( ! u8 || u8 == (char **)-1 ) return (char *)u8;
|
|
|
|
//char *ctype = getContentType();
|
|
//if ( ! ctype || ctype == (void *)-1 ) return (char *)ctype;
|
|
//bool doBinaryCheck = false;
|
|
// the "abq-g" query gives a lot of binary content, use that
|
|
// as a testbed to make sure we filter it out!
|
|
//if ( *ctype == CT_TEXT ) doBinaryCheck = true;
|
|
//if ( *ctype == CT_UNKNOWN ) doBinaryCheck = true;
|
|
//if ( *ctype == CT_XML ) doBinaryCheck = true;
|
|
//if ( *ctype == CT_HTML ) doBinaryCheck = true;
|
|
//if ( csEnum == csUnknown ) doBinaryCheck = true;
|
|
//if ( csEnum == csASCII ) doBinaryCheck = true;
|
|
//if ( csEnum == csISOLatin1 ) doBinaryCheck = true;
|
|
//if ( slen <= 0 ) doBinaryCheck = false;
|
|
// why shouldn't we binary check everything? now that we are utf8...
|
|
//doBinaryCheck = true;
|
|
|
|
// assume not
|
|
m_isBinary = false;
|
|
m_isBinaryValid = true;
|
|
|
|
// if content is not identifed as a type known to us, then check it
|
|
// for binary characters. yes, this can be utf8 or utf16 and then
|
|
// detected as binary i think, but it should really be identified as
|
|
// being html or txt or something...
|
|
//if ( ! doBinaryCheck ) return &m_isBinary;
|
|
|
|
// use a table
|
|
char table[256];
|
|
memset ( table , 0 , 256 );
|
|
// see if we had deceitful binary content
|
|
char *s = ptr_utf8Content;
|
|
char *send = s + size_utf8Content - 1;
|
|
// for now just count the binary chars
|
|
int32_t count = 0;
|
|
|
|
// no content?
|
|
if ( ! s ) return &m_isBinary;
|
|
|
|
for ( ; s < send ; s += getUtf8CharSize(s) ) {
|
|
// yield
|
|
QUICKPOLL(m_niceness);
|
|
// skip valid utf8 characters
|
|
if ( getUtf8CharSize(s) > 1 ) continue;
|
|
// . do not count \0's
|
|
// . the fctypes.cpp isBinary array takes into account
|
|
// that people mix windows 1254 characters into
|
|
// latin-1. windows 1254 is a superset of latin-1.
|
|
// so the more common quotes and dashes are no longer
|
|
// counted as binary characters, but some of the
|
|
// rarer ones are! however, the "diff" count
|
|
// contraint helps us make up for that.
|
|
// . the first char of a utf8 character sequence always has
|
|
// the high bit off, so just test that...
|
|
if ( ! is_binary_a(*s) || ! *s ) continue;
|
|
// count it up
|
|
count++;
|
|
table[(unsigned char)*s]++;
|
|
}
|
|
// how many DIFFERENT bin chars do we have?
|
|
int32_t diff = 0;
|
|
for ( int32_t i = 0 ; i < 256 ; i++ )
|
|
if ( table[i] ) diff++;
|
|
// . is binary if 10 or more bin chars and at least 10
|
|
// DIFFERENT binary chars
|
|
// . is binary if > 5% of chars are binary
|
|
if ( (count > 10 && diff>=5) || ( 100 * count ) / size_utf8Content>6) {
|
|
// note it for now
|
|
logf(LOG_DEBUG,"build: Got binary content for %s. "
|
|
"Zeroing out content. (diff=%"INT32" count=%"INT32" "
|
|
"len=%"INT32")",
|
|
m_firstUrl.getUrl(),diff,count,size_utf8Content-1);
|
|
// do not try to index binary content, but keep it
|
|
// around for site: queries or in case we have
|
|
// inlink text for it!
|
|
ptr_utf8Content = NULL;
|
|
size_utf8Content = 0;
|
|
m_isBinary = true;
|
|
}
|
|
return &m_isBinary;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// declare these two routines for using threads
|
|
static void filterDoneWrapper ( void *state , ThreadEntry *te ) ;
|
|
static void *filterStartWrapper_r ( void *state , ThreadEntry *te ) ;
|
|
|
|
// filters m_content if its pdf, word doc, etc.
|
|
char **XmlDoc::getFilteredContent ( ) {
|
|
// return it if we got it already
|
|
if ( m_filteredContentValid ) return &m_filteredContent;
|
|
|
|
// this must be valid
|
|
char **content = getContent();
|
|
if ( ! content || content == (void *)-1 ) return content;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
// it needs this
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (void *)-1 ) return (char **)mime;
|
|
|
|
// make sure NULL terminated always
|
|
// Why? pdfs can have nulls embedded
|
|
// if ( m_content &&
|
|
// m_contentValid &&
|
|
// m_content[m_contentLen] ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
|
|
int32_t max , max2;
|
|
CollectionRec *cr;
|
|
bool filterable = false;
|
|
|
|
if ( m_calledThread ) goto skip;
|
|
|
|
// assume we do not need filtering by default
|
|
m_filteredContent = m_content;
|
|
m_filteredContentLen = m_contentLen;
|
|
m_filteredContentValid = true;
|
|
m_filteredContentAllocSize = 0;
|
|
|
|
// empty content?
|
|
if ( ! m_content ) return &m_filteredContent;
|
|
|
|
if ( *ct == CT_HTML ) return &m_filteredContent;
|
|
if ( *ct == CT_TEXT ) return &m_filteredContent;
|
|
if ( *ct == CT_XML ) return &m_filteredContent;
|
|
// javascript - sometimes has address information in it, so keep it!
|
|
if ( *ct == CT_JS ) return &m_filteredContent;
|
|
if ( m_contentLen == 0 ) return &m_filteredContent;
|
|
|
|
// we now support JSON for diffbot
|
|
if ( *ct == CT_JSON ) return &m_filteredContent;
|
|
|
|
if ( *ct == CT_ARC ) return &m_filteredContent;
|
|
if ( *ct == CT_WARC ) return &m_filteredContent;
|
|
|
|
// unknown content types are 0 since it is probably binary... and
|
|
// we do not want to parse it!!
|
|
if ( *ct == CT_PDF ) filterable = true;
|
|
if ( *ct == CT_DOC ) filterable = true;
|
|
if ( *ct == CT_XLS ) filterable = true;
|
|
if ( *ct == CT_PPT ) filterable = true;
|
|
if ( *ct == CT_PS ) filterable = true;
|
|
|
|
// if its a jpeg, gif, text/css etc. bail now
|
|
if ( ! filterable ) {
|
|
m_filteredContent = NULL;
|
|
m_filteredContentLen = 0;
|
|
m_filteredContentValid = true;
|
|
return &m_filteredContent;
|
|
}
|
|
|
|
// invalidate
|
|
m_filteredContentValid = false;
|
|
|
|
cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . if we have no filter specified...
|
|
// . usually "gbfilter" and it is a script in the working directory
|
|
//if ( ! cr->m_filter[0] ) {
|
|
// m_indexCode = EDOCBADCONTENTTYPE;
|
|
// return &m_filteredContent;
|
|
//}
|
|
|
|
// if not text/html or text/plain, use the other max
|
|
//max = MAXDOCLEN; // cr->m_maxOtherDocLen;
|
|
max = cr->m_maxOtherDocLen;
|
|
// now we base this on the pre-filtered length to save memory because
|
|
// our maxOtherDocLen can be 30M and when we have a lot of injections
|
|
// at the same time we lose all our memory quickly
|
|
max2 = 5 * m_contentLen + 10*1024;
|
|
if ( max > max2 ) max = max2;
|
|
// user uses -1 to specify no maxTextDocLen or maxOtherDocLen
|
|
if ( max < 0 ) max = max2;
|
|
// make a buf to hold filtered reply
|
|
m_filteredContentAllocSize = max;
|
|
m_filteredContent = (char *)mmalloc(m_filteredContentAllocSize,"xdfc");
|
|
if ( ! m_filteredContent ) {
|
|
log("build: Could not allocate %"INT32" bytes for call to "
|
|
"content filter.",m_filteredContentMaxSize);
|
|
return NULL;
|
|
}
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// reset this here in case thread gets killed by the kill() call below
|
|
m_filteredContentLen = 0;
|
|
// update status msg so its visible in the spider gui
|
|
setStatus ( "filtering content" );
|
|
// reset this... why?
|
|
g_errno = 0;
|
|
// . call thread to call popen
|
|
// . callThread returns true on success, in which case we block
|
|
// . do not repeat
|
|
m_calledThread = true;
|
|
// reset this since filterStart_r() will set it on error
|
|
m_errno = 0;
|
|
|
|
// how can this be? don't core like this in thread, because it
|
|
// does not save our files!!
|
|
if ( ! m_mimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do it
|
|
if ( g_threads.call ( FILTER_THREAD ,
|
|
MAX_NICENESS ,
|
|
this ,
|
|
filterDoneWrapper ,
|
|
filterStartWrapper_r ) )
|
|
// return -1 if blocked
|
|
return (char **)-1;
|
|
// clear error!
|
|
g_errno = 0;
|
|
// note it
|
|
log("build: Could not spawn thread for call to "
|
|
"content filter.");
|
|
// get the data
|
|
filterStart_r ( false ); // am thread?
|
|
|
|
// skip down here if thread has returned and we got re-called
|
|
skip:
|
|
|
|
// if size is 0, free the buf
|
|
if ( m_filteredContentLen <= 0 ) {
|
|
mfree ( m_filteredContent ,
|
|
m_filteredContentAllocSize,"fcas");
|
|
m_filteredContent = NULL;
|
|
m_filteredContentLen = 0;
|
|
m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
// did we have an error from the thread?
|
|
if ( m_errno ) g_errno = m_errno;
|
|
// but bail out if it set g_errno
|
|
if ( g_errno ) return NULL;
|
|
// must be valid now - sanity check
|
|
if ( ! m_filteredContentValid ) { char *xx=NULL;*xx=0; }
|
|
// return it
|
|
return &m_filteredContent;
|
|
}
|
|
|
|
// come back here
|
|
void filterDoneWrapper ( void *state , ThreadEntry *te ) {
|
|
// jump back into the brawl
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
// if size is 0, free the buf. have to do this outside the thread
|
|
// since malloc/free cannot be called in thread
|
|
if ( THIS->m_filteredContentLen <= 0 ) {
|
|
mfree ( THIS->m_filteredContent ,
|
|
THIS->m_filteredContentAllocSize,"fcas");
|
|
THIS->m_filteredContent = NULL;
|
|
THIS->m_filteredContentLen = 0;
|
|
THIS->m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
// . call the master callback
|
|
// . it will ultimately re-call getFilteredContent()
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// thread starts here
|
|
void *filterStartWrapper_r ( void *state , ThreadEntry *te ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->filterStart_r ( true ); // am thread?
|
|
return NULL;
|
|
}
|
|
|
|
//int my_system_r ( char *cmd , int32_t timeout ) ;
|
|
|
|
// sets m_errno on error
|
|
void XmlDoc::filterStart_r ( bool amThread ) {
|
|
// get thread id
|
|
pthread_t id = getpidtid();
|
|
// sanity check
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
int32_t ctype = m_contentType;
|
|
|
|
// assume none
|
|
m_filteredContentLen = 0;
|
|
|
|
//if ( amThread ) id = pthread_self();
|
|
//else id = getpid();
|
|
// pass the input to the program through this file
|
|
// rather than a pipe, since popen() seems broken
|
|
char in[1024];
|
|
snprintf(in,1023,"%sin.%"INT64"", g_hostdb.m_dir , (int64_t)id );
|
|
unlink ( in );
|
|
// collect the output from the filter from this file
|
|
char out[1024];
|
|
snprintf ( out , 1023,"%sout.%"INT64"", g_hostdb.m_dir, (int64_t)id );
|
|
unlink ( out );
|
|
// ignore errno from those unlinks
|
|
errno = 0;
|
|
// open the input file
|
|
retry11:
|
|
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry11;
|
|
m_errno = errno;
|
|
log("build: Could not open file %s for writing: %s.",
|
|
in,mstrerror(m_errno));
|
|
return;
|
|
}
|
|
// we are in a thread, this must be valid!
|
|
if ( ! m_mimeValid ) { char *xx=NULL;*xx=0;}
|
|
|
|
retry12:
|
|
// write the content into the input file
|
|
int32_t w = write ( fd , m_content , m_contentLen );
|
|
// valgrind
|
|
if ( w < 0 && errno == EINTR ) goto retry12;
|
|
// did we get an error
|
|
if ( w != m_contentLen ) {
|
|
//int32_t w = fwrite ( m_buf , 1 , m_bufLen , pd );
|
|
//if ( w != m_bufLen ) {
|
|
m_errno = errno;
|
|
log("build: Error writing to %s: %s.",in,
|
|
mstrerror(m_errno));
|
|
close(fd);
|
|
return;
|
|
}
|
|
// close the file
|
|
close ( fd );
|
|
|
|
// int16_tcut
|
|
char *wdir = g_hostdb.m_dir;
|
|
|
|
// . open a pipe to pdf2html program
|
|
// . the output will go to stdout
|
|
char cmd[2048];
|
|
// different commands to filter differt ctypes
|
|
// -i : ignore images
|
|
// -stdout: send output to stdout
|
|
// -c : generate complex document
|
|
// Google generates complex docs, but the large ones are horribly slow
|
|
// in the browser, but docs with 2 cols don't display right w/o -c.
|
|
// damn, -stdout doesn't work when -c is specified.
|
|
// These ulimit sizes are max virtual memory in kilobytes. let's
|
|
// keep them to 25 Megabytes
|
|
if ( ctype == CT_PDF )
|
|
snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; timeout 30s nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
|
|
else if ( ctype == CT_DOC )
|
|
// "wdir" include trailing '/'? not sure
|
|
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; timeout 30s nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
|
|
else if ( ctype == CT_XLS )
|
|
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
|
|
// this is too buggy for now... causes hanging threads because it
|
|
// hangs, so i added 'timeout 10s' but that only works on newer
|
|
// linux version, so it'll just error out otherwise.
|
|
else if ( ctype == CT_PPT )
|
|
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/ppthtml %s > %s" , wdir , in , out );
|
|
else if ( ctype == CT_PS )
|
|
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30; timeout 10s nice -n 19 %s/pstotext %s > %s" , wdir , in , out );
|
|
else { char *xx=NULL;*xx=0; }
|
|
|
|
// breach sanity check
|
|
//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// exectue it
|
|
int retVal = gbsystem ( cmd );
|
|
if ( retVal == -1 )
|
|
log("gb: system(%s) : %s",
|
|
cmd,mstrerror(g_errno));
|
|
|
|
// all done with input file
|
|
// clean up the binary input file from disk
|
|
if ( unlink ( in ) != 0 ) {
|
|
// log error
|
|
log("gbfilter: unlink (%s): %s\n",in, strerror(errno));
|
|
// ignore it, since it was not a processing error per se
|
|
errno = 0;
|
|
}
|
|
|
|
// don't use too much memory, i think xhtml uses so much that it
|
|
// swaps out all the gb processes?
|
|
//struct rlimit lim;
|
|
//lim.rlim_cur = lim.rlim_max = 24 * 1024 * 1024 ;
|
|
//if ( setrlimit ( RLIMIT_AS , &lim ) )
|
|
// fprintf (stderr,"gbfilter:setrlimit: %s", strerror(errno) );
|
|
|
|
retry13:
|
|
fd = open ( out , O_RDONLY );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry13;
|
|
m_errno = errno;
|
|
log("gbfilter: Could not open file %s for reading: %s.",
|
|
out,mstrerror(m_errno));
|
|
return;
|
|
}
|
|
// sanity -- need room to store a \0
|
|
if ( m_filteredContentAllocSize < 2 ) { char *xx=NULL;*xx=0; }
|
|
// to read - leave room for \0
|
|
int32_t toRead = m_filteredContentAllocSize - 1;
|
|
retry14:
|
|
// read right from pipe descriptor
|
|
int32_t r = read (fd, m_filteredContent,toRead);
|
|
// note errors
|
|
if ( r < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry14;
|
|
log("gbfilter: reading output: %s",mstrerror(errno));
|
|
// this is often bad fd from an oom error, so ignore it
|
|
//m_errno = errno;
|
|
errno = 0;
|
|
r = 0;
|
|
}
|
|
// clean up shop
|
|
close ( fd );
|
|
// delete output file
|
|
unlink ( out );
|
|
|
|
// validate now
|
|
m_filteredContentValid = 1;
|
|
// save the new buf len
|
|
m_filteredContentLen = r;
|
|
// ensure enough room for null term
|
|
if ( r >= m_filteredContentAllocSize ) { char *xx=NULL;*xx=0; }
|
|
// ensure filtered stuff is NULL terminated so we can set the Xml class
|
|
m_filteredContent [ m_filteredContentLen ] = '\0';
|
|
// it is good
|
|
m_filteredContentValid = true;
|
|
|
|
// . at this point we got the filtered content
|
|
// . bitch if we didn't allocate enough space
|
|
if ( r > 0 && r == toRead )
|
|
log(LOG_LOGIC,"build: Had to truncate document to %"INT32" bytes "
|
|
"because did not allocate enough space for filter. "
|
|
"This should never happen. It is a hack that should be "
|
|
"fixed right.", toRead );
|
|
|
|
// if we got something, then we're done
|
|
//if ( r > 0 ) return;
|
|
// otherwise, free it up
|
|
// . NO! not in a thread!!
|
|
//mfree ( m_filteredContent , m_filteredContentAllocSize, "fcas" );
|
|
//m_filteredContent = NULL;
|
|
//m_filteredContentLen = 0;
|
|
//m_filteredContentAllocSize = 0;
|
|
}
|
|
|
|
pid_t g_pid = -1;
|
|
int32_t g_ticker = 0;
|
|
int32_t g_filterTimeout = -1;
|
|
|
|
/*
|
|
static int startUp ( void *cmd ) ;
|
|
#include <sys/types.h> // waitpid()
|
|
#include <sys/wait.h> // waitpid()
|
|
#include <sched.h> // clone()
|
|
|
|
static char cloneStack[250000];
|
|
|
|
int my_system_r ( char *cmd , int32_t timeout ) {
|
|
// bail if cmd empty
|
|
if ( ! cmd ) {
|
|
log(LOG_LOGIC,"build: empty command.");
|
|
return -1;
|
|
}
|
|
errno = 0;
|
|
// this gives out of memory on newer kernels, was that causing our
|
|
// older kernerls to crash, too, in addition to the e1000 driver?
|
|
//pid_t pid = fork();
|
|
// let's use clone() instead now
|
|
// error forking?
|
|
pid_t pid = clone ( startUp ,
|
|
cloneStack + 250000 ,
|
|
CLONE_FS | CLONE_FILES | CLONE_VM | SIGCHLD ,
|
|
cmd );
|
|
if (pid == -1) {
|
|
log("build: fork: %s.",mstrerror(errno));
|
|
return -1;
|
|
}
|
|
// sanity check
|
|
if ( g_pid != -1 ) { char *xx = NULL; *xx = 0; }
|
|
// set the process group id of this guy to itself, so he becomes
|
|
// the process leader, so any processes he spawns should all receive
|
|
// the same HUP or kill signals he receives. uhhhh probably not...
|
|
//setpgid ( pid , pid );
|
|
// save the pid globally so Threads.cpp can kill(9,g_pid) it if it
|
|
// stalls too long. but to measure how long it is out for, keep a
|
|
// ticker count. this ticker count is incremented in the sleep wrapper
|
|
// in Threads.cpp.
|
|
g_ticker = 0;
|
|
g_pid = pid;
|
|
g_filterTimeout = timeout;
|
|
loop:
|
|
int status;
|
|
if ( waitpid ( pid , &status , 0 ) == -1 ) {
|
|
// reset g_pid so Threads.cpp's kill wrapper chills out
|
|
if ( errno != EINTR ) {
|
|
log("build: waitpid pid=%"INT32": %s.",
|
|
(int32_t)g_pid,mstrerror(errno));
|
|
g_pid = -1;
|
|
return -1;
|
|
}
|
|
// if we got interrupted by a different signal keep waiting
|
|
goto loop;
|
|
}
|
|
// reset g_pid so Threads.cpp's kill wrapper chills out
|
|
g_pid = -1;
|
|
if ( status < 0 ) log("build: Got bad status from child.");
|
|
// we got the signal
|
|
return status;
|
|
}
|
|
|
|
int startUp ( void *cmd ) {
|
|
char *argv[4];
|
|
argv[0] = "sh";
|
|
argv[1] = "-c";
|
|
argv[2] = (char *)cmd;
|
|
argv[3] = 0;
|
|
char *envp[2];
|
|
char buf[1024];
|
|
// antiword needs this environment var so it can find
|
|
// the .antiword/ dir , we should put it in gb's working dir
|
|
snprintf(buf,1023,"HOME=%s", g_hostdb.m_dir );
|
|
envp[0] = buf;
|
|
envp[1] = 0;
|
|
execve("/bin/sh", argv, envp );
|
|
//exit(127);
|
|
return 1;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
// return downloaded content as utf8
|
|
char **XmlDoc::getRawUtf8Content ( ) {
|
|
// if we already computed it, return that
|
|
if ( m_rawUtf8ContentValid ) return &m_rawUtf8Content;
|
|
|
|
// . get our characterset
|
|
// . crap! this can be recursive. it calls getXml() which calls
|
|
// getUtf8Content() which is us!
|
|
uint16_t *charset = getCharset ( );
|
|
if ( ! charset || charset == (uint16_t *)-1 ) return (char **)charset;
|
|
|
|
char *csName = get_charset_str(*charset);
|
|
|
|
// . if not supported fix that!
|
|
// . m_indexCode should be set to EBADCHARSET ultimately, but not here
|
|
if ( ! supportedCharset(*charset) && csName ) {
|
|
m_rawUtf8Content = NULL;
|
|
m_rawUtf8ContentSize = 0;
|
|
m_rawUtf8ContentAllocSize = 0;
|
|
m_rawUtf8ContentValid = true;
|
|
return &m_rawUtf8Content;
|
|
}
|
|
|
|
// get ptr to filtered content
|
|
char **fc = getFilteredContent();
|
|
if ( ! fc || fc == (void *)-1 ) return (char **)fc;
|
|
|
|
// make sure NULL terminated always
|
|
if ( m_filteredContent &&
|
|
m_filteredContentValid &&
|
|
m_filteredContent[m_filteredContentLen] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// NULL out if no content
|
|
if ( ! m_filteredContent ) {
|
|
m_rawUtf8Content = NULL;
|
|
m_rawUtf8ContentSize = 0;
|
|
m_rawUtf8ContentAllocSize = 0;
|
|
m_rawUtf8ContentValid = true;
|
|
return &m_rawUtf8Content;
|
|
}
|
|
|
|
// assume already utf8
|
|
m_rawUtf8Content = m_filteredContent;
|
|
m_rawUtf8ContentSize = m_filteredContentLen + 1;
|
|
m_rawUtf8ContentAllocSize = 0;
|
|
|
|
// if we are not ascii or utf8 already, encode it into utf8
|
|
if ( m_rawUtf8ContentSize > 1 &&
|
|
csName &&
|
|
*charset != csASCII &&
|
|
*charset != csUTF8 ) {
|
|
// ok, no-go
|
|
//ptr_utf8Content = NULL;
|
|
m_rawUtf8Content = NULL;
|
|
// assume utf8 will be twice the size ... then add a little
|
|
int32_t need = (m_filteredContentLen * 2) + 4096;
|
|
char *buf = (char *) mmalloc(need, "Xml3");
|
|
// log oom error
|
|
if ( ! buf ) {
|
|
log("build: xml: not enough memory for utf8 buffer");
|
|
return NULL;
|
|
}
|
|
// sanity check
|
|
if ( ! csName ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
setStatus ( "converting doc to utf8" );
|
|
// returns # of bytes i guess
|
|
int32_t used = ucToUtf8 ( buf ,
|
|
// fix core dump by subtracting 10!
|
|
need - 10,
|
|
m_filteredContent ,
|
|
m_filteredContentLen ,
|
|
csName ,
|
|
-1 ,//allowBadChars
|
|
m_niceness );
|
|
// clear this if successful, otherwise, it sets errno
|
|
if ( used > 0 ) g_errno = 0;
|
|
// unrecoverable error? bad charset is g_errno == 7
|
|
// which is like argument list too long or something
|
|
// error from Unicode.cpp's call to iconv()
|
|
if ( g_errno )
|
|
log(LOG_INFO, "build: xml: failed parsing buffer: %s "
|
|
"(cs=%d)", mstrerror(g_errno), *charset);
|
|
if ( g_errno && g_errno != 7 ) {
|
|
mfree ( buf, need, "Xml3");
|
|
// do not index this doc, delete from spiderdb/tfndb
|
|
//if ( g_errno != ENOMEM ) m_indexCode = g_errno;
|
|
// if conversion failed NOT because of bad charset
|
|
// then return NULL now and bail out. probably ENOMEM
|
|
return NULL;
|
|
}
|
|
// if bad charset... just make doc empty as a utf8 doc
|
|
if ( g_errno == 7 ) {
|
|
used = 0;
|
|
buf[0] = '\0';
|
|
buf[1] = '\0';
|
|
// clear g_errno
|
|
g_errno = 0;
|
|
// and make a note for getIndexCode() so it will not
|
|
// bother indexing the doc! nah, just index it
|
|
// but with no content...
|
|
}
|
|
// crazy? this is pretty important...
|
|
if ( used + 10 >= need )
|
|
log("build: utf8 using too much buf space!!! u=%s",
|
|
getFirstUrl()->getUrl());
|
|
// re-assign
|
|
//ptr_utf8Content = buf;
|
|
//size_utf8Content = used + 1;
|
|
//m_utf8ContentAllocSize = need;
|
|
m_rawUtf8Content = buf;
|
|
m_rawUtf8ContentSize = used + 1;
|
|
m_rawUtf8ContentAllocSize = need;
|
|
}
|
|
|
|
// convert \0's to spaces. why do we see these in some pages?
|
|
// http://www.golflink.com/golf-courses/ has one in the middle after
|
|
// about 32k of content.
|
|
char *p = m_rawUtf8Content;
|
|
char *pend = p + m_rawUtf8ContentSize - 1;
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ! *p ) *p = ' ';
|
|
}
|
|
|
|
|
|
//
|
|
// VALIDATE the UTF-8
|
|
//
|
|
|
|
// . make a buffer to hold the decoded content now
|
|
// . we were just using the m_expandedUtf8Content buf itself, but "n"
|
|
// ended up equalling m_expadedUtf8ContentSize one time for a
|
|
// doc, http://ediso.net/, which probably had corrupt utf8 in it,
|
|
// and that breached our buffer! so verify that this is good
|
|
// utf8, and that we can parse it without breaching our buffer!
|
|
p = m_rawUtf8Content;
|
|
// make sure NULL terminated always
|
|
if ( p[m_rawUtf8ContentSize-1]) { char *xx=NULL;*xx=0;}
|
|
// make sure we don't breach the buffer when parsing it
|
|
char size;
|
|
char *lastp = NULL;
|
|
for ( ; ; p += size ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p >= pend ) break;
|
|
lastp = p;
|
|
size = getUtf8CharSize(p);
|
|
}
|
|
// overflow?
|
|
if ( p > pend && lastp ) {
|
|
// back up to the bad utf8 char that made us overshoot
|
|
p = lastp;
|
|
// space it out
|
|
for ( ; p < pend ; p++ ) *p = ' ';
|
|
// log it maybe due to us not being keep alive http server?
|
|
log("doc: fix bad utf8 overflow (because we are not "
|
|
"keepalive?) in doc %s",m_firstUrl.m_url);
|
|
}
|
|
// overflow?
|
|
if ( p != pend ) { char *xx=NULL;*xx=0; }
|
|
// sanity check for breach. or underrun in case we encountered a
|
|
// premature \0
|
|
if (p-m_rawUtf8Content!=m_rawUtf8ContentSize-1) {char*xx=NULL;*xx=0;}
|
|
|
|
// sanity -- must be \0 terminated
|
|
if ( m_rawUtf8Content[m_rawUtf8ContentSize-1] ) {char *xx=NULL;*xx=0; }
|
|
|
|
// it might have shrunk us
|
|
//m_rawUtf8ContentSize = n + 1;
|
|
// we are good to go
|
|
m_rawUtf8ContentValid = true;
|
|
|
|
//return &ptr_utf8Content;
|
|
return &m_rawUtf8Content;
|
|
}
|
|
|
|
// this is so Msg13.cpp can call getExpandedUtf8Content() to do its
|
|
// iframe expansion logic
|
|
void getExpandedUtf8ContentWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
char **retVal = THIS->getExpandedUtf8Content();
|
|
// return if blocked again
|
|
if ( retVal == (void *)-1 ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// now if there are any <iframe> tags let's substitute them for
|
|
// the html source they represent here. that way we will get all the
|
|
// information you see on the page. this is somewhat critical since
|
|
// a lot of pages have their content in the frame.
|
|
char **XmlDoc::getExpandedUtf8Content ( ) {
|
|
// if we already computed it, return that
|
|
if ( m_expandedUtf8ContentValid ) return &m_expandedUtf8Content;
|
|
|
|
// if called from spider compression proxy we need to set
|
|
// masterLoop here now
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getExpandedUtf8ContentWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// get the unexpanded cpontent first
|
|
char **up = getRawUtf8Content ();
|
|
if ( ! up || up == (void *)-1 ) return up;
|
|
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (char **)cu;
|
|
|
|
// NULL out if no content
|
|
if ( ! *up ) {
|
|
m_expandedUtf8Content = NULL;
|
|
m_expandedUtf8ContentSize = 0;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
// do not do iframe expansion in order to keep injections fast
|
|
if ( m_wasContentInjected ) {
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
bool skip = m_skipIframeExpansion;
|
|
|
|
// if we are a warc, arc or doc that consists of a sequence of
|
|
// sub-docs that we are indexing/injecting then skip iframe expansion
|
|
if ( isContainerDoc() )
|
|
skip = true;
|
|
|
|
// or if this is set to true
|
|
if ( skip ) {
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (char **)ct;
|
|
|
|
// if we have a json reply, leave it alone... do not expand iframes
|
|
// in json, it will mess up the json
|
|
if ( *ct == CT_JSON ) {
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
// we need this so getExtraDoc does not core
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;
|
|
|
|
// point to it
|
|
char *p = *up;
|
|
char *pend = *up + m_rawUtf8ContentSize; // includes \0
|
|
// declare crap up here so we can jump into the for loop
|
|
int32_t urlLen;
|
|
char *url;
|
|
char *fend;
|
|
Url furl;
|
|
XmlDoc **ped;
|
|
XmlDoc *ed;
|
|
bool inScript = false;
|
|
bool match;
|
|
// assign saved value if we got that
|
|
if ( m_savedp ) {
|
|
// restore "p"
|
|
p = m_savedp;
|
|
// update this
|
|
ed = m_extraDoc;
|
|
// and see if we got the mime now
|
|
goto gotMime;
|
|
}
|
|
// now loop for frame and iframe tags
|
|
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// if never found a frame tag, just keep on chugging
|
|
if ( *p != '<' ) continue;
|
|
// <script>?
|
|
if ( to_lower_a(p[1]) == 's' &&
|
|
to_lower_a(p[2]) == 'c' &&
|
|
to_lower_a(p[3]) == 'r' &&
|
|
to_lower_a(p[4]) == 'i' &&
|
|
to_lower_a(p[5]) == 'p' &&
|
|
to_lower_a(p[6]) == 't' )
|
|
inScript = 1;
|
|
// </script>?
|
|
if ( p[1]=='/' &&
|
|
to_lower_a(p[2]) == 's' &&
|
|
to_lower_a(p[3]) == 'c' &&
|
|
to_lower_a(p[4]) == 'r' &&
|
|
to_lower_a(p[5]) == 'i' &&
|
|
to_lower_a(p[6]) == 'p' &&
|
|
to_lower_a(p[7]) == 't' )
|
|
inScript = 0;
|
|
// . skip if in script
|
|
// . fixes guysndollsllc.com which has an iframe tag in
|
|
// a script section, "document.write ('<iframe..."
|
|
if ( inScript ) continue;
|
|
// iframe or frame?
|
|
match = false;
|
|
if ( to_lower_a(p[1]) == 'f' &&
|
|
to_lower_a(p[2]) == 'r' &&
|
|
to_lower_a(p[3]) == 'a' &&
|
|
to_lower_a(p[4]) == 'm' &&
|
|
to_lower_a(p[5]) == 'e' )
|
|
match = true;
|
|
if ( to_lower_a(p[1]) == 'i' &&
|
|
to_lower_a(p[2]) == 'f' &&
|
|
to_lower_a(p[3]) == 'r' &&
|
|
to_lower_a(p[4]) == 'a' &&
|
|
to_lower_a(p[5]) == 'm' &&
|
|
to_lower_a(p[6]) == 'e' )
|
|
match = true;
|
|
// skip tag if not iframe or frame
|
|
if ( ! match ) continue;
|
|
// check for frame or iframe
|
|
//if ( strncasecmp(p+1,"frame " , 6) &&
|
|
// strncasecmp(p+1,"iframe ", 7) )
|
|
// continue;
|
|
// get src tag (function in Words.h)
|
|
url = getFieldValue ( p , pend - p ,"src" , &urlLen );
|
|
// needs a src field
|
|
if ( ! url ) continue;
|
|
// "" is not acceptable either. techcrunch.com has
|
|
// <iframe src=""> which ends up embedding the root url.
|
|
if ( urlLen == 0 )
|
|
continue;
|
|
// skip if "about:blank"
|
|
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
|
|
continue;
|
|
// get our current url
|
|
//cu = getCurrentUrl();
|
|
// set our frame url
|
|
furl.set ( cu , url , urlLen );
|
|
// no recursion
|
|
if ( strcmp(furl.getUrl(),m_firstUrl.getUrl()) == 0 )
|
|
continue;
|
|
// must be http or https, not ftp! ftp was causing us to
|
|
// core in Msg22.cpp where it checks the url's protocol
|
|
// when trying to lookup the old title rec.
|
|
// http://sweetaub.ipower.com/ had an iframe with a ftp url.
|
|
if ( ! furl.isHttp() && ! furl.isHttps() ) continue;
|
|
// ignore google.com/ assholes for now
|
|
if ( strstr(furl.getUrl(),"google.com/" ) ) continue;
|
|
// and bing just to be safe
|
|
if ( strstr(furl.getUrl(),"bing.com/" ) ) continue;
|
|
// save it in case we have to return and come back later
|
|
m_savedp = p;
|
|
// break here
|
|
//log("mdw: breakpoing here");
|
|
// . download that. get as a doc. use 0 for max cache time
|
|
// . no, use 5 seconds since we often have the same iframe
|
|
// in the root doc that we have in the main doc, like a
|
|
// facebook iframe or something.
|
|
// . use a m_maxCacheAge of 5 seconds now!
|
|
ped = getExtraDoc ( furl.m_url , 5 );
|
|
// should never block
|
|
if ( ! ped ) {
|
|
log("xmldoc: getExpandedutf8content = %s",
|
|
mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
// . return -1 if it blocked???
|
|
// . no, this is not supported right now
|
|
// . it will mess up our for loop
|
|
if ( ped == (void *)-1 ) {char *xx=NULL;*xx=0;}
|
|
// cast it
|
|
ed = *ped;
|
|
// sanity
|
|
if ( ! ed ) { char *xx=NULL;*xx=0; }
|
|
// jump in here from above
|
|
gotMime:
|
|
// make it not use the ips.txt cache
|
|
//ed->m_useIpsTxtFile = false;
|
|
//ed->m_readFromTestCache = false;
|
|
// get the mime
|
|
HttpMime *mime = ed->getMime();
|
|
if ( ! mime || mime == (void *)-1 ) return (char **)mime;
|
|
// if not success, do not expand it i guess...
|
|
if ( mime->getHttpStatus() != 200 ) {
|
|
// free it
|
|
nukeDoc ( ed );
|
|
// and continue
|
|
continue;
|
|
}
|
|
// update m_downloadEndTime if we should
|
|
if ( ed->m_downloadEndTimeValid ) {
|
|
// we must already be valid
|
|
if ( ! m_downloadEndTimeValid ) {char *xx=NULL;*xx=0;}
|
|
// only replace it if it had ip and robots.txt allowed
|
|
if ( ed->m_downloadEndTime )
|
|
m_downloadEndTime = ed->m_downloadEndTime;
|
|
}
|
|
|
|
// re-write that extra doc into the content
|
|
char **puc = ed->getRawUtf8Content();
|
|
// this should not block
|
|
//if ( puc == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// it blocked before! because the charset was not known!
|
|
if ( puc == (void *)-1 ) return (char **)puc;
|
|
// error?
|
|
if ( ! puc ) return (char **)puc;
|
|
// cast it
|
|
char *uc = *puc;
|
|
// or if no content, and no mime (like if robots.txt disallows)
|
|
if ( ! uc || ed->m_rawUtf8ContentSize == 1 ) {
|
|
// free it
|
|
nukeDoc ( ed );
|
|
// and continue
|
|
continue;
|
|
}
|
|
// size includes terminating \0
|
|
if ( uc[ed->m_rawUtf8ContentSize-1] ) { char *xx=NULL;*xx=0;}
|
|
|
|
// if first time we are expanding, set this
|
|
if ( ! m_oldp ) m_oldp = *up;
|
|
|
|
// find end of frame tag
|
|
fend = p;
|
|
for ( ; fend < pend ; fend += getUtf8CharSize(fend) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// if never found a frame tag, just keep on chugging
|
|
if ( *fend == '>' ) break;
|
|
}
|
|
// if no end to the iframe tag was found, bail then...
|
|
if ( fend >= pend ) continue;
|
|
// skip the >
|
|
fend++;
|
|
|
|
// insert the non-frame crap first AND the frame/iframe tag
|
|
m_esbuf.safeMemcpy ( m_oldp , fend - m_oldp );
|
|
// end the frame
|
|
//m_esbuf.safeMemcpy ( "</iframe>", 9 );
|
|
// use our own special tag so Sections.cpp can set
|
|
// Section::m_gbFrameNum which it uses internally
|
|
m_esbuf.safePrintf("<gbframe>"); // gbiframe
|
|
// identify javascript
|
|
bool javascript = false;
|
|
if ( *ed->getContentType() == CT_JS ) javascript = true;
|
|
// so we do not mine javascript for cities and states etc.
|
|
// in Address.cpp
|
|
if ( javascript ) m_esbuf.safePrintf("<script>");
|
|
// store that
|
|
m_esbuf.safeMemcpy ( uc , ed->m_rawUtf8ContentSize - 1 );
|
|
// our special tag has an end tag as well
|
|
if ( javascript ) m_esbuf.safePrintf("</script>");
|
|
m_esbuf.safePrintf("</gbframe>");
|
|
// free up ed
|
|
nukeDoc ( ed );
|
|
|
|
// end of frame tag, skip over whole thing
|
|
m_oldp = fend ;
|
|
// sanity check
|
|
if ( m_oldp > pend ) { char *xx=NULL;*xx=0; }
|
|
// another flag
|
|
m_didExpansion = true;
|
|
// count how many we did
|
|
if ( ++m_numExpansions >= 5 ) break;
|
|
}
|
|
// default
|
|
m_expandedUtf8Content = m_rawUtf8Content;
|
|
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
|
|
// point to expansion buffer if we did any expanding
|
|
if ( m_didExpansion ) {
|
|
// copy over the rest
|
|
m_esbuf.safeMemcpy ( m_oldp , pend - m_oldp );
|
|
// null term it
|
|
m_esbuf.pushChar('\0');
|
|
// and point to that buffer
|
|
m_expandedUtf8Content = m_esbuf.getBufStart();//m_buf;
|
|
// include the \0 as part of the size
|
|
m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
|
|
}
|
|
// sanity -- must be \0 terminated
|
|
if ( m_expandedUtf8Content[m_expandedUtf8ContentSize-1] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
m_expandedUtf8ContentValid = true;
|
|
return &m_expandedUtf8Content;
|
|
}
|
|
|
|
static SafeBuf s_cookieBuf;
|
|
|
|
|
|
|
|
|
|
|
|
void *systemStartWrapper_r ( void *state , ThreadEntry *t ) {
|
|
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
char filename[2048];
|
|
snprintf(filename,2048,"%sgbarchivefile%"UINT32".gz",
|
|
g_hostdb.m_dir,
|
|
(int32_t)(int64_t)THIS);
|
|
|
|
char cmd[MAX_URL_LEN+256];
|
|
snprintf( cmd,
|
|
MAX_URL_LEN+256,
|
|
"wget -q --header=\"Cookie: %s\" \"%s\" -O %s" ,
|
|
s_cookieBuf.getBufStart() ,
|
|
THIS->m_firstUrl.getUrl() ,
|
|
filename );
|
|
|
|
log("build: wget: %s",cmd );
|
|
|
|
int ret;
|
|
|
|
ret = system(cmd);
|
|
if ( ret == -1 )
|
|
log("build: wget system failed: %s",mstrerror(errno));
|
|
else
|
|
log("build: wget system returned %"INT32"",ret);
|
|
|
|
// unzip it now
|
|
snprintf ( cmd , MAX_URL_LEN+256, "gunzip -f %s" , filename );
|
|
|
|
log("build: wget begin: %s",cmd );
|
|
|
|
ret = system(cmd);
|
|
if ( ret == -1 )
|
|
log("build: gunzip system failed: %s",mstrerror(errno));
|
|
else
|
|
log("build: gunzip system returned %"INT32"",ret);
|
|
|
|
|
|
log("build: done with gunzip");
|
|
|
|
return NULL;
|
|
}
|
|
|
|
// come back here
|
|
void systemDoneWrapper ( void *state , ThreadEntry *t ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// we download large files to a file on disk, like warcs and arcs
|
|
FILE *XmlDoc::getUtf8ContentInFile () {
|
|
|
|
setStatus ("wgetting archive file");
|
|
|
|
// if ( m_calledWgetThread ) {
|
|
|
|
// char filename[2048];
|
|
// snprintf ( filename,
|
|
// 2048,
|
|
// "gbarchivefile%"UINT32"",
|
|
// (int32_t)(int64_t)this);
|
|
|
|
// m_file.set ( g_hostdb.m_dir , filename );
|
|
// m_fileSize = m_file.getFileSize();
|
|
// m_fileValid = true;
|
|
// *fileSizeArg = m_fileSize;
|
|
// m_file.open(O_RDONLY);
|
|
// // explicitly set it to false now to make it harder for
|
|
// // it not to be true because that messes things up
|
|
// m_file.m_usePartFiles = false;
|
|
// return &m_file;
|
|
// }
|
|
|
|
// before calling the system wget thread we gotta set the cookiebuf
|
|
// HACK: for archive.org
|
|
// if getting a page from archive.org then append the cookie
|
|
// so we have the proper permissions
|
|
static bool s_triedToLoadCookie = false;
|
|
char *x = m_firstUrl.getUrl();
|
|
// only go out 20 chars looking for start of .archive.org/
|
|
char *xend = x + 25;
|
|
bool isArchiveOrg = false;
|
|
for ( ; x < xend && *x ; x++ ) {
|
|
if ( x[ 0] != '.' && x[0] != '/' ) continue; // /archive.org?
|
|
if ( x[ 1] != 'a' ) continue;
|
|
if ( x[ 2] != 'r' ) continue;
|
|
if ( x[ 3] != 'c' ) continue;
|
|
if ( x[ 4] != 'h' ) continue;
|
|
if ( x[ 5] != 'i' ) continue;
|
|
if ( x[ 6] != 'v' ) continue;
|
|
if ( x[ 7] != 'e' ) continue;
|
|
if ( x[ 8] != '.' ) continue;
|
|
if ( x[ 9] != 'o' ) continue;
|
|
if ( x[10] != 'r' ) continue;
|
|
if ( x[11] != 'g' ) continue;
|
|
if ( x[12] != '/' ) continue;
|
|
isArchiveOrg = true;
|
|
break;
|
|
}
|
|
|
|
if ( isArchiveOrg && ! s_triedToLoadCookie ) {
|
|
// try to load it up if haven't tried yet
|
|
s_triedToLoadCookie = true;
|
|
SafeBuf tmp;
|
|
//int32_t loaded = tmp.load ( "/home/mwells/.config/internetarchive.yml");
|
|
int32_t loaded = tmp.load ( "auth/internetarchive.yml");
|
|
if(loaded <= 0) {
|
|
if ( ! g_errno ) g_errno = EDOCTOOBIG;
|
|
log("gb: failed to load auth/internetarchive.yml: "
|
|
"%s",mstrerror(g_errno));
|
|
// do not restart gb in a loop, so return 0 to shell
|
|
exit(0);
|
|
//return NULL;
|
|
// FIXME
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
char *s = tmp.getBufStart();
|
|
char *line;
|
|
char *lineEnd;
|
|
line = strstr ( s , "logged-in-user: " );
|
|
if ( line ) lineEnd = strstr(line,"\n");
|
|
if ( lineEnd ) {
|
|
s_cookieBuf.safePrintf("logged-in-user=");
|
|
line += 16;
|
|
s_cookieBuf.safeMemcpy(line,lineEnd-line);
|
|
s_cookieBuf.pushChar(';');
|
|
s_cookieBuf.pushChar(' ');
|
|
s_cookieBuf.nullTerm();
|
|
}
|
|
line = strstr ( s , "logged-in-sig: " );
|
|
if ( line ) lineEnd = strstr(line,"\n");
|
|
if ( lineEnd ) {
|
|
s_cookieBuf.safePrintf("logged-in-sig=");
|
|
line += 15;
|
|
s_cookieBuf.safeMemcpy(line,lineEnd-line);
|
|
//s_cookieBuf.pushChar(';');
|
|
//s_cookieBuf.pushChar(' ');
|
|
s_cookieBuf.nullTerm();
|
|
}
|
|
}
|
|
|
|
// if we loaded something use it
|
|
if ( isArchiveOrg && s_cookieBuf.length() ) {
|
|
//cookie = s_cookieBuf.getBufStart();
|
|
log("http: using archive cookie %s",s_cookieBuf.getBufStart());
|
|
// and set user-agent too
|
|
// userAgent = "python-requests/2.3.0 "
|
|
// "CPython/2.7.3 Linux/3.5.0-32-generic";
|
|
}
|
|
|
|
char cmd[MAX_URL_LEN+256];
|
|
snprintf( cmd,
|
|
MAX_URL_LEN+256,
|
|
"set -o pipefail|"
|
|
"wget --limit-rate=10M -O- --header=\"Cookie: %s\" \"%s\"|" //
|
|
"zcat|"
|
|
"mbuffer -t -m 10M -o-", //this is useful but we need a new version of mbuffer -W 30
|
|
s_cookieBuf.getBufStart() ,
|
|
m_firstUrl.getUrl());
|
|
|
|
log("build: wget: %s",cmd );
|
|
|
|
FILE* fh = gbpopen(cmd);
|
|
|
|
int fd = fileno(fh);
|
|
int flags = fcntl(fd, F_GETFL, 0);
|
|
if(fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
|
|
log("build: could not set wget stream to nonblocking %s",
|
|
m_firstUrl.getUrl());
|
|
//error
|
|
}
|
|
|
|
if(!g_loop.registerReadCallback ( fd,
|
|
this ,
|
|
doneReadingArchiveFileWrapper,
|
|
m_niceness )) {
|
|
log("build: failed to register warc read callback." );
|
|
return NULL;
|
|
}
|
|
m_registeredWgetReadCallback = true;
|
|
|
|
|
|
log("build: called popen");
|
|
|
|
m_calledWgetThread = true;
|
|
m_hasMoreToRead = true;
|
|
|
|
return fh;
|
|
|
|
// return getUtf8ContentInFile ( fileSizeArg );
|
|
|
|
|
|
// . callThread returns true on success, in which case we block
|
|
// if ( g_threads.call ( FILTER_THREAD ,
|
|
// MAX_NICENESS ,
|
|
// (void *)this , // this
|
|
// systemDoneWrapper ,
|
|
// systemStartWrapper_r ) )
|
|
// // would block, wait for thread
|
|
// return (BigFile *)-1;
|
|
// // failed?
|
|
// log("build: failed to launch wget thread");
|
|
// If we run it in this thread then if we are fetching
|
|
// a local url it will block forever.
|
|
// systemStartWrapper_r(this,NULL);
|
|
// return getUtf8ContentInFile ( fileSizeArg );
|
|
//g_errno = ETHREADSDISABLED;
|
|
|
|
//return NULL;
|
|
}
|
|
|
|
// . get the final utf8 content of the document
|
|
// . all html entities are replaced with utf8 chars
|
|
// . all iframes are expanded
|
|
// . if we are using diffbot then getting the utf8 content should return
|
|
// the json which is the output from the diffbot api. UNLESS we are getting
|
|
// the webpage itself for harvesting outlinks to spider later.
|
|
char **XmlDoc::getUtf8Content ( ) {
|
|
|
|
// if we already computed it, return that
|
|
if ( m_utf8ContentValid ) return &ptr_utf8Content;
|
|
|
|
if ( m_setFromTitleRec ) {
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
setStatus("getting utf8 content");
|
|
|
|
// recycle?
|
|
if ( cr->m_recycleContent || m_recycleContent ||
|
|
// if trying to delete from index, load from old titlerec
|
|
m_deleteFromIndex ) {
|
|
// get the old xml doc from the old title rec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (void *)-1 ) return (char **)pod;
|
|
// int16_tcut
|
|
XmlDoc *od = *pod;
|
|
// this is non-NULL if it existed
|
|
if ( od ) {
|
|
ptr_utf8Content = od-> ptr_utf8Content;
|
|
size_utf8Content = od->size_utf8Content;
|
|
m_utf8ContentValid = true;
|
|
m_contentType = od->m_contentType;
|
|
m_contentTypeValid = true;
|
|
// sanity check
|
|
if ( ptr_utf8Content &&
|
|
ptr_utf8Content[size_utf8Content-1] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
return &ptr_utf8Content;
|
|
}
|
|
// if could not find title rec and we are docid-based then
|
|
// we can't go any further!!
|
|
if ( m_setFromDocId ||
|
|
// it should be there if trying to delete as well!
|
|
m_deleteFromIndex ) {
|
|
log("xmldoc: null utf8 content for docid-based "
|
|
"titlerec (d=%"INT64") lookup which was not found",
|
|
m_docId);
|
|
ptr_utf8Content = NULL;
|
|
size_utf8Content = 0;
|
|
m_utf8ContentValid = true;
|
|
m_contentType = CT_HTML;
|
|
m_contentTypeValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
char **ep = getExpandedUtf8Content();
|
|
if ( ! ep || ep == (void *)-1 ) return ep;
|
|
|
|
// NULL out if no content
|
|
if ( ! *ep ) {
|
|
ptr_utf8Content = NULL;
|
|
size_utf8Content = 0;
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (char **)ct;
|
|
|
|
// if we have a json reply, leave it alone... expanding a "
|
|
// into a double quote will mess up the JSON!
|
|
if ( *ct == CT_JSON ) {
|
|
ptr_utf8Content = (char *)m_expandedUtf8Content;
|
|
size_utf8Content = m_expandedUtf8ContentSize;
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
// why would the spider proxy, who use msg13.cpp to call
|
|
// XmlDoc::getExpandedUtf8Content() want to call this??? it seems
|
|
// to destroy expandedutf8content with a call to htmldecode
|
|
if ( m_isSpiderProxy ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// not if rss file extension
|
|
//bool isRSSExt = false;
|
|
//char *ext = m_firstUrl.getExtension();
|
|
//if ( ext && strcasecmp(ext,"rss") == 0 ) isRSSExt = true;
|
|
//if ( ext && strcasecmp(ext,"xml") == 0 ) isRSSExt = true;
|
|
//if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;
|
|
|
|
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( m_contentTypeValid && m_contentType == CT_XML ) isRSSExt = true;
|
|
|
|
// convert < to <gb and > to gb/> ???? and & to utf32 char
|
|
// for a double wide ampersand?
|
|
//bool doSpecial = true;
|
|
// convert to what it should be if we are an .rss file extension
|
|
//if ( isRSSExt ) doSpecial = false;
|
|
|
|
// sabnity check
|
|
if ( m_xmlValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_wordsValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
//
|
|
// convert illegal utf8 characters into spaces
|
|
//
|
|
// fixes santaclarachorale.vbotickets.com/tickets/g.f._handels_israel_in_egypt/1062
|
|
// which has a 228,0x80,& sequence (3 chars, last is ascii)
|
|
uint8_t *x = (uint8_t *)m_expandedUtf8Content;
|
|
char size;
|
|
for ( ; *x ; x += size ) {
|
|
QUICKPOLL(m_niceness);
|
|
size = getUtf8CharSize(x);
|
|
// ok, make it a space i guess if it is a bad utf8 char
|
|
if ( ! isSaneUtf8Char(x) ) {
|
|
*x = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
// skip if only one byte
|
|
if ( size == 1 ) continue;
|
|
// now each byte in the sequence must have 0x80 set...
|
|
if ( ! (x[1] & 0x80) ) {
|
|
x[0] = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
if ( size == 2 ) continue;
|
|
if ( ! (x[2] & 0x80) ) {
|
|
x[0] = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
if ( size == 3 ) continue;
|
|
if ( ! (x[3] & 0x80) ) {
|
|
x[0] = ' ';
|
|
size = 1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// sanity
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if we are an xml doc, then before we call htmlDecode translate
|
|
// all tags like <title> or <link> to <gbtitle> or <gblink> so we
|
|
// know they are xml tags. because stuff like <br> will
|
|
// become <br> and will be within its xml tag like <gbdescription>
|
|
// or <gbtitle>.
|
|
// MDW: 9/28/2014. no longer do this since i added hashXmlFields().
|
|
/*
|
|
if ( m_contentType == CT_XML ) {
|
|
// count the xml tags
|
|
char *p = m_expandedUtf8Content;
|
|
char *pend = p + m_expandedUtf8ContentSize - 1;
|
|
int32_t need = m_expandedUtf8ContentSize;
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( *p == '<' ) need += 5; // for adding "gbxml"
|
|
}
|
|
if ( ! m_xbuf.reserve(need) ) return NULL;
|
|
// reset ptr
|
|
p = m_expandedUtf8Content;
|
|
// ponit to dst
|
|
char *dst = m_xbuf.getBufStart();
|
|
// do the copy
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// copy it over
|
|
*dst++ = *p;
|
|
if ( *p != '<' ) continue;
|
|
// if <?xml> copy over as is, do not insert 'gb'
|
|
if ( p[1] == '?' ) continue;
|
|
// same for comments <!--...-->
|
|
if ( p[1] == '!' ) continue;
|
|
// point to tagname
|
|
char *tagName = p+1;
|
|
if ( p[1] == '/' ) tagName++;
|
|
// also get the full node now
|
|
NodeType *nt; getTagId ( tagName , &nt );
|
|
// if it is not an html tag, do not fuss with it!
|
|
if ( ! nt ) continue;
|
|
// if its in the list but is xml, let it go too
|
|
if ( nt->m_isXmlTag ) continue;
|
|
// . otherwise, its an html tag being used as an xml
|
|
// tag and we need to encode (append gbxml to it)
|
|
// . insert / first if there
|
|
if ( p[1] == '/' ) {p++;*dst++ = *p;}
|
|
// then "gb"
|
|
*dst++ = 'g';
|
|
*dst++ = 'b';
|
|
*dst++ = 'x';
|
|
*dst++ = 'm';
|
|
*dst++ = 'l';
|
|
}
|
|
// update
|
|
m_xbuf.m_length = dst - m_xbuf.getBufStart();
|
|
// final \0
|
|
*dst = '\0';
|
|
// re-assign these
|
|
m_expandedUtf8Content = m_xbuf.getBufStart();//m_buf;
|
|
m_expandedUtf8ContentSize = m_xbuf.m_length + 1;
|
|
// free esbuf if we were referencing that to save mem
|
|
m_esbuf.purge();
|
|
}
|
|
*/
|
|
|
|
// richmondspca.org has " in some tags and we do not like
|
|
// expanding that to " because it messes up XmlNode::getTagLen()
|
|
// and creates big problems. same for www.first-avenue.com. so
|
|
// by setting doSpecial to try we change < > and " to
|
|
// [ ] and ' which have no meaning in html per se.
|
|
bool doSpecial = true;
|
|
if ( m_contentType == CT_XML ) doSpecial = false;
|
|
|
|
// . now decode those html entites into utf8 so that we never have to
|
|
// check for html entities anywhere else in the code. a big win!!
|
|
// . doSpecial = true, so that <, >, & and " are
|
|
// encoded into high value
|
|
// utf8 chars so that Xml::set(), etc. still work properly and don't
|
|
// add any more html tags than it should
|
|
// . this will decode in place
|
|
// . MDW: 9/28/2014. no longer do for xml docs since i added
|
|
// hashXmlFields()
|
|
int32_t n = m_expandedUtf8ContentSize - 1;
|
|
if ( m_contentType != CT_XML )
|
|
n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
|
|
m_expandedUtf8Content,//ptr_utf8Content,
|
|
m_expandedUtf8ContentSize-1,//size_utf8Con
|
|
doSpecial,
|
|
m_niceness);
|
|
|
|
// can't exceed this! n does not include the final \0 even though
|
|
// we do right it out.
|
|
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
|
|
// sanity
|
|
if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now rss has crap in it like "&nbsp;" so we have to do another
|
|
// decoding pass
|
|
// . MDW: 9/28/2014. no longer do for xml docs since i added
|
|
// hashXmlFields()
|
|
// if ( m_contentType == CT_XML ) // isRSSExt )
|
|
// n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
|
|
// m_expandedUtf8Content,//ptr_utf8Content,
|
|
// n,
|
|
// false,//doSpecial,
|
|
// m_niceness);
|
|
// sanity
|
|
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
|
|
// sanity
|
|
if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
|
|
// finally transform utf8 apostrophe's into regular apostrophes
|
|
// to make parsing easier
|
|
uint8_t *p = (uint8_t *)m_expandedUtf8Content;
|
|
uint8_t *dst = (uint8_t *)m_expandedUtf8Content;
|
|
uint8_t *pend = p + n;
|
|
for ( ; *p ; p += size ) {
|
|
QUICKPOLL(m_niceness);
|
|
size = getUtf8CharSize(p);
|
|
// quick copy
|
|
if ( size == 1 && p[0] != '<' ) { *dst++ = *p; continue; }
|
|
// make "1<super>st</super>" into "1st" so Dates.cpp can
|
|
// have an easier time
|
|
if ( p[0] == '<' &&
|
|
to_lower_a(p[1]) == 's' &&
|
|
to_lower_a(p[2]) == 'u' &&
|
|
to_lower_a(p[3]) == 'p' ) {
|
|
// assume no go!
|
|
*dst++ = '<';
|
|
// use this
|
|
char *s = (char *)p;
|
|
// did number preceed?
|
|
char *pn = s - 1;
|
|
for (;pn>=m_expandedUtf8Content&&is_wspace_a(*pn);pn--)
|
|
QUICKPOLL(m_niceness);
|
|
// must be like "1st" or "32nd"
|
|
if ( ! is_digit(*pn) ) continue;
|
|
// skip the "<sup"
|
|
s += 4;
|
|
// skip until >
|
|
for (; *s && *s != '>' ; s++ )
|
|
QUICKPOLL(m_niceness);
|
|
// crazy?
|
|
if ( ! *s ) continue;
|
|
// skip the '>'
|
|
s++;
|
|
// skip spaces after the "<sup>" tag
|
|
for (; *s && is_wspace_a(*s) ; s++ )
|
|
QUICKPOLL(m_niceness);
|
|
// crazy?
|
|
if ( ! *s ) continue;
|
|
// check for "st" etc
|
|
bool gotIt = false;
|
|
char *suffix = s;
|
|
if ( (to_lower_a(s[0])=='s'&&to_lower_a(s[1]) == 't')||
|
|
(to_lower_a(s[0])=='n'&&to_lower_a(s[1]) == 'd')||
|
|
(to_lower_a(s[0])=='r'&&to_lower_a(s[1]) == 'd')||
|
|
(to_lower_a(s[0])=='t'&&to_lower_a(s[1]) == 'h'))
|
|
gotIt = true;
|
|
if ( ! gotIt ) continue;
|
|
// skip that
|
|
s += 2;
|
|
// skip more spaces
|
|
for (; *s && is_wspace_a(*s) ; s++ )
|
|
QUICKPOLL(m_niceness);
|
|
// crazy?
|
|
if ( ! *s ) continue;
|
|
// find </super> tag
|
|
if ( s[0] != '<' ) continue;
|
|
if ( s[1] != '/' ) continue;
|
|
if ( to_lower_a(s[2]) != 's' ) continue;
|
|
if ( to_lower_a(s[3]) != 'u' ) continue;
|
|
if ( to_lower_a(s[4]) != 'p' ) continue;
|
|
if ( s[5] != '>' ) continue;
|
|
// skip it, point to >
|
|
s += 5;
|
|
// assign p to that
|
|
p = (unsigned char *)s;
|
|
// back up ove rthe no-go
|
|
dst--;
|
|
// rewrite it
|
|
*dst++ = to_lower_a(suffix[0]);
|
|
*dst++ = to_lower_a(suffix[1]);
|
|
// do next round
|
|
continue;
|
|
}
|
|
|
|
|
|
// check for crazy apostrophes
|
|
if ( p[0]==0xe2 &&
|
|
p[1]==0x80 &&
|
|
(p[2]==0x99 ||
|
|
p[2]==0x98 ||
|
|
p[2]==0x9b ) ) {
|
|
*dst++ = '\'';
|
|
continue;
|
|
}
|
|
// utf8 control character?
|
|
if ( p[0] == 0xc2 &&
|
|
p[1] >= 0x80 &&
|
|
p[1] <= 0x9f ) {
|
|
*dst++ = ' ';
|
|
continue;
|
|
}
|
|
// double quotes in utf8
|
|
// DO NOT do this if type JSON!! json uses quotes as
|
|
// control characters
|
|
if ( p[0] == 0xe2 &&
|
|
p[1] == 0x80 &&
|
|
m_contentType != CT_JSON ) {
|
|
if (p[2] == 0x9c ) {
|
|
*dst++ = '\"';
|
|
continue;
|
|
}
|
|
if (p[2] == 0x9d ) {
|
|
*dst++ = '\"';
|
|
continue;
|
|
}
|
|
}
|
|
// and crazy hyphens (8 - 10pm)
|
|
if ( p[0]==0xc2 &&
|
|
p[1]==0xad ) {
|
|
*dst++ = '-';
|
|
continue;
|
|
}
|
|
if ( p[0]==0xe2 &&
|
|
p[1]==0x80 &&
|
|
p[2]==0x93 ) {
|
|
*dst++ = '-';
|
|
continue;
|
|
}
|
|
if ( p[0]==0xe2 &&
|
|
p[1]==0x80 &&
|
|
p[2]==0x94 ) {
|
|
*dst++ = '-';
|
|
continue;
|
|
}
|
|
// . convert all utf8 white space to ascii white space
|
|
// . should benefit the string matching algo in
|
|
// XmlDoc::getEventSummary() which needs to skip spaces
|
|
if ( ! g_map_is_ascii[(unsigned char)*p] &&
|
|
is_wspace_utf8(p) ) {
|
|
*dst++ = ' ';
|
|
continue;
|
|
}
|
|
// otherwise, just copy it
|
|
gbmemcpy(dst,p,size);
|
|
dst += size;
|
|
}
|
|
// null term
|
|
*dst++ = '\0';
|
|
|
|
// now set it up
|
|
ptr_utf8Content = (char *)m_expandedUtf8Content;
|
|
//size_utf8Content = n+1;//m_expandedUtf8ContentSize;
|
|
size_utf8Content = (char *)dst - m_expandedUtf8Content;
|
|
|
|
// sanity -- skipped over the \0???
|
|
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity check
|
|
if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
m_utf8ContentValid = true;
|
|
return &ptr_utf8Content;
|
|
}
|
|
|
|
// *pend should be \0
|
|
int32_t getContentHash32Fast ( unsigned char *p ,
|
|
int32_t plen ,
|
|
int32_t niceness ) {
|
|
// sanity
|
|
if ( ! p ) return 0;
|
|
if ( plen <= 0 ) return 0;
|
|
if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
unsigned char *pend = p + plen;
|
|
|
|
static bool s_init = false;
|
|
static char s_qtab0[256];
|
|
static char s_qtab1[256];
|
|
static char s_qtab2[256];
|
|
static char *s_skips[] = {
|
|
"jan",
|
|
"feb",
|
|
"mar",
|
|
"apr",
|
|
"may",
|
|
"jun",
|
|
"jul",
|
|
"aug",
|
|
"sep",
|
|
"oct",
|
|
"nov",
|
|
"dec",
|
|
"sun",
|
|
"mon",
|
|
"tue",
|
|
"wed",
|
|
"thu",
|
|
"fri",
|
|
"sat" };
|
|
if ( ! s_init ) {
|
|
// only call this crap once
|
|
s_init = true;
|
|
// clear up
|
|
memset(s_qtab0,0,256);
|
|
memset(s_qtab1,0,256);
|
|
memset(s_qtab2,0,256);
|
|
for ( int32_t i = 0 ; i < 19 ; i++ ) {
|
|
unsigned char *s = (unsigned char *)s_skips[i];
|
|
s_qtab0[(unsigned char)to_lower_a(s[0])] = 1;
|
|
s_qtab0[(unsigned char)to_upper_a(s[0])] = 1;
|
|
// do the quick hash
|
|
unsigned char qh = to_lower_a(s[0]);
|
|
qh ^= to_lower_a(s[1]);
|
|
qh <<= 1;
|
|
qh ^= to_lower_a(s[2]);
|
|
s_qtab1[qh] = 1;
|
|
// try another hash, the swift hash
|
|
unsigned char sh = to_lower_a(s[0]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(s[1]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(s[2]);
|
|
s_qtab2[sh] = 1;
|
|
}
|
|
}
|
|
|
|
bool lastWasDigit = false;
|
|
bool lastWasPunct = true;
|
|
uint32_t h = 0LL;
|
|
//char size = 0;
|
|
unsigned char pos = 0;
|
|
for ( ; p < pend ; p++ ) { // += size ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// get size
|
|
// this might not be utf8!!!
|
|
//size = getUtf8CharSize(p);
|
|
// skip if not alnum
|
|
// this might not be utf8!!!
|
|
//if ( ! is_alnum_utf8 ( (char *)p ) ) {
|
|
if ( ! is_alnum_a ( *p ) ) {
|
|
lastWasDigit = false;
|
|
lastWasPunct = true;
|
|
continue;
|
|
}
|
|
// if its a digit, call it 1
|
|
if ( is_digit(*p) ) {
|
|
// skip consecutive digits
|
|
if ( lastWasDigit ) continue;
|
|
// xor in a '1'
|
|
h ^= g_hashtab[pos][(unsigned char)'1'];
|
|
pos++;
|
|
lastWasDigit = true;
|
|
continue;
|
|
}
|
|
// reset
|
|
lastWasDigit = false;
|
|
|
|
// exclude days of the month or week so clocks do
|
|
// not affect this hash
|
|
if ( s_qtab0[p[0]] && lastWasPunct && p[1] && p[2] ) {
|
|
// quick hash
|
|
unsigned char qh = to_lower_a(p[0]);
|
|
qh ^= to_lower_a(p[1]);
|
|
qh <<= 1;
|
|
qh ^= to_lower_a(p[2]);
|
|
// look that up
|
|
if ( ! s_qtab1[qh] ) goto skip;
|
|
// try another hash, the swift hash
|
|
unsigned char sh = to_lower_a(p[0]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(p[1]);
|
|
sh <<= 1;
|
|
sh ^= to_lower_a(p[2]);
|
|
if ( ! s_qtab2[sh] ) goto skip;
|
|
// ok, probably a match..
|
|
unsigned char *s = p + 3;
|
|
// skip to end of word
|
|
//char size2;
|
|
//for ( ; s < pend ; s += size2 ) {
|
|
for ( ; s < pend ; s++ ) {
|
|
//size2 = getUtf8CharSize(s);
|
|
//if ( ! is_alnum_utf8 ((char *)s) )
|
|
if ( ! is_alnum_a ( *s ) )
|
|
break;
|
|
}
|
|
// it is already point to the next char, so clr this
|
|
//size = 0;
|
|
// advance p now
|
|
p = s;
|
|
// hash as one type of thing...
|
|
h ^= g_hashtab[pos][(unsigned char)'X'];
|
|
pos++;
|
|
continue;
|
|
}
|
|
|
|
skip:
|
|
// reset this
|
|
lastWasPunct = false;
|
|
// xor this in right
|
|
h ^= g_hashtab[pos][p[0]];
|
|
pos++;
|
|
// assume ascii or latin1
|
|
continue;
|
|
/*
|
|
// one more?
|
|
if ( size == 1 ) continue;
|
|
// do that
|
|
h ^= g_hashtab[pos][p[1]];
|
|
pos++;
|
|
// one more?
|
|
if ( size == 2 ) continue;
|
|
// do that
|
|
h ^= g_hashtab[pos][p[2]];
|
|
pos++;
|
|
// one more?
|
|
if ( size == 3 ) continue;
|
|
// do that
|
|
h ^= g_hashtab[pos][p[3]];
|
|
pos++;
|
|
// that should do it!
|
|
continue;
|
|
*/
|
|
}
|
|
return h;
|
|
}
|
|
|
|
int32_t *XmlDoc::getContentHash32 ( ) {
|
|
// return it if we got it
|
|
if ( m_contentHash32Valid ) return &m_contentHash32;
|
|
setStatus ( "getting contenthash32" );
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (int32_t *)ct;
|
|
|
|
// we do not hash the url/resolved_url/html fields in diffbot json
|
|
// because the url field is a mirror of the url and the html field
|
|
// is redundant and would slow us down
|
|
if ( *ct == CT_JSON )
|
|
return getContentHashJson32();
|
|
|
|
// if we are a diffbot json object, fake this for now, it will
|
|
// be set for real in hashJSON()
|
|
// no, because we call this before hashJSON() for to set
|
|
// EDOCUNCHANGED above... so just hash the json normally for now
|
|
//if ( m_isDiffbotJSONObject ) {
|
|
// m_contentHash32 = 0;
|
|
// return &m_contentHash32;
|
|
//}
|
|
|
|
// . get the content. get the pure untouched content!!!
|
|
// . gotta be pure since that is what Msg13.cpp computes right
|
|
// after it downloads the doc...
|
|
// . if iframes are present, msg13 gives up
|
|
char **pure = getContent();
|
|
if ( ! pure || pure == (char **)-1 ) return (int32_t *)pure;
|
|
// size
|
|
//int32_t n = size_utf8Content - 1;
|
|
// hash up to first 10,000 chars
|
|
//if ( n > 10000 ) n = 10000;
|
|
// do it
|
|
//m_contentHash32 = hash32 ( ptr_utf8Content , n );
|
|
unsigned char *p = (unsigned char *)(*pure);
|
|
int32_t plen = m_contentLen;//size_utf8Content - 1;
|
|
|
|
// no content means no hash32
|
|
if ( plen <= 0 ) {//ptr_utf8Content ) {
|
|
m_contentHash32 = 0;
|
|
m_contentHash32Valid = true;
|
|
return &m_contentHash32;
|
|
}
|
|
|
|
// we set m_contentHash32 in ::hashJSON() below because it is special
|
|
// for diffbot since it ignores certain json fields like url: and the
|
|
// fields are independent, and numbers matter, like prices
|
|
//if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; }
|
|
|
|
// *pend should be \0
|
|
m_contentHash32 = getContentHash32Fast ( p , plen , m_niceness );
|
|
// validate
|
|
m_contentHash32Valid = true;
|
|
return &m_contentHash32;
|
|
}
|
|
|
|
// we do not hash the url/resolved_url/html fields in diffbot json
|
|
// because the url field is a mirror of the url and the html field
|
|
// is redundant and would slow us down
|
|
int32_t *XmlDoc::getContentHashJson32 ( ) {
|
|
|
|
if ( m_contentHash32Valid ) return &m_contentHash32;
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (int32_t *)jp;
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
int32_t totalHash32 = 0;
|
|
|
|
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
|
|
char *topName = NULL;
|
|
|
|
// what name level are we?
|
|
int32_t numNames = 1;
|
|
JsonItem *pi = ji->m_parent;
|
|
for ( ; pi ; pi = pi->m_parent ) {
|
|
// empty name?
|
|
if ( ! pi->m_name ) continue;
|
|
if ( ! pi->m_name[0] ) continue;
|
|
topName = pi->m_name;
|
|
numNames++;
|
|
}
|
|
|
|
// if we are the diffbot reply "html" field do not hash this
|
|
// because it is redundant and it hashes html tags etc.!
|
|
// plus it slows us down a lot and bloats the index.
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"html") == 0 )
|
|
continue;
|
|
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"url") == 0 )
|
|
continue;
|
|
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"pageUrl") == 0 )
|
|
continue;
|
|
|
|
// mike will track down how the hash works in article|3|123456
|
|
//if ( ji->m_name && numNames==1 &&
|
|
// strcmp(ji->m_name,"diffbotUri") == 0 )
|
|
// continue;
|
|
|
|
if ( ji->m_name && numNames==1 &&
|
|
strcmp(ji->m_name,"resolved_url") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"stats") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"queryString") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"nextPages") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"textAnalysis") == 0 )
|
|
continue;
|
|
|
|
if ( topName && strcmp(topName,"links") == 0 )
|
|
continue;
|
|
|
|
|
|
// hash the fully compound name
|
|
int32_t nameHash32 = 0;
|
|
JsonItem *p = ji;
|
|
char *lastName = NULL;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
// empty name?
|
|
if ( ! p->m_name ) continue;
|
|
if ( ! p->m_name[0] ) continue;
|
|
// dup? can happen with arrays. parent of string
|
|
// in object, has same name as his parent, the
|
|
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
|
|
if ( p->m_name == lastName ) continue;
|
|
// update
|
|
lastName = p->m_name;
|
|
// hash it up
|
|
nameHash32 = hash32(p->m_name,p->m_nameLen,nameHash32);
|
|
}
|
|
|
|
//
|
|
// now Json.cpp decodes and stores the value into
|
|
// a buffer, so ji->getValue() should be decoded completely
|
|
//
|
|
|
|
// . get the value of the json field
|
|
// . if it's a number or bool it converts into a string
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
|
|
//
|
|
// for deduping search results we set m_contentHash32 here for
|
|
// diffbot json objects.
|
|
//
|
|
// we use this hash for setting EDOCUNCHANGED when reindexing
|
|
// a diffbot reply. we also use to see if the diffbot reply
|
|
// is a dup with another page in the index. thirdly, we use
|
|
// to dedup search results, which could be redundant because
|
|
// of our spider-time deduping.
|
|
//
|
|
// make the content hash so we can set m_contentHash32
|
|
// for deduping. do an exact hash for now...
|
|
int32_t vh32 = hash32 ( val , vlen , m_niceness );
|
|
// combine
|
|
int32_t combined32 = hash32h ( nameHash32 , vh32 );
|
|
// accumulate field/val pairs order independently
|
|
totalHash32 ^= combined32;
|
|
// debug note
|
|
//logf(LOG_DEBUG,"ch32: field=%s nh32=%"UINT32" vallen=%"INT32"",
|
|
// ji->m_name,
|
|
// nameHash32,
|
|
// vlen);
|
|
}
|
|
|
|
m_contentHash32 = totalHash32;
|
|
m_contentHash32Valid = true;
|
|
return &m_contentHash32;
|
|
}
|
|
|
|
// do not consider tags except frame and iframe... make all months
|
|
// and days of weeks and digits basically the same
|
|
int64_t *XmlDoc::getLooseContentHash64 ( ) {
|
|
|
|
if ( m_looseContentHash64Valid )
|
|
return &m_looseContentHash64;
|
|
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int64_t *)xml;
|
|
|
|
int64_t h64 = 0LL;
|
|
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes ();
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// skip if not the right kinda tag
|
|
if ( nodes[i].isTag() &&
|
|
nodes[i].getNodeId() != TAG_FRAME &&
|
|
nodes[i].getNodeId() != TAG_IFRAME &&
|
|
nodes[i].getNodeId() != TAG_IMG )
|
|
continue;
|
|
|
|
// hash that node up
|
|
int64_t ch64;
|
|
|
|
// this is really a 32-bit hash
|
|
ch64=getContentHash32Fast((unsigned char *)nodes[i].getNode() ,
|
|
nodes[i].getNodeLen() ,
|
|
m_niceness );
|
|
|
|
// incorporate hash from that node
|
|
h64 = hash64h ( ch64 , h64 );
|
|
}
|
|
|
|
m_looseContentHash64Valid = true;
|
|
m_looseContentHash64 = h64;
|
|
return &m_looseContentHash64;
|
|
}
|
|
|
|
int32_t XmlDoc::getHostHash32a ( ) {
|
|
if ( m_hostHash32aValid ) return m_hostHash32a;
|
|
m_hostHash32aValid = true;
|
|
Url *f = getFirstUrl();
|
|
m_hostHash32a = f->getHostHash32();
|
|
return m_hostHash32a;
|
|
}
|
|
|
|
int32_t XmlDoc::getHostHash32b ( ) {
|
|
if ( m_hostHash32bValid ) return m_hostHash32b;
|
|
m_hostHash32bValid = true;
|
|
Url *c = getCurrentUrl();
|
|
m_hostHash32b = c->getHostHash32();
|
|
return m_hostHash32b;
|
|
}
|
|
|
|
int32_t XmlDoc::getDomHash32( ) {
|
|
if ( m_domHash32Valid ) return m_domHash32;
|
|
m_domHash32Valid = true;
|
|
Url *f = getFirstUrl();
|
|
m_domHash32 = hash32 ( f->getDomain(), f->getDomainLen() );
|
|
return m_domHash32;
|
|
}
|
|
|
|
// . this will be the actual pnm data of the image thumbnail
|
|
// . you can inline it in an image tag like
|
|
// <img src="...."/>
|
|
// background-image:url(...);
|
|
// . FORMAT of ptr_imageData:
|
|
// <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg>
|
|
char **XmlDoc::getThumbnailData ( ) {
|
|
if ( m_imageDataValid ) return &ptr_imageData;
|
|
Images *images = getImages();
|
|
if ( ! images || images == (Images *)-1 ) return (char **)images;
|
|
ptr_imageData = NULL;
|
|
size_imageData = 0;
|
|
m_imageDataValid = true;
|
|
if ( ! images || ! images->m_imageBufValid ) return &ptr_imageData;
|
|
if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData;
|
|
// this buffer is a ThumbnailArray
|
|
ptr_imageData = images->m_imageBuf.getBufStart();
|
|
size_imageData = images->m_imageBuf.length();
|
|
return &ptr_imageData;
|
|
}
|
|
|
|
Images *XmlDoc::getImages ( ) {
|
|
if ( m_imagesValid ) return &m_images;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
if ( ! cr->m_makeImageThumbnails ) {
|
|
m_images.reset();
|
|
m_imagesValid = true;
|
|
return &m_images;
|
|
}
|
|
|
|
if ( cr->m_isCustomCrawl ) {
|
|
m_images.reset();
|
|
m_imagesValid = true;
|
|
return &m_images;
|
|
}
|
|
|
|
setStatus ( "getting thumbnail" );
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (Images *)words;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Images *)xml;
|
|
Sections *sections = getSections();
|
|
if ( ! sections || sections==(Sections *)-1) return (Images *)sections;
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (Images *)site;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Images *)d;
|
|
int8_t *hc = getHopCount();
|
|
if ( ! hc || hc == (void *)-1 ) return (Images *)hc;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (Images *)cu;
|
|
|
|
// . this does not block or anything
|
|
// . if we are a diffbot json reply it should just use the primary
|
|
// image, if any, as the only candidate
|
|
m_images.setCandidates ( cu , words , xml , sections , this );
|
|
|
|
setStatus ("getting thumbnail");
|
|
|
|
// assume valid
|
|
m_imagesValid = true;
|
|
|
|
// now get the thumbnail
|
|
if ( ! m_images.getThumbnail ( site ,
|
|
gbstrlen(site) ,
|
|
*d ,
|
|
this ,
|
|
cr->m_collnum ,
|
|
//NULL , // statusPtr ptr
|
|
*hc ,
|
|
m_masterState,
|
|
m_masterLoop ) )
|
|
return (Images *)-1;
|
|
|
|
return &m_images;
|
|
}
|
|
|
|
|
|
// . get different attributes of the Links as vectors
|
|
// . these are 1-1 with the Links::m_linkPtrs[] array
|
|
TagRec ***XmlDoc::getOutlinkTagRecVector () {
|
|
|
|
// if page has a <meta name=usefakeips content=1> tag
|
|
// then use the hash of the links host as the firstip.
|
|
// this will speed things up when adding a gbdmoz.urls.txt.*
|
|
// file to index every url in dmoz.
|
|
char *useFakeIps = hasFakeIpsMetaTag();
|
|
if ( ! useFakeIps || useFakeIps == (void *)-1 )
|
|
return (TagRec ***)useFakeIps;
|
|
|
|
// no error and valid, return quick
|
|
if ( m_outlinkTagRecVectorValid && *useFakeIps )
|
|
return &m_outlinkTagRecVector;
|
|
|
|
// error?
|
|
if ( m_outlinkTagRecVectorValid && m_msge0.m_errno ) {
|
|
g_errno = m_msge0.m_errno;
|
|
return NULL;
|
|
}
|
|
|
|
// if not using fake ips, give them the real tag rec vector
|
|
if ( m_outlinkTagRecVectorValid )
|
|
return &m_msge0.m_tagRecPtrs;
|
|
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (void *) -1 ) return (TagRec ***)links;
|
|
|
|
if ( *useFakeIps ) {
|
|
// set to those
|
|
m_fakeTagRec.reset();
|
|
// just make a bunch ptr to empty tag rec
|
|
int32_t need = links->m_numLinks * sizeof(TagRec *);
|
|
if ( ! m_fakeTagRecPtrBuf.reserve ( need ) ) return NULL;
|
|
// make them all point to the fake empty tag rec
|
|
TagRec **grv = (TagRec **)m_fakeTagRecPtrBuf.getBufStart();
|
|
for ( int32_t i = 0 ; i < links->m_numLinks ; i++ )
|
|
grv[i] = &m_fakeTagRec;
|
|
// set it
|
|
m_outlinkTagRecVector = grv;
|
|
m_outlinkTagRecVectorValid = true;
|
|
return &m_outlinkTagRecVector;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
|
|
// update status msg
|
|
setStatus ( "getting outlink tag rec vector" );
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (TagRec ***)gr;
|
|
// assume valid
|
|
m_outlinkTagRecVectorValid = true;
|
|
// go get it
|
|
if ( ! m_msge0.getTagRecs ( links->m_linkPtrs ,
|
|
links->m_linkFlags ,
|
|
links->m_numLinks ,
|
|
false , // skip old?
|
|
// make it point to this basetagrec if
|
|
// the LF_SAMEHOST flag is set for the link
|
|
gr ,
|
|
cr->m_collnum ,
|
|
m_niceness ,
|
|
m_masterState ,
|
|
m_masterLoop )) {
|
|
// sanity check
|
|
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
|
|
// we blocked
|
|
return (TagRec ***)-1;
|
|
}
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// or this?
|
|
if ( m_msge0.m_errno ) {
|
|
g_errno = m_msge0.m_errno;
|
|
return NULL;
|
|
}
|
|
// set it
|
|
//m_outlinkTagRecVector = m_msge0.m_tagRecPtrs;
|
|
// ptr to a list of ptrs to tag recs
|
|
return &m_msge0.m_tagRecPtrs;
|
|
}
|
|
|
|
char *XmlDoc::hasNoIndexMetaTag() {
|
|
if ( m_hasNoIndexMetaTagValid )
|
|
return &m_hasNoIndexMetaTag;
|
|
// assume none
|
|
m_hasNoIndexMetaTag = false;
|
|
// store value/content of meta tag in here
|
|
char mbuf[16];
|
|
mbuf[0] = '\0';
|
|
char *tag = "noindex";
|
|
int32_t tlen = gbstrlen(tag);
|
|
// check the xml for a meta tag
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
if ( mbuf[0] == '1' ) m_hasNoIndexMetaTag = true;
|
|
m_hasNoIndexMetaTagValid = true;
|
|
return &m_hasNoIndexMetaTag;
|
|
}
|
|
|
|
|
|
char *XmlDoc::hasFakeIpsMetaTag ( ) {
|
|
if ( m_hasUseFakeIpsMetaTagValid ) return &m_hasUseFakeIpsMetaTag;
|
|
|
|
char mbuf[16];
|
|
mbuf[0] = '\0';
|
|
char *tag = "usefakeips";
|
|
int32_t tlen = gbstrlen(tag);
|
|
|
|
// check the xml for a meta tag
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
|
|
m_hasUseFakeIpsMetaTag = false;
|
|
if ( mbuf[0] == '1' ) m_hasUseFakeIpsMetaTag = true;
|
|
m_hasUseFakeIpsMetaTagValid = true;
|
|
return &m_hasUseFakeIpsMetaTag;
|
|
}
|
|
|
|
|
|
int32_t **XmlDoc::getOutlinkFirstIpVector () {
|
|
|
|
Links *links = getLinks();
|
|
if ( ! links ) return NULL;
|
|
|
|
// if page has a <meta name=usefakeips content=1> tag
|
|
// then use the hash of the links host as the firstip.
|
|
// this will speed things up when adding a gbdmoz.urls.txt.*
|
|
// file to index every url in dmoz.
|
|
char *useFakeIps = hasFakeIpsMetaTag();
|
|
if ( ! useFakeIps || useFakeIps == (void *)-1 )
|
|
return (int32_t **)useFakeIps;
|
|
|
|
if ( *useFakeIps && m_outlinkIpVectorValid )
|
|
return &m_outlinkIpVector;
|
|
|
|
if ( *useFakeIps ) {
|
|
int32_t need = links->m_numLinks * 4;
|
|
m_fakeIpBuf.reserve ( need );
|
|
for ( int32_t i = 0 ; i < links->m_numLinks ; i++ ) {
|
|
uint64_t h64 = links->getHostHash64(i);
|
|
int32_t ip = h64 & 0xffffffff;
|
|
m_fakeIpBuf.pushLong(ip);
|
|
}
|
|
int32_t *ipBuf = (int32_t *)m_fakeIpBuf.getBufStart();
|
|
m_outlinkIpVector = ipBuf;
|
|
m_outlinkIpVectorValid = true;
|
|
return &m_outlinkIpVector;
|
|
}
|
|
|
|
// return msge1's buf otherwise
|
|
if ( m_outlinkIpVectorValid )
|
|
return &m_msge1.m_ipBuf;
|
|
|
|
// should we have some kinda error for msge1?
|
|
//if ( m_outlinkIpVectorValid && m_msge1.m_errno ) {
|
|
// g_errno = m_msge1.m_errno;
|
|
// return NULL;
|
|
//}
|
|
|
|
// . we now scrounge them from TagRec's "firstip" tag if there!
|
|
// . that way even if a domain changes its ip we still use the
|
|
// original ip, because the only reason we need this ip is for
|
|
// deciding which group of hosts will store this SpiderRequest and
|
|
// we use that for throttling, so we have to be consistent!!!
|
|
// . we never add -1 or 0 ips to tagdb though.... (NXDOMAIN,error...)
|
|
// . uses m_msgeForTagRecs for this one
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (int32_t **)grv;
|
|
// note it
|
|
setStatus ( "getting outlink first ip vector" );
|
|
// assume valid
|
|
m_outlinkIpVectorValid = true;
|
|
// sanity check
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
// use this
|
|
int32_t nowGlobal = getSpideredTime();//m_spideredTime;
|
|
// add tags to tagdb?
|
|
bool addTags = true;
|
|
//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
|
|
if ( getIsPageParser() ) addTags = false;
|
|
// get this
|
|
char *testDir = getTestDir();
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . go get it
|
|
// . if coll is "qatest123" then try to use the file ./test/ips.txt to
|
|
// see if the ip is in there for the given url hostname
|
|
// . this will now update Tagdb with the "firstip" tags if it should!!
|
|
// . this just dns looks up the DOMAINS of each outlink because these
|
|
// are *first* ips and ONLY used by Spider.cpp for throttling!!!
|
|
if ( ! m_msge1.getFirstIps ( *grv ,
|
|
links->m_linkPtrs ,
|
|
links->m_linkFlags ,
|
|
links->m_numLinks ,
|
|
false , // skip old?
|
|
cr->m_coll ,
|
|
m_niceness ,
|
|
m_masterState ,
|
|
m_masterLoop ,
|
|
nowGlobal ,
|
|
addTags ,
|
|
testDir )) {
|
|
// sanity check
|
|
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
|
|
// we blocked
|
|
return (int32_t **)-1;
|
|
}
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// . ptr to a list of ptrs to tag recs
|
|
// . ip will be -1 on error
|
|
return &m_msge1.m_ipBuf;
|
|
}
|
|
|
|
/*
|
|
// really this could just check titledb in memory tree and tfndb and should
|
|
// be really fast!!
|
|
char **XmlDoc::getOutlinkIsIndexedVector () {
|
|
if ( m_outlinkIsIndexedVectorValid ) return &m_msge2.m_isIndexedBuf;
|
|
setStatus ( "getting outlink is indexed vector" );
|
|
Links *links = getLinks();
|
|
if ( ! links ) return NULL;
|
|
// assume valid
|
|
m_outlinkIsIndexedVectorValid = true;
|
|
// go get it
|
|
bool status = m_msge2.getIsIndexed ( links->m_linkPtrs ,
|
|
links->m_linkFlags ,
|
|
links->m_numLinks ,
|
|
false , // skip old?
|
|
m_coll ,
|
|
m_niceness ,
|
|
m_masterState ,
|
|
m_masterLoop );
|
|
// set it
|
|
//m_outlinkIsIndexedVector = m_msge2.m_isIndexedBuf;
|
|
// we blocked
|
|
if ( ! status ) return (char **)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
// ptr to a list of ptrs to tag recs
|
|
return &m_msge2.m_isIndexedBuf;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
char *XmlDoc::getIsVisible ( ) {
|
|
if ( m_isVisibleValid ) return &m_isVisible;
|
|
setStatus ( "getting is visible" );
|
|
// to get a live reading, invalidate tag rec from title rec
|
|
m_oldTagRecValid = false;
|
|
// . loop over all regular expression in the url filters table
|
|
// . stop at first regular expression it matches
|
|
int32_t *rn = getRegExpNum2 ( -1 );
|
|
// need to wait for a callback at this point (or we had critical error)
|
|
if ( ! rn || rn == (int32_t *)-1 ) return (char *)rn;
|
|
// assume yes
|
|
m_isVisible = true;
|
|
// and valid
|
|
m_isVisibleValid = true;
|
|
// no match
|
|
if ( *rn == -1 ) return &m_isVisible;
|
|
// get spider priority
|
|
int32_t pr = m_cr->m_spiderPriorities[*rn];
|
|
// test it
|
|
if ( pr == -2 ) m_isVisible = false;
|
|
if ( pr == -3 ) m_isVisible = false;
|
|
return &m_isVisible;
|
|
}
|
|
*/
|
|
|
|
int32_t *XmlDoc::getUrlFilterNum ( ) {
|
|
// return it if already set
|
|
if ( m_urlFilterNumValid ) return &m_urlFilterNum;
|
|
// note that
|
|
setStatus ( "getting url filter row num");
|
|
|
|
// . make the partial new spider rec
|
|
// . we need this for matching filters like lang==zh_cn
|
|
// . crap, but then it matches "hasReply" when it should not
|
|
// . PROBLEM! this is the new reply not the OLD reply, so it may
|
|
// end up matching a DIFFERENT url filter num then what it did
|
|
// before we started spidering it...
|
|
//SpiderReply *newsr = getNewSpiderReply ( );
|
|
// note it
|
|
//if ( ! newsr )
|
|
// log("doc: getNewSpiderReply: %s",mstrerror(g_errno));
|
|
//if ( ! newsr || newsr == (void *)-1 ) return (int32_t *)newsr;
|
|
|
|
// need language i guess
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (int32_t *)langId;
|
|
|
|
|
|
// make a fake one for now
|
|
// SpiderReply fakeReply;
|
|
// // fix errors
|
|
// fakeReply.reset();
|
|
// fakeReply.m_isIndexedINValid = true;
|
|
// // just language for now, so we can FILTER by language
|
|
// if ( m_langIdValid ) fakeReply.m_langId = m_langId;
|
|
|
|
int32_t langIdArg = -1;
|
|
if ( m_langIdValid ) langIdArg = m_langId;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t spideredTime = getSpideredTime();
|
|
// get the spider request
|
|
SpiderRequest *oldsr = &m_sreq;
|
|
// null it out if invalid...
|
|
if ( ! m_sreqValid ) oldsr = NULL;
|
|
// do not set the spideredTime in the spiderReply to 0
|
|
// so we do not trigger the lastSpiderTime
|
|
//int32_t saved = newsr->m_spideredTime;
|
|
//newsr->m_spideredTime = 0;
|
|
//
|
|
// PROBLEM: we end up matching "isIndexed" in the url filters
|
|
// even if this is a NEW document because we pass it in the spider
|
|
// reply that we generate now even though another spider reply
|
|
// may not exist.
|
|
//
|
|
// SOLUTION: just do not supply a spider reply, we only seem to
|
|
// use the urlfilternum to get a diffbot api url OR to see if the
|
|
// document is banned/filtered so we should delete it. otherwise
|
|
// we were supplying "newsr" above...
|
|
|
|
// . look it up
|
|
// . use the old spidered date for "nowGlobal" so we can be consistent
|
|
// for injecting into the "qatest123" coll
|
|
int32_t ufn = ::getUrlFilterNum ( oldsr,
|
|
NULL,//&fakeReply,
|
|
spideredTime,false,
|
|
m_niceness,cr,
|
|
false, // isOutlink?
|
|
NULL,
|
|
langIdArg);
|
|
|
|
// put it back
|
|
//newsr->m_spideredTime = saved;
|
|
|
|
// bad news?
|
|
if ( ufn < 0 ) {
|
|
log("build: failed to get url filter for xmldoc %s",
|
|
m_firstUrl.m_url);
|
|
//g_errno = EBADENGINEER;
|
|
//return NULL;
|
|
}
|
|
|
|
|
|
// store it
|
|
m_urlFilterNum = ufn;
|
|
m_urlFilterNumValid = true;
|
|
|
|
// set this too in case the url filters table changes while
|
|
// we are spidering this and a row is inserted or deleted or something
|
|
//SafeBuf *yy = &cr->m_spiderDiffbotApiUrl[ufn];
|
|
// copy to ours
|
|
//m_diffbotApiUrl.safeMemcpy ( yy );
|
|
// ensure null term
|
|
//m_diffbotApiUrl.nullTerm();
|
|
//m_diffbotApiUrlValid = true;
|
|
|
|
|
|
return &m_urlFilterNum;
|
|
}
|
|
|
|
// . both "u" and "site" must not start with http:// or https:// or protocol
|
|
bool isSiteRootFunc ( char *u , char *site ) {
|
|
// get length of each
|
|
int32_t slen = gbstrlen(site);//m_siteLen;
|
|
int32_t ulen = gbstrlen(u);
|
|
// "site" may or may not end in /, so remove that
|
|
if ( site[slen-1] == '/' ) slen--;
|
|
// same for url
|
|
if ( u[ulen-1] == '/' ) ulen--;
|
|
// skip http:// or https://
|
|
if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
|
|
if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
|
|
if ( strncmp(site,"http://" ,7)==0 ) { site += 7; slen -= 7; }
|
|
if ( strncmp(site,"https://",8)==0 ) { site += 8; slen -= 8; }
|
|
// subtract default.asp etc. from "u"
|
|
//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.asp",11)==0 )
|
|
// ulen -= 11;
|
|
//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.html",12)==0 )
|
|
// ulen -= 12;
|
|
//if ( ulen > 15 && strncasecmp(u+ulen-11,"index.html",10)==0 )
|
|
// ulen -= 10;
|
|
// now they must match exactly
|
|
if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
|
|
// all done
|
|
return false;
|
|
}
|
|
|
|
bool isSiteRootFunc3 ( char *u , int32_t siteRootHash32 ) {
|
|
// get length of each
|
|
int32_t ulen = gbstrlen(u);
|
|
// remove trailing /
|
|
if ( u[ulen-1] == '/' ) ulen--;
|
|
// skip http:// or https://
|
|
if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
|
|
if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
|
|
// now they must match exactly
|
|
int32_t sh32 = hash32(u,ulen);
|
|
return ( sh32 == siteRootHash32 );
|
|
}
|
|
|
|
char *XmlDoc::getIsSiteRoot ( ) {
|
|
if ( m_isSiteRootValid ) return &m_isSiteRoot2;
|
|
// get our site
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (char *)site;
|
|
// get our url without the http:// or https://
|
|
char *u = getFirstUrl()->getHost();
|
|
if ( ! u ) {
|
|
g_errno = EBADURL;
|
|
return NULL;
|
|
}
|
|
// assume valid now
|
|
m_isSiteRootValid = true;
|
|
// get it
|
|
bool isRoot = isSiteRootFunc ( u , site );
|
|
// seems like https:://twitter.com/ is not getting set to root
|
|
if ( m_firstUrl.getPathDepth(true) == 0 && ! m_firstUrl.isCgi() )
|
|
isRoot = true;
|
|
m_isSiteRoot2 = m_isSiteRoot = isRoot;
|
|
return &m_isSiteRoot2;
|
|
}
|
|
|
|
/*
|
|
bool XmlDoc::getIsOutlinkSiteRoot ( char *u , TagRec *gr ) {
|
|
// get our site
|
|
Tag *tag = gr->getTag("site");
|
|
// make "host" point to u's hostname
|
|
int32_t hostLen; char *host = getHostFast ( u , &hostLen );
|
|
// use hostname?
|
|
char *site;
|
|
int32_t slen;
|
|
if ( tag ) {
|
|
site = tag->getTagData();
|
|
slen = tag->getTagDataSize() - 1;
|
|
}
|
|
// otherwise, use hostname as site
|
|
else {
|
|
// must be end, or could be '/'
|
|
if ( ! host[hostLen] || ! host[hostLen+1] ) return true;
|
|
// i guess we were more than just a hostname, so not site root
|
|
return false;
|
|
}
|
|
// get length of each
|
|
int32_t ulen = gbstrlen(u);
|
|
// "site" may or may not end in /, so remove that
|
|
if ( site[slen-1] == '/' ) slen--;
|
|
// same for url
|
|
if ( u[ulen-1] == '/' ) ulen--;
|
|
// now they must match exactly
|
|
if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
|
|
// all done
|
|
return false;
|
|
}
|
|
*/
|
|
|
|
|
|
int8_t *XmlDoc::getHopCount ( ) {
|
|
// return now if valid
|
|
if ( m_hopCountValid ) return &m_hopCount;
|
|
|
|
setStatus ( "getting hop count" );
|
|
|
|
CollectionRec *cr = this->getCollRec();
|
|
if(cr && cr->m_isCustomCrawl ) {
|
|
// for diffbot collections, compute hopcount without casting
|
|
// site/rss to 0 hopcount -- copied from below
|
|
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if (!info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
|
|
int32_t origHopCount = -1;
|
|
if ( m_sreqValid ) {
|
|
origHopCount = m_sreq.m_hopCount;
|
|
}
|
|
int32_t hc = -1;
|
|
// if(m_minInlinkerHopCount+1 < hc && m_minInlinkerHopCount>=0)
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
if ( origHopCount < hc && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
if ( hc == -1 && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
if ( hc == -1 )
|
|
hc = 1;
|
|
if ( hc > 0x7f ) hc = 0x7f;
|
|
m_hopCountValid = true;
|
|
m_hopCount = hc;
|
|
|
|
//printf("Custom hopcount: %d for url: %s",
|
|
//m_hopCount, this->ptr_firstUrl);
|
|
return &m_hopCount;
|
|
}
|
|
|
|
// the unredirected url
|
|
Url *f = getFirstUrl();
|
|
// get url as string, skip "http://" or "https://"
|
|
//char *u = f->getHost();
|
|
// if we match site, we are a site root, so hop count is 0
|
|
//char *isr = getIsSiteRoot();
|
|
//if ( ! isr || isr == (char *)-1 ) return (int8_t *)isr;
|
|
//if ( *isr ) {
|
|
// m_hopCount = 0;
|
|
// m_hopCountValid = true;
|
|
// return &m_hopCount;
|
|
//}
|
|
// ping servers have 0 hop counts
|
|
if ( f->isPingServer() ) {
|
|
// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
|
|
m_hopCount = 0;
|
|
m_hopCountValid = true;
|
|
return &m_hopCount;
|
|
}
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
|
|
// check for site root
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int8_t *)gr;
|
|
// and site roots
|
|
char *isSiteRoot = getIsSiteRoot();
|
|
if (!isSiteRoot ||isSiteRoot==(char *)-1) return (int8_t *)isSiteRoot;
|
|
if ( *isSiteRoot ) {
|
|
// log("xmldoc: hc1 is 0 (siteroot) %s",m_firstUrl.m_url);
|
|
m_hopCount = 0;
|
|
m_hopCountValid = true;
|
|
return &m_hopCount;
|
|
}
|
|
// make sure m_minInlinkerHopCount is valid
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
|
|
// . fix bad original hop counts
|
|
// . assign this hop count from the spider rec
|
|
int32_t origHopCount = -1;
|
|
if ( m_sreqValid ) origHopCount = m_sreq.m_hopCount;
|
|
// derive our hop count from our parent hop count
|
|
int32_t hc = -1;
|
|
// . BUT use inlinker if better
|
|
// . if m_linkInfo1Valid is true, then m_minInlinkerHopCount is valid
|
|
// if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 )
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
// or if parent is unknown, but we have a known inlinker with a
|
|
// valid hop count, use the inlinker hop count then
|
|
// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
|
|
// hc = m_minInlinkerHopCount + 1;
|
|
// if ( origHopCount == 0 )
|
|
// log("xmldoc: hc3 is 0 (spiderreq) %s",m_firstUrl.m_url);
|
|
// or use our hop count from the spider rec if better
|
|
if ( origHopCount < hc && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
// or if neither parent or inlinker was valid hop count
|
|
if ( hc == -1 && origHopCount >= 0 )
|
|
hc = origHopCount;
|
|
// if we have no hop count at this point, i guess just pick 1!
|
|
if ( hc == -1 )
|
|
hc = 1;
|
|
// truncate, hop count is only one byte in the TitleRec.h::m_hopCount
|
|
if ( hc > 0x7f ) hc = 0x7f;
|
|
|
|
// and now so do rss urls.
|
|
if ( *isRSS && hc > 1 ) {
|
|
// force it to one, not zero, otherwise it gets pounded
|
|
// too hard on the aggregator sites. spider priority
|
|
// is too high
|
|
m_hopCount = 1;
|
|
m_hopCountValid = true;
|
|
return &m_hopCount;
|
|
}
|
|
|
|
// unknown hop counts (-1) are propogated, except for root urls
|
|
m_hopCountValid = true;
|
|
m_hopCount = hc;
|
|
return &m_hopCount;
|
|
}
|
|
|
|
/*
|
|
int8_t *XmlDoc::getOutlinkHopCountVector ( ) {
|
|
if ( m_outlinkHopCountVectorValid ) return m_outlinkHopCountVector;
|
|
// need these of course
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (int8_t *)links;
|
|
// and these for seeing if outlink is a site root
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (int8_t *)grv;
|
|
// hop count of parent
|
|
int8_t *ph = getHopCount();
|
|
if ( ! ph || ph == (void *)-1 ) return (int8_t *)ph;
|
|
// int16_tcut
|
|
int32_t n = links->getNumLinks();
|
|
// sanity check
|
|
if ( m_outlinkHopCountVector ) { char *xx=NULL;*xx=0; }
|
|
// make some space
|
|
m_outlinkHopCountVector = (int8_t *)mmalloc ( n * 4 ,"xdhc");
|
|
// return NULL on error with g_errno set
|
|
if ( ! m_outlinkHopCountVector ) return NULL;
|
|
// save size
|
|
m_outlinkHopCountVectorSize = n * 4;
|
|
// stock it
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get it
|
|
char *u = links->getLinkPtr(i);
|
|
// and this
|
|
TagRec *gr = (*grv)[i];
|
|
// flags
|
|
linkflags_t flags = links->m_linkFlags[i];
|
|
// hop count. default to 1.
|
|
int32_t hc = 1;
|
|
if ( getIsOutlinkSiteRoot ( u , gr ) ) hc = 0;
|
|
else if ( isPingServer ( u ) ) hc = 0;
|
|
else if ( flags & LF_RSS ) hc = 0;
|
|
else hc = *ph + 1;
|
|
// assign it
|
|
m_outlinkHopCountVector[i] = hc;
|
|
}
|
|
m_outlinkHopCountVectorValid = true;
|
|
return m_outlinkHopCountVector;
|
|
}
|
|
*/
|
|
|
|
//set to false fo rinjecting and validate it... if &spiderlinks=0
|
|
// should we spider links?
|
|
char *XmlDoc::getSpiderLinks ( ) {
|
|
// set it to false on issues
|
|
//if ( m_indexCode ) {
|
|
// m_spiderLinks = false;
|
|
// m_spiderLinks2 = false;
|
|
// m_spiderLinksValid = true ; }
|
|
|
|
// this slows importing down because we end up doing ip lookups
|
|
// for every outlink if "firstip" not in tagdb.
|
|
// shoot. set2() already sets m_spiderLinksValid to true so we
|
|
// have to override if importing.
|
|
if ( m_isImporting && m_isImportingValid ) {
|
|
m_spiderLinks = false;
|
|
m_spiderLinks2 = false;
|
|
m_spiderLinksValid = true;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
// return the valid value
|
|
if ( m_spiderLinksValid ) return &m_spiderLinks2;
|
|
|
|
setStatus ( "getting spider links flag");
|
|
|
|
// do not add links now if doing the parser test
|
|
if ( g_conf.m_testParserEnabled ||
|
|
m_isDiffbotJSONObject ) {
|
|
m_spiderLinks = false;
|
|
m_spiderLinks2 = false;
|
|
m_spiderLinksValid = true;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return (char *)cr;
|
|
|
|
int32_t *ufn = getUrlFilterNum();
|
|
if ( ! ufn || ufn == (void *)-1 ) return (char *)ufn;
|
|
|
|
// if url filters forbids it
|
|
if ( ! cr->m_harvestLinks[*ufn] ) {
|
|
m_spiderLinksValid = true;
|
|
m_spiderLinks2 = false;
|
|
m_spiderLinks = false;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
// hack for bulk job detection. never spider links
|
|
//if ( cr->m_isCustomCrawl == 2 ) {
|
|
// m_spiderLinks = false;
|
|
// m_spiderLinks2 = false;
|
|
// m_spiderLinksValid = true;
|
|
// return &m_spiderLinks2;
|
|
//}
|
|
|
|
// check the xml for a meta robots tag
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
|
|
// assume true
|
|
m_spiderLinks = true;
|
|
|
|
// or if meta tag says not to
|
|
char buf1 [256];
|
|
char buf2 [256];
|
|
buf1[0] = '\0';
|
|
buf2[0] = '\0';
|
|
xml->getMetaContent ( buf1, 255 , "robots" , 6 );
|
|
xml->getMetaContent ( buf2, 255 , "gigabot", 7 );
|
|
|
|
if ( strstr ( buf1 , "nofollow" ) ||
|
|
strstr ( buf2 , "nofollow" ) ||
|
|
strstr ( buf1 , "none" ) ||
|
|
strstr ( buf2 , "none" ) )
|
|
m_spiderLinks = false;
|
|
|
|
// spider links if doing custom crawl or not using robots.txt
|
|
if ( ! m_useRobotsTxt || cr->m_isCustomCrawl )
|
|
m_spiderLinks = true;
|
|
|
|
// spider request forbade it? diffbot.cpp crawlbot api when
|
|
// specifying urldata (list of urls to add to spiderdb) usually
|
|
// they do not want the links crawled i'd imagine.
|
|
if ( m_sreqValid && m_sreq.m_avoidSpiderLinks )
|
|
m_spiderLinks = false;
|
|
|
|
|
|
// also check in url filters now too
|
|
|
|
|
|
// set shadow member
|
|
m_spiderLinks2 = m_spiderLinks;
|
|
// validate
|
|
m_spiderLinksValid = true;
|
|
return &m_spiderLinks2;
|
|
}
|
|
|
|
//
|
|
// . DELETE ALL SPAM FROM THE INDEX!!!
|
|
//
|
|
// . for a page to be spam these must ALL be true, with the current ip:
|
|
// . site is not in google
|
|
// . site has no "stars" in google's dir
|
|
// . site has no authorityinlink tag
|
|
// . site has less than 10 fresh inlinks
|
|
// . site has less than 500 total inlinks
|
|
// . ip is not from ultra dns
|
|
// . TODO: site is not linked to by wikipedia.com
|
|
// . TODO: site is not linked to by about.com
|
|
// . TODO: site is not linked to by a .gov site
|
|
// . the page IP address changed significantly since the same since last
|
|
// time we indexed it when it was not spam (if applicable)
|
|
//
|
|
// . if the page was indexed at one time and then we decided it was spam,
|
|
// and its ip changed significantly since last time, we just
|
|
// reschedule the spider rec for 15 days later and do not touch anything
|
|
// else. that way we keep the index somewhat stable.
|
|
//
|
|
|
|
/*
|
|
char *XmlDoc::getIsSpam() {
|
|
// return it if valid
|
|
if ( m_isSpamValid ) return &m_isSpam;
|
|
|
|
setStatus ("getting is spam");
|
|
|
|
// assume it is not spam
|
|
m_isSpam = false;
|
|
|
|
// debug
|
|
//logf(LOG_DEBUG,"doc: NOT SPAM!!");
|
|
//m_isSpamValid = true; return &m_isSpam;
|
|
|
|
// we disable this check for the contact doc
|
|
if ( m_spamCheckDisabled ) { m_isSpamValid = true; return &m_isSpam; }
|
|
|
|
// . i put this here for debugging purposes
|
|
// . some big sites have no easy to find contact info
|
|
// . get our domain
|
|
Url *fu = getFirstUrl();
|
|
char *dom = fu->getDomain ();
|
|
int32_t dlen = fu->getDomainLen();
|
|
if ( dlen == 12 && !strncmp(dom,"facebook.com",dlen) ) {
|
|
m_isSpamValid = true; return &m_isSpam; }
|
|
if ( dlen == 9 && !strncmp(dom,"yahoo.com",dlen) ) {
|
|
m_isSpamValid = true; return &m_isSpam; }
|
|
|
|
// get our site's tag rec
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
|
|
|
|
// are we already in the index?
|
|
//char *isIndexed = getIsIndexed();
|
|
//if (!isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
|
|
|
|
// this will update m_oldTagRec with the latest info if its stale
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (char *)hci;
|
|
|
|
//int32_t *ip = getIp();
|
|
//if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
|
|
|
|
//XmlDoc **od = getOldXmlDoc ( );
|
|
//if ( ! od || od == (void *)-1 ) return (char *)od;
|
|
|
|
//int32_t oldIp = 0 ;
|
|
//if ( *od ) {
|
|
// int32_t *ip2 = (*od)->getIp();
|
|
// if ( ! ip2 || ip2 == (int32_t *)-1 ) return (char *)ip2;
|
|
// oldIp = *ip2;
|
|
//}
|
|
|
|
// i am guessing that most sites that use ultra dns will have a lot
|
|
// of site inlinks! so comment this our for now
|
|
//char *ultra = getIpIsUltraDns();
|
|
//if ( ultra || ultra==(char *)-1 ) return (char *)ultra;
|
|
// spammers do not use ultradns
|
|
//if ( *ultra ) return false;
|
|
|
|
Url *f = getFirstUrl();
|
|
char *u = f->getUrl();
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// this will be valid
|
|
m_isSpamValid = true;
|
|
|
|
// use this routine
|
|
m_isSpam = isSpam ( u,
|
|
gr,
|
|
now,
|
|
// *isIndexed,
|
|
//oldIp ,
|
|
// *ip ,
|
|
*hci );
|
|
|
|
// we are doomed! delete in its entirety
|
|
if ( m_isSpam ) m_indexCode = EDOCSPAM;
|
|
|
|
return &m_isSpam;
|
|
}
|
|
|
|
// . "u" must be NORMALIZED. i.e. start with http:// or https:// etc.
|
|
// . we call this on outlinks as well
|
|
// . we no longer look at the old and newip to determine ownership change,
|
|
// because that is not reliable enough
|
|
// . we now maybe rely on a major change to the site root page...
|
|
bool XmlDoc::isSpam ( char *u ,
|
|
TagRec *gr ,
|
|
int32_t now ,
|
|
char isIndexed ,
|
|
int32_t oldIp ,
|
|
int32_t newIp ,
|
|
bool hasContactInfo ) {
|
|
|
|
// we need to mine that same database that firefox does...
|
|
Tag *tag = gr->getTag ( "malware" );
|
|
if ( tag && tag->getTagData()[0] != '0' ) return true;
|
|
|
|
// if they have contact info, that is a really good sign
|
|
if ( hasContactInfo ) return false;
|
|
|
|
// .edu and .gov sites are always fine
|
|
int32_t tlen; char *tld = getTLDFast(u,&tlen);
|
|
if ( tlen == 3 && ! strncmp(tld,"edu",3) ) return false;
|
|
if ( tlen == 3 && ! strncmp(tld,"gov",3) ) return false;
|
|
|
|
// the current top ip address
|
|
//int32_t top = newIp & 0x00ffffff;
|
|
|
|
// TODO: in the case of multiple ips on one domain, ensure we select
|
|
// the same IP every time we do a lookup in MsgC.
|
|
|
|
// ok if in google
|
|
if ( gr->getTag ( "ingoogle" ) ) return false;
|
|
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
|
|
|
|
// can also be in google's dmoz dir. must have a decent page rank.
|
|
if ( gr->getTag ( "pagerank" ) ) return false;
|
|
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
|
|
|
|
// . if was linked to by a high quality root as a new external outlink
|
|
// . TODO: include about.com and wikipedia.com i guess (TODO)
|
|
if ( gr->getTag ( "authorityinlink" ) ) return false;
|
|
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
|
|
|
|
tag = gr->getTag("sitenuminlinks");
|
|
// i guess if it has no entry for this, assume the best
|
|
if ( ! tag ) return false;
|
|
// or just a massive amount of any-age inlinks
|
|
if ( atol(tag->getTagData()) >= 500 ) return false;
|
|
|
|
tag = gr->getTag("sitenuminlinksfresh");
|
|
// i guess if it has no entry for this, assume the best
|
|
if ( ! tag ) return false;
|
|
// if site has enough good FRESH inlinks from the last 3 mos, no spam
|
|
if( atol(tag->getTagData()) >= 10 ) return false;
|
|
|
|
// if we are old and the top 3 bytes of the ip is the same as the last
|
|
// time we were indexed and thereby not identified as spam...
|
|
// then assume we are still not spam! because it was unlikely that
|
|
// the domain ownership changed...
|
|
//if ( isIndexed (oldIp & 0x00ffffff) == top ) return false;
|
|
|
|
// if they have contact info, that is a really good sign
|
|
//if ( hasContactInfo && (oldIp & 0x00ffffff) == top ) return false;
|
|
|
|
// if first time... accept them if they got contact info
|
|
//if ( ! oldIp && hasContactInfo ) return false;
|
|
|
|
// . if it has had the same ip for the last 365 days, let it in
|
|
// . getTagRec() updates this tag immediately if the ip changes
|
|
// . so we can't really use this tag for outlinks, because they might
|
|
// never get thrown into spiderdb to where we can add this tag to
|
|
// their tag rec... UNLESS msgc/msge were to update their tag rec...
|
|
// . i've seen quite a few old spam sites/pages. they just kinda stay
|
|
// there. so let's not do this...
|
|
//tag = gr->get("iptimestamp");
|
|
//int32_t now;
|
|
//if ( tag ) now = getTimeGlobal();
|
|
//if(tag&&now-atol(tag->getTagData())>365*24*3600&&
|
|
// ((tag->m_ip&0x00ffffff)==top))
|
|
// return false;
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// should we index the doc? if already indexed, and is filtered, we delete it
|
|
char *XmlDoc::getIsFiltered ( ) {
|
|
if ( m_isFilteredValid ) return &m_isFiltered;
|
|
if ( m_isDiffbotJSONObject ) {
|
|
m_isFiltered = false;
|
|
m_isFilteredValid = true;
|
|
return &m_isFiltered;
|
|
}
|
|
int32_t *priority = getSpiderPriority();
|
|
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
|
|
m_isFiltered = false;
|
|
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
|
|
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
|
|
if ( *priority == -3 ) m_isFiltered = true;
|
|
m_isFilteredValid = true;
|
|
return &m_isFiltered;
|
|
}
|
|
|
|
int32_t *XmlDoc::getSpiderPriority ( ) {
|
|
if ( m_priorityValid ) return &m_priority;
|
|
setStatus ("getting spider priority");
|
|
// need tagrec to see if banned
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
|
// this is an automatic ban!
|
|
if ( gr->getLong("manualban",0) ) {
|
|
m_priority = -3;//SPIDER_PRIORITY_BANNED;
|
|
m_priorityValid = true;
|
|
return &m_priority;
|
|
}
|
|
int32_t *ufn = getUrlFilterNum();
|
|
if ( ! ufn || ufn == (void *)-1 ) return (int32_t *)ufn;
|
|
// sanity check
|
|
if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
m_priority = cr->m_spiderPriorities[*ufn];
|
|
|
|
// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
|
|
if ( cr->m_forceDelete[*ufn] ) m_priority = -3;
|
|
|
|
m_priorityValid = true;
|
|
return &m_priority;
|
|
}
|
|
|
|
bool XmlDoc::logIt ( SafeBuf *bb ) {
|
|
|
|
// set errCode
|
|
int32_t errCode = m_indexCode;
|
|
if ( ! errCode && g_errno ) errCode = g_errno;
|
|
|
|
// were we new?
|
|
//char isIndexed = -1;
|
|
//if ( m_isIndexedValid ) isIndexed = m_isIndexed;
|
|
bool isNew = true;
|
|
if ( m_sreqValid && m_sreq.m_hadReply ) isNew = false;
|
|
|
|
// keep track of stats
|
|
g_stats.addSpiderPoint ( errCode, isNew ); // !isIndexed );
|
|
|
|
// do not log if we should not, saves some time
|
|
//if ( ! g_conf.m_logSpideredUrls && ! m_forceDelete ) return true;
|
|
if ( ! g_conf.m_logSpideredUrls ) return true;
|
|
|
|
// patch the ip
|
|
int32_t ip = m_ip;
|
|
// invalid?
|
|
if ( ! m_ipValid ) ip = 0;
|
|
|
|
char *coll = "nuked";
|
|
CollectionRec *cr = getCollRec();
|
|
if ( cr ) coll = cr->m_coll;
|
|
|
|
SafeBuf tmpsb;
|
|
|
|
// print into this now
|
|
SafeBuf *sb = &tmpsb;
|
|
// log into provided safebuf if not null
|
|
if ( bb ) sb = bb;
|
|
|
|
//
|
|
// coll
|
|
//
|
|
sb->safePrintf("coll=%s ",coll);
|
|
sb->safePrintf("collnum=%"INT32" ",(int32_t)m_collnum);
|
|
|
|
//
|
|
// print ip
|
|
//
|
|
if ( m_ipValid )
|
|
sb->safePrintf("ip=%s ",iptoa(m_ip) );
|
|
|
|
if ( m_firstIpValid )
|
|
sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );
|
|
|
|
// . first ip from spider req if it is fake
|
|
// . we end up spidering the same url twice because it will have
|
|
// different "firstips" in the SpiderRequest key. maybe just
|
|
// use domain hash instead of firstip, and then let msg13
|
|
// make queues in the case of hammering an ip, which i think
|
|
// it already does...
|
|
if ( m_sreqValid && m_sreq.m_firstIp != m_firstIp )
|
|
sb->safePrintf("fakesreqfirstip=%s ",iptoa(m_sreq.m_firstIp) );
|
|
|
|
//
|
|
// print when this spider request was added
|
|
//
|
|
//if ( m_sreqValid && m_sreq.m_addedTime ) {
|
|
// struct tm *timeStruct = gmtime ( &m_sreq.m_addedTime );
|
|
// char tmp[64];
|
|
// strftime(tmp,64,"requestadded=%b-%d-%Y(%H:%M:%S)", timeStruct);
|
|
// sb->safePrintf("%s(%"UINT32") ",tmp,m_sreq.m_addedTime);
|
|
//}
|
|
|
|
//
|
|
// print spidered time
|
|
//
|
|
//if ( m_spideredTimeValid ) {
|
|
time_t spideredTime = (time_t)getSpideredTime();
|
|
struct tm *timeStruct = gmtime ( &spideredTime );
|
|
char tmp[64];
|
|
strftime(tmp,64,"spidered=%b-%d-%Y(%H:%M:%S)", timeStruct );
|
|
sb->safePrintf("%s(%"UINT32") ",tmp,(uint32_t)spideredTime);
|
|
|
|
// when it was scheduled to be spidered
|
|
if ( m_sreqValid && m_sreq.m_addedTime ) {
|
|
time_t ts = m_sreq.m_addedTime;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
char tmp[64];
|
|
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("scheduledtime=%s(%"UINT32") ",
|
|
tmp,(uint32_t)m_sreq.m_addedTime);
|
|
}
|
|
|
|
// discovery date, first time spiderrequest was added to spiderdb
|
|
if ( m_sreqValid && m_sreq.m_discoveryTime ) {
|
|
time_t ts = m_sreq.m_discoveryTime;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
char tmp[64];
|
|
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("discoverydate=%s(%"UINT32") ",
|
|
tmp,(uint32_t)m_sreq.m_discoveryTime);
|
|
}
|
|
|
|
// print first indexed time
|
|
if ( m_firstIndexedDateValid ) {
|
|
time_t ts = m_firstIndexedDate;
|
|
timeStruct = gmtime ( &ts );//m_firstIndexedDate );
|
|
strftime(tmp,64,"firstindexed=%b-%d-%Y(%H:%M:%S)", timeStruct);
|
|
sb->safePrintf("%s(%"UINT32") ",tmp,
|
|
(uint32_t)m_firstIndexedDate);
|
|
}
|
|
|
|
|
|
//if ( ! m_isIndexedValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// just use the oldurlfilternum for grepping i guess
|
|
//if ( m_oldDocValid && m_oldDoc )
|
|
|
|
// when injecting a request we have no idea if it had a reply or not
|
|
if ( m_sreqValid && m_sreq.m_isInjecting )
|
|
sb->safePrintf("firsttime=? ");
|
|
else if ( m_sreqValid && m_sreq.m_hadReply )
|
|
sb->safePrintf("firsttime=0 ");
|
|
else if ( m_sreqValid )
|
|
sb->safePrintf("firsttime=1 ");
|
|
else
|
|
sb->safePrintf("firsttime=? ");
|
|
|
|
//
|
|
// print # of link texts
|
|
//
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 ) {
|
|
LinkInfo *info = ptr_linkInfo1;
|
|
int32_t nt = info->getNumLinkTexts();
|
|
sb->safePrintf("goodinlinks=%"INT32" ",nt );
|
|
// new stuff. includes ourselves i think.
|
|
//sb->safePrintf("ipinlinks=%"INT32" ",info->m_numUniqueIps);
|
|
//sb->safePrintf("cblockinlinks=%"INT32" ",
|
|
//info->m_numUniqueCBlocks);
|
|
}
|
|
|
|
//
|
|
// print # of link texts from 2nd coll
|
|
//
|
|
// this is not used for what it was used for.
|
|
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
|
|
// LinkInfo *info = ptr_linkInfo2;
|
|
// int32_t nt = 0;
|
|
// if ( info ) nt = info->getNumLinkTexts();
|
|
// if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
|
|
// }
|
|
|
|
if ( m_docIdValid )
|
|
sb->safePrintf("docid=%"UINT64" ",m_docId);
|
|
|
|
char *u = getFirstUrl()->getUrl();
|
|
int64_t pd = g_titledb.getProbableDocId(u);
|
|
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
|
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
|
sb->safePrintf("probdocid=%"UINT64" ",pd);
|
|
sb->safePrintf("probdocidmin=%"UINT64" ",d1);
|
|
sb->safePrintf("probdocidmax=%"UINT64" ",d2);
|
|
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
|
|
|
|
|
|
if ( m_siteNumInlinksValid ) {
|
|
sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
|
|
// sb->safePrintf("siteipinlinks=%"INT32" ",
|
|
// m_siteNumInlinksUniqueIp);
|
|
// sb->safePrintf("sitecblockinlinks=%"INT32" ",
|
|
// m_siteNumInlinksUniqueCBlock);
|
|
int32_t sr = ::getSiteRank ( m_siteNumInlinks );
|
|
sb->safePrintf("siterank=%"INT32" ", sr );
|
|
}
|
|
|
|
if ( m_sreqValid )
|
|
sb->safePrintf("pageinlinks=%04"INT32" ",
|
|
m_sreq.m_pageNumInlinks);
|
|
|
|
// int16_tcut
|
|
int64_t uh48 = hash64b ( m_firstUrl.m_url );
|
|
// mask it
|
|
uh48 &= 0x0000ffffffffffffLL;
|
|
sb->safePrintf ("uh48=%"UINT64" ",uh48 );
|
|
|
|
|
|
if ( m_charsetValid )
|
|
sb->safePrintf("charset=%s ",get_charset_str(m_charset));
|
|
|
|
if ( m_contentTypeValid )
|
|
sb->safePrintf("ctype=%s ",
|
|
g_contentTypeStrings [m_contentType]);
|
|
|
|
if ( m_sreqValid )
|
|
sb->safePrintf("parentlang=%02"INT32"(%s) ",
|
|
(int32_t)m_sreq.m_parentLangId,
|
|
getLanguageAbbr(m_sreq.m_parentLangId));
|
|
|
|
if ( m_langIdValid )
|
|
sb->safePrintf("lang=%02"INT32"(%s) ",(int32_t)m_langId,
|
|
getLanguageAbbr(m_langId));
|
|
|
|
if ( m_countryIdValid )
|
|
sb->safePrintf("country=%02"INT32"(%s) ",(int32_t)m_countryId,
|
|
g_countryCode.getAbbr(m_countryId));
|
|
|
|
if ( m_hopCountValid )
|
|
sb->safePrintf("hopcount=%02"INT32" ",(int32_t)m_hopCount);
|
|
|
|
|
|
if ( m_contentValid )
|
|
sb->safePrintf("contentlen=%06"INT32" ",m_contentLen);
|
|
|
|
if ( m_contentValid && cr->m_isCustomCrawl )
|
|
sb->safePrintf("zeroedout=%i ",(int)m_zeroedOut);
|
|
|
|
if ( m_isContentTruncatedValid )
|
|
sb->safePrintf("contenttruncated=%"INT32" ",
|
|
(int32_t)m_isContentTruncated);
|
|
|
|
if ( m_robotsTxtLenValid )
|
|
sb->safePrintf("robotstxtlen=%04"INT32" ",m_robotsTxtLen );
|
|
|
|
if ( m_isAllowedValid )
|
|
sb->safePrintf("robotsallowed=%i ", (int)m_isAllowed);
|
|
else
|
|
sb->safePrintf("robotsallowed=? " );
|
|
|
|
if ( m_contentHash32Valid )
|
|
sb->safePrintf("ch32=%010"UINT32" ",m_contentHash32);
|
|
|
|
if ( m_domHash32Valid )
|
|
sb->safePrintf("dh32=%010"UINT32" ",m_domHash32);
|
|
|
|
if ( m_siteHash32Valid )
|
|
sb->safePrintf("sh32=%010"UINT32" ",m_siteHash32);
|
|
|
|
if ( m_isPermalinkValid )
|
|
sb->safePrintf("ispermalink=%"INT32" ",(int32_t)m_isPermalink);
|
|
|
|
if ( m_isRSSValid )
|
|
sb->safePrintf("isrss=%"INT32" ",(int32_t)m_isRSS);
|
|
|
|
if ( m_linksValid )
|
|
sb->safePrintf("hasrssoutlink=%"INT32" ",
|
|
(int32_t)m_links.hasRSSOutlink() );
|
|
|
|
if ( m_numOutlinksAddedValid ) {
|
|
sb->safePrintf("outlinksadded=%04"INT32" ",
|
|
(int32_t)m_numOutlinksAdded);
|
|
sb->safePrintf("outlinksaddedfromsamedomain=%04"INT32" ",
|
|
(int32_t)m_numOutlinksAddedFromSameDomain);
|
|
}
|
|
|
|
if ( m_metaListValid )
|
|
sb->safePrintf("addlistsize=%05"INT32" ",
|
|
(int32_t)m_metaListSize);
|
|
else
|
|
sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)0);
|
|
|
|
if ( m_addedSpiderRequestSizeValid )
|
|
sb->safePrintf("addspiderreqsize=%05"INT32" ",
|
|
m_addedSpiderRequestSize);
|
|
else
|
|
sb->safePrintf("addspiderreqsize=%05"INT32" ",0);
|
|
|
|
|
|
if ( m_addedSpiderReplySizeValid )
|
|
sb->safePrintf("addspiderrepsize=%05"INT32" ",
|
|
m_addedSpiderReplySize);
|
|
else
|
|
sb->safePrintf("addspiderrepsize=%05"INT32" ",0);
|
|
|
|
|
|
if ( m_addedStatusDocSizeValid ) {
|
|
sb->safePrintf("addstatusdocsize=%05"INT32" ",
|
|
m_addedStatusDocSize);
|
|
sb->safePrintf("addstatusdocid=%"UINT64" ",
|
|
m_addedStatusDocId);
|
|
}
|
|
else {
|
|
sb->safePrintf("addstatusdocsize=%05"INT32" ",0);
|
|
sb->safePrintf("addstatusdocid=0 ");
|
|
}
|
|
|
|
|
|
if ( m_useSecondaryRdbs ) {
|
|
sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
|
|
sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
|
|
sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
|
|
sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
|
|
sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
|
|
if ( cr )
|
|
sb->safePrintf("indexspiderreplies=%i ",(int)
|
|
cr->m_indexSpiderReplies);
|
|
}
|
|
|
|
if ( size_imageData && m_imageDataValid ) {
|
|
// url is in data now
|
|
ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
|
|
int32_t nt = ta->getNumThumbnails();
|
|
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
|
|
sb->safePrintf("thumbnail=%s,%"INT32"bytes,%"INT32"x%"INT32",(%"INT32") ",
|
|
ti->getUrl(),
|
|
ti->m_dataSize,
|
|
ti->m_dx,
|
|
ti->m_dy,
|
|
nt);
|
|
}
|
|
else
|
|
sb->safePrintf("thumbnail=none ");
|
|
|
|
|
|
/*
|
|
if ( m_hasAddressValid && m_addressesValid )
|
|
sb->safePrintf("numaddr=%"INT32" ",(int32_t)m_addresses.m_numValid);
|
|
|
|
//if ( m_skipIndexingValid )
|
|
// sb->safePrintf("skipindexing=%"INT32" ",(int32_t)m_skipIndexing);
|
|
|
|
if ( m_hasTODValid )
|
|
sb->safePrintf("hastod=%"INT32" ",(int32_t)m_hasTOD);
|
|
*/
|
|
|
|
// get the content type
|
|
uint8_t ct = CT_UNKNOWN;
|
|
if ( m_contentTypeValid ) ct = m_contentType;
|
|
|
|
bool isRoot = false;
|
|
if ( m_isSiteRootValid ) isRoot = m_isSiteRoot;
|
|
|
|
// make sure m_minInlinkerHopCount is valid
|
|
LinkInfo *info1 = NULL;
|
|
if ( m_linkInfo1Valid ) info1 = ptr_linkInfo1;
|
|
|
|
//bool isContacty = getIsContacty(&m_firstUrl,
|
|
// info1,
|
|
// m_hopCount ,
|
|
// ct , // contentType
|
|
// isRoot ,
|
|
// m_niceness );
|
|
/*
|
|
// just use this now
|
|
if ( m_hasContactInfoValid )
|
|
sb->safePrintf("iscontacty=%"INT32" ",(int32_t)m_hasContactInfo);
|
|
|
|
if ( m_hasSiteVenueValid )
|
|
sb->safePrintf("hassitevenue=%"INT32" ",(int32_t)m_hasSiteVenue);
|
|
*/
|
|
|
|
// hack this kinda
|
|
// . in PageInject.cpp we do not have a valid priority without
|
|
// blocking because we did a direct injection!
|
|
// so ignore this!!
|
|
// . a diffbot json object, an xmldoc we set from a json object
|
|
// in a diffbot reply, is a childDoc (m_isChildDoc) is true
|
|
// and does not have a spider priority. only the parent doc
|
|
// that we used to get the diffbot reply (array of json objects)
|
|
// will have the spider priority
|
|
if ( ! getIsInjecting() && ! m_isDiffbotJSONObject ) {
|
|
//int32_t *priority = getSpiderPriority();
|
|
//if ( ! priority ||priority==(void *)-1){char *xx=NULL;*xx=0;}
|
|
if ( m_priorityValid )
|
|
sb->safePrintf("priority=%"INT32" ",
|
|
(int32_t)m_priority);
|
|
}
|
|
|
|
// should be valid since we call getSpiderPriority()
|
|
if ( m_urlFilterNumValid )
|
|
sb->safePrintf("urlfilternum=%"INT32" ",(int32_t)m_urlFilterNum);
|
|
|
|
|
|
if ( m_diffbotApiUrlValid &&
|
|
m_diffbotApiUrl.getBufStart() &&
|
|
m_diffbotApiUrl.getBufStart()[0] )
|
|
sb->safePrintf("diffbotjsonobjects=%"INT32" ",
|
|
(int32_t)m_diffbotJSONCount);
|
|
|
|
if ( m_diffbotReplyValid )
|
|
sb->safePrintf("diffboterror=%"INT32" ",m_diffbotReplyError);
|
|
|
|
if ( m_siteValid )
|
|
sb->safePrintf("site=%s ",ptr_site);
|
|
|
|
if ( m_isSiteRootValid )
|
|
sb->safePrintf("siteroot=%"INT32" ",m_isSiteRoot );
|
|
else
|
|
sb->safePrintf("siteroot=? ");
|
|
|
|
// like how we index it, do not include the filename. so we can
|
|
// have a bunch of pathdepth 0 urls with filenames like xyz.com/abc.htm
|
|
if ( m_firstUrlValid ) {
|
|
int32_t pd = -1;
|
|
// fix core
|
|
if ( m_firstUrl.m_url &&
|
|
m_firstUrl.m_ulen > 0 &&
|
|
m_firstUrl.m_path )
|
|
pd = m_firstUrl.getPathDepth(false);
|
|
sb->safePrintf("pathdepth=%"INT32" ",pd);
|
|
}
|
|
else {
|
|
sb->safePrintf("pathdepth=? ");
|
|
}
|
|
|
|
//
|
|
// . sometimes we print these sometimes we do not
|
|
// . put this at the end so we can awk out the above fields reliably
|
|
//
|
|
|
|
// print when it was last spidered
|
|
if ( m_oldDocValid && m_oldDoc ) {
|
|
time_t spideredTime = m_oldDoc->getSpideredTime();
|
|
struct tm *timeStruct = gmtime ( &spideredTime );
|
|
char tmp[64];
|
|
strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
|
|
sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime);
|
|
}
|
|
|
|
// print new pubdate
|
|
if ( m_pubDateValid && m_pubDate!=(uint32_t)-1 && m_pubDate!=0 ) {
|
|
char tmp[64];
|
|
time_t ts = (time_t)m_pubDate;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("pubdate=%s ", tmp );
|
|
}
|
|
|
|
if ( m_linkInfo1Valid && ptr_linkInfo1 && ptr_linkInfo1->hasRSSItem())
|
|
sb->safePrintf("hasrssitem=1 ");
|
|
|
|
// was the content itself injected?
|
|
if ( m_wasContentInjected )
|
|
sb->safePrintf("contentinjected=1 ");
|
|
else
|
|
sb->safePrintf("contentinjected=0 ");
|
|
|
|
// might have just injected the url and downloaded the content?
|
|
if ( (m_sreqValid && m_sreq.m_isInjecting) ||
|
|
(m_isInjecting && m_isInjectingValid) )
|
|
sb->safePrintf("urlinjected=1 ");
|
|
else
|
|
sb->safePrintf("urlinjected=0 ");
|
|
|
|
if ( m_sreqValid && m_sreq.m_isAddUrl )
|
|
sb->safePrintf("isaddurl=1 ");
|
|
else
|
|
sb->safePrintf("isaddurl=0 ");
|
|
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex )
|
|
sb->safePrintf("pagereindex=1 ");
|
|
|
|
if ( m_spiderLinksValid && m_spiderLinks )
|
|
sb->safePrintf("spiderlinks=1 ");
|
|
if ( m_spiderLinksValid && ! m_spiderLinks )
|
|
sb->safePrintf("spiderlinks=0 ");
|
|
|
|
|
|
if ( m_crawlDelayValid && m_crawlDelay != -1 )
|
|
sb->safePrintf("crawldelayms=%"INT32" ",(int32_t)m_crawlDelay);
|
|
|
|
if ( m_recycleContent )
|
|
sb->safePrintf("recycleContent=1 ");
|
|
|
|
if ( m_exactContentHash64Valid )
|
|
sb->safePrintf("exactcontenthash=%"UINT64" ",
|
|
m_exactContentHash64 );
|
|
|
|
// . print percent changed
|
|
// . only print if non-zero!
|
|
if ( m_percentChangedValid && m_oldDocValid && m_oldDoc &&
|
|
m_percentChanged )
|
|
sb->safePrintf("changed=%.00f%% ",m_percentChanged);
|
|
|
|
// only print if different now! good for grepping changes
|
|
if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_docId != m_docId )
|
|
sb->safePrintf("olddocid=%"UINT64" ",m_oldDoc->m_docId);
|
|
|
|
// only print if different now! good for grepping changes
|
|
if ( m_sreqValid && m_sreq.m_ufn >= 0 &&
|
|
m_sreq.m_ufn != m_urlFilterNum )
|
|
sb->safePrintf("oldurlfilternum=%"INT32" ",
|
|
(int32_t)m_sreq.m_ufn);
|
|
|
|
if ( m_sreqValid && m_sreq.m_priority >= 0 &&
|
|
m_sreq.m_priority != m_priority )
|
|
sb->safePrintf("oldpriority=%"INT32" ",
|
|
(int32_t)m_sreq.m_priority);
|
|
|
|
if ( m_oldDoc && m_oldDoc->m_langIdValid &&
|
|
m_oldDoc->m_langId != m_langId )
|
|
sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_oldDoc->m_langId,
|
|
getLanguageAbbr(m_oldDoc->m_langId));
|
|
|
|
if ( m_useSecondaryRdbs &&
|
|
m_useTitledb &&
|
|
m_logLangId != m_langId )
|
|
sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_logLangId,
|
|
getLanguageAbbr(m_logLangId));
|
|
|
|
if ( m_useSecondaryRdbs &&
|
|
m_useTitledb &&
|
|
m_logSiteNumInlinks != m_siteNumInlinks )
|
|
sb->safePrintf("oldsiteinlinks=%04"INT32" ",m_logSiteNumInlinks);
|
|
|
|
if ( m_useSecondaryRdbs &&
|
|
m_useTitledb &&
|
|
m_oldDocValid &&
|
|
m_oldDoc &&
|
|
strcmp(ptr_site,m_oldDoc->ptr_site) )
|
|
sb->safePrintf("oldsite=%s ",m_oldDoc->ptr_site);
|
|
|
|
// . print old pubdate
|
|
// . -1 means unsupported, 0 means could not find one
|
|
// . only print if different now! good for grepping changes
|
|
if ( m_oldDocValid && m_oldDoc &&
|
|
m_oldDoc->m_pubDate!= (uint32_t)-1 &&
|
|
m_oldDoc->m_pubDate !=0 &&
|
|
m_oldDoc->m_pubDate != m_pubDate ) {
|
|
char tmp[64];
|
|
time_t ts = m_oldDoc->m_pubDate;
|
|
struct tm *timeStruct = gmtime ( &ts );
|
|
strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
sb->safePrintf("oldpubdate=%s ",tmp );
|
|
}
|
|
|
|
if ( m_isAdultValid )
|
|
sb->safePrintf("isadult=%"INT32" ",(int32_t)m_isAdult);
|
|
|
|
// only print if different now! good for grepping changes
|
|
if ( m_oldDocValid && m_oldDoc &&
|
|
m_oldDoc->m_siteNumInlinks >= 0 &&
|
|
m_oldDoc->m_siteNumInlinks != m_siteNumInlinks ) {
|
|
int32_t sni = -1;
|
|
if ( m_oldDoc ) sni = m_oldDoc->m_siteNumInlinks;
|
|
sb->safePrintf("oldsiteinlinks=%04"INT32" ",sni);
|
|
}
|
|
|
|
|
|
// Spider.cpp sets m_sreq.m_errCount before adding it to doledb
|
|
if ( m_sreqValid ) // && m_sreq.m_errCount )
|
|
sb->safePrintf("errcnt=%"INT32" ",(int32_t)m_sreq.m_errCount );
|
|
else
|
|
sb->safePrintf("errcnt=? ");
|
|
|
|
if ( ptr_redirUrl ) { // m_redirUrlValid && m_redirUrlPtr ) {
|
|
sb->safePrintf("redir=%s ",ptr_redirUrl);//m_redirUrl.getUrl());
|
|
if ( m_numRedirects > 2 )
|
|
sb->safePrintf("numredirs=%"INT32" ",m_numRedirects);
|
|
}
|
|
|
|
if ( m_canonicalRedirUrlValid && m_canonicalRedirUrlPtr )
|
|
sb->safePrintf("canonredir=%s ",
|
|
m_canonicalRedirUrlPtr->getUrl());
|
|
|
|
if ( m_httpStatusValid && m_httpStatus != 200 )
|
|
sb->safePrintf("httpstatus=%"INT32" ",(int32_t)m_httpStatus);
|
|
|
|
if ( m_updatedMetaData )
|
|
sb->safePrintf("updatedmetadata=1 ");
|
|
|
|
if ( m_isDupValid && m_isDup )
|
|
sb->safePrintf("dupofdocid=%"INT64" ",m_docIdWeAreADupOf);
|
|
|
|
if ( m_firstUrlValid )
|
|
sb->safePrintf("url=%s ",m_firstUrl.m_url);
|
|
else
|
|
sb->safePrintf("urldocid=%"INT64" ",m_docId);
|
|
|
|
//
|
|
// print error/status
|
|
//
|
|
sb->safePrintf(": %s",mstrerror(m_indexCode));
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// if safebuf provided, do not log to log
|
|
if ( bb ) return true;
|
|
|
|
// log it out
|
|
logf ( LOG_INFO ,
|
|
"build: %s",
|
|
//getFirstUrl()->getUrl(),
|
|
sb->getBufStart() );
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . make sure that the title rec we generated creates the exact same
|
|
// meta list as what we got
|
|
bool XmlDoc::doConsistencyTest ( bool forceTest ) {
|
|
|
|
// skip for now it was coring on a json doc test
|
|
return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr )
|
|
return true;
|
|
|
|
if ( ! m_doConsistencyTesting && strcmp(cr->m_coll,"qatest123") != 0 )
|
|
return true;
|
|
|
|
// if we had an old doc then our meta list will have removed
|
|
// stuff already in the database from indexing the old doc.
|
|
// so it will fail the parsing consistency check... because of
|
|
// the 'incremental indexing' algo above
|
|
// disable for now... just a secondfor testing cheatcc.com
|
|
if ( m_oldDoc && m_oldDocValid && g_conf.m_doIncrementalUpdating )
|
|
return true;
|
|
|
|
// if not test coll skip this
|
|
//if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
|
|
|
// title rec is null if we are reindexing an old doc
|
|
// and "unchanged" was true.
|
|
if ( m_unchangedValid && m_unchanged ) {
|
|
if ( ! m_titleRecBufValid ) return true;
|
|
if ( m_titleRecBuf.length()==0 ) return true;
|
|
}
|
|
|
|
// leave this uncommented so we can see if we are doing it
|
|
setStatus ( "doing consistency check" );
|
|
|
|
// log debug
|
|
log("spider: doing consistency check for %s",ptr_firstUrl);
|
|
|
|
// . set another doc from that title rec
|
|
// . do not keep on stack since so huge!
|
|
XmlDoc *doc ;
|
|
try { doc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return false;
|
|
}
|
|
mnew ( doc , sizeof(XmlDoc),"xmldcs");
|
|
|
|
|
|
if ( ! doc->set2 ( m_titleRecBuf.getBufStart() ,
|
|
-1 , cr->m_coll , NULL , m_niceness ,
|
|
// no we provide the same SpiderRequest so that
|
|
// it can add the same SpiderReply to the metaList
|
|
&m_sreq ) ) {
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
return false;
|
|
}
|
|
|
|
// . some hacks
|
|
// . do not look up title rec in titledb, assume it is new
|
|
doc->m_isIndexed = false;
|
|
doc->m_isIndexedValid = true;
|
|
|
|
// so we don't core in getRevisedSpiderRequest()
|
|
doc->m_firstIp = m_firstIp;
|
|
doc->m_firstIpValid = true;
|
|
|
|
// inherit this doc's tag rec since it has not called updateTagdb() yet
|
|
//doc->ptr_tagRecData = ptr_tagRecData;
|
|
//doc->size_tagRecData = size_tagRecData;
|
|
|
|
// getNewSpiderReply() calls getDownloadEndTime() which is not valid
|
|
// and causes the page to be re-downloaded, so stop that..!
|
|
doc->m_downloadEndTime = m_downloadEndTime;
|
|
doc->m_downloadEndTimeValid = true;
|
|
|
|
// inherit doledb key as well to avoid a core there
|
|
doc->m_doledbKey = m_doledbKey;
|
|
|
|
// skip the robots.txt lookup! that was causing this too block!
|
|
//doc->m_isAllowed = true;
|
|
//doc->m_isAllowedValid = true;
|
|
|
|
// do not get outlink info for this, that stuff is for adding outlinks
|
|
// to spiderdb, and tagdb may have changed. so we can't really compare
|
|
// spider recs! if this is false then the call to doc->getMetaList()
|
|
// blocks to lookup the tagdb and titledb recs for each outlink!
|
|
// therefore, set it to true!
|
|
//doc->m_isInjecting = true;
|
|
// mdw: shouldn't this have the same effect?
|
|
//doc->m_spiderLinks2 = false;
|
|
//doc->m_spiderLinksValid = true;
|
|
|
|
// flag it
|
|
doc->m_doingConsistencyCheck = true;
|
|
|
|
// get get its metalist. rv = return value
|
|
char *rv = doc->getMetaList ( );
|
|
|
|
// sanity check - compare urls
|
|
if ( doc->m_firstUrl.m_ulen != m_firstUrl.m_ulen){char *xx=NULL;*xx=0;}
|
|
|
|
// error setting it?
|
|
if ( ! rv ) {
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// free it
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
// error
|
|
return false;
|
|
}
|
|
// blocked? that is not allowed
|
|
if ( rv == (void *)-1 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// compare with the old list
|
|
char *list1 = m_metaList;
|
|
int32_t listSize1 = m_metaListSize;
|
|
|
|
char *list2 = doc->m_metaList;
|
|
int32_t listSize2 = doc->m_metaListSize;
|
|
|
|
|
|
// show it for now
|
|
//log("build: printing meta list 1");
|
|
//printMetaList(list1,list1+listSize1,NULL);
|
|
//log("build: printing meta list 2");
|
|
//printMetaList(list2,list2+listSize2,NULL);
|
|
|
|
|
|
// do a compare
|
|
HashTableX ht1;
|
|
HashTableX ht2;
|
|
|
|
ht1.set ( sizeof(key224_t),sizeof(char *),
|
|
262144,NULL,0,false,m_niceness,"xmlht1");
|
|
ht2.set ( sizeof(key224_t),sizeof(char *),
|
|
262144,NULL,0,false,m_niceness,"xmlht2");
|
|
|
|
// format of a metalist... see XmlDoc::addTable() where it adds keys
|
|
// from a table into the metalist
|
|
// <nosplitflag|rdbId><key><dataSize><data>
|
|
// where nosplitflag is 0x80
|
|
char *p1 = list1;
|
|
char *p2 = list2;
|
|
char *pend1 = list1 + listSize1;
|
|
char *pend2 = list2 + listSize2;
|
|
|
|
// see if each key in list1 is in list2
|
|
if ( ! hashMetaList ( &ht1 , p1 , pend1 , false ) ) {
|
|
char *xx=NULL;*xx=0;
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
return log("doc: failed consistency test for %s",ptr_firstUrl);
|
|
}
|
|
if ( ! hashMetaList ( &ht2 , p2 , pend2 , false ) ) {
|
|
char *xx=NULL;*xx=0;
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
return log("doc: failed consistency test for %s",ptr_firstUrl);
|
|
}
|
|
|
|
// . now make sure each list matches the other
|
|
// . first scan the guys in "p1" and make sure in "ht2"
|
|
hashMetaList ( &ht2 , p1 , pend1 , true );
|
|
// . second scan the guys in "p2" and make sure in "ht1"
|
|
hashMetaList ( &ht1 , p2 , pend2 , true );
|
|
|
|
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
|
|
delete ( doc );
|
|
|
|
log ("spider: passed consistency test for %s",ptr_firstUrl );
|
|
|
|
// no serious error, although there might be an inconsistency
|
|
return true;
|
|
}
|
|
|
|
int32_t XmlDoc::printMetaList ( ) {
|
|
|
|
SafeBuf sb;
|
|
printMetaList ( m_metaList ,
|
|
m_metaList + m_metaListSize ,
|
|
&sb );
|
|
fprintf(stderr,"%s\n",sb.getBufStart());
|
|
return 0;
|
|
}
|
|
|
|
|
|
#define TABLE_ROWS 25
|
|
|
|
// print this also for page parser output!
|
|
void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
|
|
|
|
verifyMetaList ( p , pend , false );
|
|
|
|
SafeBuf tmp;
|
|
if ( ! sb ) sb = &tmp;
|
|
|
|
char *hdr =
|
|
"<table border=1>\n"
|
|
"<tr>"
|
|
"<td><b>rdb</b></td>"
|
|
"<td><b>del?</b></td>"
|
|
"<td><b>shardByTermId?</b></td>"
|
|
// illustrates key size
|
|
"<td><b>key</b></td>"
|
|
// break it down. based on rdb, of course.
|
|
"<td><b>desc</b></td>"
|
|
"</tr>\n" ;
|
|
|
|
sb->safePrintf("%s",hdr);
|
|
|
|
int32_t recSize = 0;
|
|
int32_t rcount = 0;
|
|
for ( ; p < pend ; p += recSize ) {
|
|
// get rdbid
|
|
uint8_t rdbId = *p & 0x7f;
|
|
// skip
|
|
p++;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// point to it
|
|
char *rec = p;
|
|
// init this
|
|
int32_t recSize = ks;
|
|
// convert into a key128_t, the biggest possible key
|
|
//key224_t k ;
|
|
char k[MAX_KEY_BYTES];
|
|
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
|
|
//k.setMin();
|
|
gbmemcpy ( &k , p , ks );
|
|
// is it a negative key?
|
|
char neg = false;
|
|
if ( ! ( p[0] & 0x01 ) ) neg = true;
|
|
// this is now a bit in the posdb key so we can rebalance
|
|
char shardByTermId = false;
|
|
if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
|
|
shardByTermId = true;
|
|
// skip it
|
|
p += ks;
|
|
// get datasize
|
|
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// . always zero if key is negative
|
|
// . this is not the case unfortunately...
|
|
if ( neg ) dataSize = 0;
|
|
// if -1, read it in
|
|
if ( dataSize == -1 ) {
|
|
dataSize = *(int32_t *)p;
|
|
// inc this
|
|
recSize += 4;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
}
|
|
// point to it
|
|
char *data = p;
|
|
// skip the data
|
|
p += dataSize;
|
|
// inc it
|
|
recSize += dataSize;
|
|
// NULL it for negative keys
|
|
if ( dataSize == 0 ) data = NULL;
|
|
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS) == 0 )
|
|
sb->safePrintf("<!--ignore--></table>%s",hdr);
|
|
|
|
|
|
//if ( rdbId != RDB_LINKDB ) continue;
|
|
|
|
// print dbname
|
|
sb->safePrintf("<tr>");
|
|
char *dn = getDbnameFromId ( rdbId );
|
|
sb->safePrintf("<td>%s</td>",dn);
|
|
|
|
if ( neg ) sb->safePrintf("<td>D</td>");
|
|
else sb->safePrintf("<td> </td>");
|
|
|
|
if ( shardByTermId ) sb->safePrintf("<td>shardByTermId</td>");
|
|
else sb->safePrintf("<td> </td>");
|
|
|
|
sb->safePrintf("<td><nobr>%s</nobr></td>", KEYSTR(k,ks));
|
|
|
|
|
|
|
|
if ( rdbId == RDB_POSDB ) {
|
|
// get termid et al
|
|
key144_t *k2 = (key144_t *)k;
|
|
int64_t tid = g_posdb.getTermId(k2);
|
|
//uint8_t score8 = g_posdb.getScore ( *k2 );
|
|
//uint32_t score32 = score8to32 ( score8 );
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
"termId=%020"UINT64" "
|
|
//"score8=%03"UINT32" "
|
|
//"score32=%010"UINT32""
|
|
"</td>"
|
|
,(uint64_t)tid
|
|
//(int32_t)score8,
|
|
//(int32_t)score32
|
|
);
|
|
}
|
|
else if ( rdbId == RDB_DATEDB ) {
|
|
// get termid et al
|
|
key128_t *k2 = (key128_t *)k;
|
|
int64_t tid = g_datedb.getTermId(k2);
|
|
// use indexdb's function for this. should be the same
|
|
uint8_t score8 = g_indexdb.getScore ( (char *)k );
|
|
int32_t date = g_datedb.getDate ( k2 );
|
|
uint32_t score32 = score8to32 ( score8 );
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
"termId=%020"UINT64" "
|
|
"date=%010"UINT32" "
|
|
"score8=%03"UINT32" "
|
|
"score32=%010"UINT32""
|
|
"</td>",
|
|
tid,
|
|
date,
|
|
(int32_t)score8,
|
|
(int32_t)score32);
|
|
}
|
|
// key parsing logic from Sections.cpp::gotSectiondbList()
|
|
else if ( rdbId == RDB_SECTIONDB ) {
|
|
key128_t *k2 = (key128_t *)k;
|
|
int32_t secType = g_indexdb.getScore ( (char *)k2);
|
|
int32_t tagHash = g_datedb.getDate ( k2 );
|
|
int64_t tid = g_datedb.getTermId(k2);
|
|
int64_t siteHash = tid; // not quite 64 bits
|
|
SectionVote *sv = (SectionVote *)data;
|
|
char *dd = "tagHash32";
|
|
if ( secType == SV_TAGCONTENTHASH )
|
|
dd ="tagcontentHash32";
|
|
if ( secType == SV_TAGPAIRHASH )
|
|
dd = "tagPairHash32";
|
|
// sanity check
|
|
int32_t ds = sizeof(SectionVote);
|
|
if (!neg&&dataSize!=ds){char*xx=NULL;*xx=0;}
|
|
if ( neg&&dataSize!=0 ){char*xx=NULL;*xx=0;}
|
|
float score = 0.0;
|
|
float numSampled = 0.0;
|
|
if ( data ) {
|
|
score = sv->m_score;
|
|
numSampled = sv->m_numSampled;
|
|
}
|
|
sb->safePrintf("<td>"
|
|
"<nobr>"
|
|
"siteHash48=0x%016"XINT64" "
|
|
"%s=0x%08"XINT32" "
|
|
"secType=%s "
|
|
"score=%.02f "
|
|
"numSampled=%.02f"
|
|
"</nobr>"
|
|
"</td>",
|
|
siteHash,
|
|
dd,tagHash,
|
|
getSectionTypeAsStr(secType),
|
|
score,
|
|
numSampled);
|
|
}
|
|
else if ( rdbId == RDB_LINKDB ) {
|
|
key224_t *k2 = (key224_t *)k;
|
|
int64_t linkHash=g_linkdb.getLinkeeUrlHash64_uk(k2);
|
|
int32_t linkeeSiteHash = g_linkdb.getLinkeeSiteHash32_uk(k2);
|
|
int32_t linkerSiteHash = g_linkdb.getLinkerSiteHash32_uk(k2);
|
|
char linkSpam = g_linkdb.isLinkSpam_uk (k2);
|
|
int32_t siteRank = g_linkdb.getLinkerSiteRank_uk (k2);
|
|
//int32_t hopCount = g_linkdb.getLinkerHopCount_uk (k2);
|
|
//int32_t ip24 = g_linkdb.getLinkerIp24_uk (k2);
|
|
int32_t ip32 = g_linkdb.getLinkerIp_uk (k2);
|
|
int64_t docId = g_linkdb.getLinkerDocId_uk (k2);
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
"<nobr>"
|
|
"linkeeSiteHash32=0x%08"XINT32" "
|
|
"linkeeUrlHash=0x%016"XINT64" "
|
|
"linkSpam=%"INT32" "
|
|
"siteRank=%"INT32" "
|
|
//"hopCount=%03"INT32" "
|
|
"sitehash32=0x%"XINT32" "
|
|
"IP32=%s "
|
|
"docId=%"UINT64""
|
|
"</nobr>"
|
|
"</td>",
|
|
linkeeSiteHash,
|
|
linkHash,
|
|
(int32_t)linkSpam,
|
|
siteRank,
|
|
//hopCount,
|
|
linkerSiteHash,
|
|
iptoa(ip32),
|
|
docId);
|
|
|
|
}
|
|
else if ( rdbId == RDB_CLUSTERDB ) {
|
|
key128_t *k2 = (key128_t *)k;
|
|
char *r = (char *)k2;
|
|
int32_t siteHash26 = g_clusterdb.getSiteHash26 ( r );
|
|
char lang = g_clusterdb.getLanguage ( r );
|
|
int64_t docId = g_clusterdb.getDocId ( r );
|
|
char ff = g_clusterdb.getFamilyFilter ( r );
|
|
// sanity check
|
|
if(dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td>"
|
|
// 26 bit site hash
|
|
"siteHash26=0x%08"XINT32" "
|
|
"family=%"INT32" "
|
|
"lang=%03"INT32" "
|
|
"docId=%"UINT64""
|
|
"</td>",
|
|
siteHash26 ,
|
|
(int32_t)ff,
|
|
(int32_t)lang,
|
|
docId );
|
|
}
|
|
// key parsing logic taken from Address::makePlacedbKey
|
|
else if ( rdbId == RDB_PLACEDB ) {
|
|
key128_t *k2 = (key128_t *)k;
|
|
int64_t bigHash = g_placedb.getBigHash ( k2 );
|
|
int64_t docId = g_placedb.getDocId ( k2 );
|
|
int32_t snh = g_placedb.getStreetNumHash ( k2 );
|
|
//int32_t smallHash = g_placedb.getSmallHash ( k2 );
|
|
// sanity check
|
|
if(!neg &&dataSize<=0){char*xx=NULL;*xx=0;}
|
|
if( neg &&dataSize!=0){char*xx=NULL;*xx=0;}
|
|
sb->safePrintf("<td><nobr>"
|
|
"bigHash64=0x%016"XINT64" "
|
|
"docId=%"UINT64" "
|
|
"streetNumHash25=0x%08"XINT32" "
|
|
"dataSize=%010"INT32" "
|
|
"address=%s"
|
|
"</nobr>"
|
|
"</td>",
|
|
bigHash,
|
|
docId,
|
|
snh,
|
|
dataSize ,
|
|
data );
|
|
}
|
|
// key parsing logic taken from Address::makePlacedbKey
|
|
else if ( rdbId == RDB_SPIDERDB ) {
|
|
sb->safePrintf("<td><nobr>");
|
|
key128_t *k2 = (key128_t *)k;
|
|
if ( g_spiderdb.isSpiderRequest(k2) ) {
|
|
SpiderRequest *sreq = (SpiderRequest *)rec;
|
|
sreq->print ( sb );
|
|
}
|
|
else {
|
|
SpiderReply *srep = (SpiderReply *)rec;
|
|
srep->print ( sb );
|
|
}
|
|
sb->safePrintf("</nobr></td>");
|
|
}
|
|
else if ( rdbId == RDB_DOLEDB ) {
|
|
key_t *k2 = (key_t *)k;
|
|
sb->safePrintf("<td><nobr>");
|
|
sb->safePrintf("priority=%"INT32" "
|
|
"spidertime=%"UINT32" "
|
|
//"uh48=%"XINT64" "
|
|
"isdel=%"INT32"",
|
|
g_doledb.getPriority(k2),
|
|
(uint32_t)g_doledb.getSpiderTime(k2),
|
|
//g_doledb.getUrlHash48(k2),
|
|
g_doledb.getIsDel(k2));
|
|
sb->safePrintf("</nobr></td>");
|
|
}
|
|
else if ( rdbId == RDB_TITLEDB ) {
|
|
//XmlDoc tr;
|
|
//SafeBuf tmp;
|
|
//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
|
|
// print each offset and size for the variable crap
|
|
sb->safePrintf("<td><nobr>titlerec datasize=%"INT32" "
|
|
//"sizeofxmldoc=%"INT32" "
|
|
//"hdrSize=%"INT32" "
|
|
//"version=%"INT32" "
|
|
//"%s"
|
|
"</nobr></td>",
|
|
dataSize
|
|
//(int32_t)sizeof(XmlDoc),
|
|
//(int32_t)tr.m_headerSize,
|
|
//(int32_t)tr.m_version,
|
|
//tmp.getBufStart());
|
|
);
|
|
}
|
|
//else if ( rdbId == RDB_REVDB ) {
|
|
// sb->safePrintf("<td><nobr>revdb datasize=%"INT32" ",
|
|
// dataSize);
|
|
//}
|
|
else if ( rdbId == RDB_TAGDB ) {
|
|
Tag *tag = (Tag *)rec;
|
|
sb->safePrintf("<td><nobr>");
|
|
if ( rec[0] & 0x01 ) tag->printToBuf(sb);
|
|
else sb->safePrintf("negativeTagKey");
|
|
sb->safePrintf("</nobr></td>");
|
|
}
|
|
else {
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// close it up
|
|
sb->safePrintf("</tr>\n");
|
|
|
|
/*
|
|
// hash the data into a int32_t for hash table
|
|
char *ns = "no";
|
|
if ( noSplit ) ns = "yes";
|
|
char *del = "";
|
|
if ( neg ) del = " (delete)";
|
|
|
|
if ( ks==12 ) {
|
|
key_t *k2 = (key_t *)k;
|
|
int64_t tid = g_indexdb.getTermId(k2);
|
|
uint8_t score8 = g_indexdb.getScore ( *k2 );
|
|
uint32_t score32 = score8to32 ( score8 );
|
|
log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
|
|
"tid=%"UINT64" score8=%"UINT32" score32=%"UINT32" nosplit=%s%s",
|
|
count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,tid ,(int32_t)score8,(int32_t)score32,
|
|
ns,del);
|
|
}
|
|
else {
|
|
log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
|
|
"nosplit=%s%s",
|
|
count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,ns,del);
|
|
}
|
|
*/
|
|
|
|
}
|
|
sb->safePrintf("</table>\n");
|
|
|
|
if ( sb == &tmp )
|
|
sb->print();
|
|
}
|
|
|
|
|
|
bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// do not do this if not test collection for now
|
|
if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
|
|
|
log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");
|
|
|
|
// store each record in the list into the send buffers
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// first is rdbId
|
|
//char rdbId = -1; // m_rdbId;
|
|
//if ( rdbId < 0 ) rdbId = *p++;
|
|
uint8_t rdbId = *p++;
|
|
// mask off rdbId
|
|
rdbId &= 0x7f;
|
|
// get the key of the current record
|
|
//char *key = p;
|
|
// negative key?
|
|
bool del ;
|
|
if ( *p & 0x01 ) del = false;
|
|
else del = true;
|
|
// must always be negative if deleteing
|
|
// spiderdb is exempt because we add a spiderreply that is
|
|
// positive and a spiderdoc
|
|
// no, this is no longer the case because we add spider
|
|
// replies to the index when deleting or rejecting a doc.
|
|
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// get the key size. a table lookup in Rdb.cpp.
|
|
int32_t ks ;
|
|
if ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 ) {
|
|
ks = 18;
|
|
// no compress bits set!
|
|
if ( p[0] & 0x06 ) { char*xx=NULL;*xx=0; }
|
|
// alignment bit set or cleared
|
|
if ( ! ( p[1] & 0x02 ) ) { char *xx=NULL;*xx=0; }
|
|
if ( ( p[7] & 0x02 ) ) { char *xx=NULL;*xx=0; }
|
|
int64_t docId = g_posdb.getDocId(p);
|
|
if ( docId != m_docId && !cr->m_indexSpiderReplies) {
|
|
log("xmldoc: %"INT64" != %"INT64""
|
|
, docId
|
|
, m_docId );
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// else
|
|
// log("xmldoc: %"INT64" == %"INT64""
|
|
// , docId
|
|
// , m_docId );
|
|
|
|
// uint64_t termId = g_posdb.getTermId(p);
|
|
// if ( termId == 59194288760543LL ) {
|
|
// log("xmldoc: debug");
|
|
// //char *xx=NULL;*xx=0;
|
|
// }
|
|
}
|
|
else if ( rdbId == RDB_DATEDB ) ks = 16;
|
|
else ks = getKeySizeFromRdbId ( rdbId );
|
|
// sanity
|
|
if ( ks < 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
|
|
// another check
|
|
Rdb *rdb = getRdbFromId(rdbId);
|
|
if ( ! rdb ) { char *xx=NULL;*xx=0; }
|
|
if ( rdb->m_ks < 12 || rdb->m_ks > MAX_KEY_BYTES ) {
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// special linkdb check
|
|
//if ( rdbId == RDB_LINKDB ) {
|
|
// // parse it up
|
|
// key192_t *k = (key192_t *)p;
|
|
// unsigned char hc = g_linkdb.getLinkerHopCount_uk(k);
|
|
// if ( hc != 0 ){ char *xx=NULL;*xx=0; }
|
|
//}
|
|
|
|
char *rec = p;
|
|
|
|
// set this
|
|
//bool split = true;
|
|
//if(rdbId == RDB_POSDB && g_posdb.isShardedByTermId(p) )
|
|
// split =false;
|
|
// skip key
|
|
p += ks;
|
|
// . if key belongs to same group as firstKey then continue
|
|
// . titledb now uses last bits of docId to determine groupId
|
|
// . but uses the top 32 bits of key still
|
|
// . spiderdb uses last 64 bits to determine groupId
|
|
// . tfndb now is like titledb(top 32 bits are top 32 of docId)
|
|
//uint32_t gid = getGroupId ( rdbId , key , split );
|
|
// get the record, is -1 if variable. a table lookup.
|
|
int32_t dataSize;
|
|
if ( rdbId == RDB_POSDB || rdbId==RDB2_POSDB2)dataSize=0;
|
|
else if ( rdbId == RDB_DATEDB ) dataSize = 0;
|
|
//else if ( rdbId == RDB_REVDB ) dataSize = -1;
|
|
else if ( rdbId == RDB2_POSDB2 ) dataSize = 0;
|
|
else if ( rdbId == RDB2_DATEDB2 ) dataSize = 0;
|
|
//else if ( rdbId == RDB2_REVDB2 ) dataSize = -1;
|
|
else dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// . for delete never stores the data
|
|
// . you can have positive keys without any dataSize member
|
|
// when they normally should have one, like titledb
|
|
if ( forDelete ) dataSize = 0;
|
|
// . negative keys have no data
|
|
// . this is not the case unfortunately
|
|
if ( del ) dataSize = 0;
|
|
|
|
// ensure spiderdb request recs have data/url in them
|
|
if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
|
|
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)rec ) &&
|
|
! forDelete &&
|
|
! del &&
|
|
dataSize == 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// if variable read that in
|
|
if ( dataSize == -1 ) {
|
|
// -1 means to read it in
|
|
dataSize = *(int32_t *)p;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip dataSize
|
|
p += 4;
|
|
}
|
|
// skip over the data, if any
|
|
p += dataSize;
|
|
// breach us?
|
|
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// must be exactly equal to end
|
|
if ( p != pend ) return false;
|
|
return true;
|
|
|
|
/*
|
|
int32_t recSize = 0;
|
|
int32_t count = 0;
|
|
for ( ; p < pend ; p += recSize , count++ ) {
|
|
// get rdbid
|
|
char rdbId = *p & 0x7f;
|
|
// get nosplit flag
|
|
char noSplit = *p & 0x80;
|
|
// skip
|
|
p++;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// sanity
|
|
if ( ks > 16 ) { char *xx=NULL;*xx=0;}
|
|
// negative key?
|
|
bool del;
|
|
if ( *p & 0x01 ) del = false;
|
|
else del = true;
|
|
// convert into a key128_t, the biggest possible key
|
|
char k[16];
|
|
gbmemcpy ( &k , p , ks );
|
|
// skip it
|
|
p += ks;
|
|
// flip this
|
|
char split = ! noSplit;
|
|
// test it
|
|
g_hostdb.getGroupId(rdbId,k,split);
|
|
// if negative, no data size allowed
|
|
if ( ( k[0] & 0x01 ) == 0x00 ) continue;
|
|
// get datasize
|
|
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// no negative key has data
|
|
if ( del ) dataSize = 0;
|
|
// if -1, read it in
|
|
if ( dataSize == -1 ) {
|
|
dataSize = *(int32_t *)p;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
}
|
|
// skip the data
|
|
p += dataSize;
|
|
}
|
|
*/
|
|
}
|
|
|
|
bool XmlDoc::hashMetaList ( HashTableX *ht ,
|
|
char *p ,
|
|
char *pend ,
|
|
bool checkList ) {
|
|
int32_t recSize = 0;
|
|
int32_t count = 0;
|
|
for ( ; p < pend ; p += recSize , count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get rdbid
|
|
char rdbId = *p & 0x7f;
|
|
// skip rdb id
|
|
p++;
|
|
// save that
|
|
char *rec = p;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// sanity check
|
|
if ( ks > 28 ) { char *xx=NULL;*xx=0; }
|
|
// is it a delete key?
|
|
char del ;
|
|
if ( ( p[0] & 0x01 ) == 0x00 ) del = true;
|
|
else del = false;
|
|
// convert into a key128_t, the biggest possible key
|
|
char k[MAX_KEY_BYTES];//key128_t k ;
|
|
// zero out
|
|
KEYMIN(k,MAX_KEY_BYTES);
|
|
//k.setMin();
|
|
gbmemcpy ( k , p , ks );
|
|
// skip it
|
|
p += ks;
|
|
// if negative, no data size allowed -- no
|
|
if ( del ) continue;
|
|
// get datasize
|
|
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
|
|
// if -1, read it in
|
|
if ( dataSize == -1 ) {
|
|
dataSize = *(int32_t *)p;
|
|
// sanity check
|
|
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
}
|
|
// hash the data into a int32_t for hash table
|
|
//int32_t h32 = 0;
|
|
//h32 = hash32 ( p , dataSize );
|
|
// do not allow 0
|
|
//if ( h32 == 0 ) h32 = 1;
|
|
// skip the data
|
|
p += dataSize;
|
|
// ignore spiderdb recs for parsing consistency check
|
|
if ( rdbId == RDB_SPIDERDB ) continue;
|
|
if ( rdbId == RDB2_SPIDERDB2 ) continue;
|
|
// ignore tagdb as well!
|
|
if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;
|
|
// skip revdb for now too
|
|
//if ( rdbId == RDB_REVDB ) continue;
|
|
// set our rec size, includes key/dataSize/data
|
|
int32_t recSize = p - rec;
|
|
// debug point
|
|
//if ( *(uint64_t *)k == 4828936067112479745LL )
|
|
// log("hey");
|
|
// if just adding, do it
|
|
if ( ! checkList ) {
|
|
// we now store ptr to the rec, not hash!
|
|
if ( ! ht->addKey ( k , &rec ) ) return false;
|
|
continue;
|
|
}
|
|
// check to see if this rec is in the provided hash table
|
|
int32_t slot = ht->getSlot ( k );
|
|
// bitch if not found
|
|
if ( slot < 0 && ks==12 ) {
|
|
key144_t *k2 = (key144_t *)k;
|
|
int64_t tid = g_posdb.getTermId(k2);
|
|
char shardByTermId = g_posdb.isShardedByTermId(k2);
|
|
//uint8_t score8 = g_indexdb.getScore ( *k2 );
|
|
//uint32_t score32 = score8to32 ( score8 );
|
|
log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
|
|
"tid=%"UINT64" "
|
|
"key=%s "
|
|
//"score8=%"UINT32" score32=%"UINT32" "
|
|
"shardByTermId=%"INT32"",
|
|
count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,tid ,
|
|
//(int32_t)score8,(int32_t)score32,
|
|
KEYSTR(k2,ks),
|
|
(int32_t)shardByTermId);
|
|
// look it up
|
|
|
|
|
|
// int16_tcut
|
|
HashTableX *wt = m_wts;
|
|
|
|
// point to keys, termids?
|
|
//TermInfo **tp = (TermInfo **)wt->m_keys;
|
|
|
|
// now print the table we stored all we hashed into
|
|
for ( int32_t i = 0 ; i < wt->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( wt->m_flags[i] == 0 ) continue;
|
|
// get the TermInfo
|
|
TermDebugInfo *ti;
|
|
ti = (TermDebugInfo *)wt->getValueFromSlot(i);
|
|
// skip if not us
|
|
if((ti->m_termId & TERMID_MASK)!=tid)continue;
|
|
// got us
|
|
char *start = m_wbuf.getBufStart();
|
|
char *term = start + ti->m_termOff;
|
|
char *prefix = "";
|
|
if ( ti->m_prefixOff >= 0 ) {
|
|
prefix = start + ti->m_prefixOff;
|
|
//prefix[ti->m_prefixLen] = '\0';
|
|
}
|
|
// NULL term it
|
|
term[ti->m_termLen] = '\0';
|
|
// print it
|
|
log("parser: term=%s prefix=%s",//score32=%"INT32"",
|
|
term,prefix);//,(int32_t)ti->m_score32);
|
|
}
|
|
|
|
char *xx=NULL; *xx=0;
|
|
// count it for PageStats.cpp
|
|
g_stats.m_parsingInconsistencies++;
|
|
continue;
|
|
}
|
|
if ( slot < 0 && ks != 12 ) {
|
|
// if it is sectiondb and the orig doc did not
|
|
// add sectiondb recs because m_totalSiteVoters >=
|
|
// MAX_SITE_VOTERS, then that is ok!
|
|
if ( (rdbId == RDB_SECTIONDB ||
|
|
rdbId == RDB2_SECTIONDB2 ) &&
|
|
m_sectionsValid &&
|
|
m_sections.m_totalSiteVoters >= MAX_SITE_VOTERS )
|
|
continue;
|
|
log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
|
|
"ks=%s "
|
|
,count,getDbnameFromId(rdbId),(int32_t)ks,
|
|
(int32_t)dataSize,KEYSTR(k,ks));
|
|
char *xx=NULL; *xx=0;
|
|
// count it for PageStats.cpp
|
|
g_stats.m_parsingInconsistencies++;
|
|
continue;
|
|
}
|
|
// if in there, check the hashes
|
|
//int32_t h2 = *(int32_t *)ht->getValueFromSlot ( slot );
|
|
char *rec2 = *(char **)ht->getValueFromSlot ( slot );
|
|
// get his dataSize
|
|
int32_t dataSize2 = getDataSizeFromRdbId(rdbId);
|
|
// his keysize
|
|
int32_t ks2 = getKeySizeFromRdbId(rdbId);
|
|
// get his recsize
|
|
int32_t recSize2 = ks2 ;
|
|
// if -1 that is variable
|
|
if ( dataSize2 == -1 ) {
|
|
dataSize2 = *(int32_t *)(rec2+ks2);
|
|
recSize2 += 4;
|
|
}
|
|
// add it up
|
|
recSize2 += dataSize2;
|
|
// keep on chugging if they match
|
|
if ( recSize2==recSize && !memcmp(rec,rec2,recSize) ) continue;
|
|
// otherwise, bitch
|
|
char shardByTermId = false;
|
|
if ( rdbId == RDB_POSDB )
|
|
shardByTermId = g_posdb.isShardedByTermId(rec2);
|
|
log("build: data not equal for key=%s "
|
|
"rdb=%s splitbytermid=%"INT32" dataSize=%"INT32"",
|
|
KEYSTR(k,ks2),
|
|
getDbnameFromId(rdbId),(int32_t)shardByTermId,dataSize);
|
|
|
|
// print into here
|
|
SafeBuf sb1;
|
|
SafeBuf sb2;
|
|
|
|
// print it out
|
|
if ( rdbId == RDB_SPIDERDB ) {
|
|
// get rec
|
|
if ( g_spiderdb.isSpiderRequest((key128_t *)rec) ) {
|
|
SpiderRequest *sreq1 = (SpiderRequest *)rec;
|
|
SpiderRequest *sreq2 = (SpiderRequest *)rec2;
|
|
sreq1->print(&sb1);
|
|
sreq2->print(&sb2);
|
|
}
|
|
else {
|
|
SpiderReply *srep1 = (SpiderReply *)rec;
|
|
SpiderReply *srep2 = (SpiderReply *)rec2;
|
|
srep1->print(&sb1);
|
|
srep2->print(&sb2);
|
|
}
|
|
log("build: rec1=%s",sb1.getBufStart());
|
|
log("build: rec2=%s",sb2.getBufStart());
|
|
|
|
}
|
|
char *xx=NULL; *xx=0;
|
|
// count it for PageStats.cpp
|
|
g_stats.m_parsingInconsistencies++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool checkRegex ( SafeBuf *regex ,
|
|
char *target ,
|
|
bool *boolVal ,
|
|
bool *boolValValid ,
|
|
int32_t *compileError ,
|
|
CollectionRec *cr ) {
|
|
|
|
if ( compileError ) *compileError = false;
|
|
|
|
if ( *boolValValid )
|
|
return *boolVal;
|
|
|
|
// if not using diffbot or there is no regex, it is ok
|
|
if ( regex->length() <= 0 ) {
|
|
*boolVal = true;
|
|
*boolValValid = true;
|
|
return boolVal;
|
|
}
|
|
|
|
// whip out the regex shit i guess...
|
|
regex_t buf;
|
|
// this will store the compiled regular expression into "buf"
|
|
int32_t ret = regcomp ( &buf ,
|
|
// the actual regular expression to compile
|
|
regex->getBufStart() ,
|
|
// some flags
|
|
REG_EXTENDED|REG_ICASE|REG_NEWLINE|REG_NOSUB);
|
|
|
|
if ( ret ) {
|
|
//g_errno = ret;
|
|
if ( compileError ) *compileError = errno;
|
|
log("xmldoc: diffbot regcomp failed: %s. This should have "
|
|
"been tested before crawl was started. Ignoring.",
|
|
mstrerror(errno));
|
|
return true;
|
|
}
|
|
|
|
// now see if it is a match
|
|
if ( regexec(&buf,target,0,NULL,0) ) *boolVal = true;
|
|
else *boolVal = false;
|
|
|
|
*boolValValid = true;
|
|
return boolVal;
|
|
}
|
|
*/
|
|
|
|
// . should we send this url off to diffbot or processing?
|
|
// . if the url's downloaded content does not match the provided regex
|
|
// in m_diffbotPageProcessPattern, then we do not send the url to diffbot
|
|
// for processing
|
|
// . make sure this regex is pre-tested before starting the crawl
|
|
// so we know it compiles
|
|
bool XmlDoc::doesUrlMatchDiffbotCrawlPattern() {
|
|
|
|
if ( m_matchesCrawlPatternValid )
|
|
return m_matchesCrawlPattern;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// get the compiled regular expressions
|
|
regex_t *ucr = &cr->m_ucr;
|
|
if ( ! cr->m_hasucr ) ucr = NULL;
|
|
|
|
if ( ! m_firstUrlValid ) return false;
|
|
|
|
|
|
m_matchesCrawlPatternValid = true;
|
|
m_matchesCrawlPattern = false;
|
|
|
|
Url *furl = getFirstUrl();
|
|
char *url = furl->getUrl();
|
|
|
|
// if we had a url crawl regex then regexec will return non-zero
|
|
// if our url does NOT match i guess
|
|
if ( ucr && regexec(ucr,url,0,NULL,0) )
|
|
return false;
|
|
|
|
// int16_tcut
|
|
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
|
|
if ( ucp && ! ucp[0] ) ucp = NULL;
|
|
|
|
// do not require a match on ucp if ucr is given
|
|
if ( ucp && ! ucr && ! doesStringContainPattern(url,ucp) )
|
|
return false;
|
|
|
|
m_matchesCrawlPattern = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool XmlDoc::doesUrlMatchDiffbotProcessPattern() {
|
|
return checkRegex ( &cr->m_diffbotUrlProcessPattern ,
|
|
m_firstUrl.m_url ,
|
|
&m_diffbotUrlProcessPatternMatch,
|
|
&m_diffbotUrlProcessPatternMatchValid,
|
|
NULL,
|
|
cr);
|
|
}
|
|
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
return checkRegex ( &cr->m_diffbotPageProcessPattern ,
|
|
ptr_utf8Content,
|
|
&m_diffbotPageProcessPatternMatch,
|
|
&m_diffbotPageProcessPatternMatchValid,
|
|
NULL,
|
|
cr);
|
|
}
|
|
*/
|
|
|
|
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
char *p = cr->m_diffbotPageProcessPattern.getBufStart();
|
|
// empty? no pattern matches everything.
|
|
if ( ! p ) return true;
|
|
if ( ! m_content ) return false;
|
|
// how many did we have?
|
|
return doesStringContainPattern ( m_content , p );
|
|
}
|
|
|
|
int32_t *XmlDoc::reindexJSONObjects ( int32_t *newTitleHashes,
|
|
int32_t numNewHashes ) {
|
|
return redoJSONObjects (newTitleHashes,numNewHashes,false );
|
|
}
|
|
|
|
int32_t *XmlDoc::nukeJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ) {
|
|
return redoJSONObjects (newTitleHashes,numNewHashes,true );
|
|
}
|
|
|
|
// . returns ptr to status
|
|
// . diffbot uses this to remove the indexed json pages associated with
|
|
// a url. each json object is basically its own url. a json object
|
|
// url is the parent page's url with a -diffbotxyz-%"UINT32" appended to it
|
|
// where %"INT32" is the object # starting at 0 and incrementing from there.
|
|
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
|
|
int32_t *XmlDoc::redoJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ,
|
|
bool deleteFromIndex ) {
|
|
// use this
|
|
static int32_t s_return = 1;
|
|
// if none, we are done
|
|
if ( m_diffbotJSONCount <= 0 ) return &s_return;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// i was trying to re-index some diffbot json docs in the global
|
|
// index but it wasn't set as custom crawl
|
|
//if ( ! cr->m_isCustomCrawl ) return &s_return;
|
|
|
|
// already did it?
|
|
if ( m_joc >= m_diffbotJSONCount ) return &s_return;
|
|
|
|
// new guy here
|
|
if ( ! m_dx ) {
|
|
try { m_dx = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("xmldoc: failed to alloc m_dx");
|
|
return NULL;
|
|
}
|
|
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
|
|
}
|
|
|
|
//
|
|
// index the hashes of the latest diffbot json items for this parent
|
|
//
|
|
HashTableX dedup;
|
|
if ( ! dedup.set(4,0,numNewHashes*4,NULL,0,false,m_niceness,"njodt") )
|
|
return NULL;
|
|
for ( int32_t i = 0 ; i < numNewHashes ; i++ )
|
|
dedup.addKey ( &newTitleHashes[i] );
|
|
|
|
// get this old doc's current title hashes
|
|
int32_t numOldHashes;
|
|
int32_t *oldTitleHashes = getDiffbotTitleHashes ( &numOldHashes );
|
|
// sanity. should return right away without having to block
|
|
if ( oldTitleHashes == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
//int32_t count = m_diffbotJSONCount;
|
|
// sanity again
|
|
if ( numOldHashes != m_diffbotJSONCount ) {
|
|
log("build: can't remove json objects. "
|
|
"jsoncount mismatch %"INT32" != %"INT32
|
|
,numOldHashes
|
|
,m_diffbotJSONCount
|
|
);
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
//count = 0;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// scan down each
|
|
for ( ; m_joc < m_diffbotJSONCount ; ) {
|
|
// only NUKE the json items for which title hashes we lost
|
|
int32_t th32 = oldTitleHashes[m_joc];
|
|
// . if still in the new diffbot reply, do not DELETE!!!
|
|
// . if there was no title, it uses hash of entire object
|
|
if ( deleteFromIndex && dedup.isInTable(&th32) ) {
|
|
m_joc++;
|
|
continue;
|
|
}
|
|
// if m_dx has no url set, call set4 i guess
|
|
if ( ! m_dx->m_firstUrlValid ) {
|
|
// make the fake url for this json object for indexing
|
|
SafeBuf fakeUrl;
|
|
fakeUrl.set ( m_firstUrl.getUrl() );
|
|
// get his title hash32
|
|
//int32_t jsonTitleHash32 = titleHashes[m_joc];
|
|
// append -diffbotxyz%"UINT32" for fake url
|
|
fakeUrl.safePrintf("-diffbotxyz%"UINT32"",
|
|
(uint32_t)th32);
|
|
// set url of new xmldoc
|
|
if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
m_niceness ) )
|
|
// g_errno should be set!
|
|
return NULL;
|
|
// we are indexing json objects, don't use all these
|
|
m_dx->m_useClusterdb = false;
|
|
m_dx->m_useSpiderdb = false;
|
|
m_dx->m_useTagdb = false;
|
|
m_dx->m_usePlacedb = false;
|
|
m_dx->m_useLinkdb = false;
|
|
m_dx->m_isChildDoc = true;
|
|
m_dx->m_parentDocPtr = this;
|
|
// are we doing a query reindex or a nuke?
|
|
m_dx->m_deleteFromIndex = deleteFromIndex;//true;
|
|
// do not try to download this url
|
|
if ( ! deleteFromIndex )
|
|
m_dx->m_recycleContent = true;
|
|
// we need this because only m_dx->m_oldDoc will
|
|
// load from titledb and have it set
|
|
m_dx->m_isDiffbotJSONObject = true;
|
|
// for debug
|
|
char *str = "reindexing";
|
|
if ( deleteFromIndex ) str = "nuking";
|
|
log("xmldoc: %s %s",str,fakeUrl.getBufStart());
|
|
}
|
|
|
|
// when the indexdoc completes, or if it blocks, call us!
|
|
// we should just pass through here
|
|
m_dx->setCallback ( m_masterState , m_masterLoop );
|
|
|
|
// . this should ultimately load from titledb and not
|
|
// try to download the page since m_deleteFromIndex is
|
|
// set to true
|
|
// . if m_dx got its msg4 reply it ends up here, in which
|
|
// case do NOT re-call indexDoc() so check for
|
|
// m_listAdded.
|
|
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
|
|
return (int32_t *)-1;
|
|
// critical error on our part trying to index it?
|
|
// does not include timeouts or 404s, etc. mostly just
|
|
// OOM errors.
|
|
if ( g_errno ) return NULL;
|
|
// count as deleted
|
|
cr->m_localCrawlInfo.m_objectsDeleted++;
|
|
cr->m_globalCrawlInfo.m_objectsDeleted++;
|
|
cr->m_needsSave = true;
|
|
// but gotta set this crap back
|
|
//log("diffbot: resetting %s",m_dx->m_firstUrl.m_url);
|
|
// clear for next guy if there is one. clears
|
|
// m_dx->m_contentValid so the set4() can be called again above
|
|
m_dx->reset();
|
|
// try to do more json objects indexed from this parent doc
|
|
m_joc++;
|
|
}
|
|
|
|
// nuke it
|
|
mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
|
|
delete ( m_dx );
|
|
m_dx = NULL;
|
|
|
|
return &s_return;
|
|
}
|
|
|
|
|
|
void getMetaListWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in get meta list wrapper" );
|
|
// get it
|
|
char *ml = THIS->getMetaList ( );
|
|
// sanity check
|
|
if ( ! ml && ! g_errno ) {
|
|
log("doc: getMetaList() returned NULL without g_errno");
|
|
sleep(5);
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// return if it blocked
|
|
if ( ml == (void *)-1 ) return;
|
|
// sanityh check
|
|
if ( THIS->m_callback1 == getMetaListWrapper ) { char *xx=NULL;*xx=0;}
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
XmlDoc *g_od = NULL;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . make a meta list to call Msg4::addMetaList() with
|
|
// . called by Msg14.cpp
|
|
// . a meta list is just a buffer of Rdb records of the following format:
|
|
// rdbid | rdbRecord
|
|
// . meta list does not include title rec since Msg14 adds that using Msg1
|
|
// . returns false and sets g_errno on error
|
|
// . sets m_metaList ptr and m_metaListSize
|
|
// . if "deleteIt" is true, we are a delete op on "old"
|
|
// . returns (char *)-1 if it blocks and will call your callback when done
|
|
// . generally only Repair.cpp changes these use* args to false
|
|
char *XmlDoc::getMetaList ( bool forDelete ) {
|
|
|
|
if ( m_metaListValid ) return m_metaList;
|
|
|
|
setStatus ( "getting meta list" );
|
|
|
|
// force it true?
|
|
// "forDelete" means we want the metalist to consist of "negative"
|
|
// keys that will annihilate with the positive keys in the index,
|
|
// posdb and the other rdbs, in order to delete them. "deleteFromIndex"
|
|
// means to just call getMetaList(tre) on the m_oldDoc (old XmlDoc)
|
|
// which is built from the titlerec in Titledb. so don't confuse
|
|
// these two things. otherwise when i add this we were not adding
|
|
// the spiderreply of "Doc Force Deleted" from doing a query reindex
|
|
// and it kept repeating everytime we started gb up.
|
|
//if ( m_deleteFromIndex ) forDelete = true;
|
|
|
|
// assume valid
|
|
m_metaList = "";
|
|
m_metaListSize = 0;
|
|
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block, this callback will be called
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getMetaListWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// returning from a handler that had an error?
|
|
if ( g_errno ) return NULL;
|
|
|
|
// if we are a spider status doc/titlerec and we are doing a rebuild
|
|
// operation, then keep it simple
|
|
if ( m_setFromTitleRec &&
|
|
m_useSecondaryRdbs &&
|
|
m_contentTypeValid &&
|
|
m_contentType == CT_STATUS ) {
|
|
// if not rebuilding posdb then done, list is empty since
|
|
// spider status docs do not contribute to linkdb, clusterdb,..
|
|
if ( ! m_usePosdb && ! m_useTitledb ) {
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
/////////////
|
|
//
|
|
// if user disabled spider status docs then delete the titlerec
|
|
// AND the posdb index list from our dbs for this ss doc
|
|
//
|
|
/////////////
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
if ( ! cr->m_indexSpiderReplies ) {
|
|
int64_t uh48 = m_firstUrl.getUrlHash48();
|
|
// delete title rec. true = delete?
|
|
key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
|
|
// shortcut
|
|
SafeBuf *ssb = &m_spiderStatusDocMetaList;
|
|
// add to list. and we do not add the spider status
|
|
// doc to posdb since we deleted its titlerec.
|
|
ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
|
|
ssb->safeMemcpy ( &tkey , sizeof(key_t) );
|
|
m_metaList = ssb->getBufStart();
|
|
m_metaListSize = ssb->getLength ();
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
// set safebuf to the json of the spider status doc
|
|
SafeBuf jd;
|
|
if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
|
|
return NULL;
|
|
// set m_spiderStatusDocMetaList from the json
|
|
if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
|
|
return NULL;
|
|
// TODO: support titledb rebuild as well
|
|
m_metaList = m_spiderStatusDocMetaList.getBufStart();
|
|
m_metaListSize = m_spiderStatusDocMetaList.getLength();
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
|
|
// any other indexing issue? hey! g_errno might not be set here
|
|
//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }
|
|
|
|
// a hacky thing
|
|
//XmlDoc *od = (XmlDoc *)1;
|
|
|
|
//bool diffbotEmptyReply = false;
|
|
|
|
/*
|
|
// fake this for diffbot?
|
|
if ( m_useDiffbot &&
|
|
! m_isDiffbotJSONObject &&
|
|
! doesUrlMatchDiffbotCrawlPattern() ) {
|
|
// flag it so we only add the SpiderReply to spiderdb and bail
|
|
//diffbotEmptyReply = true;
|
|
// we should not delete the json objects for this url
|
|
// from the index just because the user decided to remove
|
|
// it from her crawl
|
|
m_isIndexedValid = true;
|
|
m_isIndexed = false;
|
|
m_oldDocValid = true;
|
|
m_oldDoc = NULL;
|
|
}
|
|
*/
|
|
|
|
// if "rejecting" from index fake all this stuff
|
|
if ( m_deleteFromIndex ) {
|
|
// if we are using diffbot api and diffbot found no json objects
|
|
// or we never even processed the url, we really just want to
|
|
// add the SpiderReply for this url to spiderdb and nothing more.
|
|
// NO! we still want to store the page content in titledb
|
|
// so we can see if it has changed i guess
|
|
//diffbotEmptyReply ) {
|
|
// set these things to bogus values since we don't need them
|
|
m_contentHash32Valid = true;
|
|
m_contentHash32 = 0;
|
|
m_httpStatusValid = true;
|
|
m_httpStatus = 200;
|
|
m_siteValid = true;
|
|
ptr_site = "www.poopoo.com";
|
|
size_site = gbstrlen(ptr_site)+1;
|
|
m_isSiteRootValid = true;
|
|
m_isSiteRoot2 = 1;
|
|
//m_tagHash32Valid = true;
|
|
//m_tagHash32 = 0;
|
|
m_tagPairHash32Valid = true;
|
|
m_tagPairHash32 = 0;
|
|
m_siteHash64Valid = true;
|
|
m_siteHash64 = 0LL;
|
|
m_spiderLinksValid = true;
|
|
m_spiderLinks2 = 1;
|
|
m_langIdValid = true;
|
|
m_langId = 1;
|
|
m_siteNumInlinksValid = true;
|
|
m_siteNumInlinks = 0;
|
|
m_isIndexed = true;
|
|
m_isIndexedValid = true;
|
|
m_ipValid = true;
|
|
m_ip = 123456;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
//
|
|
// BEGIN MULTI DOC QUERY REINDEX HACK
|
|
//
|
|
// this fixes it so we can do a query reindex on fake child urls
|
|
// of their original parent multidoc url. the child urls are
|
|
// subsections of the original parent url that were indexed as
|
|
// separate documents with their own docid. if we try to do a
|
|
// query reindex on such things, detect it, and add the request
|
|
// for the original parent multidoc url.
|
|
//
|
|
if ( m_sreqValid && m_sreq.m_isPageReindex &&
|
|
// if it is a force delete, then allow the user to delete
|
|
// such diffbot reply json children documents, however.
|
|
! m_sreq.m_forceDelete ) {
|
|
// see if its diffbot json object
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
|
|
XmlDoc *od = *pod;
|
|
// if no old doc then we might have just been a diffbot
|
|
// json url that was directly injected into GLOBAL-INDEX
|
|
// like xyz.com/-diffbotxyz12345 (my format) or
|
|
if ( ! od ) goto skip9;
|
|
// if we are indexing a subdoc piece of a multidoc url
|
|
// then parentUrl should return non-NULL
|
|
char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
|
|
if ( ! parentUrl && od->m_contentType != CT_STATUS )
|
|
goto skip9;
|
|
// in that case we need to reindex the parent url not the
|
|
// subdoc url, so make the spider reply gen quick
|
|
//SpiderReply *newsr = od->getFakeSpiderReply();
|
|
//if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
|
|
// use our ip though
|
|
//newsr->m_firstIp = od->m_firstIp;
|
|
// however we have to use our docid-based spider request
|
|
SpiderReply srep;
|
|
srep.reset();
|
|
// it MUST match up with original spider request so the
|
|
// lock key in Spider.cpp can unlock it. that lock key
|
|
// uses the "uh48" (48bit hash of the url) and "srep.m_firstIp"
|
|
// in this case the SpiderRequest, sreq, is docid-based because
|
|
// it was added through PageReindex.cpp (query reindex) so
|
|
// it will be the 48 bit hash64b() of the docid
|
|
// (see PageReindex.cpp)'s call to SpiderRequest::setKey()
|
|
srep.m_firstIp = m_sreq.m_firstIp;
|
|
// assume no error
|
|
srep.m_errCount = 0;
|
|
// do not inherit this one, it MIGHT HAVE CHANGE!
|
|
srep.m_siteHash32 = m_sreq.m_siteHash32;
|
|
srep.m_domHash32 = m_sreq.m_domHash32;
|
|
srep.m_spideredTime = getTimeGlobal();
|
|
int64_t uh48 = m_sreq.getUrlHash48();
|
|
int64_t parentDocId = 0LL;
|
|
srep.m_contentHash32 = 0;
|
|
// were we already in titledb before we started spidering?
|
|
// yes otherwise we would have called "goto skip9" above
|
|
srep.m_wasIndexed = 1;
|
|
srep.m_wasIndexedValid = 1;
|
|
srep.m_isIndexed = 1;
|
|
srep.m_isIndexedINValid = false;
|
|
srep.m_errCode = EREINDEXREDIR; // indexCode
|
|
srep.m_downloadEndTime = 0;
|
|
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
|
|
// lock of request needs to match that of reply so the
|
|
// reply, when recevied by Rdb.cpp which calls addSpiderReply()
|
|
// can unlock this url so it can be spidered again.
|
|
int64_t lock1 = makeLockTableKey(&m_sreq);
|
|
int64_t lock2 = makeLockTableKey(&srep);
|
|
if ( lock1 != lock2 ) { char *xx=NULL;*xx=0; }
|
|
// make a fake spider reply so this docid-based spider
|
|
// request is not used again
|
|
//SpiderReply srep;
|
|
// store the rdbid
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
if ( ! m_zbuf.pushChar(rd) )
|
|
return NULL;
|
|
// store that reply to indicate this spider request has
|
|
// been fulfilled!
|
|
if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
|
|
return NULL;
|
|
|
|
// but also store a new spider request for the parent url
|
|
SpiderRequest ksr;
|
|
int64_t pd;
|
|
|
|
// skip if doc is a spider status "document". their docids
|
|
// often get added during a query reindex but we should ignore
|
|
// them completely.
|
|
if ( od->m_contentType == CT_STATUS )
|
|
goto returnList;
|
|
|
|
//goto returnList;
|
|
|
|
// complain
|
|
if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
|
|
log("build: doing query reindex but diffbot api "
|
|
"url is not set in spider controls");
|
|
// just copy original request
|
|
gbmemcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
|
|
// do not spider links, it's a page reindex of a multidoc url
|
|
ksr.m_avoidSpiderLinks = 1;
|
|
// avoid EDOCUNCHANGED
|
|
ksr.m_ignoreDocUnchangedError = 1;
|
|
// no longer docid based we set it to parentUrl
|
|
ksr.m_urlIsDocId = 0;
|
|
// but consider it a manual add. this should already be set.
|
|
ksr.m_isPageReindex = 1;
|
|
// but it is not docid based, so overwrite the docid
|
|
// in ksr.m_url with the parent multidoc url. it \0 terms it.
|
|
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
|
|
// this must be valid
|
|
//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
// set the key, ksr.m_key. isDel = false
|
|
// fake docid
|
|
pd = g_titledb.getProbableDocId(parentUrl);
|
|
ksr.setKey ( m_sreq.m_firstIp, pd , false );
|
|
// store this
|
|
if ( ! m_zbuf.pushChar(rd) )
|
|
return NULL;
|
|
// then the request
|
|
if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
|
|
return NULL;
|
|
returnList:
|
|
// prevent cores in indexDoc()
|
|
m_indexCode = EREINDEXREDIR;
|
|
m_indexCodeValid = true;
|
|
// for now we set this crap
|
|
m_metaList = m_zbuf.getBufStart();
|
|
m_metaListSize = m_zbuf.length();
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
//
|
|
// END DIFFBOT OBJECT QUERY REINDEX HACK
|
|
//
|
|
|
|
|
|
skip9:
|
|
// get our checksum
|
|
int32_t *plainch32 = getContentHash32();
|
|
if ( ! plainch32 || plainch32 == (void *)-1 ) return (char *)plainch32;
|
|
|
|
// get this too
|
|
int16_t *hs = getHttpStatus ();
|
|
if ( ! hs || hs == (void *)-1 ) return (char *)hs;
|
|
|
|
// make sure site is valid
|
|
char *site = getSite();
|
|
if ( ! site || site == (void *)-1 ) return (char *)site;
|
|
|
|
// this seems to be an issue as well for "unchanged" block below
|
|
char *isr = getIsSiteRoot();
|
|
if ( ! isr || isr == (void *)-1 ) return (char *)isr;
|
|
|
|
// get hash of all tags from tagdb that affect what we index
|
|
//int32_t *tagHash = getTagHash32 ( );
|
|
//if ( ! tagHash || tagHash == (void *)-1 ) return (char *)tagHash;
|
|
|
|
int64_t *sh64 = getSiteHash64();
|
|
if ( ! sh64 || sh64 == (void *)-1 ) return (char *)sh64;
|
|
|
|
// make sure docid valid
|
|
int64_t *mydocid = getDocId();
|
|
if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
|
|
|
|
// . get the old version of our XmlDoc from the previous spider time
|
|
// . set using the old title rec in titledb
|
|
// . should really not do any more than set m_titleRec...
|
|
// . should not even uncompress it!
|
|
// . getNewSpiderReply() will use this to set the reply if
|
|
// m_indexCode == EDOCUNCHANGED...
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
|
|
// point to the old xml doc if no error, etc.
|
|
XmlDoc *od = *pod;
|
|
|
|
// check if we are already indexed
|
|
char *isIndexed = getIsIndexed ();
|
|
if ( ! isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
|
|
// do not delete anything in these cases, but do remove us from
|
|
// spiderdb, and from tfndb (except for EDOCNOTNEW)
|
|
//if ( m_indexCode == EDOCNOTNEW || m_indexCode == EDOCNOTOLD )
|
|
// od = NULL;
|
|
|
|
// why call this way down here? it ends up downloading the doc!
|
|
int32_t *indexCode = getIndexCode();
|
|
if ( ! indexCode || indexCode ==(void *)-1) return (char *)indexCode;
|
|
// sanity check
|
|
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this means to abandon the injection
|
|
if ( *indexCode == EABANDONED ||
|
|
*indexCode == EHITCRAWLLIMIT ||
|
|
*indexCode == EHITPROCESSLIMIT ) {
|
|
m_metaList = (char *)0x123456;
|
|
m_metaListSize = 0;
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
|
|
// if diffbot reply is empty, don't bother adding anything except
|
|
// for the spider reply... reply might be "-1" too!
|
|
//if ( m_useDiffbot &&
|
|
// ! m_isDiffbotJSONObject &&
|
|
// m_diffbotReplyValid &&
|
|
// m_diffbotReply.length() <= 3 )
|
|
// diffbotEmptyReply = true;
|
|
|
|
// . some index code warrant retries, like EDNSTIMEDOUT, ETCPTIMEDOUT,
|
|
// etc. these are deemed temporary errors. other errors basically
|
|
// indicate a document that will never be indexable and should,
|
|
// if currently indexed, be deleted.
|
|
// . just add the spider reply and we're done
|
|
if ( *indexCode == EDNSTIMEDOUT
|
|
|| *indexCode == ETCPTIMEDOUT
|
|
|| *indexCode == EUDPTIMEDOUT
|
|
|| *indexCode == EDNSDEAD
|
|
|| *indexCode == ENETUNREACH
|
|
|| *indexCode == EHOSTUNREACH
|
|
// . rejected from a diffbot regex url crawl filter?
|
|
// . or no json objects returned from diffbot?
|
|
// . or rejected from the processign regex filter?
|
|
// . then just add the SpiderReply to avoid respidering
|
|
// . NO! still need to add outlinks
|
|
//|| diffbotEmptyReply
|
|
// . treat this as a temporary error i guess
|
|
// . getNewSpiderReply() below will clear the error in it and
|
|
// copy stuff over from m_sreq and m_oldDoc for this case
|
|
|| *indexCode == EDOCUNCHANGED
|
|
) {
|
|
// sanity - in repair mode?
|
|
if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
|
|
// . this seems to be an issue for blocking
|
|
// . if we do not have a valid ip, we can't compute this,
|
|
// in which case it will not be valid in the spider reply
|
|
// . why do we need this for timeouts etc? if the doc is
|
|
// unchanged
|
|
// we should probably update its siteinlinks in tagdb
|
|
// periodically and reindex the whole thing...
|
|
// . i think we were getting the sitenuminlinks for
|
|
// getNewSpiderReply()
|
|
if ( m_ipValid &&
|
|
m_ip != 0 &&
|
|
m_ip != -1 ) {
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
}
|
|
// all done!
|
|
bool addReply = true;
|
|
// Scraper.cpp uses this
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
|
|
// page parser calls set4 and sometimes gets a dns time out!
|
|
if ( m_sreqValid && m_sreq.m_isPageParser ) addReply = false;
|
|
// return nothing if done
|
|
if ( ! addReply ) {
|
|
m_metaListSize = 0;
|
|
m_metaList = (char *)0x1;
|
|
return m_metaList;
|
|
}
|
|
// save this
|
|
int32_t savedCode = *indexCode;
|
|
// before getting our spider reply, assign crap from the old
|
|
// doc to us since we are unchanged! this will allow us to
|
|
// call getNewSpiderReply() without doing any processing, like
|
|
// setting the Xml or Words classes, etc.
|
|
copyFromOldDoc ( od );
|
|
// need this though! i don't want to print out "Success"
|
|
// in the log in the logIt() function
|
|
m_indexCode = savedCode;
|
|
m_indexCodeValid = true;
|
|
// but set our m_contentHash32 from the spider request
|
|
// which got it from the spiderreply in the case of
|
|
// EDOCUNCHANGED. this way ch32=xxx will log correctly.
|
|
// I think this is only when EDOCUNCHANGED is set in the
|
|
// Msg13.cpp code, when we have a spider compression proxy.
|
|
if ( *indexCode == EDOCUNCHANGED &&
|
|
m_sreqValid &&
|
|
! m_contentHash32Valid ) {
|
|
m_contentHash32 = m_sreq.m_contentHash32;
|
|
m_contentHash32Valid = true;
|
|
}
|
|
// we need these got getNewSpiderReply()
|
|
m_wasInIndex = false;
|
|
if ( od ) m_wasInIndex = true;
|
|
m_isInIndex = m_wasInIndex;
|
|
m_wasInIndexValid = true;
|
|
m_isInIndexValid = true;
|
|
|
|
// unset our ptr_linkInfo1 so we do not free it and core
|
|
// since we might have set it in copyFromOldDoc() above
|
|
ptr_linkInfo1 = NULL;
|
|
size_linkInfo1 = 0;
|
|
m_linkInfo1Valid = false;
|
|
|
|
bool indexNewTimeStamp = false;
|
|
if ( getUseTimeAxis() &&
|
|
od &&
|
|
m_hasMetadata &&
|
|
*indexCode == EDOCUNCHANGED
|
|
//m_spideredTimeValid &&
|
|
//od->m_spideredTime != m_spideredTime
|
|
)
|
|
indexNewTimeStamp = true;
|
|
|
|
|
|
|
|
// . if not using spiderdb we are done at this point
|
|
// . this happens for diffbot json replies (m_dx)
|
|
if ( ! m_useSpiderdb && ! indexNewTimeStamp ) {
|
|
m_metaList = NULL;
|
|
m_metaListSize = 0;
|
|
return (char *)0x01;
|
|
}
|
|
|
|
// get our spider reply
|
|
SpiderReply *newsr = getNewSpiderReply();
|
|
// return on error
|
|
if ( ! newsr ) return (char *)newsr;
|
|
// . panic on blocking! this is supposed to be fast!
|
|
// . it might still have to lookup the tagdb rec?????
|
|
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// how much we need
|
|
int32_t needx = sizeof(SpiderReply) + 1;
|
|
|
|
|
|
// . INDEX SPIDER REPLY (1a)
|
|
// . index ALL spider replies as separate doc. error or not.
|
|
// . then print out error histograms.
|
|
// . we should also hash this stuff when indexing the
|
|
// doc as a whole
|
|
|
|
// i guess it is safe to do this after getting the spiderreply
|
|
SafeBuf *spiderStatusDocMetaList = NULL;
|
|
// if ( cr->m_indexSpiderReplies &&
|
|
// m_useSpiderdb &&
|
|
// // doing it for diffbot throws off smoketests.
|
|
// // yeah, but we need it, so we'll just have to update
|
|
// // the smoketests
|
|
// ! cr->m_isCustomCrawl ) {
|
|
// get the spiderreply ready to be added
|
|
spiderStatusDocMetaList = getSpiderStatusDocMetaList(newsr ,
|
|
forDelete);
|
|
// error?
|
|
if ( ! spiderStatusDocMetaList ) return NULL;
|
|
// blocked?
|
|
if (spiderStatusDocMetaList==(void *)-1)
|
|
return (char *)-1;
|
|
|
|
// . now append the new stuff.
|
|
// . we overwrite the old titlerec with the new one that has
|
|
// some more json in the ptr_metaInfo buffer so we hash
|
|
// its new timestamp. 'gbspiderdate' and any info from
|
|
// the meta info given in the injection request if there.
|
|
// this allows you to tag each document, even multiple
|
|
// versions of the same url with the same content. so if
|
|
// you spider the doc again and it is unchanged since last
|
|
// time we still index some of this meta stuff.
|
|
if ( indexNewTimeStamp )
|
|
appendNewMetaInfo (spiderStatusDocMetaList,forDelete);
|
|
|
|
// need to alloc space for it too
|
|
int32_t len = spiderStatusDocMetaList->length();
|
|
needx += len;
|
|
// this too
|
|
m_addedStatusDocSize = len;
|
|
m_addedStatusDocSizeValid = true;
|
|
//}
|
|
|
|
// doledb key?
|
|
//if ( m_doledbKey.n0 || m_doledbKey.n1 )
|
|
// needx += 1 + sizeof(key_t); // + 4;
|
|
// the FAKEDB unlock key for msg12 in spider.cpp
|
|
//needx += 1 + sizeof(key_t); // FAKEDB
|
|
// make the buffer
|
|
m_metaList = (char *)mmalloc ( needx , "metalist");
|
|
if ( ! m_metaList ) return NULL;
|
|
// save size for freeing later
|
|
m_metaListAllocSize = needx;
|
|
// ptr and boundary
|
|
m_p = m_metaList;
|
|
m_pend = m_metaList + needx;
|
|
|
|
// save it
|
|
char *saved = m_p;
|
|
|
|
// first store spider reply "document"
|
|
if ( spiderStatusDocMetaList ) {
|
|
gbmemcpy ( m_p,
|
|
spiderStatusDocMetaList->getBufStart(),
|
|
spiderStatusDocMetaList->length() );
|
|
m_p += spiderStatusDocMetaList->length();
|
|
}
|
|
|
|
/*
|
|
|
|
Not any more, now we remove from doledb as soon
|
|
as we get all the lock grants in our group (shard)
|
|
using Msg4 in Spider.cpp. That way we can add a
|
|
"0" entry into the waiting tree (or a time X ms into
|
|
the future from now) to try to enforce a sameIpWait
|
|
constraint and also allow up to maxSpidersPerIP.
|
|
|
|
// remove from doledb if we had a valid key
|
|
// (BEFORE adding SpiderReply)
|
|
if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
|
|
// note it
|
|
setStatus ( "removing key from doledb" );
|
|
// . now remove the original spider rec from "doledb"
|
|
// . rdbid first
|
|
*m_p = RDB_DOLEDB;
|
|
m_p++;
|
|
// then the key
|
|
*(key_t *)m_p = m_doledbKey;
|
|
// nukey, clear del bit to delete it
|
|
*m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key_t);
|
|
// then zero for data size
|
|
// *(int32_t *)m_p = 0;
|
|
//m_p += 4;
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
}
|
|
*/
|
|
|
|
// sanity check
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// . make a fake titledb key
|
|
// . remove the spider lock (Msg12 in Spider.cpp)
|
|
// . now SPider.cpp uses SpiderReply reception to remove lock
|
|
// - mdw 9/28/13
|
|
//*m_p++ = RDB_FAKEDB;
|
|
//*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
|
|
//key_t fakeKey;
|
|
//fakeKey.n1 = 0;
|
|
//fakeKey.n0 = m_docId;
|
|
//gbmemcpy ( m_p , &fakeKey , sizeof(key_t) );
|
|
//m_p += sizeof(key_t);
|
|
// now add the new rescheduled time
|
|
setStatus ( "adding SpiderReply to spiderdb" );
|
|
// rdbid first
|
|
char rd = RDB_SPIDERDB;
|
|
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
|
*m_p++ = rd;
|
|
// get this
|
|
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
// store the spider rec
|
|
int32_t newsrSize = newsr->getRecSize();
|
|
gbmemcpy ( m_p , newsr , newsrSize );
|
|
m_p += newsrSize;
|
|
m_addedSpiderReplySize = newsrSize;
|
|
m_addedSpiderReplySizeValid = true;
|
|
// sanity check
|
|
if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
// verify it
|
|
m_metaListValid = true;
|
|
// set size
|
|
m_metaListSize = m_p - m_metaList;
|
|
// all done
|
|
return m_metaList;
|
|
|
|
}
|
|
|
|
// if using diffbot do not index the content of the web page we
|
|
// got the json objects from, although, do keep it cached in titledb
|
|
// because that can be useful
|
|
// Not any more, now index the pages as well! then restrict search
|
|
// to type:json to just search json objects.
|
|
//if ( m_useDiffbot && ! m_isDiffbotJSONObject ) {
|
|
// m_usePosdb = false;
|
|
// m_useClusterdb = false;
|
|
//}
|
|
|
|
// get the old meta list if we had an old doc
|
|
char *oldList = NULL;
|
|
int32_t oldListSize = 0;
|
|
if ( od ) {
|
|
od->m_useSpiderdb = false;
|
|
od->m_useTagdb = false;
|
|
// do not use diffbot for old doc since we call
|
|
// od->nukeJSONObjects below()
|
|
od->m_diffbotApiUrlValid = true;
|
|
// api url should be empty by default
|
|
//od->m_diffbotApiNum = DBA_NONE;
|
|
//log("break it here. shit this is not getting the list!!!");
|
|
// if we are doing diffbot stuff, we are still indexing this
|
|
// page, so we need to get the old doc meta list
|
|
oldList = od->getMetaList ( true );
|
|
oldListSize = od->m_metaListSize;
|
|
if ( ! oldList || oldList ==(void *)-1) return (char *)oldList;
|
|
}
|
|
|
|
// . set whether we should add recs to titledb, posdb, linkdb, etc.
|
|
// . if this doc is set by titlerec we won't change these
|
|
// . we only turn off m_usePosdb, etc. if there is a
|
|
// <meta name=noindex content=1>
|
|
// . we will still add to spiderdb, but not posdb, linkdb, titledb
|
|
// and clusterdb.
|
|
// . so we'll add the spiderreply for this doc and the spiderrequests
|
|
// for all outlinks and "firstIp" tagrecs to tagdb for those outlinks
|
|
// . we use this for adding the url seed file gbdmoz.urls.txt
|
|
// which contains a list of all the dmoz urls we want to spider.
|
|
// gbdmoz.urls.txt is generated by dmozparse.cpp. we spider all
|
|
// these dmoz urls so we can search the CONTENT of the pages in dmoz,
|
|
// something dmoz won't let you do.
|
|
char *mt = hasNoIndexMetaTag();
|
|
if ( ! mt || mt == (void *)-1 ) return (char *)mt;
|
|
if ( *mt ) {
|
|
m_usePosdb = false;
|
|
m_useLinkdb = false;
|
|
m_useTitledb = false;
|
|
m_useClusterdb = false;
|
|
// do not add the "firstIp" tagrecs of the outlinks any more
|
|
// because it might hurt us?
|
|
m_useTagdb = false;
|
|
}
|
|
|
|
if ( cr->m_isCustomCrawl )
|
|
m_useLinkdb = false;
|
|
|
|
// . should we recycle the diffbot reply for this url?
|
|
// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
|
|
// our existing diffbot reply, i.e. recycle it, even though we
|
|
// respidered this page.
|
|
bool *recycle = getRecycleDiffbotReply();
|
|
if ( ! recycle || recycle == (void *)-1) return (char *)recycle;
|
|
// in that case inherit this from the old doc...
|
|
if ( od && *recycle && od->m_diffbotJSONCount &&
|
|
// somehow i am seeing that this is empty!
|
|
// this is how many title hashes of diffbot replies we've
|
|
// stored in the old doc's titlerec. if these are not equal
|
|
// and we call reindexJSONObjects() below then it cores
|
|
// in redoJSONObjects().
|
|
od->size_linkInfo2/4 == od->m_diffbotJSONCount &&
|
|
// only call this once otherwise we double stock
|
|
// m_diffbotTitleHashBuf
|
|
m_diffbotJSONCount == 0 ) {//cr->m_isCustomCrawl){
|
|
m_diffbotJSONCount = od->m_diffbotJSONCount;
|
|
m_sentToDiffbot = od->m_sentToDiffbot;
|
|
m_gotDiffbotSuccessfulReply = od->m_gotDiffbotSuccessfulReply;
|
|
// copy title hashes info. it goes hand in hand with the
|
|
// NUMBER of diffbot items we have.
|
|
int nh = 0;
|
|
int32_t *ohbuf = od->getDiffbotTitleHashes ( &nh );
|
|
if ( ! m_diffbotTitleHashBuf.safeMemcpy ( ohbuf , nh*4 ) )
|
|
return NULL;
|
|
ptr_linkInfo2 =(LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
|
|
size_linkInfo2=m_diffbotTitleHashBuf.length();
|
|
|
|
}
|
|
// but we might have to call reindexJSONObjects() multiple times if
|
|
// it would block
|
|
if ( od && *recycle &&
|
|
// only reindex if it is a query reindex i guess otherwise
|
|
// just leave it alone
|
|
m_sreqValid && m_sreq.m_isPageReindex &&
|
|
od->m_diffbotJSONCount &&
|
|
size_linkInfo2 ) {
|
|
// similar to od->nukeJSONObjects
|
|
int32_t *ohbuf =(int32_t *)m_diffbotTitleHashBuf.getBufStart();
|
|
int32_t nh =m_diffbotTitleHashBuf.length() / 4;
|
|
int32_t *status = reindexJSONObjects( ohbuf , nh );
|
|
if ( ! status || status == (void *)-1) return (char *)status;
|
|
}
|
|
|
|
|
|
// just delete the json items whose "title hashes" are present
|
|
// in the "old doc" but NOT i the "new doc".
|
|
// we use the title hash to construct a unique url for each json item.
|
|
// if the title hash is present in both the old and new docs then
|
|
// do not delete it here, but we will reindex it later in
|
|
// getMetaList() below when we call indexDoc() on each one after
|
|
// setting m_dx to each one.
|
|
bool nukeJson = true;
|
|
if ( ! od ) nukeJson = false;
|
|
if ( od && od->m_diffbotJSONCount <= 0 ) nukeJson = false;
|
|
// if recycling json objects, leave them there!
|
|
if ( *recycle ) nukeJson = false;
|
|
// you have to be a diffbot crawl to do this
|
|
// no, not if you have th diffbot api url set... so take this out
|
|
//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
|
|
// do not remove old diffbot json objects if pageparser.cpp test
|
|
// because that can not change the index, etc.
|
|
if ( getIsPageParser() ) nukeJson = false;
|
|
|
|
if ( nukeJson ) {
|
|
// it should only nuke/delete the json items that we LOST,
|
|
// so if we still have the title hash in our latest
|
|
// diffbot reply, then do not nuke that json item, which
|
|
// will have a url ending in -diffboyxyz%"UINT32"
|
|
// (where %"UINT32" is the json item title hash).
|
|
// This will download the diffbot reply if not already there.
|
|
int32_t numHashes;
|
|
int32_t *th = getDiffbotTitleHashes(&numHashes);
|
|
if ( ! th && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
if ( ! th || th == (void *)-1 ) return (char *)th;
|
|
// this returns false if it blocks
|
|
int32_t *status = od->nukeJSONObjects( th , numHashes );
|
|
if ( ! status || status == (void *)-1) return (char *)status;
|
|
}
|
|
|
|
// . need this if useTitledb is true
|
|
// . otherwise XmlDoc::getTitleRecBuf() cores because its invalid
|
|
// . this cores if rebuilding just posdb because hashAll() needs
|
|
// the inlink texts for hashing
|
|
//if ( m_useTitledb ) {
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 )
|
|
return (char *)info1;
|
|
//}
|
|
|
|
// global debug
|
|
g_od = od;
|
|
|
|
/*
|
|
// is the document content unchanged?
|
|
bool unchanged = false;
|
|
if ( od && od->m_contentHash32 == *ch32 ) unchanged = true;
|
|
// http status of 304 means "not modified since"
|
|
if ( od && *hs == 304 ) unchanged = true;
|
|
|
|
// compare to last time
|
|
if ( od && *tagHash != od->m_tagHash32 ) unchanged = false;
|
|
|
|
// do not do this if from pageparser.cpp
|
|
//if ( m_sreqValid && m_sreq.m_isPageParser ) unchanged = false;
|
|
if ( getIsPageParser() ) unchanged = false;
|
|
|
|
// force reindex if it was from query reindex (docid based spider req)
|
|
if ( m_sreqValid && m_sreq.m_urlIsDocId ) unchanged = false;
|
|
|
|
// if we were turked... how to tell????
|
|
if ( m_sreqValid && m_sreq.m_isInjecting ) unchanged = false;
|
|
|
|
// just turn it all off for now because our parsing logic might
|
|
// have changed
|
|
unchanged = false;
|
|
|
|
// set this i guess for doConsistencyTest()
|
|
m_unchanged = unchanged;
|
|
m_unchangedValid = true;
|
|
|
|
// . if doc content was unchanged just add the SpiderReply to the
|
|
// meta list so that spiderdb knows we attempted it at this time.
|
|
// . copy over member vars of the old titleRec/XmlDoc into us so
|
|
// we can save time and cpu
|
|
if ( unchanged ) {
|
|
// this seems to be an issue for blocking
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
// all done!
|
|
bool addReply = true;
|
|
// Scraper.cpp uses this
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
|
|
// return nothing if done
|
|
if ( ! addReply ) {
|
|
m_metaListSize = 0;
|
|
m_metaList = (char *)0x1;
|
|
return m_metaList;
|
|
}
|
|
// before getting our spider reply, assign crap from the old
|
|
// doc to us since we are unchanged! this will allow us to
|
|
// call getNewSpiderReply() without doing any processing, like
|
|
// setting the Xml or Words classes, etc.
|
|
copyFromOldDoc ( od );
|
|
// and don't forget to validate this
|
|
int32_t *ic = getIndexCode();
|
|
// should never block since we copied from old doc
|
|
if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// get our spider reply
|
|
SpiderReply *newsr = getNewSpiderReply();
|
|
// return on error
|
|
if ( ! newsr ) return (char *)newsr;
|
|
// . panic on blocking! this is supposed to be fast!
|
|
// . it might still have to lookup the tagdb rec?????
|
|
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// unset our ptr_linkInfo1 so we do not free it and core
|
|
// since we might have set it in copyFromOldDoc() above
|
|
ptr_linkInfo1 = NULL;
|
|
size_linkInfo1 = 0;
|
|
// how much we need
|
|
int32_t needx = sizeof(SpiderReply) + 1;
|
|
// doledb key?
|
|
if ( m_doledbKey.n0 || m_doledbKey.n1 )
|
|
needx += 1 + sizeof(key_t); // + 4;
|
|
// the titledb unlock key for msg12 in spider.cpp
|
|
needx += 1 + sizeof(key_t);
|
|
// make the buffer
|
|
m_metaList = (char *)mmalloc ( needx , "metalist");
|
|
if ( ! m_metaList ) return NULL;
|
|
// save size for freeing later
|
|
m_metaListAllocSize = needx;
|
|
// ptr and boundary
|
|
m_p = m_metaList;
|
|
m_pend = m_metaList + needx;
|
|
// save it
|
|
char *saved = m_p;
|
|
// remove from doledb if we had a valid key (BEFORE adding SpiderReply)
|
|
if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
|
|
// note it
|
|
setStatus ( "removing key from doledb" );
|
|
// . now remove the original spider rec from "doledb"
|
|
// . rdbid first
|
|
*m_p = RDB_DOLEDB;
|
|
m_p++;
|
|
// then the key
|
|
*(key_t *)m_p = m_doledbKey;
|
|
// nukey, clear del bit to delete it
|
|
*m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key_t);
|
|
// then zero for data size
|
|
// *(int32_t *)m_p = 0;
|
|
//m_p += 4;
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
}
|
|
// sanity check
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// . make a fake titledb key
|
|
// . remove the spider lock (Msg12 in Spider.cpp)
|
|
*m_p++ = RDB_FAKEDB;
|
|
*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
|
|
m_p += sizeof(key_t);
|
|
// now add the new rescheduled time
|
|
// note it
|
|
setStatus ( "adding SpiderReply to spiderdb" );
|
|
// rdbid first
|
|
*m_p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
|
|
m_p++;
|
|
// get this
|
|
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
// store the spider rec
|
|
int32_t newsrSize = newsr->getRecSize();
|
|
gbmemcpy ( m_p , newsr , newsrSize );
|
|
m_p += newsrSize;
|
|
// sanity check
|
|
if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
// verify it
|
|
m_metaListValid = true;
|
|
// set size
|
|
m_metaListSize = m_p - m_metaList;
|
|
// all done
|
|
return m_metaList;
|
|
}
|
|
*/
|
|
|
|
// so getSiteRank() works
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
// so addTable144 works
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
|
|
|
|
|
|
//
|
|
// . before making the title rec we need to set all the ptrs!
|
|
// . so at least now set all the data members we will need to
|
|
// seriazlize into the title rec because we can't be blocking further
|
|
// down below after we set all the hashtables and XmlDoc::ptr_ stuff
|
|
if ( ! m_setFromTitleRec || m_useSecondaryRdbs ) {
|
|
// all member vars should already be valid if set from titlerec
|
|
char *ptg = prepareToMakeTitleRec ();
|
|
// return NULL with g_errno set on error
|
|
if ( ! ptg || ptg == (void *)-1 ) return (char *)ptg;
|
|
}
|
|
|
|
// sanity check - if the valid title rec is null, m_indexCode is set!
|
|
//if ( ! *tr && ! m_indexCode ) { char *xx=NULL;*xx=0; }
|
|
// . bail. return an empty meta list, m_metaListSize should be 0!
|
|
// . NO! we need to add a SpiderReply to spiderdb...
|
|
//if ( ! *tr )
|
|
// log("HEY");
|
|
/*
|
|
if ( ! *tr ) {
|
|
m_metaList = "";
|
|
m_metaListSize = 0;
|
|
m_metaListValid = true;
|
|
return m_metaList;
|
|
}
|
|
*/
|
|
|
|
// get this for hashing stuff
|
|
//Spam *sp = getSpam();
|
|
//if ( ! sp || sp == (void *)-1 ) return (char *)sp;
|
|
|
|
// our next slated spider priority
|
|
char *spiderLinks3 = getSpiderLinks();
|
|
if ( ! spiderLinks3 || spiderLinks3 == (char *)-1 )
|
|
return (char *)spiderLinks3;
|
|
|
|
bool spideringLinks = *spiderLinks3;
|
|
|
|
// int16_tcut
|
|
XmlDoc *nd = this;
|
|
|
|
///////////////////////////////////
|
|
///////////////////////////////////
|
|
//
|
|
//
|
|
// if we had an error, do not add us regardless to the index
|
|
// although we might add SOME things depending on the error.
|
|
// Like add the redirecting url if we had a ESIMPLIFIEDREDIR error.
|
|
// So what we had to the Rdbs depends on the indexCode.
|
|
//
|
|
|
|
if ( m_indexCode ) nd = NULL;
|
|
|
|
// OR if deleting from index, we just want to get the metalist
|
|
// directly from "od"
|
|
if ( m_deleteFromIndex ) nd = NULL;
|
|
|
|
|
|
//
|
|
//
|
|
///////////////////////////////////
|
|
///////////////////////////////////
|
|
|
|
if ( ! nd )
|
|
spideringLinks = false;
|
|
|
|
// set these for getNewSpiderReply() so it can set
|
|
// SpiderReply::m_wasIndexed and m_isIndexed...
|
|
m_wasInIndex = false;
|
|
m_isInIndex = false;
|
|
if ( od ) m_wasInIndex = true;
|
|
if ( nd ) m_isInIndex = true;
|
|
m_wasInIndexValid = true;
|
|
m_isInIndexValid = true;
|
|
|
|
|
|
// if we are adding a simplified redirect as a link to spiderdb
|
|
if ( m_indexCode == EDOCSIMPLIFIEDREDIR )
|
|
spideringLinks = true;
|
|
|
|
// likewise if there error was ENONCANONICAL treat it like that
|
|
if ( m_indexCode == EDOCNONCANONICAL )
|
|
spideringLinks = true;
|
|
|
|
//
|
|
// . prepare the outlink info if we are adding links to spiderdb!
|
|
// . do this before we start hashing so we do not block and re-hash!!
|
|
//
|
|
if ( spideringLinks && ! m_doingConsistencyCheck && m_useSpiderdb){
|
|
setStatus ( "getting outlink info" );
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (char *)grv;
|
|
//char **iiv = getOutlinkIsIndexedVector();
|
|
//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
|
|
int32_t **ipv = getOutlinkFirstIpVector();
|
|
if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
|
|
//int8_t *hcv = getOutlinkHopCountVector();
|
|
//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
|
|
//char *ipi = getIsIndexed(); // is the parent indexed?
|
|
//if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
|
|
}
|
|
|
|
// get the tag buf to add to tagdb
|
|
SafeBuf *ntb = NULL;
|
|
if ( m_useTagdb && ! m_deleteFromIndex ) {
|
|
ntb = getNewTagBuf();
|
|
if ( ! ntb || ntb == (void *)-1 ) return (char *)ntb;
|
|
}
|
|
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (void *)-1 ) return (char *)ww;
|
|
|
|
int64_t *pch64 = getExactContentHash64();
|
|
//int64_t *pch64 = getLooseContentHash64();
|
|
if ( ! pch64 || pch64 == (void *)-1 ) return (char *)pch64;
|
|
|
|
// get the voting table which we will add to sectiondb
|
|
SectionVotingTable *nsvt = NULL;
|
|
SectionVotingTable *osvt = NULL;
|
|
// seems like
|
|
// sectiondb takes up abotu 15% of the disk space like this. no!
|
|
// cuz then there is revdb, so we are 30%. so that's a no go.
|
|
bool addSectionVotes = false;
|
|
if ( nd ) addSectionVotes = true;
|
|
if ( ! m_useSectiondb ) addSectionVotes = false;
|
|
// to save disk space no longer add the roots! nto only saves sectiondb
|
|
// but also saves space in revdb
|
|
//if ( nd && *isRoot ) addSectionVotes = true;
|
|
if ( addSectionVotes ) {
|
|
nsvt = getNewSectionVotingTable();
|
|
if ( ! nsvt || nsvt == (void *)-1 ) return (char *)nsvt;
|
|
// get the old table too!
|
|
osvt = getNewSectionVotingTable();
|
|
if ( ! osvt || osvt == (void *)-1 ) return (char *)osvt;
|
|
}
|
|
|
|
// get the addresses for hashing tag hashes that indicate place names
|
|
Addresses *na = NULL;
|
|
//Addresses *oa = NULL;
|
|
if ( nd ) na = getAddresses();
|
|
//if ( od ) oa = od->getAddresses();
|
|
|
|
// get dates ready for hashing
|
|
Dates *ndp = NULL;
|
|
//Dates *odp = NULL;
|
|
if ( nd ) {
|
|
ndp = nd->getDates();
|
|
if ( ! ndp || ndp==(void *)-1) return (char *)ndp;
|
|
}
|
|
//if ( od ) {
|
|
// odp = od->getDates();
|
|
// if ( ! odp || odp==(void *)-1) return (char *)odp;
|
|
//}
|
|
|
|
// need firstip if adding a rebuilt spider request
|
|
if ( m_useSecondaryRdbs && ! m_isDiffbotJSONObject && m_useSpiderdb ) {
|
|
int32_t *fip = getFirstIp();
|
|
if ( ! fip || fip == (void *)-1 ) return (char *)fip;
|
|
}
|
|
|
|
|
|
// shit, we need a spider reply so that it will not re-add the
|
|
// spider request to waiting tree, we ignore docid-based
|
|
// recs that have spiderreplies in Spider.cpp
|
|
SpiderReply *newsr = NULL;
|
|
if ( m_useSpiderdb ) { // && ! m_deleteFromIndex ) {
|
|
newsr = getNewSpiderReply();
|
|
if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
|
|
}
|
|
|
|
// the site hash for hashing
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (char *)sh32;
|
|
|
|
// set ptr_clockCandidatesData
|
|
// if ( nd ) {
|
|
// HashTableX *cct = nd->getClockCandidatesTable();
|
|
// if ( ! cct || cct==(void *)-1) return (char *)cct;
|
|
// }
|
|
|
|
if ( m_useLinkdb && ! m_deleteFromIndex ) {
|
|
int32_t *linkSiteHashes = getLinkSiteHashes();
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
|
|
return (char *)linkSiteHashes;
|
|
}
|
|
|
|
//SafeBuf *au = getDiffbotApiUrl();
|
|
//if ( ! au || au == (void *)-1 ) return (char *)au;
|
|
|
|
|
|
// test json parser
|
|
//
|
|
/*
|
|
char *json = "{\"icon\":\"http://www.pixar.com/sites/default/files/pixar_2012_favicon_0.ico\",\"text\":\"\",\"title\":\"Pixar\",\"type\":\"article\",\"media\":[{\"primary\":\"true\",\"link\":\"http://www.pixar.com/sites/default/files/home_billboard_v7.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/roz1_0.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/home_bu-thumb_v1.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/mu_home_thumb.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/brenda_homepage.jpg\",\"type\":\"image\"}],\"url\":\"http://www.pixar.com/\"}";
|
|
char *dd = getNextJSONObject ( json );
|
|
if ( *dd ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
///////////
|
|
//
|
|
// BEGIN the diffbot json object index hack
|
|
//
|
|
// if we are using diffbot, then each json object in the diffbot reply
|
|
// should be indexed as its own document.
|
|
//
|
|
///////////
|
|
|
|
|
|
// . get the reply of json objects from diffbot
|
|
// . this will be empty if we are a json object!
|
|
// . will also be empty if not meant to be sent to diffbot
|
|
// . the TOKENIZED reply consists of \0 separated json objects that
|
|
// we create from the original diffbot reply
|
|
SafeBuf *tdbr = getTokenizedDiffbotReply();
|
|
if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr;
|
|
|
|
|
|
// i guess it is safe to do this after getting the spiderreply
|
|
SafeBuf *spiderStatusDocMetaList = NULL;
|
|
//if ( indexReply ) {
|
|
|
|
// get the spiderreply ready to be added to the rdbs w/ msg4
|
|
// but if doing a rebuild operation then do not get it, we'll rebuild
|
|
// it since it will have its own titlerec
|
|
if ( ! m_useSecondaryRdbs ) {
|
|
spiderStatusDocMetaList =
|
|
getSpiderStatusDocMetaList (newsr,forDelete);
|
|
if ( ! spiderStatusDocMetaList ) {
|
|
log("build: ss doc metalist null. bad!");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if ( spiderStatusDocMetaList == (void *)-1)
|
|
return (char *)spiderStatusDocMetaList;
|
|
//}
|
|
|
|
|
|
|
|
|
|
int32_t tdbrLen = tdbr->length();
|
|
|
|
// do not index json items as separate docs if we are page parser
|
|
if ( getIsPageParser() ) tdbrLen = 0;
|
|
|
|
// same goes if appending -diffbotxyz%UINT32 would be too long
|
|
if ( m_firstUrl.getUrlLen() + 11 + 10 > MAX_URL_LEN )
|
|
tdbrLen = 0;
|
|
|
|
// once we have tokenized diffbot reply we can get a unique
|
|
// hash of the title of each json item. that way, if a page changes
|
|
// and it gains or loses a diffbot item, the old items will still
|
|
// have the same url and we can set their m_indexCode to EDOCUNCHANGED
|
|
// if the individual json item itself has not changed when we
|
|
// call m_dx->indexDoc() below.
|
|
int32_t numHashes = 0;
|
|
int32_t *titleHashBuf = NULL;
|
|
|
|
//
|
|
// if we got a json object or two from diffbot, index them
|
|
// as their own child xmldocs.
|
|
// watch out for reply from diffbot of "-1" indicating error!
|
|
//
|
|
if ( tdbrLen > 3 ) {
|
|
|
|
// get title hashes of the json items
|
|
titleHashBuf = getDiffbotTitleHashes(&numHashes);
|
|
if (!titleHashBuf || titleHashBuf == (void *)-1){
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// make sure diffbot reply is valid for sure
|
|
if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
|
|
// set status for this
|
|
setStatus ( "indexing diffbot json doc");
|
|
// new guy here
|
|
if ( ! m_dx ) {
|
|
try { m_dx = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("xmldoc: failed to alloc m_dx");
|
|
return NULL;
|
|
}
|
|
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
|
|
// we now parse the array of products out of the
|
|
// diffbot reply. each product is an item/object.
|
|
m_diffbotObj = tdbr->getBufStart();
|
|
m_diffbotJSONCount = 0;
|
|
}
|
|
// loop back up here to process next json object from below
|
|
jsonloop:
|
|
// if m_dx has no url set, call set4 i guess
|
|
if ( ! m_dx->m_contentValid ) {
|
|
|
|
// sanity. ensure the json item we are trying to
|
|
// index has a title hash in this buf
|
|
if(m_diffbotJSONCount>=numHashes){char *xx=NULL;*xx=0;}
|
|
|
|
// get the title of the json we are indexing
|
|
int32_t jth = titleHashBuf [ m_diffbotJSONCount ];
|
|
|
|
// make the fake url for this json object for indexing
|
|
SafeBuf fakeUrl;
|
|
fakeUrl.set ( m_firstUrl.getUrl() );
|
|
// append -diffbot-0 etc. for fake url
|
|
fakeUrl.safePrintf("-diffbotxyz%"UINT32"",
|
|
//(int32_t)m_diffbotJSONCount);
|
|
(uint32_t)jth);
|
|
if ( fakeUrl.length() > MAX_URL_LEN ) {
|
|
log("build: diffbot enhanced url too long for "
|
|
"%s",fakeUrl.getBufStart());
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
m_diffbotJSONCount++;
|
|
// this can go on the stack since set4() copies it
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
// string ptr
|
|
char *url = fakeUrl.getBufStart();
|
|
// use this as the url
|
|
strcpy( sreq.m_url, url );
|
|
// parentdocid of 0
|
|
int32_t firstIp = hash32n ( url );
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = m_hopCount;
|
|
sreq.m_hopCountValid = m_hopCountValid;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
// so we can match url filters' "insitelist" directive
|
|
// in Spider.cpp::getUrlFilterNum()
|
|
sreq.m_domHash32 = m_domHash32;
|
|
sreq.m_siteHash32 = m_siteHash32;
|
|
sreq.m_hostHash32 = m_siteHash32;
|
|
// set this
|
|
if (!m_dx->set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have
|
|
// to be careful since we are a
|
|
// niceness of 0!!!!
|
|
m_niceness, // 1 ,
|
|
// inject this content
|
|
m_diffbotObj,
|
|
false, // deleteFromIndex ,
|
|
0, // forcedIp ,
|
|
CT_JSON, // contentType ,
|
|
0, // lastSpidered ,
|
|
false )) // hasMime
|
|
// g_errno should be set!
|
|
return NULL;
|
|
// we are indexing json objects, don't use all these
|
|
m_dx->m_useClusterdb = false;
|
|
m_dx->m_useSpiderdb = false;
|
|
m_dx->m_useTagdb = false;
|
|
m_dx->m_usePlacedb = false;
|
|
m_dx->m_useLinkdb = false;
|
|
m_dx->m_isChildDoc = true;
|
|
m_dx->m_parentDocPtr = this;
|
|
// we like to sort json objects using
|
|
// 'gbsortby:spiderdate' query to get the most
|
|
// recent json objects, so this must be valid
|
|
if ( m_spideredTimeValid ) {
|
|
m_dx->m_spideredTimeValid = true;
|
|
m_dx->m_spideredTime = m_spideredTime;
|
|
}
|
|
|
|
m_dx->m_isDiffbotJSONObject = true;
|
|
}
|
|
|
|
// when the indexdoc completes, or if it blocks, call us!
|
|
// we should just pass through here
|
|
//xd->setCallback ( this , getMetaListWrapper );
|
|
m_dx->setCallback ( m_masterState , m_masterLoop );
|
|
|
|
///////////////
|
|
// . inject the content of the json using this fake url
|
|
// . return -1 if this blocks
|
|
// . if m_dx got its msg4 reply it ends up here, in which
|
|
// case do NOT re-call indexDoc() so check for
|
|
// m_listAdded.
|
|
///////////////
|
|
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
|
|
return (char *)-1;
|
|
|
|
// critical error on our part trying to index it?
|
|
// does not include timeouts or 404s, etc. mostly just
|
|
// OOM errors.
|
|
if ( g_errno ) return NULL;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
// count as deleted
|
|
cr->m_localCrawlInfo.m_objectsAdded++;
|
|
cr->m_globalCrawlInfo.m_objectsAdded++;
|
|
cr->m_needsSave = true;
|
|
// we successfully index the json object, skip to next one
|
|
m_diffbotObj += gbstrlen(m_diffbotObj) + 1;
|
|
// but gotta set this crap back
|
|
log(LOG_INFO,"diffbot: resetting %s",m_dx->m_firstUrl.m_url);
|
|
// clear for next guy if there is one. clears
|
|
// m_dx->m_contentValid so the set4() can be called again above
|
|
m_dx->reset();
|
|
// have we breached the buffer of json objects? if not, do more
|
|
if ( m_diffbotObj < tdbr->getBuf() ) goto jsonloop;
|
|
}
|
|
|
|
/////
|
|
//
|
|
// END the diffbot json object index hack
|
|
//
|
|
/////
|
|
|
|
|
|
//
|
|
// CAUTION
|
|
//
|
|
// CAUTION
|
|
//
|
|
// We should never "block" after this point, lest the hashtables
|
|
// we create get messed up.
|
|
//
|
|
|
|
//
|
|
//
|
|
// START HASHING
|
|
//
|
|
//
|
|
|
|
|
|
// store what we hash into this table
|
|
if ( (m_pbuf || m_storeTermListInfo) && ! m_wts ) {
|
|
// init it. the value is a TermInfo class. allowDups=true!
|
|
m_wtsTable.set (12,sizeof(TermDebugInfo),
|
|
0,NULL,0,true,m_niceness,
|
|
"wts-tab");
|
|
// point to it, make it active
|
|
m_wts = &m_wtsTable;
|
|
}
|
|
|
|
// how much to alloc? compute an upper bound
|
|
int32_t need = 0;
|
|
// should we index this doc?
|
|
bool index1 = true;
|
|
|
|
setStatus ( "hashing posdb and datedb terms" );
|
|
// . hash our documents terms into "tt1"
|
|
// . hash the old document's terms into "tt2"
|
|
// . by old, we mean the older versioned doc of this url spidered b4
|
|
HashTableX tt1;
|
|
HashTableX tt2;
|
|
// how many words we got?
|
|
int32_t nw = m_words.getNumWords();
|
|
// . prepare it, 5000 initial terms
|
|
// . make it nw*8 to avoid have to re-alloc the table!!!
|
|
// . i guess we can have link and neighborhood text too! we don't
|
|
// count it here though... but add 5k for it...
|
|
int32_t need4 = nw * 4 + 5000;
|
|
if ( nd && index1 && m_usePosdb ) {
|
|
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,
|
|
"posdb-indx"))
|
|
return NULL;
|
|
int32_t did = tt1.m_numSlots;
|
|
//bool index2 = true;
|
|
// . hash the document terms into "tt1"
|
|
// . this is a biggie!!!
|
|
// . only hash ourselves if m_indexCode is false
|
|
// . m_indexCode is non-zero if we should delete the doc from
|
|
// index
|
|
// . i think this only adds to posdb
|
|
//log("xmldoc: CALLING HASHALL");
|
|
// shit, this blocks which is bad!!!
|
|
char *nod = hashAll ( &tt1 ) ;
|
|
// you can't block here because if we are re-called we lose tt1
|
|
if ( nod == (char *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// error?
|
|
if ( ! nod ) return NULL;
|
|
int32_t done = tt1.m_numSlots;
|
|
if ( done != did )
|
|
log("xmldoc: reallocated big table! bad. old=%"INT32" "
|
|
"new=%"INT32" nw=%"INT32"",did,done,nw);
|
|
}
|
|
|
|
// if indexing the spider reply as well under a different docid
|
|
// there is no reason we can't toss it into our meta list here
|
|
if ( spiderStatusDocMetaList )
|
|
need += spiderStatusDocMetaList->length();
|
|
|
|
// now we use revdb
|
|
// before hashing the old doc into it
|
|
//if ( od && index2 ) {
|
|
// // if this hash table init fails, return NULL
|
|
// if (!tt2.set(12,4,5000,NULL,0,false,m_niceness)) return NULL;
|
|
// char *rod = od->hash ( &tt2 ) ;
|
|
// if ( ! rod || rod == (char *)-1 ) return rod;
|
|
//}
|
|
// space for indexdb AND DATEDB! +2 for rdbids
|
|
int32_t needIndexdb = 0;
|
|
needIndexdb +=tt1.m_numSlotsUsed*(sizeof(key144_t)+2+sizeof(key128_t));
|
|
//needIndexdb+=tt2.m_numSlotsUsed * (sizeof(key_t)+2+sizeof(key128_t));
|
|
need += needIndexdb;
|
|
// sanity check
|
|
//if ( ! od && m_skipIndexing && needIndexdb ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . sanity check - must have one or the other!
|
|
// . well, not in the case of EDOCNOTNEW or EDOCNOTOLD, in which
|
|
// case we just remove ourselves from spiderdb, and in the case
|
|
// of EDOCNOTOLD, from tfndb as well
|
|
//if ( ! od && ! nd ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// what pub dates do the old and new doc have? -1 means none.
|
|
int32_t date1 = -1; if ( nd ) date1 = nd->m_pubDate;
|
|
//int32_t date2 = -1; if ( od ) date2 = od->m_pubDate;
|
|
|
|
// now we also add the title rec. true = ownsCbuf? ret NULL on error
|
|
// with g_errno set.
|
|
//if ( nd && ! nd->compress( true , m_niceness ) ) return NULL;
|
|
|
|
|
|
/*
|
|
now we have the bit in the posdb key, so this should not be needed...
|
|
use Posdb::isShardedByTermId() to see if it is such a spcial case key
|
|
like Hostdb::getShardNum() now does...
|
|
|
|
setStatus ( "hashing nosplit keys" );
|
|
// hash no split terms into ns1 and ns2
|
|
HashTableX ns1;
|
|
// prepare it, 500 initial terms
|
|
if ( ! ns1.set ( 18 , 4 , 500,NULL,0,false,m_niceness,"nosplt-indx" ))
|
|
return NULL;
|
|
// . hash for no splits
|
|
// . like above, but these are "no split" termids
|
|
if ( nd && m_usePosdb && ! hashNoSplit ( &ns1 ) ) return NULL;
|
|
//if(index2 && od && ! od->hashNoSplit ( &ns2 ) ) return NULL;
|
|
// needs for hashing no split terms
|
|
int32_t needNoSplit1 = 0;
|
|
// add em up. +1 for rdbId. add to both indexdb AND datedb i guess...
|
|
needNoSplit1 += ns1.m_numSlotsUsed * (18+1); // +16+1);
|
|
//needNoSplit += ns2.m_numSlotsUsed * (12+1+16+1);
|
|
// add it in
|
|
need += needNoSplit1;
|
|
// sanity check
|
|
//if ( ! od && m_skipIndexing && needNoSplit ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
|
|
setStatus ( "hashing sectiondb keys" );
|
|
// add in special sections keys. "ns" = "new sections", etc.
|
|
// add in the special nosplit datedb terms from the Sections class
|
|
// these hash into the term table so we can do incremental updating
|
|
HashTableX st1; // <key128_t,char> dt1;
|
|
//HashTableX st2; // <key128_t,char> dt2;
|
|
// set key/data size
|
|
int32_t svs = sizeof(SectionVote);
|
|
st1.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness,"sectdb-indx");
|
|
// tell hashtable to use the sectionhash for determining the slot,
|
|
// not the lower 4 bytes because that is the docid which is the
|
|
// same for every key
|
|
st1.m_maskKeyOffset = 6;
|
|
//st2.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness);
|
|
// do not bother if deleting
|
|
if ( m_indexCode ) nsvt = NULL;
|
|
|
|
// . now we hash the root just to get some section votes i guess
|
|
//if ( nts && ! *isr ) nsvt = NULL;
|
|
// if old voting table add more than 100,000 votes forget it!!! do
|
|
// not bloat sectiondb that big...
|
|
if ( osvt && osvt->m_totalSiteVoters >= MAX_SITE_VOTERS ) nsvt = NULL;
|
|
// hash terms into a table that uses full datedb keys
|
|
if ( nsvt && ! nsvt->hash (m_docId,&st1,*sh64,m_niceness)) return NULL;
|
|
// needs for hashing no split terms
|
|
int32_t needSectiondb = 0;
|
|
// add em up. plus one for rdbId
|
|
needSectiondb += st1.m_numSlotsUsed * (16+svs+1);
|
|
//needSectiondb += st2.m_numSlotsUsed * (16+svs+1);
|
|
// add it in
|
|
need += needSectiondb;
|
|
|
|
|
|
// Sections::respiderLineWaiters() adds one docid-based spider rec
|
|
// for every url waiting in line. Sections::m_numLineWaiters. assume
|
|
// 64 bytes per line waiter spider rec i guess
|
|
//int32_t needLineWaiters = 0;
|
|
// +1 for rdbId
|
|
//if ( ns ) needLineWaiters = ns->m_numLineWaiters * 64;
|
|
// forgot to add this?
|
|
//need += needLineWaiters;
|
|
// . for adding Sections.cpp keys
|
|
// . Sections::hash() does not bother with invalid sections
|
|
// . waitInLine might be true in Sections::hash() too, so always add 12
|
|
//if ( ns ) need += (ns->m_numSections - ns->m_numInvalids)*12 + 12;
|
|
//if ( os ) need += (os->m_numSections - os->m_numInvalids)*12 + 12;
|
|
|
|
|
|
// for adding Addresses::m_keys[] (Addresses::hash())
|
|
//if ( na ) need += (na->m_numKeys * 16);
|
|
//if ( oa ) need += (oa->m_numKeys * 16);
|
|
|
|
// don't forget Dates!
|
|
//if ( ndp ) need += ndp->m_numPubDates * sizeof(key_t);
|
|
//if ( odp ) need += odp->m_numPubDates * sizeof(key_t);
|
|
|
|
// clusterdb keys. plus one for rdbId
|
|
int32_t needClusterdb = 0;
|
|
//if ( nd && ! nd->m_skipIndexing ) needClusterdb += 13;
|
|
//if ( od && ! od->m_skipIndexing ) needClusterdb += 13;
|
|
if ( nd ) needClusterdb += 13;
|
|
//if ( od ) needClusterdb += 13;
|
|
need += needClusterdb;
|
|
|
|
// . LINKDB
|
|
// . linkdb records. assume one per outlink
|
|
// . we may index 2 16-byte keys for each outlink
|
|
Links *nl2 = NULL;
|
|
//if ( spideringLinks ) nl2 = &m_links;
|
|
// if injecting, spideringLinks is false, but then we don't
|
|
// add the links to linkdb, which causes the qainlinks() test to fail
|
|
nl2 = &m_links;
|
|
// do not bother if deleting. but we do add simplified redirects
|
|
// to spiderdb as SpiderRequests now.
|
|
int32_t code = m_indexCode;
|
|
if ( code == EDOCSIMPLIFIEDREDIR ) code = 0;
|
|
if ( code == EDOCNONCANONICAL ) code = 0;
|
|
if ( code ) nl2 = NULL;
|
|
//Links *ol = NULL; if ( od ) ol = od->getLinks();
|
|
// . set key/data size
|
|
// . use a 16 byte key, not the usual 12
|
|
// . use 0 for the data, since these are pure keys, which have no
|
|
// scores to accumulate
|
|
HashTableX kt1;
|
|
//HashTableX kt2;
|
|
int32_t nis = 0;
|
|
if ( nl2 && m_useLinkdb ) nis = nl2->getNumLinks() * 4;
|
|
// pre-grow table based on # outlinks
|
|
kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" );
|
|
// use magic to make fast
|
|
kt1.m_useKeyMagic = true;
|
|
// linkdb keys will have the same lower 4 bytes, so make hashing fast.
|
|
// they are 28 byte keys. bytes 20-23 are the hash of the linkEE
|
|
// so that will be the most random.
|
|
kt1.m_maskKeyOffset = 20;
|
|
// faster
|
|
//kt2.set ( sizeof(key128_t) , 0,0,NULL,0,false,m_niceness );
|
|
// do not add these
|
|
//bool add1 = true;
|
|
// do not add negative key if no old doc
|
|
//if ( ! od ) add2 = false;
|
|
// . we already have a Links::hash into the Termtable for links: terms,
|
|
// but this will have to be for adding to Linkdb. basically take a
|
|
// lot of it from Linkdb::fillLinkdbList()
|
|
// . these return false with g_errno set on error
|
|
if ( m_useLinkdb && nl2 && ! hashLinksForLinkdb(&kt1) ) return NULL;
|
|
//if ( add2 && ol && ! !od->m_skipIndexing &&
|
|
// ol->hash(&kt2,od,m_niceness) )
|
|
// return NULL;
|
|
// add up what we need. +1 for rdbId
|
|
int32_t needLinkdb = 0;
|
|
needLinkdb += kt1.m_numSlotsUsed * (sizeof(key224_t)+1);
|
|
//needLinkdb += kt2.m_numSlotsUsed * (sizeof(key128_t)+1);
|
|
need += needLinkdb;
|
|
// sanity check
|
|
//if ( ! od && m_skipIndexing && needLinkdb ) { char *xx=NULL;*xx=0; }
|
|
|
|
// PLACEDB
|
|
HashTableX pt1;
|
|
//HashTableX pt2;
|
|
// . set key/data size
|
|
// . limit every address to 512 bytes
|
|
pt1.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness,"placedb-indx");
|
|
//pt2.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness);
|
|
//
|
|
// if this is true, then we just store the placedb recs
|
|
// directly into the title rec. That way we do not have
|
|
// to store the content of the web page, and we save space.
|
|
//
|
|
// otherwise, we have to parse out the sections and it is much slower
|
|
//else if (oa && !oa->hashForPlacedb(m_docId,*sh32,*od->getIp(),&pt2) )
|
|
// return NULL;
|
|
// hash terms into a table that uses full datedb keys
|
|
if ( na && !na->hashForPlacedb(m_docId,*sh32,*nd->getIp(),&pt1))
|
|
return NULL;
|
|
|
|
|
|
setStatus("hashing place info");
|
|
int32_t needPlacedb = 0;
|
|
// . +1 for rdbId
|
|
// . up to 512 bytes per address
|
|
needPlacedb += pt1.m_numSlotsUsed * (sizeof(key128_t)+1+512);
|
|
//needPlacedb += pt2.m_numSlotsUsed * (sizeof(key128_t)+1+512);
|
|
need += needPlacedb;
|
|
// sanity check -- coring here because we respider the page and
|
|
// the address is gone so it tries to delete it!
|
|
//if ( ! od && m_skipIndexing && needPlacedb ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we add a negative key to doledb usually (include datasize now)
|
|
int32_t needDoledb = sizeof(key_t) + 1 ; // + 4;
|
|
if ( forDelete ) needDoledb = 0;
|
|
need += needDoledb;
|
|
|
|
// for adding the SpiderReply to spiderdb (+1 for rdbId)
|
|
int32_t needSpiderdb1 = sizeof(SpiderReply) + 1;
|
|
if ( forDelete ) needSpiderdb1 = 0;
|
|
need += needSpiderdb1;
|
|
|
|
// if injecting we add a spiderrequest to be able to update it
|
|
// but don't do this if it is pagereindex. why is pagereindex
|
|
// setting the injecting flag anyway?
|
|
int32_t needSpiderdb3 = 0;
|
|
if ( m_sreqValid &&
|
|
m_sreq.m_isInjecting &&
|
|
m_sreq.m_fakeFirstIp &&
|
|
! m_sreq.m_forceDelete &&
|
|
// do not rebuild spiderdb if only rebuilding posdb
|
|
// this is explicitly for injecting so we need to add
|
|
// the spider request to spiderdb...
|
|
//m_useSpiderdb &&
|
|
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
|
|
! m_isDiffbotJSONObject ) {
|
|
needSpiderdb3 = m_sreq.getRecSize() + 1;
|
|
// NO! because when injecting a warc and the subdocs
|
|
// it contains, gb then tries to spider all of them !!! sux...
|
|
needSpiderdb3 = 0;
|
|
}
|
|
// or if we are rebuilding spiderdb
|
|
else if (m_useSecondaryRdbs && !m_isDiffbotJSONObject && m_useSpiderdb)
|
|
needSpiderdb3 = sizeof(SpiderRequest) + m_firstUrl.m_ulen+1;
|
|
|
|
need += needSpiderdb3;
|
|
|
|
//int32_t needSpiderdb3 = 0;
|
|
//if ( m_sreqValid ) needSpiderdb3 = m_sreq.getRecSize() + 1;
|
|
//need += needSpiderdb3;
|
|
|
|
// . for adding our outlinks to spiderdb
|
|
// . see SpiderRequest::getRecSize() for description
|
|
// . SpiderRequest::getNeededSize() will include the null terminator
|
|
int32_t hsize = SpiderRequest::getNeededSize ( 0 );
|
|
int32_t needSpiderdb2 = hsize * m_links.getNumLinks();
|
|
// and the url buffer of outlinks. includes \0 terminators i think
|
|
needSpiderdb2 += m_links.getLinkBufLen();
|
|
// don't need this if doing consistecy check
|
|
if ( m_doingConsistencyCheck ) needSpiderdb2 = 0;
|
|
// nor for generating the delete meta list for incremental indexing
|
|
if ( forDelete ) needSpiderdb2 = 0;
|
|
// accumulate it
|
|
need += needSpiderdb2;
|
|
|
|
// the new tags for tagdb
|
|
int32_t needTagdb = 0;
|
|
if ( ntb ) needTagdb = ntb->length() ;
|
|
// add 1 byte for up to 128 rdbids
|
|
//needTagdb += needTagdb/sizeof(Tag) + 1;
|
|
// add that in
|
|
need += needTagdb;
|
|
|
|
// . add in title rec size
|
|
// . should be valid because we called getTitleRecBuf() above
|
|
// . this should include the key
|
|
// . add in possible negative key for deleting old title rec
|
|
//int32_t needTitledb = sizeof(key96_t);
|
|
// +1 for rdbId
|
|
//if ( nd && m_useTitledb ) needTitledb = m_titleRecSize + 1;
|
|
//need += needTitledb;
|
|
|
|
|
|
//
|
|
// . CHECKSUM PARSING CONSISTENCY TEST
|
|
//
|
|
// . set m_metaListChecksum member (will be stored in titleRec header)
|
|
// . gotta set m_metaListCheckSum8 before making titleRec below
|
|
// . also, if set from titleRec, verify metalist is the same!
|
|
//
|
|
if ( ! m_computedMetaListCheckSum ) {
|
|
// do not call twice!
|
|
m_computedMetaListCheckSum = true;
|
|
// all keys in tt1, ns1, kt1 and pt1
|
|
int32_t ck32 = 0;
|
|
ck32 ^= tt1.getKeyChecksum32();
|
|
|
|
// show tt1
|
|
//
|
|
// UNCOMMENT this to debug parsing inconsistencies!!!
|
|
//
|
|
// SafeBuf sb;
|
|
// tt1.print(&sb);
|
|
// if(sb.getBufStart()) fprintf(stderr,"%s", sb.getBufStart());
|
|
|
|
//ck32 ^= ns1.getKeyChecksum32();
|
|
//ck32 ^= kt1.getKeyChecksum32();
|
|
//ck32 ^= pt1.getKeyChecksum32();
|
|
// set this before calling getTitleRecBuf() below
|
|
uint8_t currentMetaListCheckSum8 = (uint8_t)ck32;
|
|
// see if matches what was in old titlerec
|
|
if ( m_metaListCheckSum8Valid &&
|
|
// if we were set from a titleRec, see if we got
|
|
// a different hash of terms to index this time around...
|
|
m_setFromTitleRec &&
|
|
// fix for import log spam
|
|
! m_isImporting &&
|
|
m_version >= 120 &&
|
|
m_metaListCheckSum8 != currentMetaListCheckSum8 ) {
|
|
log("xmldoc: checksum parsing inconsistency for %s "
|
|
"(old)%i != %i(new). Uncomment tt1.print() "
|
|
"above to debug.",
|
|
m_firstUrl.getUrl(),
|
|
(int)m_metaListCheckSum8,
|
|
(int)currentMetaListCheckSum8);
|
|
// if doing qa test drop core
|
|
CollectionRec *cr = getCollRec();
|
|
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 ) {
|
|
log("xmldoc: sleep 1000");
|
|
sleep(1000);
|
|
exit(0);}//char *xx=NULL;*xx=0; }
|
|
}
|
|
// assign the new one, getTitleRecBuf() call below needs this
|
|
m_metaListCheckSum8 = currentMetaListCheckSum8;
|
|
m_metaListCheckSum8Valid = true;
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// now that we've set all the ptr_* members vars, we can make
|
|
// the title rec
|
|
//
|
|
|
|
// . MAKE the title rec from scratch, that is all we need at this point
|
|
// . sets m_indexCode to EDOCNOTNEW or EDOCNOTOLD sometimes
|
|
// . if repairing and not rebuilding titledb, we do not need the
|
|
// titlerec
|
|
if ( m_useTitledb ) {
|
|
// this buf includes key/datasize/compressdata
|
|
SafeBuf *tr = getTitleRecBuf ();
|
|
// panic if this blocks! it should not at this point because
|
|
// we'd have to re-hash the crap above
|
|
if ( tr == (void *) -1 ) { char *xx=NULL;*xx=0; }
|
|
// return NULL with g_errno set on error
|
|
if ( ! tr ) return (char *)tr;
|
|
// sanity check - if the valid title rec is null,
|
|
// m_indexCode is set!
|
|
if ( tr->length()==0 && ! m_indexCode ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// . add in title rec size
|
|
// . should be valid because we called getTitleRecBuf() above
|
|
// . this should include the key
|
|
// . add in possible negative key for deleting old title rec
|
|
int32_t needTitledb = sizeof(key96_t) + 1;
|
|
// +1 for rdbId
|
|
if ( nd && m_useTitledb && ! forDelete )
|
|
needTitledb += m_titleRecBuf.length();
|
|
// set new and old keys for titledb
|
|
//key_t ok;
|
|
key_t nk;
|
|
//ok.setMin();
|
|
nk.setMin();
|
|
//if ( od ) ok = *od->getTitleRecKey();
|
|
if ( nd && m_useTitledb ) nk = *nd->getTitleRecKey();
|
|
//if ( od && m_useTitledb && ok != nk ) needTitledb += sizeof(key_t)+1;
|
|
if ( m_useTitledb ) {
|
|
// then add it in
|
|
need += needTitledb;
|
|
// the titledb unlock key for msg12 in spider.cpp
|
|
need += sizeof(key_t);
|
|
}
|
|
|
|
//
|
|
// now space for the revdb record, which is the meta list itself!
|
|
//
|
|
//need = need + 12 + 4 + need;
|
|
|
|
// . alloc mem for metalist
|
|
// . sanity
|
|
if ( m_metaListSize > 0 ) { char *xx=NULL;*xx=0; }
|
|
// make the buffer
|
|
m_metaList = (char *)mmalloc ( need , "metalist");
|
|
if ( ! m_metaList ) return NULL;
|
|
// save size for freeing later
|
|
m_metaListAllocSize = need;
|
|
// ptr and boundary
|
|
m_p = m_metaList;
|
|
m_pend = m_metaList + need;
|
|
|
|
//
|
|
// TITLEDB
|
|
//
|
|
setStatus ("adding titledb recs");
|
|
// checkpoint
|
|
char *saved = m_p;
|
|
// . delete old title rec key if different
|
|
// . Repair.cpp might set useTitledb to false!
|
|
//if ( od && m_useTitledb && ok != nk ) {
|
|
// // rdbId
|
|
// *m_p++ = RDB_TITLEDB;
|
|
// // key
|
|
// *(key_t *)m_p = *od->getTitleRecKey();
|
|
// // make it negative
|
|
// *m_p &= 0xfe;
|
|
// // skip over it
|
|
// m_p += sizeof(key_t);
|
|
// // then data size, 0
|
|
// //*(int32_t *)m_p = 0;
|
|
// //m_p+= 4;
|
|
//}
|
|
// . store title rec
|
|
// . Repair.cpp might set useTitledb to false!
|
|
if ( nd && m_useTitledb ) {
|
|
// rdbId
|
|
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_TITLEDB2;
|
|
else *m_p++ = RDB_TITLEDB;
|
|
// sanity
|
|
if ( ! nd->m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
|
|
// key, dataSize, data is the whole rec
|
|
int32_t tsize = nd->m_titleRecBuf.length();
|
|
// if getting an "oldList" to do incremental posdb updates
|
|
// then do not include the data portion of the title rec
|
|
if ( forDelete ) tsize = sizeof(key_t);
|
|
gbmemcpy ( m_p , nd->m_titleRecBuf.getBufStart() , tsize );
|
|
// make it a negative key
|
|
//if ( forDelete ) *m_p = *m_p & 0xfe;
|
|
m_p += tsize;//nd->m_titleRecSize;
|
|
// store a zero datasize, key is still positive until the dt8
|
|
// table deletes it
|
|
//if ( forDelete ) { *(int32_t *)m_p = 0; m_p += 4; }
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needTitledb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// ADD BASIC INDEXDB/DATEDB TERMS
|
|
//
|
|
setStatus ( "adding posdb and datedb terms");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// store indexdb terms into m_metaList[]
|
|
if ( m_usePosdb && ! addTable144 ( &tt1 , m_docId )) return NULL;
|
|
//if(!addTable96 ( &tt2, &tt1, date2, date1, true ,false)) return NULL;
|
|
//if ( od ) tt2.clear();
|
|
// sanity check
|
|
if ( m_p - saved > needIndexdb ) { char*xx=NULL;*xx=0; }
|
|
// free all mem
|
|
tt1.reset();
|
|
//tt2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// ADD NOSPLIT INDEXDB/DATEDB TERMS
|
|
//
|
|
/*
|
|
we added these now in hashAll() to tt1, no longer ns1 since we
|
|
have the sharded by termid bit in the actual posdb key now so
|
|
Rebalance.cpp works
|
|
|
|
setStatus ( "adding posdb shardByTermId terms");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// no longer anything special now since the
|
|
// Posdb::isShardedyTermId() bit
|
|
// is in the key now so Rebalance.cpp can work
|
|
if ( m_usePosdb && ! addTable144 ( &ns1 )) return NULL;
|
|
//if(! addTable96 ( &ns2, &ns1, -1, -1, true ,true)) return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needNoSplit1 ) { char*xx=NULL;*xx=0; }
|
|
// free all mem
|
|
ns1.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
*/
|
|
|
|
|
|
/*
|
|
setStatus ( "adding datedb nosplit terms");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// this is now for datedb
|
|
if ( m_useDatedb && ! addTableDate(&ns2,m_docId,RDB_DATEDB,true))
|
|
return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needNoSplit2 ) { char*xx=NULL;*xx=0; }
|
|
// free all mem
|
|
ns2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
*/
|
|
|
|
//
|
|
// ADD SECTIONS SPECIAL TERMS
|
|
//
|
|
setStatus ( "adding sectiondb keys");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// add that table to the metalist
|
|
if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete))
|
|
return NULL;
|
|
//if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; }
|
|
// free mem
|
|
st1.reset();
|
|
//st2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
//
|
|
// ADD CLUSTERDB KEYS
|
|
//
|
|
setStatus ( "adding clusterdb keys" );
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . do we have adult content?
|
|
// . should already be valid!
|
|
if ( nd && ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
|
|
// . get new clusterdb key
|
|
// . we use the host hash for the site hash! hey, this is only 26 bits!
|
|
key_t newk ; newk.setMin();
|
|
if ( nd )
|
|
newk = g_clusterdb.makeClusterRecKey ( *nd->getDocId() ,
|
|
*nd->getIsAdult() ,
|
|
*nd->getLangId(),
|
|
nd->getHostHash32a(),
|
|
false ); // del?
|
|
//key_t oldk; oldk.setMin();
|
|
//if ( od ) // && add2 )
|
|
// oldk = g_clusterdb.makeClusterRecKey ( *od->getDocId(),
|
|
// *od->getIsAdult() ,
|
|
// *od->getLangId() ,
|
|
// od->getHostHash32a(),
|
|
// true ); // del?
|
|
// . store old only if new tr is good and keys are different from old
|
|
// . now we store even if skipIndexing is true because i'd like to
|
|
// see how many titlerecs we have and count them towards the
|
|
// docsIndexed count...
|
|
if ( nd && m_useClusterdb ) {
|
|
// store rdbid
|
|
*m_p = RDB_CLUSTERDB;
|
|
// use secondary if we should
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
|
|
// skip
|
|
m_p++;
|
|
// and key
|
|
*(key_t *)m_p = newk;
|
|
// skip it
|
|
m_p += sizeof(key_t);
|
|
}
|
|
// store new if different
|
|
//if ( od && ( ! nd || newk != oldk ) ) { // && !od->m_skipIndexing ) {
|
|
// // store rdbid
|
|
// *m_p = RDB_CLUSTERDB;
|
|
// // use secondary if we should
|
|
// if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
|
|
// // skip
|
|
// m_p++;
|
|
// // turn on last bit (undo del)
|
|
// //newk.n0 |= 0x01;
|
|
// // and key
|
|
// *(key_t *)m_p = oldk;
|
|
// // skip it
|
|
// m_p += sizeof(key_t);
|
|
//}
|
|
// sanity check
|
|
if ( m_p - saved > needClusterdb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
|
|
//
|
|
// ADD LINKDB KEYS
|
|
//
|
|
setStatus ( "adding linkdb keys" );
|
|
// checkpoint
|
|
saved = m_p;
|
|
// add that table to the metalist (LINKDB)
|
|
if ( m_useLinkdb && !addTable224(&kt1))
|
|
return NULL;
|
|
//if(add2&&!addTable128(&kt2,&kt1,RDB_LINKDB, false))return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needLinkdb ) { char *xx=NULL;*xx=0; }
|
|
// all done
|
|
kt1.reset();
|
|
//kt2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// . ADD ADDRESSES TO NAMEDB/PLACEDB
|
|
// . key is basically a hash of the address (excluding place name
|
|
// and street indicators)
|
|
//
|
|
setStatus ( "adding to placedb" );
|
|
// checkpoint
|
|
saved = m_p;
|
|
// add that table to the metalist
|
|
if ( m_usePlacedb && ! addTable128 ( &pt1, RDB_PLACEDB,forDelete))
|
|
return NULL;
|
|
//if(! addTable128 ( &pt2, &pt1, RDB_PLACEDB, true , true))return NULL;
|
|
// sanity check
|
|
if ( m_p - saved > needPlacedb ) { char *xx=NULL;*xx=0; }
|
|
// free mem
|
|
pt1.reset();
|
|
//pt2.reset();
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
/*
|
|
//
|
|
// ADD REVDB RECORD
|
|
//
|
|
|
|
//
|
|
// . add the metalist to itself
|
|
// . this way, when we delete this doc from the index, we just
|
|
// lookup the original metalist in revdb, set all the
|
|
// delbits, and re-add that. this avoid having to ensure
|
|
// parsing consistency, which is a royal pain in the ass
|
|
// . now we also update getMetaList() to check revdb to get
|
|
// the meta list if the doc is already indexed...
|
|
//
|
|
// define current meta list
|
|
char *x = m_metaList;
|
|
char *xend = m_p;
|
|
// skip adding to revdb?
|
|
if ( ! m_useRevdb ) xend = x;
|
|
int32_t *dataSizePtr;
|
|
char *savedp;
|
|
// if nothing in current list do not add revdb rec
|
|
bool hadStuff = ( x < xend);
|
|
if ( hadStuff ) {
|
|
// put in the rdbId
|
|
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_REVDB2;
|
|
else *m_p++ = RDB_REVDB;
|
|
// the key
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
*(key_t *)m_p = g_revdb.makeKey ( m_docId , false );
|
|
m_p += sizeof(key_t);
|
|
// data size
|
|
dataSizePtr = (int32_t *)m_p;
|
|
// skip for now
|
|
m_p += 4;
|
|
// save it
|
|
savedp = m_p;
|
|
}
|
|
// scan the current metalist and add keys to the revdb record
|
|
for ( ; x < xend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save this
|
|
char byte = *x;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
//
|
|
// convert if adding to secondary rdbids!!!!!!!!
|
|
//
|
|
if ( m_useSecondaryRdbs ) {
|
|
if ( rdbId == RDB2_POSDB2 )
|
|
rdbId = RDB_POSDB;
|
|
else if ( rdbId == RDB2_DATEDB2 )
|
|
rdbId = RDB_DATEDB;
|
|
else if ( rdbId == RDB2_SECTIONDB2 )
|
|
rdbId = RDB_SECTIONDB;
|
|
else if ( rdbId == RDB2_PLACEDB2 )
|
|
rdbId = RDB_PLACEDB;
|
|
else if ( rdbId == RDB2_TITLEDB2 )
|
|
rdbId = RDB_TITLEDB;
|
|
else if ( rdbId == RDB2_LINKDB2 )
|
|
rdbId = RDB_LINKDB;
|
|
else if ( rdbId == RDB2_CLUSTERDB2 )
|
|
rdbId = RDB_CLUSTERDB;
|
|
else if ( rdbId == RDB2_SPIDERDB2 )
|
|
rdbId = RDB_SPIDERDB;
|
|
else if ( rdbId == RDB2_TAGDB2 )
|
|
rdbId = RDB_TAGDB;
|
|
// must be covered!!
|
|
else { char *xx=NULL;*xx=0; }
|
|
// rewrite byte now b/c we store it below
|
|
byte = (byte & 0x80) | rdbId;
|
|
}
|
|
// skip that
|
|
x++;
|
|
// copy that over
|
|
*m_p++ = byte;
|
|
// sanity check -- no negative keys allowed in here
|
|
if ( (x[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// copy that over
|
|
gbmemcpy ( m_p , x , ks );
|
|
// skip that
|
|
m_p += ks;
|
|
x += ks;
|
|
// datasize?
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
if ( ds == -1 ) {
|
|
ds = *(int32_t *)x;
|
|
x += 4;
|
|
}
|
|
// skip data
|
|
x += ds;
|
|
}
|
|
// record size of what we wrote
|
|
if ( hadStuff )
|
|
*dataSizePtr = ( m_p - savedp );
|
|
// sanity check
|
|
if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
*/
|
|
|
|
//////
|
|
//
|
|
// add SPIDERREPLY BEFORE and SPIDERREQUEST!!!
|
|
//
|
|
// add spider reply first so we do not immediately respider
|
|
// this same url if we were injecting it because no SpiderRequest
|
|
// may have existed, and SpiderColl::addSpiderRequest() will
|
|
// spawn a spider of this url again unless there is already a REPLY
|
|
// in spiderdb!!! crazy...
|
|
bool addReply = true;
|
|
// Scraper.cpp uses this
|
|
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
|
|
// save it
|
|
saved = m_p;
|
|
// now add the new rescheduled time
|
|
if ( addReply && m_useSpiderdb && ! forDelete ) {
|
|
// note it
|
|
setStatus ( "adding SpiderReply to spiderdb" );
|
|
// rdbid first
|
|
*m_p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
|
|
m_p++;
|
|
// get this
|
|
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
|
|
// store the spider rec
|
|
int32_t newsrSize = newsr->getRecSize();
|
|
gbmemcpy ( m_p , newsr , newsrSize );
|
|
m_p += newsrSize;
|
|
|
|
m_addedSpiderReplySize = newsrSize;
|
|
m_addedSpiderReplySizeValid = true;
|
|
|
|
// sanity check - must not be a request, this is a reply
|
|
if ( g_spiderdb.isSpiderRequest( &newsr->m_key ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( m_p - saved != needSpiderdb1 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
}
|
|
|
|
|
|
// if we are injecting we must add the spider request
|
|
// we are injecting from so the url can be scheduled to be
|
|
// spidered again.
|
|
// NO! because when injecting a warc and the subdocs
|
|
// it contains, gb then tries to spider all of them !!! sux...
|
|
if ( needSpiderdb3 ) {
|
|
// note it
|
|
setStatus("adding spider request");
|
|
// checkpoint
|
|
saved = m_p;
|
|
// store it here
|
|
SpiderRequest revisedReq;
|
|
|
|
// if doing a repair/rebuild of spiderdb...
|
|
if ( m_useSecondaryRdbs )
|
|
getRebuiltSpiderRequest ( &revisedReq );
|
|
|
|
// this fills it in for doing injections
|
|
if ( ! m_useSecondaryRdbs ) {
|
|
getRevisedSpiderRequest ( &revisedReq );
|
|
// sanity log
|
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
|
// sanity log
|
|
if ( m_firstIp == 0 || m_firstIp == -1 ) {
|
|
char *url = "unknown";
|
|
if ( m_sreqValid ) url = m_sreq.m_url;
|
|
log("build: error3 getting real firstip of "
|
|
"%"INT32" for %s. not adding new request.",
|
|
(int32_t)m_firstIp,url);
|
|
goto skipNewAdd2;
|
|
}
|
|
}
|
|
|
|
// copy it
|
|
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_SPIDERDB2;
|
|
else *m_p++ = RDB_SPIDERDB;
|
|
// store it back
|
|
gbmemcpy ( m_p , &revisedReq , revisedReq.getRecSize() );
|
|
// skip over it
|
|
m_p += revisedReq.getRecSize();
|
|
// sanity check
|
|
if ( m_p - saved > needSpiderdb3 ) { char *xx=NULL;*xx=0; }
|
|
|
|
m_addedSpiderRequestSize = revisedReq.getRecSize();
|
|
m_addedSpiderRequestSizeValid = true;
|
|
|
|
}
|
|
|
|
skipNewAdd2:
|
|
|
|
//
|
|
// ADD SPIDERDB RECORDS of outlinks
|
|
//
|
|
// - do this AFTER computing revdb since we do not want spiderdb recs
|
|
// to be in revdb.
|
|
//
|
|
setStatus ( "adding spiderdb keys" );
|
|
// sanity check. cannot spider until in sync
|
|
if ( ! isClockInSync() ) { char *xx=NULL;*xx=0; }
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . should be fixed from Links::setRdbList
|
|
// . we should contain the msge that msg16 uses!
|
|
// . we were checking m_msg16.m_recycleContent, but i have not done
|
|
// that in years!!! MDW
|
|
// . we were also checking if the # of banned outlinks >= 2, then
|
|
// we would not do this...
|
|
// . should also add with a time of now plus 5 seconds to that if
|
|
// we spider an outlink linkdb should be update with this doc
|
|
// pointing to it so it can get link text then!!
|
|
if ( spideringLinks && nl2 && ! m_doingConsistencyCheck &&
|
|
m_useSpiderdb && ! forDelete ){
|
|
// returns NULL and sets g_errno on error
|
|
char *ret = addOutlinkSpiderRecsToMetaList ();
|
|
// sanity check
|
|
if ( ! ret && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// return NULL on error
|
|
if ( ! ret ) return NULL;
|
|
// this MUST not block down here, to avoid re-hashing above
|
|
if ( ret == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needSpiderdb2 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
//
|
|
// ADD TAG RECORDS TO TAGDB
|
|
//
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . only do this if NOT setting from a title rec
|
|
// . it might add a bunch of forced spider recs to spiderdb
|
|
// . store into tagdb even if indexCode is set!
|
|
if ( ntb && m_useTagdb && ! forDelete ) {
|
|
// ntb is a safebuf of Tags, which are already Rdb records
|
|
// so just gbmemcpy them directly over
|
|
char *src = ntb->getBufStart();
|
|
int32_t srcSize = ntb->length();
|
|
gbmemcpy ( m_p , src , srcSize );
|
|
m_p += srcSize;
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needTagdb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
|
|
|
|
|
|
//
|
|
// ADD INDEXED SPIDER REPLY with different docid so we can
|
|
// search index of spider replies! (NEW!)
|
|
//
|
|
// . index spider reply with separate docid so they are all searchable.
|
|
// . see getSpiderStatusDocMetaList() function to see what we index
|
|
// and the titlerec we create for it
|
|
if ( spiderStatusDocMetaList ) {
|
|
gbmemcpy ( m_p ,
|
|
spiderStatusDocMetaList->getBufStart() ,
|
|
spiderStatusDocMetaList->length() );
|
|
m_p += spiderStatusDocMetaList->length();
|
|
m_addedStatusDocSize = spiderStatusDocMetaList->length();
|
|
m_addedStatusDocSizeValid = true;
|
|
}
|
|
|
|
/*
|
|
//
|
|
// ADD FORCED RESPIDER DOCID-BASED SPIDER RECS for Sections
|
|
//
|
|
// used by Sections.cpp to respider docs because we just identified an
|
|
// article section and they need to be re-indexed to take advantage
|
|
// of that
|
|
//
|
|
// checkpoint
|
|
saved = m_p;
|
|
// . only do this if NOT setting from a title rec
|
|
// . it might add a bunch of forced spider recs to spiderdb
|
|
if ( ! m_setFromTitleRec && nd ) { // && ! m_isInjecting ) {
|
|
Sections *ss = &m_sections;
|
|
m_p = ss->respiderLineWaiters ( m_p , m_pend );
|
|
if ( ! m_p ) return NULL;
|
|
}
|
|
// sanity check
|
|
if ( m_p - saved > needLineWaiters ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
*/
|
|
|
|
|
|
//
|
|
// NOW UPDATE OURSELVES (OUR URL) IN SPIDERDB
|
|
//
|
|
|
|
// but not if injecting!
|
|
//if ( ! m_sreqValid ) {
|
|
// // set the list size, different from the alloc size
|
|
// m_metaListSize = m_p - m_metaList;
|
|
// // all done
|
|
// return m_metaList;
|
|
//}
|
|
|
|
// note it
|
|
//setStatus ( "deleting old spider rec key" );
|
|
// rdbid first
|
|
// *p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
//if ( m_useSecondaryRdbs ) *p = RDB2_SPIDERDB2;
|
|
//p++;
|
|
// must be legit
|
|
//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
// then the key
|
|
// *(key_t *)p = m_sreq.m_key;
|
|
// nukey, clear del bit to delete it
|
|
// *p &= 0xfe;
|
|
// skip key
|
|
//p += sizeof(key_t);
|
|
|
|
// int16_tcut
|
|
saved = m_p;
|
|
/*
|
|
|
|
See comment under DOLEDB above! this approach is no longer used.
|
|
|
|
// . remove from doledb if we had a valid key
|
|
// . DO THIS BEFORE adding the SpiderReply since
|
|
// Spider.cpp::addSpiderReply() will
|
|
// decrement the count for firstIp in m_doleIpTable
|
|
if ( (m_doledbKey.n0 || m_doledbKey.n1) &&
|
|
! m_useSecondaryRdbs &&
|
|
// do not add if we are generating the meta list for incremental
|
|
// indexing purposes from an old doc
|
|
! forDelete ) {
|
|
// note it
|
|
setStatus ( "removing key from doledb" );
|
|
// . now remove the original spider rec from "doledb"
|
|
// . rdbid first
|
|
*m_p = RDB_DOLEDB;
|
|
m_p++;
|
|
// then the key
|
|
*(key_t *)m_p = m_doledbKey;
|
|
// nukey, clear del bit to delete it
|
|
*m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key_t);
|
|
// datasize is 0
|
|
// *(int32_t *)m_p = 0;
|
|
//m_p += 4;
|
|
// sanity check
|
|
if ( m_p - saved != needDoledb ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p , forDelete );
|
|
}
|
|
*/
|
|
|
|
// note it
|
|
//setStatus ( "removing spider lock");
|
|
// . make a fake titledb key
|
|
// . remove the spider lock (Msg12 in Spider.cpp)
|
|
// . no need to do this if called from Repair.cpp
|
|
// . the uh48 is zero, that means fake!
|
|
// . i added "&& m_useSpiderdb" here because it was messing up
|
|
// the cacheTermLists() function which ONLY wants posdb keys and
|
|
// any other keys in the metalist messes it up. MDW 1/26/13
|
|
// . now SPider.cpp uses SpiderReply reception to remove lock
|
|
// - mdw 9/28/13
|
|
//if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) {
|
|
// *m_p++ = RDB_FAKEDB;
|
|
// ((key_t *)m_p)->n1 = 0;
|
|
// ((key_t *)m_p)->n0 = m_docId;
|
|
// //= g_titledb.makeKey ( m_docId , 0LL , true );
|
|
// m_p += sizeof(key_t);
|
|
//}
|
|
|
|
|
|
// MDW: new spider algo does not need this
|
|
/*
|
|
// save it
|
|
saved = m_p;
|
|
// re-add the same request since it was removed from Spider.cpp's
|
|
// m_urlBuf and the associated orderTree,ipTree, etc. and now
|
|
// since we are un-doling (undoling) it we need to re-add and this
|
|
// is the easiest way. it really was never removed from spiderdb
|
|
// but it will no longer be in the spider's cache since we delete
|
|
// it from there when we add it to doledb. so this is just a quick
|
|
// way of getting it back into the cache.
|
|
// now, we add this first since now Rdb.cpp calls evaluateAllReqeusts()
|
|
// AFTER the REPLY now
|
|
if ( m_sreqValid &&
|
|
// page parser has an invalid firstIp which causes printMetaList()
|
|
// to core when trying to print this out, so don't add it when
|
|
// doing page parser
|
|
! m_sreq.m_isPageParser ) {
|
|
// note it
|
|
setStatus ( "adding SpiderRequest back to spiderdb" );
|
|
// rdbid first
|
|
*m_p = RDB_SPIDERDB;
|
|
// use secondary?
|
|
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
|
|
m_p++;
|
|
// store the spider rec
|
|
int32_t size = m_sreq.getRecSize();
|
|
gbmemcpy ( m_p , &m_sreq , size );
|
|
// set this one bit
|
|
SpiderRequest *rr = (SpiderRequest *)m_p;
|
|
rr->m_readd = 1;
|
|
// and hafta reset this junk otherwise it cores
|
|
// (see Spider.h::SpiderRequest::reset())
|
|
rr->m_ufn = -1;
|
|
rr->m_priority = -1;
|
|
rr->m_doled = 0;
|
|
// skip over the whole rec
|
|
m_p += size;
|
|
// sanity check - must not be a request, this is a reply
|
|
if ( ! g_spiderdb.isSpiderRequest( &m_sreq.m_key ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( m_p - saved != needSpiderdb3 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_p );
|
|
}
|
|
*/
|
|
|
|
// sanity check
|
|
if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
/////////////////
|
|
//
|
|
// INCREMENTAL INDEXING / INCREMENTAL UPDATING
|
|
//
|
|
// now prune/manicure the metalist to remove records that
|
|
// were already added, and insert deletes for records that
|
|
// changed since the last time. this is how we do deletes
|
|
// now that we have revdb. this allows us to avoid
|
|
// parsing inconsistency errors.
|
|
//
|
|
/////////////////
|
|
|
|
// disable for parsing consistency testing of already indexed docs
|
|
//oldList = NULL;
|
|
|
|
if ( oldList ) { // && oldList->m_listSize > 16 ) {
|
|
// point to start of the old meta list, the first and only
|
|
// record in the oldList
|
|
char *om = oldList;// + 12 + 4;
|
|
// the size
|
|
int32_t osize = oldListSize;//*(int32_t *)(oldList + 12);
|
|
// the end
|
|
char *omend = om + osize;
|
|
int32_t needx = 0;
|
|
// init these. data is just the rdbid, a single byte.
|
|
//HashTableX dt12;
|
|
//HashTableX dt16;
|
|
//char dbuf12[30000];
|
|
//char dbuf16[40000];
|
|
//dt12.set ( 12,1,2048,dbuf12,30000,false,m_niceness);
|
|
//dt16.set ( 16,1,2048,dbuf16,40000,false,m_niceness);
|
|
HashTableX dt8;
|
|
char dbuf8[34900];
|
|
// value is the ptr to the rdbId/key in the oldList
|
|
dt8.set ( 8,sizeof(char *),2048,dbuf8,34900,
|
|
false,m_niceness,"dt8-tab");
|
|
// just for linkdb:
|
|
//HashTableX dt9;
|
|
//char dbuf9[30000];
|
|
//dt9.set ( 8,4,2048,dbuf9,30000,false,m_niceness,"dt9-tab");
|
|
// scan recs in that and hash them
|
|
for ( char *p = om ; p < omend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save this
|
|
char byte = *p;
|
|
// save this
|
|
char *rec = p;
|
|
// get the rdbid for this rec
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// get the key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// get that
|
|
char *k = p;
|
|
// unlike a real meta list, this meta list has
|
|
// no data field, just rdbIds and keys only! because
|
|
// we only use it for deleting, which only requires
|
|
// a key and not the data
|
|
p += ks;
|
|
// tally this up in case we have to add the delete
|
|
// version of this key back (add 1 for rdbId)
|
|
needx += ks + 1;
|
|
// always re-add titledb record!
|
|
// if our current/new list is basically empty
|
|
// except for a SpiderReply because it got deleted
|
|
// from the index, we need to store the titledb key
|
|
// in dt8 so we can add it as a negative! so i
|
|
// don't really know what this was trying to fix
|
|
// because it broke that!
|
|
//if ( rdbId == RDB_TITLEDB ) continue;
|
|
// for linkdb, sometimes we also add a "lost" link
|
|
// key in addition to deleting the old key! see below
|
|
if ( rdbId == RDB_LINKDB ) needx += ks + 1;
|
|
// do not add it if datasize > 0
|
|
uint64_t hk;
|
|
// do not include discovery or lost dates in the
|
|
// linkdb key...
|
|
if ( rdbId == RDB_LINKDB )
|
|
hk = hash64 (k+12,ks-12);
|
|
else
|
|
hk = hash64 (k,ks);
|
|
// sanity check
|
|
if ( rdbId == RDB_LINKDB &&
|
|
g_linkdb.getLinkerDocId_uk((key224_t *)k)!=
|
|
m_docId ) {
|
|
char *xx=NULL;*xx=0; }
|
|
//if ( getDataSize(rdbId) != 0 ) continue;
|
|
// hash this key
|
|
//bool status;
|
|
// sectiondb keys all have the same last few bits...
|
|
// so this clogs up the hash table.
|
|
// so mix up the key bits for hashing
|
|
//uint64_t hk = hash64 ( k,ks);
|
|
//if (ks == 12 ) status = dt12.addKey ( k, &byte);
|
|
//else if (ks == 16 ) status = dt16.addKey ( k, &byte);
|
|
//else { char *xx=NULL; *xx=0; }
|
|
if ( ! dt8.addKey(&hk,&rec) ) return NULL;
|
|
// return NULL with g_errno set on error
|
|
//if ( ! status ) return NULL;
|
|
}
|
|
// also need all the new keys just to be sure, in case none
|
|
// are already in the rdbs
|
|
needx += (m_p - m_metaList);
|
|
// now alloc for our new manicured metalist
|
|
char *nm = (char *)mmalloc( needx, "newmeta" );
|
|
if ( ! nm ) return NULL;
|
|
char *nptr = nm;
|
|
char *nmax = nm + needx;
|
|
// scan each rec in the current meta list, see if its in either
|
|
// the dt12 or dt16 hash table, if it already is, then
|
|
// do NOT add it to the new metalist, nm, because there is
|
|
// no need to.
|
|
char *p = m_metaList;
|
|
char *pend = p + (m_p - m_metaList);
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save it with the flag
|
|
char byte = *p;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// get key
|
|
char *key = p;
|
|
// skip that
|
|
p += ks;
|
|
// get data size
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
// assume we do not store the datasize
|
|
bool neg = false;
|
|
// . if key is negative, no data is present
|
|
// . the doledb key is negative for us here
|
|
if ( (key[0] & 0x01) == 0x00 ) { neg = true; ds = 0; }
|
|
// if datasize variable, read it in
|
|
if ( ds == -1 ) {
|
|
// get data size
|
|
ds = *(int32_t *)p;
|
|
// skip data size int32_t
|
|
p += 4;
|
|
}
|
|
// point to data
|
|
char *data = p;
|
|
// skip data if not zero
|
|
p += ds;
|
|
|
|
// mix it up for hashtable speed
|
|
uint64_t hk ;//= hash64 ( key,ks);
|
|
|
|
// skip if for linkdb, we do that below
|
|
if ( rdbId == RDB_LINKDB )
|
|
hk = hash64(key+12,ks-12);
|
|
else
|
|
hk = hash64(key,ks);
|
|
|
|
// was this key already in the "old" list?
|
|
int32_t slot = dt8.getSlot(&hk);
|
|
|
|
// do we got a linkdb key that existed last time
|
|
// we indexed this doc? if so, inherit its discovery
|
|
// date.
|
|
if ( slot >= 0 && rdbId == RDB_LINKDB ) {
|
|
/*
|
|
// get old key from last time
|
|
char *oldk=*(char**)dt8.getValueFromSlot(slot);
|
|
// skip rdbid
|
|
oldk++;
|
|
// sanity
|
|
if(g_linkdb.getLinkerDocId_uk((key224_t *)oldk)
|
|
!=m_docId){
|
|
char *xx=NULL;*xx=0; }
|
|
// copy rdbid into new meta list
|
|
*nptr++ = byte;
|
|
// point to where key will be stored in new lst
|
|
char *nk = nptr;
|
|
// store the new key in the new meta list
|
|
gbmemcpy ( nptr , key , ks );
|
|
// advance ptr
|
|
nptr += ks;
|
|
// get disocvery time of old key from last time
|
|
int32_t dd = g_linkdb.getDiscoveryDate_uk(oldk);
|
|
// sanity
|
|
if ( dd < 0 ) { char *xx=NULL;*xx=0; }
|
|
// but mod the new key's discovery time
|
|
g_linkdb.setDiscoveryDate_uk ( nk, dd );
|
|
*/
|
|
// . no need to deal with this any further
|
|
// . yeah, because there could be dups!
|
|
// so don't delete it just yet
|
|
// . but make the data ptr NULL so we
|
|
// know to disregard it below...???
|
|
dt8.removeSlot(slot);
|
|
// all done for this key
|
|
continue;
|
|
}
|
|
|
|
// see if already in an rdb, IFF dataless, otherwise
|
|
// the keys might be the same but with different data!
|
|
if ( slot >= 0 ) { // dt8.isInTable(&hk) ) {
|
|
// remove from hashtable so we do not add it
|
|
// as a delete key below
|
|
// dt8.removeKey(&hk);
|
|
dt8.removeSlot(slot);
|
|
// but do add like a titledb rec that has the
|
|
// same key, because its data is probably
|
|
// different...
|
|
// HACK: enable for now since we lost
|
|
// the url:www.geico.com term somehow!!!
|
|
// geico got deleted but not the title rec!!
|
|
// MAKE SURE TITLEREC gets deleted then!!!
|
|
if ( ds==0 && g_conf.m_doIncrementalUpdating )
|
|
continue;
|
|
}
|
|
// ok, it is not already in an rdb, so add it
|
|
*nptr++ = byte;
|
|
// store key
|
|
gbmemcpy ( nptr, key , ks );
|
|
// skip over it
|
|
nptr += ks;
|
|
// store data size. BUT not if negative key!
|
|
if ( getDataSizeFromRdbId(rdbId) == -1 && ! neg ) {
|
|
*(int32_t *)nptr = ds;
|
|
nptr += 4;
|
|
}
|
|
// store data
|
|
if ( ds ) {
|
|
gbmemcpy ( nptr , data , ds );
|
|
nptr += ds;
|
|
}
|
|
}
|
|
// now scan dt8 and add their keys as del keys
|
|
for ( int32_t i = 0 ; i < dt8.m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! dt8.m_flags[i] ) continue;
|
|
// store rdbid first
|
|
char *rec = *(char **)dt8.getValueFromSlot(i);
|
|
// get rdbId with hi bit possibly set
|
|
char rdbId = rec[0] & 0x7f;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// sanity test - no negative keys
|
|
if ( (rec[1] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0;}
|
|
// copy the rdbId byte and key
|
|
gbmemcpy ( nptr , rec , 1 + ks );
|
|
// skip over rdbid
|
|
nptr++;
|
|
// make it a negative key by clearing lsb
|
|
*nptr = *nptr & 0xfe;
|
|
// skip it
|
|
nptr += ks;
|
|
// if it is from linkdb, and unmet, then it is a
|
|
// lost link, so set the lost date of it. we keep
|
|
// these so we can graph lost links
|
|
if ( rdbId == RDB_LINKDB ) {
|
|
// the real linkdb rec is at rec+1
|
|
int32_t lost = g_linkdb.getLostDate_uk( rec+1 );
|
|
// how can it be non-zero? it should have
|
|
// been freshly made from the old titlerec...
|
|
if ( lost ) { char *xx=NULL;*xx=0; }
|
|
// if zero, set it to now!
|
|
//g_linkdb.setLostDate_uk(realRec,now);
|
|
// copy the rdbId byte and key
|
|
gbmemcpy ( nptr , rec , 1 + ks );
|
|
// set it in there now
|
|
g_linkdb.setLostDate_uk(nptr+1,now);
|
|
// carry it through on revdb, do not delete
|
|
// it! we want a linkdb history for seomasters
|
|
nptr += 1 + ks;
|
|
// and go on to delete the old linkdb key that
|
|
// did not have a lost date
|
|
//continue;
|
|
}
|
|
|
|
}
|
|
// sanity. check for metalist breach
|
|
if ( nptr > nmax ) { char *xx=NULL;*xx=0; }
|
|
// free the old meta list
|
|
mfree ( m_metaList , m_metaListAllocSize , "fm" );
|
|
// now switch over to the new one
|
|
m_metaList = nm;
|
|
m_metaListAllocSize = needx;
|
|
m_p = nptr;
|
|
}
|
|
|
|
|
|
// if we only removed it from index, set this flag
|
|
if ( oldList && ! nd ) m_didDelete = true;
|
|
|
|
//
|
|
// repeat this logic special for linkdb since we keep lost links
|
|
// and may update the discovery date or lost date in the keys
|
|
//
|
|
// 1. hash keys of old linkdb keys into dt9 here
|
|
// 2. do not hash the discovery/lost dates when making key hash for dt9
|
|
// 3. scan keys in meta list and add directly into new meta list
|
|
// if not in dt9
|
|
// 4. if in dt9 then add dt9 key instead
|
|
// 5. remove dt9 keys as we add them
|
|
// 6. then add remaining dt9 keys into meta list but with lost date
|
|
// set to now UNLESS it's already set
|
|
//
|
|
|
|
|
|
|
|
|
|
//
|
|
// validate us!
|
|
//
|
|
m_metaListValid = true;
|
|
|
|
// set the list size, different from the alloc size
|
|
m_metaListSize = m_p - m_metaList;//end - m_p;
|
|
// sanity check
|
|
verifyMetaList( m_metaList , m_metaList + m_metaListSize , forDelete );
|
|
|
|
// all done
|
|
return m_metaList;
|
|
}
|
|
|
|
// . copy from old title rec to us to speed things up!
|
|
// . returns NULL and set g_errno on error
|
|
// . returns -1 if blocked
|
|
// . returns 1 otherwise
|
|
// . when to doc content is unchanged, just inherit crap from the old title
|
|
// rec so we can make the spider reply in getNewSpiderReply()
|
|
void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
|
// skip if none
|
|
if ( ! od ) return;
|
|
// skip if already did it
|
|
if ( m_copied1 ) return;
|
|
// do not repeat
|
|
m_copied1 = true;
|
|
// set these
|
|
m_percentChanged = 0;
|
|
m_percentChangedValid = true;
|
|
|
|
// copy over bit members
|
|
m_contentHash32 = od->m_contentHash32;
|
|
//m_tagHash32 = od->m_tagHash32;
|
|
m_tagPairHash32 = od->m_tagPairHash32;
|
|
//m_sitePop = od->m_sitePop;
|
|
m_httpStatus = od->m_httpStatus;
|
|
m_hasAddress = od->m_hasAddress;
|
|
m_hasTOD = od->m_hasTOD;
|
|
//m_hasSiteVenue = od->m_hasSiteVenue;
|
|
m_isRSS = od->m_isRSS;
|
|
m_isPermalink = od->m_isPermalink;
|
|
m_hasContactInfo= od->m_hasContactInfo;
|
|
m_hopCount = od->m_hopCount;
|
|
m_crawlDelay = od->m_crawlDelay;
|
|
|
|
// do not forget the shadow members of the bit members
|
|
m_hasAddress2 = m_hasAddress;
|
|
m_hasTOD2 = m_hasTOD;
|
|
//m_hasSiteVenue2 = m_hasSiteVenue;
|
|
m_isRSS2 = m_isRSS;
|
|
m_isPermalink2 = m_isPermalink;
|
|
|
|
// validate them
|
|
m_contentHash32Valid = true;
|
|
//m_tagHash32Valid = true;
|
|
m_tagPairHash32Valid = true;
|
|
//m_sitePopValid = true;
|
|
m_httpStatusValid = true;
|
|
m_hasAddressValid = true;
|
|
m_hasTODValid = true;
|
|
//m_hasSiteVenueValid = true;
|
|
m_isRSSValid = true;
|
|
m_isPermalinkValid = true;
|
|
m_hasContactInfoValid= true;
|
|
m_hopCountValid = true;
|
|
m_crawlDelayValid = true;
|
|
|
|
m_pubDate = od->m_pubDate;
|
|
m_langId = od->m_langId;
|
|
|
|
m_pubDateValid = true;
|
|
m_langIdValid = true;
|
|
|
|
// so get sitenuminlinks doesn't crash when called by getNewSpiderReply
|
|
// because dns timed out. it timed out with EDNSTIMEDOUT before.
|
|
// so overwrite it here...
|
|
if ( m_ip == -1 || m_ip == 0 || ! m_ipValid ) {
|
|
m_ip = od->m_ip;
|
|
m_ipValid = true;
|
|
m_siteNumInlinks = od->m_siteNumInlinks;
|
|
// m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
|
|
// m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlo
|
|
// m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
|
|
|
|
m_siteNumInlinksValid =
|
|
od->m_siteNumInlinksValid;
|
|
// m_siteNumInlinksUniqueIpValid =
|
|
// od->m_siteNumInlinksUniqueIpValid;
|
|
// m_siteNumInlinksUniqueCBlockValid =
|
|
// od->m_siteNumInlinksUniqueCBlockValid;
|
|
// m_siteNumInlinksTotal =
|
|
// od->m_siteNumInlinksTotalValid;
|
|
}
|
|
|
|
m_indexCode = 0;//od->m_indexCode;
|
|
m_indexCodeValid = true;
|
|
|
|
// we need the link info too!
|
|
ptr_linkInfo1 = od->ptr_linkInfo1;
|
|
size_linkInfo1 = od->size_linkInfo1;
|
|
if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
|
|
else m_linkInfo1Valid = false;
|
|
|
|
// turn off for debug
|
|
ptr_sectiondbData = NULL;
|
|
size_sectiondbData = 0;
|
|
}
|
|
|
|
// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
|
|
SpiderReply *XmlDoc::getFakeSpiderReply ( ) {
|
|
|
|
if ( ! m_tagRecValid ) {
|
|
m_tagRec.reset();
|
|
m_tagRecValid = true;
|
|
}
|
|
|
|
if ( ! m_siteHash32Valid ) {
|
|
m_siteHash32 = 1;
|
|
m_siteHash32Valid = true;
|
|
}
|
|
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTime = 0;
|
|
m_downloadEndTimeValid = true;
|
|
}
|
|
|
|
if ( ! m_ipValid ) {
|
|
m_ipValid = true;
|
|
m_ip = atoip("1.2.3.4");
|
|
}
|
|
|
|
if ( ! m_spideredTimeValid ) {
|
|
m_spideredTimeValid = true;
|
|
m_spideredTime = getTimeGlobal();//0; use now!
|
|
}
|
|
|
|
// don't let it get the diffbot reply either! it should be empty.
|
|
if ( ! m_diffbotReplyValid ) {
|
|
m_diffbotReplyValid = true;
|
|
}
|
|
|
|
// if doing diffbot query reindex
|
|
// TODO: does this shard the request somewhere else???
|
|
if ( ! m_firstIpValid ) {
|
|
m_firstIp = m_ip;//atoip("1.2.3.4");
|
|
m_firstIpValid = true;
|
|
}
|
|
|
|
// this was causing nsr to block and core below on a bad engineer
|
|
// error loading the old title rec
|
|
if ( ! m_isPermalinkValid ) {
|
|
m_isPermalink = false;
|
|
m_isPermalinkValid = true;
|
|
}
|
|
|
|
//if ( ! m_sreqValid ) {
|
|
// m_sreqValid = true;
|
|
// m_sreq.m_parentDocId = 0LL;
|
|
// }
|
|
|
|
|
|
// if error is EFAKEFIRSTIP, do not core
|
|
//if ( ! m_isIndexedValid ) {
|
|
// m_isIndexed = false;
|
|
// m_isIndexedValid = true;
|
|
//}
|
|
|
|
// if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT
|
|
// or ECORRUPTDATA (corrupt gzip reply)
|
|
// then this should not block. we need a spiderReply to release the
|
|
// url spider lock in SpiderLoop::m_lockTable.
|
|
// if m_isChildDoc is true, like for diffbot url, this should be
|
|
// a bogus one.
|
|
SpiderReply *nsr = getNewSpiderReply ();
|
|
if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
|
|
if ( ! nsr ) {
|
|
log("doc: crap, could not even add spider reply "
|
|
"to indicate internal error: %s",mstrerror(g_errno));
|
|
if ( ! g_errno ) g_errno = EBADENGINEER;
|
|
//return true;
|
|
return NULL;
|
|
}
|
|
|
|
return nsr;
|
|
|
|
//if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; }
|
|
|
|
//CollectionRec *cr = getCollRec();
|
|
//if ( ! cr ) return true;
|
|
}
|
|
|
|
// getSpiderReply()
|
|
SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
|
|
|
if ( m_srepValid ) return &m_srep;
|
|
|
|
setStatus ( "getting spider reply" );
|
|
|
|
// diffbot guys, robots.txt, frames, sshould not be here
|
|
if ( m_isChildDoc ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . get the mime first
|
|
// . if we are setting XmlDoc from a titleRec, this causes
|
|
// doConsistencyCheck() to block and core
|
|
//HttpMime *mime = getMime();
|
|
//if ( ! mime || mime == (HttpMime *)-1 ) return (SpiderReply *)mime;
|
|
|
|
// if we had a critical error, do not do this
|
|
int32_t *indexCode = getIndexCode();
|
|
if (! indexCode || indexCode == (void *)-1)
|
|
return (SpiderReply *)indexCode;
|
|
|
|
|
|
// if it has been abandoned early, i.e. cut-off, then we should
|
|
// add a "fake" spider reply to release the lock in
|
|
// SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply()
|
|
// to see what parts of this are relevant.
|
|
/*
|
|
if ( *indexCode == EABANDONED ||
|
|
// . any internal "error" needs to be here really
|
|
// . was there an error unzipping the title rec?
|
|
*indexCode == ECORRUPTDATA ||
|
|
*indexCode == EHITCRAWLLIMIT ||
|
|
*indexCode == EHITPROCESSLIMIT ) {
|
|
// clear everything
|
|
m_srep.reset();
|
|
// get from spider request, if there
|
|
int32_t firstIp = 0;
|
|
if ( m_sreqValid ) firstIp = m_sreq.m_firstIp;
|
|
// otherwise, wtf?
|
|
if ( ! firstIp )
|
|
log("build: no first ip to make fake spiderReply. "
|
|
"injected?");
|
|
// we at least need this
|
|
m_srep.m_firstIp = firstIp;
|
|
Url *fu = getFirstUrl();
|
|
// this is the lock key
|
|
int64_t uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
|
|
m_srep.setKey ( firstIp, 0 , uh48 , false );
|
|
// tell it we are fake and not to really add us to
|
|
// spiderdb, but just to release the lock
|
|
m_srep.m_errCode = *indexCode;
|
|
m_srepValid = true;
|
|
return &m_srep;
|
|
}
|
|
*/
|
|
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;
|
|
|
|
// can't call getIsPermalink() here without entering a dependency loop
|
|
//char *pp = getIsUrlPermalinkFormat();
|
|
//if ( !pp || pp == (char *)-1 ) return (SpiderReply *)pp;
|
|
|
|
// the site hash
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SpiderReply *)sh32;
|
|
|
|
int64_t *de = getDownloadEndTime();
|
|
if ( ! de || de == (void *)-1 ) return (SpiderReply *)de;
|
|
|
|
// need to set m_sentToDiffbot!!
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( ! dbr || dbr == (void *)-1 ) return (SpiderReply *)dbr;
|
|
|
|
// was the doc index when we started trying to spider this url?
|
|
//char *wasIndexed = getIsIndexed();
|
|
//if ( ! wasIndexed || wasIndexed == (void *)-1 )
|
|
// return (SpiderReply *)wasIndexed;
|
|
|
|
//Tag *vt = m_oldTagRec.getTag("venueaddress");
|
|
//bool siteHasVenue = (bool)vt;
|
|
|
|
|
|
// int16_tcut
|
|
Url *fu = NULL;
|
|
// watch out for titlerec lookup errors for docid based spider reqs
|
|
if ( m_firstUrlValid ) fu = getFirstUrl();
|
|
|
|
// reset
|
|
m_srep.reset();
|
|
|
|
int32_t firstIp = -1;
|
|
// inherit firstIp
|
|
Tag *tag = m_tagRec.getTag("firstip");
|
|
// tag must be there?
|
|
if ( tag ) firstIp = atoip(tag->getTagData());
|
|
|
|
// this is usually the authority
|
|
if ( m_firstIpValid )
|
|
firstIp = m_firstIp;
|
|
|
|
// otherwise, inherit from oldsr to be safe
|
|
// BUT NOT if it was a fakeip and we were injecting because
|
|
// the SpiderRequest was manufactured and not actually taken
|
|
// from spiderdb! see XmlDoc::injectDoc() because that is where
|
|
// it came from!! if it has m_sreq.m_isAddUrl and
|
|
// m_sreq.m_fakeFirstIp then we actually do add the reply with that
|
|
// fake ip so that they will exist in the same shard.
|
|
// BUT if it is docid pased from PageReindex.cpp (a query reindex)
|
|
// we set the injection bit and the pagereindex bit, we should let
|
|
// thise guys keep the firstip because the docid-based spider request
|
|
// is in spiderdb. it needs to match up.
|
|
if ( m_sreqValid && (!m_sreq.m_isInjecting||m_sreq.m_isPageReindex) )
|
|
firstIp = m_sreq.m_firstIp;
|
|
|
|
// sanity
|
|
if ( firstIp == 0 || firstIp == -1 ) {
|
|
if ( m_firstUrlValid )
|
|
log("xmldoc: BAD FIRST IP for %s",m_firstUrl.getUrl());
|
|
else
|
|
log("xmldoc: BAD FIRST IP for %"INT64"",m_docId);
|
|
firstIp = 12345;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
// store it
|
|
m_srep.m_firstIp = firstIp;
|
|
// assume no error
|
|
// MDW: not right...
|
|
m_srep.m_errCount = 0;
|
|
// otherwise, inherit from oldsr to be safe
|
|
//if ( m_sreqValid )
|
|
// m_srep.m_firstIp = m_sreq.m_firstIp;
|
|
|
|
// do not inherit this one, it MIGHT HAVE CHANGE!
|
|
m_srep.m_siteHash32 = m_siteHash32;
|
|
|
|
// need this for updating crawl delay table, m_cdTable in Spider.cpp
|
|
if ( fu ) m_srep.m_domHash32 = getDomHash32();
|
|
else m_srep.m_domHash32 = 0;
|
|
|
|
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . set other fields besides key
|
|
// . crap! if we are the "qatest123" collection then m_spideredTime
|
|
// was read from disk usually and is way in the past! watch out!!
|
|
m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// crap, for the test coll this is often a very old time and it
|
|
// causes the spider request to be repeatedly executed, so let's
|
|
// fix that
|
|
if ( ! strcmp(cr->m_coll,"qatest123") )
|
|
m_srep.m_spideredTime = getTimeGlobal();
|
|
|
|
|
|
// TODO: expire these when "ownershipchanged" tag is newer!!
|
|
if ( gr->getTag ( "ingoogle" ) ) {
|
|
m_srep.m_inGoogle = 1;
|
|
m_srep.m_inGoogleValid = 1;
|
|
}
|
|
if ( gr->getTag ( "authorityinlink" ) )
|
|
m_srep.m_hasAuthorityInlink = 1;
|
|
// automatically valid either way
|
|
m_srep.m_hasAuthorityInlinkValid = 1;
|
|
// but for this tag, it must exist even if it has no contact info
|
|
//tag = gr->getTag ( "hascontactinfo" );
|
|
//if ( tag ) {
|
|
|
|
int64_t uh48 = 0LL;
|
|
// we might be a docid based spider request so fu could be invalid
|
|
// if the titlerec lookup failed
|
|
if ( fu ) uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
|
|
int64_t parentDocId = 0LL;
|
|
if ( m_sreqValid )
|
|
parentDocId = m_sreq.getParentDocId();
|
|
//else { char *xx=NULL;*xx=0; }
|
|
|
|
// for docid based urls from PageReindex.cpp we have to make
|
|
// sure to set the urlhash48 correctly from that.
|
|
if ( m_sreqValid ) uh48 = m_sreq.getUrlHash48();
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);
|
|
|
|
// set the key, m_srep.m_key
|
|
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
|
|
|
|
// . did we download a page? even if indexcode is set we might have
|
|
// . if this is non-zero that means its valid
|
|
if ( m_contentHash32Valid )
|
|
m_srep.m_contentHash32 = m_contentHash32;
|
|
|
|
// injecting the content (url implied)
|
|
if ( m_contentInjected ) // m_sreqValid && m_sreq.m_isInjecting )
|
|
m_srep.m_fromInjectionRequest = 1;
|
|
|
|
// can be injecting a url too, content not necessarily implied
|
|
if ( m_sreqValid && m_sreq.m_isInjecting )
|
|
m_srep.m_fromInjectionRequest = 1;
|
|
|
|
if ( m_sentToDiffbotThisTime )
|
|
m_srep.m_sentToDiffbotThisTime = true;
|
|
else
|
|
m_srep.m_sentToDiffbotThisTime = false;
|
|
|
|
if ( m_diffbotReplyError )
|
|
m_srep.m_hadDiffbotError = true;
|
|
else
|
|
m_srep.m_hadDiffbotError = false;
|
|
|
|
// if we only had an error code in the diffbot reply, record that
|
|
if ( ! m_indexCode && m_diffbotReplyError )
|
|
m_srep.m_errCode = m_diffbotReplyError;
|
|
|
|
// sanity. if being called directly from indexDoc() because of
|
|
// an error like out of memory, then we do not know if it is
|
|
// indexed or not or was indexed...
|
|
//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// were we already in titledb before we started spidering?
|
|
m_srep.m_wasIndexed = m_wasInIndex;
|
|
|
|
// note whether m_wasIndexed is valid because if it isn't then
|
|
// we shouldn't be counting this reply towards the page counts.
|
|
// if we never made it this far i guess we should not forcibly call
|
|
// getIsIndexed() at this point so our performance is fast in case
|
|
// this is an EFAKEFIRSTIP error or something similar where we
|
|
// basically just add this reply and we're done.
|
|
// NOTE: this also pertains to SpiderReply::m_isIndexed.
|
|
m_srep.m_wasIndexedValid = m_wasInIndexValid;
|
|
|
|
// assume no change
|
|
m_srep.m_isIndexed = m_isInIndex;
|
|
|
|
// we need to know if the m_isIndexed bit is valid or not
|
|
// because sometimes like if we are being called directly from
|
|
// indexDoc() because of an error situation, we do not know!
|
|
if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
|
|
else m_srep.m_isIndexedINValid = true;
|
|
|
|
// likewise, we need to know if we deleted it so we can decrement the
|
|
// quota count for this subdomain/host in SpiderColl::m_quotaTable
|
|
//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;
|
|
|
|
// treat error replies special i guess, since langId, etc. will be
|
|
// invalid
|
|
if ( m_indexCode ) {
|
|
// validate
|
|
m_srepValid = true;
|
|
// set these items if valid already, but don't bother
|
|
// trying to compute them, since we are not indexing.
|
|
if ( m_siteNumInlinksValid ) {
|
|
m_srep.m_siteNumInlinks = m_siteNumInlinks;
|
|
m_srep.m_siteNumInlinksValid = true;
|
|
}
|
|
//if ( m_percentChangedValid )
|
|
// m_srep.m_percentChangedPerDay = m_percentChanged;
|
|
if ( m_crawlDelayValid && m_crawlDelay >= 0 )
|
|
// we already multiply x1000 in isAllowed2()
|
|
m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
|
|
else
|
|
m_srep.m_crawlDelayMS = -1;
|
|
if ( m_pubDateValid ) m_srep.m_pubDate = m_pubDate;
|
|
if ( m_langIdValid ) m_srep.m_langId = m_langId;
|
|
if ( m_isRSSValid ) m_srep.m_isRSS = m_isRSS;
|
|
if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
|
|
if ( m_httpStatusValid ) m_srep.m_httpStatus = m_httpStatus;
|
|
// stuff that is automatically valid
|
|
m_srep.m_isPingServer = 0;
|
|
if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
|
|
// this was replaced by m_contentHash32
|
|
//m_srep.m_newRequests = 0;
|
|
m_srep.m_errCode = m_indexCode;
|
|
if ( m_downloadEndTimeValid )
|
|
m_srep.m_downloadEndTime = m_downloadEndTime;
|
|
else
|
|
m_srep.m_downloadEndTime = 0;
|
|
// is the original spider request valid?
|
|
if ( m_sreqValid ) {
|
|
// preserve the content hash in case m_indexCode is
|
|
// EDOCUNCHANGED. so we can continue to get that
|
|
// in the future. also, if we had the doc indexed,
|
|
// just carry the contentHash32 forward for the other
|
|
// errors like EDNSTIMEDOUT or whatever.
|
|
m_srep.m_contentHash32 = m_sreq.m_contentHash32;
|
|
// int16_tcuts
|
|
SpiderReply *n = &m_srep;
|
|
SpiderRequest *o = &m_sreq;
|
|
// more stuff
|
|
n->m_inGoogle = o->m_inGoogle;
|
|
n->m_hasContactInfo = o->m_hasContactInfo;
|
|
n->m_isContacty = o->m_isContacty;
|
|
n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
|
|
n->m_isPingServer = o->m_isPingServer;
|
|
// the validator flags
|
|
n->m_inGoogleValid = o->m_inGoogleValid;
|
|
n->m_hasContactInfoValid = o->m_hasContactInfoValid;
|
|
n->m_isContactyValid = o->m_isContactyValid;
|
|
n->m_hasAuthorityInlinkValid =
|
|
o->m_hasAuthorityInlinkValid;
|
|
// get error count from original spider request
|
|
int32_t newc = m_sreq.m_errCount;
|
|
// inc for us, since we had an error
|
|
newc++;
|
|
// contain to one byte
|
|
if ( newc > 255 ) newc = 255;
|
|
// store in our spiderreply
|
|
m_srep.m_errCount = newc;
|
|
}
|
|
// . and do not really consider this an error
|
|
// . i don't want the url filters treating it as an error reply
|
|
// . m_contentHash32 should have been carried forward from
|
|
// the block of code right above
|
|
if ( m_indexCode == EDOCUNCHANGED ) {
|
|
// we should have had a spider request, because that's
|
|
// where we got the m_contentHash32 we passed to
|
|
// Msg13Request.
|
|
if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
// make it a success
|
|
m_srep.m_errCode = 0;
|
|
// and no error count, it wasn't an error per se
|
|
m_srep.m_errCount = 0;
|
|
// call it 200
|
|
m_srep.m_httpStatus = 200;
|
|
}
|
|
// copy flags and data from old doc...
|
|
if ( m_indexCode == EDOCUNCHANGED &&
|
|
m_oldDocValid &&
|
|
m_oldDoc ) {
|
|
m_srep.m_pubDate = m_oldDoc->m_pubDate;
|
|
m_srep.m_langId = m_oldDoc->m_langId;
|
|
m_srep.m_isRSS = m_oldDoc->m_isRSS;
|
|
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
|
|
m_srep.m_hasAddress = m_oldDoc->m_hasAddress;
|
|
m_srep.m_hasTOD = m_oldDoc->m_hasTOD;
|
|
//m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
|
|
m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
|
|
// they're all valid
|
|
m_srep.m_hasAddressValid = true;
|
|
m_srep.m_hasTODValid = true;
|
|
//m_srep.m_hasSiteVenueValid = true;
|
|
m_srep.m_siteNumInlinksValid = true;
|
|
}
|
|
// do special things if
|
|
return &m_srep;
|
|
}
|
|
|
|
// this will help us avoid hammering ips & respect same ip wait
|
|
if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0; }
|
|
m_srep.m_downloadEndTime = m_downloadEndTime;
|
|
|
|
// . if m_indexCode was 0, we are indexed then...
|
|
// . this logic is now above
|
|
//m_srep.m_isIndexed = 1;
|
|
|
|
// get ptr to old doc/titlerec
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod || pod == (XmlDoc **)-1 ) return (SpiderReply *)pod;
|
|
// this is non-NULL if it existed
|
|
XmlDoc *od = *pod;
|
|
|
|
// status is -1 if not found
|
|
int16_t *hs = getHttpStatus ();
|
|
if ( ! hs || hs == (void *)-1 ) return (SpiderReply *)hs;
|
|
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (SpiderReply *)sni;
|
|
|
|
float *pc = getPercentChanged();
|
|
if ( ! pc || pc == (void *)-1 ) return (SpiderReply *)pc;
|
|
|
|
// these are "non-dup" addresses (nondup)
|
|
bool *hasAddress = getHasAddress();
|
|
if ( ! hasAddress || hasAddress == (void *)-1 )
|
|
return (SpiderReply *)hasAddress;
|
|
// does it have a tod (i.e. 6pm) in there somewhere?
|
|
bool *hasTOD = getHasTOD();
|
|
if ( ! hasTOD || hasTOD == (void *)-1 )
|
|
return (SpiderReply *)hasTOD;
|
|
// does it have a venue address?
|
|
//bool *hasSiteVenue = getHasSiteVenue();
|
|
//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
|
|
// return (SpiderReply *)hasSiteVenue;
|
|
// get the content type
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (SpiderReply *)isRoot;
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (SpiderReply *)hci;
|
|
|
|
|
|
|
|
int32_t *pubDate = getPubDate();
|
|
if ( ! pubDate || pubDate == (int32_t *)-1 )
|
|
return (SpiderReply *)pubDate;
|
|
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 )
|
|
return (SpiderReply *)langId;
|
|
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS || isRSS == (char *)-1 )
|
|
return (SpiderReply *)isRSS;
|
|
|
|
char *pl = getIsPermalink();
|
|
if ( ! pl || pl == (char *)-1 )
|
|
return (SpiderReply *)pl;
|
|
|
|
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_hasContactInfo ) {
|
|
m_srep.m_hasContactInfo = 1;
|
|
m_srep.m_hasContactInfoValid = 1;
|
|
}
|
|
|
|
// this is only know if we download the robots.tt...
|
|
if ( od && m_recycleContent ) {
|
|
m_crawlDelay = od->m_crawlDelay;
|
|
m_crawlDelayValid = true;
|
|
}
|
|
|
|
// sanity checks
|
|
//if(! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_isSpamValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_crawlDelayValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// httpStatus is -1 if not found (like for empty http replies)
|
|
m_srep.m_httpStatus = *hs;
|
|
|
|
// zero if none
|
|
//m_srep.m_percentChangedPerDay = 0;
|
|
// . only if had old one
|
|
// . we use this in url filters to set the respider wait time usually
|
|
if ( od ) {
|
|
int32_t spideredTime = getSpideredTime();
|
|
int32_t oldSpideredTime = od->getSpideredTime();
|
|
float numDays = spideredTime - oldSpideredTime;
|
|
m_srep.m_percentChangedPerDay = (m_percentChanged+.5)/numDays;
|
|
}
|
|
|
|
// . update crawl delay, but we must store now as milliseconds
|
|
// because Spider.cpp like it better that way
|
|
// . -1 implies crawl delay unknown or not found
|
|
if ( m_crawlDelay >= 0 && m_crawlDelayValid )
|
|
// we already multiply x1000 in isAllowed2()
|
|
m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
|
|
else
|
|
// -1 means invalid/unknown
|
|
m_srep.m_crawlDelayMS = -1;
|
|
|
|
if ( ! m_hasAddressValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hasTODValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }
|
|
|
|
// . we use this to store "bad" spider recs to keep from respidering
|
|
// a "bad" url over and over again
|
|
// . it is up to the url filters whether they want to retry this
|
|
// again or not!
|
|
// . TODO: how to represent "ETCPTIMEDOUT"????
|
|
// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
|
|
// ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
|
|
m_srep.m_siteNumInlinks = m_siteNumInlinks;
|
|
m_srep.m_pubDate = *pubDate;
|
|
// this was replaced by m_contentHash32
|
|
//m_srep.m_newRequests = 0;
|
|
m_srep.m_langId = *langId;
|
|
m_srep.m_isRSS = (bool)*isRSS;
|
|
m_srep.m_isPermalink = (bool)*pl;
|
|
m_srep.m_isPingServer = (bool)fu->isPingServer();
|
|
//m_srep.m_isSpam = m_isSpam;
|
|
|
|
m_srep.m_siteNumInlinksValid = true;
|
|
|
|
// . ignore address in dup sections (nondup/non-dup addresses only)
|
|
// . this way if the place always has their address in the header or
|
|
// footer of every web page we will ignore it
|
|
m_srep.m_hasAddress = *hasAddress;
|
|
m_srep.m_isContacty = *hci;//getIsContacty(fu,
|
|
// info1,
|
|
// m_hopCount ,
|
|
// *ct , // contentType
|
|
// *isRoot ,
|
|
// m_niceness );
|
|
m_srep.m_hasTOD = *hasTOD;
|
|
//m_srep.m_hasSiteVenue = *hasSiteVenue;
|
|
|
|
// validate all
|
|
m_srep.m_inGoogleValid = 1;
|
|
m_srep.m_hasContactInfoValid = 1;
|
|
m_srep.m_hasAuthorityInlinkValid = 1;
|
|
m_srep.m_isContactyValid = 1;
|
|
m_srep.m_hasAddressValid = 1;
|
|
m_srep.m_hasTODValid = 1;
|
|
//m_srep.m_hasSiteVenueValid = 1;
|
|
|
|
// a quick validation. reply must unlock the url from the lock table.
|
|
// so the locks must be equal.
|
|
if ( m_sreqValid &&
|
|
// we create a new spiderrequest if injecting with a fake firstip
|
|
// so it will fail this test...
|
|
! m_sreq.m_isInjecting ) {
|
|
int64_t lock1 = makeLockTableKey(&m_sreq);
|
|
int64_t lock2 = makeLockTableKey(&m_srep);
|
|
if ( lock1 != lock2 ) {
|
|
log("build: lock1 != lock2 lock mismatch for %s",
|
|
m_firstUrl.m_url);
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
}
|
|
|
|
// validate
|
|
m_srepValid = true;
|
|
|
|
return &m_srep;
|
|
}
|
|
|
|
// . so Msg20 can see if we are banned now or not...
|
|
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
|
|
// because things like "parentIsRSS" can be both true or false since a url
|
|
// can have multiple spider recs associated with it!
|
|
void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq ,
|
|
SpiderReply *srep ) {
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_isUrlPermalinkFormatValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
Url *fu = getFirstUrl();
|
|
|
|
// get this
|
|
//TagRec *gr = (TagRec *)ptr_tagRecData;
|
|
//Tag *tag = NULL;
|
|
//if ( gr ) tag = gr->getTag("sitenuminlinks");
|
|
// reset
|
|
sreq->reset();
|
|
// assume not valid
|
|
sreq->m_siteNumInlinks = -1;
|
|
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// how many site inlinks?
|
|
sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
sreq->m_siteNumInlinksValid = true;
|
|
|
|
// set other fields besides key
|
|
sreq->m_firstIp = m_ip;
|
|
sreq->m_hostHash32 = m_hostHash32a;
|
|
//sreq->m_domHash32 = m_domHash32;
|
|
//sreq->m_siteNumInlinks = m_siteNumInlinks;
|
|
//sreq->m_pageNumInlinks = m_pageNumInlinks;
|
|
sreq->m_hopCount = m_hopCount;
|
|
|
|
sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32;
|
|
sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32;
|
|
sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32;
|
|
sreq->m_pageNumInlinks = 0;//m_sreq.m_parentFirstIp;
|
|
|
|
sreq->m_isNewOutlink = 0;
|
|
sreq->m_isAddUrl = 0;//m_isAddUrl;
|
|
sreq->m_isPingServer = fu->isPingServer();
|
|
//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;
|
|
|
|
// transcribe from old spider rec, stuff should be the same
|
|
sreq->m_addedTime = m_firstIndexedDate;
|
|
sreq->m_sameDom = 0;//m_sreq.m_sameDom;
|
|
sreq->m_sameHost = 0;//m_sreq.m_sameHost;
|
|
sreq->m_sameSite = 0;//m_sreq.m_sameSite;
|
|
sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed;
|
|
sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS;
|
|
sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink;
|
|
sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer;
|
|
|
|
// validate the stuff so getUrlFilterNum() acks it
|
|
sreq->m_hopCountValid = 1;
|
|
|
|
srep->reset();
|
|
|
|
srep->m_spideredTime = getSpideredTime();//m_spideredTime;
|
|
//srep->m_isSpam = isSpam; // real-time update this!!!
|
|
srep->m_isRSS = m_isRSS;
|
|
srep->m_isPermalink = m_isPermalink;
|
|
srep->m_httpStatus = 200;
|
|
//srep->m_retryNum = 0;
|
|
srep->m_langId = m_langId;
|
|
srep->m_percentChangedPerDay = 0;//m_percentChanged;
|
|
|
|
// we need this now for ucp ucr upp upr new url filters that do
|
|
// substring matching on the url
|
|
if ( m_firstUrlValid )
|
|
strcpy(sreq->m_url,m_firstUrl.m_url);
|
|
}
|
|
|
|
// defined in PageCrawlBot.cpp
|
|
int32_t isInSeedBuf ( CollectionRec *cr , char *url, int len ) ;
|
|
|
|
// . add the spiderdb recs to the meta list
|
|
// . used by XmlDoc::setMetaList()
|
|
// . returns NULL and sets g_errno on error
|
|
// . otherwise returns the "new p"
|
|
// . if Scraper.cpp or PageAddUrl.cpp and Msg7.cpp should all use the XmlDoc
|
|
// class even if just adding links. they should make a fake html page and
|
|
// "inject" it, with only m_useSpiderdb set to true...
|
|
char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
|
|
|
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do not do this if recycling content
|
|
// UNLESS REBUILDING...
|
|
if ( m_recycleContent && ! m_useSecondaryRdbs ) return (char *)0x01;
|
|
|
|
|
|
// for now skip in repair tool
|
|
if ( m_useSecondaryRdbs && ! g_conf.m_rebuildAddOutlinks )
|
|
return (char *)0x01;
|
|
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (char *)links;
|
|
|
|
char *spiderLinks = getSpiderLinks();
|
|
if ( ! spiderLinks || spiderLinks == (char *)-1 )
|
|
return (char *)spiderLinks;
|
|
|
|
TagRec ***grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (char *)grv;
|
|
//char **iiv = getOutlinkIsIndexedVector();
|
|
//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
|
|
int32_t **ipv = getOutlinkFirstIpVector();
|
|
if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
|
|
//int8_t *hcv = getOutlinkHopCountVector();
|
|
//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
|
|
char *ipi = getIsIndexed(); // is the parent indexed?
|
|
if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (Addresses *)-1 ) return (char *)aa;
|
|
// sanity check
|
|
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . ignore address in dup sections
|
|
// . this way if the place always has their address in the header or
|
|
// footer of every web page we will ignore it (SEC_DUP section flag)
|
|
bool parentHasAddress = (bool)(aa->getNumNonDupAddresses()>0);
|
|
|
|
// need this
|
|
int32_t parentDomHash32 = getDomHash32();
|
|
if ( parentDomHash32 != m_domHash32 ) { char *xx=NULL;*xx=0; }
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
|
|
|
|
int32_t *psni = getSiteNumInlinks();
|
|
if ( ! psni || psni == (int32_t *)-1 ) return (char *)psni;
|
|
|
|
int32_t *pfip = getFirstIp();
|
|
if ( ! pfip || pfip == (void *)-1 ) return (char *)pfip;
|
|
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (char *)d;
|
|
|
|
Url *fu = getFirstUrl();
|
|
if ( ! fu || fu == (void *)-1 ) return (char *)fu;
|
|
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (char *)cu;
|
|
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;
|
|
|
|
// validate this to prevent core for simplified redirect links
|
|
int32_t hostHash32a = getHostHash32a();
|
|
|
|
// so linkSites[i] is site for link #i in Links.cpp class
|
|
int32_t *linkSiteHashes = getLinkSiteHashes ( );
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
|
|
return (char *)linkSiteHashes;
|
|
|
|
|
|
XmlDoc *nd = this;
|
|
|
|
// set "od". will be NULL if no old xml doc, i.e. no old title rec
|
|
//XmlDoc **pod = getOldXmlDoc ( );
|
|
//if ( ! pod || pod == (void *)-1 ) return (char *)pod;
|
|
//XmlDoc *od = *pod;
|
|
|
|
// if this page is hacked, then do not spider external outlinks
|
|
//char *comp = getIsCompromised();
|
|
//if ( ! comp || comp == (char *)-1 ) return (char *)comp;
|
|
//if ( *comp )
|
|
// onlyInternal = true;
|
|
|
|
bool isParentRSS = false;
|
|
bool parentIsPermalink = false;
|
|
bool parentIsSiteMap = false;
|
|
// PageAddUrl.cpp does not supply a valid new doc, so this is NULL
|
|
if ( nd ) {
|
|
isParentRSS = *nd->getIsRSS() ;
|
|
parentIsPermalink = *nd->getIsPermalink();
|
|
parentIsSiteMap = *nd->getIsSiteMap();
|
|
}
|
|
|
|
int32_t n = links->m_numLinks;
|
|
// return early if nothing to do. do not return NULL though cuz we
|
|
// do not have g_errno set!
|
|
if ( n <= 0 ) return (char *)0x01;
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int64_t myUh48 = m_firstUrl.getUrlHash48();
|
|
|
|
// . pre-allocate a buffer to hold the spider recs
|
|
// . taken from SpiderRequest::store()
|
|
int32_t size = 0;
|
|
for ( int32_t i = 0 ; i < n ; i++ )
|
|
size += SpiderRequest::getNeededSize ( links->getLinkLen(i) );
|
|
|
|
// append spider recs to this list ptr
|
|
char *p = m_p;
|
|
|
|
// hash table to avoid dups
|
|
HashTableX ht;
|
|
char buf2[8192];
|
|
if ( ! ht.set ( 4,0,1000,buf2 , 8192,false,m_niceness,"linkdedup" ) )
|
|
return NULL;
|
|
|
|
// count how many we add
|
|
int32_t numAdded = 0;
|
|
int32_t numAddedFromSameDomain = 0;
|
|
int32_t linksBanned = 0;
|
|
int32_t linksFiltered = 0;
|
|
|
|
bool isParentPingServer = false;
|
|
if ( fu && fu->isPingServer() ) isParentPingServer = true;
|
|
if ( cu && cu->isPingServer() ) isParentPingServer = true;
|
|
|
|
// int16_tcut
|
|
bool isScraping = (m_sreqValid && m_sreq.m_isScraping);
|
|
//bool useTestSpiderDir = (m_sreqValid && m_sreq.m_useTestSpiderDir);
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// do not do this if not test collection for now
|
|
bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
|
|
// turn off for now
|
|
isTestColl = false;
|
|
|
|
//char **wptrs = m_words.getWords();
|
|
//int32_t *wlens = m_words.getWordLens();
|
|
|
|
// need this for setting SpiderRequest::m_spiderTime
|
|
//int32_t nowGlobal = getTimeGlobal();
|
|
|
|
// for setting LF_CONTACTY bit on the outlinks
|
|
char disbuf[1000];
|
|
HashTableX disqualify;
|
|
disqualify.set(4,0,32,disbuf,1000,false,m_niceness,"disqual");
|
|
int32_t consec = 0;
|
|
int32_t linkTypes[2000];
|
|
int32_t lastType = 0;
|
|
|
|
|
|
|
|
// if the file we are indexing now has
|
|
// "<meta name=spiderlinkslinks value=0>" then that means to
|
|
// add the links to spiderdb, but do not spider their links!
|
|
// dmozparse uses this to make a file called gbdmoz.urs.txt.0
|
|
// that is just filled with urls that are in dmoz. and we want
|
|
// to index just those urls.
|
|
//
|
|
// now just make dmozparse output urls as <a href=> tags.
|
|
//
|
|
char mbuf[16];
|
|
mbuf[0] = '\0';
|
|
char *tag = "spiderlinkslinks";
|
|
int32_t tlen = gbstrlen(tag);
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
bool avoid = false;
|
|
if ( mbuf[0] == '0' ) avoid = true;
|
|
|
|
// if this is a simplified redir and we should not be spidering
|
|
// links then turn it off as well! because we now add simplified
|
|
// redirects back into spiderdb using this function.
|
|
if ( m_spiderLinksValid && ! m_spiderLinks )
|
|
avoid = true;
|
|
|
|
// it also has this meta tag now too
|
|
mbuf[0] = '\0';
|
|
tag = "ignorelinksexternalerrors";
|
|
tlen = gbstrlen(tag);
|
|
xml->getMetaContent ( mbuf, 16 , tag , tlen );
|
|
bool ignore = false;
|
|
if ( mbuf[0] == '1' ) ignore = true;
|
|
|
|
// for diffbot crawlbot, if we are a seed url and redirected to a
|
|
// different domain... like bn.com --> barnesandnoble.com
|
|
int32_t redirDomHash32 = 0;
|
|
int32_t redirHostHash32 = 0;
|
|
//int32_t redirSiteHash32 = 0;
|
|
if ( //cr->m_isCustomCrawl == 1 &&
|
|
//isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) &&
|
|
m_hopCount == 0 &&
|
|
m_redirUrlValid &&
|
|
ptr_redirUrl &&
|
|
//m_redirUrlPtr && (this gets reset to NULL as being LAST redir)
|
|
// this is the last non-empty redir here:
|
|
m_redirUrl.getUrlLen() > 0 ) {
|
|
log("build: seed REDIR: %s",m_redirUrl.getUrl());
|
|
redirDomHash32 = m_redirUrl.getDomainHash32();
|
|
redirHostHash32 = m_redirUrl.getHostHash32();
|
|
}
|
|
|
|
|
|
//SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
|
|
|
|
//
|
|
// serialize each link into the metalist now
|
|
//
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// grab our info
|
|
TagRec *gr = (*grv)[i];
|
|
int32_t firstIp = (*ipv)[i];
|
|
//char isIndexed = (*iiv)[i];
|
|
//int32_t hc = hcv[i];
|
|
// ip lookup failed? do not add to spiderdb then
|
|
if ( firstIp == 0 || firstIp == -1 ) continue;
|
|
|
|
// if firstIp is in the SpiderColl::m_overflowFirstIps list
|
|
// then do not add any more links to it. it already has
|
|
// more than 500MB worth.
|
|
// this was moved to Rdb.cpp's addRecord()
|
|
// if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
|
|
// m_linkOverflows++;
|
|
// g_stats.m_totalOverflows++;
|
|
// continue;
|
|
// }
|
|
|
|
// sanity check
|
|
//if ( firstIp == 0x03 ) {char *xx=NULL;*xx=0; }
|
|
// get flags
|
|
linkflags_t flags = links->m_linkFlags[i];
|
|
// . skip if we are rss page and this link is an <a href> link
|
|
// . we only harvest <link> urls from rss feeds, not href links
|
|
// . or in the case of feedburner, those orig tags
|
|
if ( isParentRSS && (flags & LF_AHREFTAG) ) continue;
|
|
// if we have a <feedburner:origLink> tag, then ignore <link>
|
|
// tags and only get the links from the original links
|
|
if ( links->m_isFeedBurner && !(flags & LF_FBTAG) ) continue;
|
|
// do not add self links, pointless
|
|
if ( flags & LF_SELFLINK ) continue;
|
|
// do not add if no follow
|
|
if ( flags & LF_NOFOLLOW ) continue;
|
|
// point to url
|
|
char *s = links->getLink (i);
|
|
int32_t slen = links->getLinkLen(i);
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get hash
|
|
int32_t uh = hash32 ( s , slen );
|
|
// it does not like keys of 0, that means empty slot
|
|
if ( uh == 0 ) uh = 1;
|
|
// skip if dup
|
|
if ( ht.isInTable ( &uh ) ) continue;
|
|
// add it, returns false and sets g_errno on error
|
|
if ( ! ht.addKey ( &uh ) ) return NULL;
|
|
// we now supports HTTPS
|
|
if ( strncmp(s,"http://",7) && strncmp(s,"https://",8) )
|
|
continue;
|
|
// . do not add if "old"
|
|
// . Links::set() calls flagOldOutlinks()
|
|
// . that just means we probably added it the last time
|
|
// we spidered this page
|
|
// . no cuz we might have a different siteNumInlinks now
|
|
// and maybe this next hop count is now allowed where as
|
|
// before it was not!
|
|
//if ( flags & LF_OLDLINK ) continue;
|
|
|
|
// set it. addWWW = true! no.. make it false because of issues
|
|
// like tmblr.co/ZHw5yo1E5TAaW injection where
|
|
// www.tmblr.co has no IP
|
|
Url url; url.set ( s , slen , false ); // true );
|
|
|
|
// if hostname length is <= 2 then SILENTLY reject it
|
|
if ( url.getHostLen() <= 2 ) continue;
|
|
|
|
// are we a new outlink from a ? i.e. a "hot link"? assume so
|
|
bool newOutlink = true;
|
|
// if no old links, can not be a new outlink then
|
|
if ( flags & LF_OLDLINK ) newOutlink = false;
|
|
// . do not consider outlinks of new pages to be newOutlinks.
|
|
// that is somewhat redundant.
|
|
// . you can use "parentisnew" to do what you want in the url
|
|
// filters table
|
|
//if ( ! isIndexed ) newOutlink = false;
|
|
|
|
// get # of inlinks to this site... if recorded...
|
|
int32_t ksni = -1;
|
|
Tag *st = NULL;
|
|
if ( gr ) st = gr->getTag ("sitenuminlinks");
|
|
if ( st ) ksni = atol(st->getTagData());
|
|
|
|
int32_t hostHash32 = url.getHostHash32();
|
|
// . consult our sitelinks.txt file
|
|
// . returns -1 if not found
|
|
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
|
|
|
// try with www if not there
|
|
if ( min < 0 && ! url.hasSubdomain() ) {
|
|
int32_t wwwHash32 = url.getHash32WithWWW();
|
|
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
|
}
|
|
|
|
if ( min >= 0 && ksni < min )
|
|
ksni = min;
|
|
|
|
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
//int32_t ksni = m_siteNumInlinks;
|
|
|
|
// . get possible pub date from url (.../2008/09/23/page.htm)
|
|
// . this returns 0 if none found
|
|
//int32_t urlPubDate = parseDateFromUrl(s);
|
|
|
|
// use zero for the timestamp so SiteGetter does not recompute
|
|
// any tags in the tagRec thereby blocking!
|
|
//SiteGetter sg;
|
|
//sg.getSite ( s , gr , 0, m_coll, m_niceness,false,NULL,NULL);
|
|
// get this
|
|
bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );
|
|
//int32_t siteHash32 = hash32n ( linkSite );
|
|
|
|
// get it quick
|
|
bool ispingserver = url.isPingServer();
|
|
int32_t domHash32 = url.getDomainHash32();
|
|
|
|
// is link rss?
|
|
//bool isrss = false;
|
|
//if (slen>6 && !strncasecmp(s+slen-4,".rss",4)) isrss = true;
|
|
bool isRSSExt = false;
|
|
char *ext = url.getExtension();
|
|
if ( ext && strcasecmp(ext,"rss" ) == 0 ) isRSSExt = true;
|
|
if ( ext && strcasecmp(ext,"xml" ) == 0 ) isRSSExt = true;
|
|
if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;
|
|
|
|
|
|
// make the spider request rec for it
|
|
SpiderRequest ksr;
|
|
// to defaults (zero out)
|
|
ksr.reset();
|
|
// set other fields besides key
|
|
ksr.m_firstIp = firstIp;
|
|
ksr.m_hostHash32 = hostHash32;
|
|
ksr.m_domHash32 = domHash32;
|
|
ksr.m_siteHash32 = linkSiteHashes[i];//siteHash32;
|
|
ksr.m_siteNumInlinks = ksni;
|
|
ksr.m_siteNumInlinksValid = true;
|
|
ksr.m_isRSSExt = isRSSExt;
|
|
// continue using "test-spider" subdir to cache web pages
|
|
// if our parent was using that
|
|
//ksr.m_useTestSpiderDir = useTestSpiderDir;
|
|
ksr.m_parentIsSiteMap = parentIsSiteMap;
|
|
|
|
ksr.m_hasMediaExtension = url.hasMediaExtension();
|
|
ksr.m_hasMediaExtensionValid = 1;
|
|
|
|
// now we need this so we can share Msg12 spider locks with
|
|
// query reindex docid-based spider requests. that way
|
|
// we do not spider the same document at the same time.
|
|
//ksr.m_probDocId = g_titledb.getProbableDocId(&url);
|
|
|
|
//ksr.m_pageNumInlinks = 0;
|
|
|
|
// hop count is now 16 bits so do not wrap that around
|
|
int32_t hc = m_hopCount + 1;
|
|
if ( hc > 65535 ) hc = 65535;
|
|
ksr.m_hopCount = hc;
|
|
|
|
// keep hopcount the same for redirs
|
|
if ( m_indexCodeValid &&
|
|
( m_indexCode == EDOCSIMPLIFIEDREDIR ||
|
|
m_indexCode == EDOCNONCANONICAL ) )
|
|
ksr.m_hopCount = m_hopCount;
|
|
|
|
// for diffbot custom crawls we keep the computed hopcount
|
|
if ( ! cr->m_isCustomCrawl ) {
|
|
if ( issiteroot ) ksr.m_hopCount = 0;
|
|
if ( ispingserver ) ksr.m_hopCount = 0;
|
|
//if ( isrss ) ksr.m_hopCount = 0;
|
|
}
|
|
|
|
// log("ksr: url=%s hc=%i (isr=%i ips=%i icv=%i ic=%i mhc=%i)",
|
|
// url.getUrl(),(int)ksr.m_hopCount,
|
|
// (int)issiteroot,(int)ispingserver,(int)m_indexCodeValid,
|
|
// (int)m_indexCode,(int)m_hopCount
|
|
// );
|
|
|
|
// validate it
|
|
ksr.m_hopCountValid = true;
|
|
|
|
ksr.m_addedTime = getSpideredTime();//m_spideredTime;
|
|
//ksr.m_lastAttempt = 0;
|
|
//ksr.m_urlPubDate = urlPubDate;
|
|
//ksr.m_errCode = 0;
|
|
ksr.m_parentHostHash32 = hostHash32a;
|
|
ksr.m_parentDomHash32 = m_domHash32;
|
|
ksr.m_parentSiteHash32 = m_siteHash32;
|
|
|
|
// if a seed/hopcount0 url redirected to a different domain
|
|
// then use that if it is the same. that way we can satisft
|
|
// the "isonsamedomain" expression in the url filters table.
|
|
if ( redirDomHash32 == domHash32 && redirDomHash32 )
|
|
ksr.m_parentDomHash32 = redirDomHash32;
|
|
if ( redirHostHash32 == hostHash32 && redirHostHash32 )
|
|
ksr.m_parentHostHash32 = redirHostHash32;
|
|
|
|
//ksr.m_parentFirstIp = *pfip;//m_ip;
|
|
ksr.m_pageNumInlinks = 0;
|
|
|
|
ksr.m_parentHasAddress = parentHasAddress;
|
|
// get this
|
|
bool isupf = ::isPermalink(NULL,&url,CT_HTML,NULL,isRSSExt);
|
|
// set some bit flags. the rest are 0 since we call reset()
|
|
if ( newOutlink ) ksr.m_isNewOutlink = 1;
|
|
if ( isupf ) ksr.m_isUrlPermalinkFormat = 1;
|
|
//if ( isIndexed ) ksr.m_isIndexed = 1;
|
|
if ( ispingserver ) ksr.m_isPingServer = 1;
|
|
|
|
// is it like www.xxx.com/* (does not include www.xxx.yyy.com)
|
|
// includes xxx.com/* however
|
|
ksr.m_isWWWSubdomain = url.isSimpleSubdomain();
|
|
|
|
// get link text we use for this outlink
|
|
/*
|
|
char tbuf[200];
|
|
int32_t tlen = links->getLinkText2 ( i ,
|
|
tbuf ,
|
|
200 ,
|
|
NULL ,
|
|
NULL ,
|
|
NULL ,
|
|
m_niceness );
|
|
*/
|
|
|
|
// the updated isContacty algo to fix www.apha.org which
|
|
// has a ton of apha.org/about/* links
|
|
int32_t t = getIsContacty ( &url,
|
|
NULL ,
|
|
ksr.m_hopCount ,
|
|
0 , // content type
|
|
(ksr.m_hopCount==0),
|
|
m_niceness );
|
|
// if same type as last one we might disqualify if 3 in a row
|
|
if ( t && t == lastType ) consec++;
|
|
else consec = 0;
|
|
// disqualify this pattern as a contacty link if is abused
|
|
if ( consec >= 3 )
|
|
if ( ! disqualify.addKey(&t) )
|
|
return NULL;
|
|
// remember. use numAdded as the index for this since we do
|
|
// not add all the outlinks to this list.
|
|
if ( numAdded < 2000 ) linkTypes[numAdded] = t;
|
|
// set this
|
|
lastType = t;
|
|
|
|
// validate
|
|
ksr.m_isContactyValid = 1;
|
|
|
|
// if parent is a root of a popular site, then it is considered
|
|
// an authority linker. (see updateTagdb() function above)
|
|
if ( *isRoot && *psni >= 500 )
|
|
ksr.m_hasAuthorityInlink = 1;
|
|
// this is in request now as well as reply
|
|
//Tag *tag;
|
|
// hascontactinfo tag can have a value of 0 or 1
|
|
//tag = gr->getTag("hascontactinfo");
|
|
//if ( tag ) {
|
|
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
|
|
if ( m_hasContactInfo ) {
|
|
ksr.m_hasContactInfo = 1;
|
|
ksr.m_hasContactInfoValid = true;
|
|
}
|
|
|
|
// if we just set the contact info, use us, more recent
|
|
if ( linkSiteHashes[i]==m_siteHash32 && m_hasContactInfoValid){
|
|
ksr.m_hasContactInfo = m_hasContactInfo;
|
|
ksr.m_hasContactInfoValid = true;
|
|
}
|
|
|
|
if ( gr->getTag("ingoogle" ) ) {
|
|
ksr.m_inGoogle = 1;
|
|
ksr.m_inGoogleValid = true;
|
|
}
|
|
// the mere existence of these tags is good
|
|
if ( gr->getTag("authorityinlink"))ksr.m_hasAuthorityInlink =1;
|
|
ksr.m_hasAuthorityInlinkValid = true;
|
|
|
|
// if our url was a seed and redirected to another domain
|
|
// allow outlinks on that other domain to be on domain too.
|
|
// only used for diffbot crawlbot right now.
|
|
if ( domHash32 == redirDomHash32 && redirDomHash32 )
|
|
ksr.m_sameDom = 1;
|
|
if ( hostHash32 == redirHostHash32 && redirHostHash32 )
|
|
ksr.m_sameHost = 1;
|
|
// if ( linkSiteHashes[i]==redirSiteHash32 && redirSiteHash32)
|
|
// ksr.m_sameSite = 1;
|
|
|
|
// set parent based info
|
|
if ( domHash32 == m_domHash32 ) ksr.m_sameDom = 1;
|
|
if ( hostHash32 == m_hostHash32a ) ksr.m_sameHost = 1;
|
|
if ( linkSiteHashes[i]==m_siteHash32 ) ksr.m_sameSite = 1;
|
|
if ( *ipi ) ksr.m_wasParentIndexed = 1;
|
|
if ( isParentRSS ) ksr.m_parentIsRSS = 1;
|
|
if ( parentIsPermalink ) ksr.m_parentIsPermalink = 1;
|
|
if ( isParentPingServer ) ksr.m_parentIsPingServer= 1;
|
|
if ( parentIsSiteMap ) ksr.m_parentIsSiteMap = 1;
|
|
|
|
// this is used for building dmoz. we just want to index
|
|
// the urls in dmoz, not their outlinks.
|
|
if ( avoid ) ksr.m_avoidSpiderLinks = 1;
|
|
|
|
// this is used for building dmoz. we need to index this
|
|
// url even in the case of ETCPTIMEDOUT, etc.
|
|
if ( ignore ) ksr.m_ignoreExternalErrors = 1;
|
|
|
|
// . if this is the 2nd+ time we were spidered and this outlink
|
|
// wasn't there last time, then set this!
|
|
// . if this is the first time spidering this doc then set it
|
|
// to zero so that m_minPubDate is set to -1 when the outlink
|
|
// defined by "ksr" is spidered.
|
|
if ( m_oldDocValid && m_oldDoc ) {
|
|
int32_t oldSpideredTime = m_oldDoc->getSpideredTime();
|
|
ksr.m_parentPrevSpiderTime = oldSpideredTime;
|
|
}
|
|
else
|
|
ksr.m_parentPrevSpiderTime = 0;
|
|
|
|
//
|
|
// . inherit manual add bit if redirecting to simplified url
|
|
// . so we always spider seed url even if prohibited by
|
|
// the regex, and even if it simplified redirects
|
|
//
|
|
if ( m_indexCodeValid &&
|
|
( m_indexCode == EDOCSIMPLIFIEDREDIR ||
|
|
m_indexCode == EDOCNONCANONICAL ) &&
|
|
m_sreqValid ) {
|
|
if ( m_sreq.m_isInjecting )
|
|
ksr.m_isInjecting = 1;
|
|
if ( m_sreq.m_isAddUrl )
|
|
ksr.m_isAddUrl = 1;
|
|
}
|
|
|
|
// it is useful to know the primary langid of the parent
|
|
// when prioritizing links for spidering in the case of
|
|
// focussing the search engine on a particular set of langs
|
|
ksr.m_parentLangId = *langId;
|
|
|
|
// don't forget this one!
|
|
//ksr.m_spiderTime = nowGlobal;
|
|
|
|
// . is it "spam"? XmlDoc.cpp::isSpam()
|
|
// . we need to make that root quality into site root quality!
|
|
// . let's put spam detection logic into url filters
|
|
//if ( isSpam ( s,gr,m_spideredTime,true ) )
|
|
// // set the bit flag
|
|
// ksr.m_isSpam = 1;
|
|
// copy the url into SpiderRequest::m_url buffer
|
|
strcpy(ksr.m_url,s);
|
|
// this must be valid
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set the key, ksr.m_key. isDel = false
|
|
ksr.setKey ( firstIp, *d , false );
|
|
|
|
// we were hopcount 0, so if we link to ourselves we override
|
|
// our original hopcount of 0 with this guy that has a
|
|
// hopcount of 1. that sux... so don't do it.
|
|
if ( ksr.getUrlHash48() == myUh48 ) continue;
|
|
|
|
// if we've recently added this url to spiderdb in Spider.cpp, skip it
|
|
//if ( sc && sc->isInDupCache ( &ksr , false ) )
|
|
// continue;
|
|
|
|
// . technically speaking we do not have any reply so we
|
|
// should not be calling this! cuz we don't have all the info
|
|
// . see if banned or filtered, etc.
|
|
// . at least try to call it. getUrlFilterNum() should
|
|
// break out and return -1 if it encounters a filter rule
|
|
// that it does not have enough info to answer.
|
|
// so if your first X filters all map to a "FILTERED"
|
|
// priority and this url matches one of them we can
|
|
// confidently toss this guy out.
|
|
// . show this for debugging!
|
|
// int32_t ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
|
|
// false, m_niceness, cr,
|
|
// false,//true , // outlink?
|
|
// NULL ); // quotatable
|
|
// logf(LOG_DEBUG,"build: ufn=%"INT32" for %s",
|
|
// ufn,ksr.m_url);
|
|
|
|
// bad?
|
|
//if ( ufn < 0 ) {
|
|
// log("build: link %s had bad url filter."
|
|
// , ksr.m_url );
|
|
// g_errno = EBADENGINEER;
|
|
// return NULL;
|
|
//}
|
|
|
|
//int32_t priority = -1;
|
|
//if ( ufn >= 0 )
|
|
// priority = cr->m_spiderPriorities[ufn];
|
|
|
|
// debug
|
|
if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
|
|
// print the tag rec out into sb2
|
|
SafeBuf sb2;
|
|
if ( gr ) gr->printToBuf ( &sb2 );
|
|
// get it
|
|
//SafeBuf sb1;
|
|
char *action = "add";
|
|
if ( isScraping ) action = "scrape";
|
|
logf(LOG_DEBUG,
|
|
"spider: attempting to %s link. "
|
|
"%s "
|
|
"tags=%s "
|
|
"onpage=%s"
|
|
,
|
|
action ,
|
|
ksr.m_url,
|
|
//sb1.getBufStart(),
|
|
sb2.getBufStart(),
|
|
m_firstUrl.m_url);
|
|
}
|
|
// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
|
|
// . mdw: oct 24, 2013. now i add so the urls show up in
|
|
// the pagecrawlbot.cpp spiderdb dump, so you can examine
|
|
// exactly why a url was crawled or not. plus if you change
|
|
// your mind about banning/filtering then it'd be nice to
|
|
// have these urls readily available.
|
|
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
|
// linksFiltered++; continue; }
|
|
//if ( priority == SPIDER_PRIORITY_BANNED ) {
|
|
// linksBanned++; continue; }
|
|
|
|
|
|
// serialize into the buffer
|
|
int32_t need = ksr.getRecSize();
|
|
// is that what we thought it would be?
|
|
//int32_t thought = links->m_linkLens[i] + 1 + hsize;
|
|
// sanity check
|
|
//if ( need + 12 + 4 > thought ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( p + 1 + need > m_pend ) { char *xx=NULL;*xx=0; }
|
|
// store the rdbId
|
|
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
|
|
else *p++ = RDB_SPIDERDB;
|
|
// print it for debug
|
|
if ( isTestColl ) {
|
|
SafeBuf tmp;
|
|
ksr.print(&tmp);
|
|
log("spider: attempting to add outlink "
|
|
"%s",tmp.getBufStart());
|
|
}
|
|
// store the spider rec
|
|
gbmemcpy ( p , &ksr , need );
|
|
// skip it
|
|
p += need;
|
|
// count it
|
|
numAdded++;
|
|
// check domain
|
|
//if ( domHash32 == m_domHash32 ) numAddedFromSameDomain++;
|
|
if ( ksr.m_sameDom ) numAddedFromSameDomain++;
|
|
}
|
|
|
|
//
|
|
// scan through requests and set m_isContacty
|
|
//
|
|
char *s = m_p;
|
|
int32_t k = 0;
|
|
for ( ; s < p ; k++ ) {
|
|
// advance over rdbid
|
|
s++;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// cast
|
|
SpiderRequest *ksr = (SpiderRequest *)s;
|
|
// set size
|
|
size = ksr->getRecSize();
|
|
// advance over that
|
|
s += size;
|
|
// stop if breach
|
|
if ( k >= 2000 ) break;
|
|
// must be isContacty
|
|
if ( ! linkTypes[k] ) continue;
|
|
// and not disqualified
|
|
if ( disqualify.isInTable(&linkTypes[k] )) continue;
|
|
// ok, we are good to go
|
|
ksr->m_isContacty = 1;
|
|
}
|
|
|
|
// . this is just how many urls we tried to index
|
|
// . move into Spider::addSpiderRequest()
|
|
//cr->m_localCrawlInfo.m_urlsHarvested += numAdded;
|
|
//cr->m_globalCrawlInfo.m_urlsHarvested += numAdded;
|
|
//cr->m_needsSave = true;
|
|
|
|
// save it
|
|
m_numOutlinksAdded = numAdded;
|
|
m_numOutlinksAddedValid = true;
|
|
m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
|
|
m_numOutlinksFiltered = linksFiltered;
|
|
m_numOutlinksBanned = linksBanned;
|
|
// update end of list once we have successfully added all spider recs
|
|
m_p = p;
|
|
// return current ptr
|
|
return m_p ;
|
|
}
|
|
|
|
|
|
/*
|
|
// add keys/recs from the table into the metalist
|
|
bool XmlDoc::addTable96 ( HashTableX *tt1 ,
|
|
int32_t date1 ,
|
|
bool nosplit ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// docid is handy
|
|
int64_t d = *getDocId();
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// use secondary rdbs if repairing
|
|
//bool useRdb2 = ( g_repair.isRepairActive() &&
|
|
// ! g_repair.m_fullRebuild &&
|
|
// ! g_repair.m_removeBadPages );
|
|
char rdbId1 = RDB_INDEXDB;
|
|
char rdbId2 = RDB_DATEDB;
|
|
if ( m_useSecondaryRdbs ) { // useRdb2 ) {
|
|
rdbId1 = RDB2_INDEXDB2;
|
|
rdbId2 = RDB2_DATEDB2;
|
|
}
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// get its key
|
|
int64_t *termId1 = (int64_t *)tt1->getKey ( i );
|
|
// get the score
|
|
uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
|
|
// sanity check
|
|
if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// store rdbid
|
|
*m_p++ = (rdbId1 | f);
|
|
// store it. not a del key.
|
|
*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,false);
|
|
// skip it
|
|
m_p += sizeof(key_t);
|
|
// add to datedb?
|
|
if ( date1 == -1 ) continue;
|
|
// yes
|
|
*m_p++ = (rdbId2 | f);
|
|
// store it. not a del key.
|
|
*(key128_t *)m_p=
|
|
g_datedb.makeKey(*termId1,date1,score1,d,false);
|
|
// advance over that
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
uint8_t rdbId ,
|
|
bool forDelete ) {
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
|
|
// store this rdbId into the list
|
|
char useRdbId = rdbId;
|
|
//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) useRdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) useRdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) useRdbId = RDB2_DATEDB2;
|
|
if ( useRdb2 && rdbId == RDB_PLACEDB ) useRdbId = RDB2_PLACEDB2;
|
|
if ( useRdb2 && rdbId == RDB_SECTIONDB ) useRdbId = RDB2_SECTIONDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else if ( rdbId == RDB_SECTIONDB ) {
|
|
int32_t svs = sizeof(SectionVote);
|
|
if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
int32_t count = 0;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key128_t *k = (key128_t *)tt1->getKey ( i );
|
|
// no key is allowed to have the del bit clear at this point
|
|
// because we reserve that for making negative keys!
|
|
if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
|
|
// store rdbid
|
|
*m_p++ = useRdbId; // (useRdbId | f);
|
|
// store it
|
|
// *(key128_t *)m_p = *k; does this work?
|
|
gbmemcpy ( m_p , k , sizeof(key128_t) );
|
|
// all keys must be positive at this point
|
|
if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
|
|
// or if getting for incremental indexing and this is
|
|
// from the "oldList"
|
|
//if ( forDelete ) *m_p = *m_p & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key128_t);
|
|
// count it
|
|
count++;
|
|
// do not add the data if deleting
|
|
if ( forDelete ) continue;
|
|
// skip if not sectiondb or placedb
|
|
if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
|
|
// ok test it out (MDW)
|
|
//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
|
|
//if ( count > 1 ) continue;
|
|
// get the data value
|
|
char *val = (char *)tt1->getValue ( k );
|
|
// get the size of the data to store. assume Sectiondb vote.
|
|
int32_t ds = sizeof(SectionVote);
|
|
// placedb is special even. include the \0 terminator
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
// "ds" is how many bytes we store as data
|
|
ds = gbstrlen(val)+1;
|
|
// store dataSize first
|
|
*(int32_t *)m_p = ds;
|
|
// skip it
|
|
m_p += 4;
|
|
}
|
|
// store possible accompanying date of the rdb record
|
|
gbmemcpy (m_p,val, ds );
|
|
// skip it
|
|
m_p += ds;
|
|
}
|
|
//if(rdbId==RDB_LINKDB ) log("doc: added %"INT32" linkdb keys" ,count);
|
|
//if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count);
|
|
return true;
|
|
}
|
|
|
|
int32_t XmlDoc::getSiteRank ( ) {
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
return ::getSiteRank ( m_siteNumInlinks );
|
|
}
|
|
|
|
// . add keys/recs from the table into the metalist
|
|
// . we store the keys into "m_p" unless "buf" is given
|
|
bool XmlDoc::addTable144 ( HashTableX *tt1 , int64_t docId , SafeBuf *buf ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key144_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// assume we are storing into m_p
|
|
char *p = m_p;
|
|
|
|
// reserve space if we had a safebuf and point into it if there
|
|
if ( buf ) {
|
|
int32_t slotSize = (sizeof(key144_t)+2+sizeof(key128_t));
|
|
int32_t need = tt1->getNumSlotsUsed() * slotSize;
|
|
if ( ! buf->reserve ( need ) ) return false;
|
|
// get cursor into buf, NOT START of buf
|
|
p = buf->getBufStart();
|
|
}
|
|
|
|
int32_t siteRank = getSiteRank ();
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
char rdbId = RDB_POSDB;
|
|
if ( m_useSecondaryRdbs ) rdbId = RDB2_POSDB2;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// get its key
|
|
char *kp = (char *)tt1->getKey ( i );
|
|
// store rdbid
|
|
*p++ = rdbId; // (rdbId | f);
|
|
// store it as is
|
|
gbmemcpy ( p , kp , sizeof(key144_t) );
|
|
// sanity check
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//h64 &= TERMID_MASK;
|
|
//if ( g_posdb.getTermId(kp) == h64 ) {
|
|
// log("hey: docid=%"INT64" float=%f",m_docId,
|
|
// g_posdb.getFloat(kp) );
|
|
//}
|
|
/*
|
|
// get the score
|
|
int32_t score = tt1->getScoreFromSlot ( i ) ;
|
|
// set the M-bits to the score. used to accumulate link texts
|
|
// that are the same so pages like google.com do not have
|
|
// the word 'google' like 1 million times. this should reduce
|
|
// our "score" logarithmacly into the 7-bits or whatever.
|
|
//
|
|
// NO! now we just always increment the distance cursor
|
|
// m_dist so there will never be a collision of any posdb
|
|
// key we add... so we think
|
|
if ( score ) {
|
|
int32_t newScore = score;
|
|
if ( score >= 65 ) newScore = 65 +(score/100);
|
|
//if ( score >= 65+3200) newScore = 65 +(score/100);
|
|
if ( newScore > MAXMULTIPLIER )
|
|
newScore = MAXMULTIPLIER;
|
|
g_posdb.setMultiplierBits(m_p,(unsigned char)newScore);
|
|
}
|
|
*/
|
|
// this was zero when we added these keys to zero, so fix it
|
|
g_posdb.setDocIdBits ( p , docId );
|
|
// if this is a numeric field we do not want to set
|
|
// the siterank or langid bits because it will mess up
|
|
// sorting by the float which is basically in the position
|
|
// of the word position bits.
|
|
if ( g_posdb.isAlignmentBitClear ( p ) ) {
|
|
// make sure it is set again. it was just cleared
|
|
// to indicate that this key contains a float
|
|
// like a price or something, and we should not
|
|
// set siterank or langid so that its termlist
|
|
// remains sorted just by that float
|
|
g_posdb.setAlignmentBit ( p , 1 );
|
|
}
|
|
// otherwise, set the siterank and langid
|
|
else {
|
|
// this too
|
|
g_posdb.setSiteRankBits ( p , siteRank );
|
|
// set language here too
|
|
g_posdb.setLangIdBits ( p , m_langId );
|
|
}
|
|
// advance over it
|
|
p += sizeof(key144_t);
|
|
}
|
|
|
|
// all done
|
|
if ( ! buf ) { m_p = p; return true; }
|
|
|
|
// update safebuf otherwise
|
|
char *start = buf->getBufStart();
|
|
// fix SafeBuf::m_length
|
|
buf->setLength ( p - start );
|
|
// sanity
|
|
if ( buf->length() > buf->getCapacity() ) { char *xx=NULL;*xx=0; }
|
|
|
|
return true;
|
|
}
|
|
|
|
// add keys/recs from the table into the metalist
|
|
bool XmlDoc::addTable224 ( HashTableX *tt1 ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key224_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 0 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
char rdbId = RDB_LINKDB;
|
|
if ( m_useSecondaryRdbs ) rdbId = RDB2_LINKDB2;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// get its key
|
|
char *kp = (char *)tt1->getKey ( i );
|
|
// store rdbid
|
|
*m_p++ = rdbId; // (rdbId | f);
|
|
// store it as is
|
|
gbmemcpy ( m_p , kp , sizeof(key224_t) );
|
|
// advance over it
|
|
m_p += sizeof(key224_t);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// . add table into our metalist pointed to by m_p
|
|
// . k.n1 = date (see hashWords() below)
|
|
// . k.n0 = termId (see hashWords() below)
|
|
// . and the value is the score, 32-bits
|
|
bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
uint64_t docId ,
|
|
uint8_t rdbId ,
|
|
bool nosplit ) {
|
|
|
|
if ( tt1->m_numSlotsUsed == 0 ) return true;
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity checks
|
|
if ( nosplit ) {
|
|
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key96_t *k = (key96_t *)tt1->getKey ( i );
|
|
// get its value
|
|
uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
|
|
// convert to 8 bits
|
|
v = score32to8 ( v );
|
|
// . make the meta list key for datedb
|
|
// . a datedb key (see Datedb.h)
|
|
key128_t mk = g_datedb.makeKey ( k->n0 , // termId
|
|
k->n1 , // date
|
|
v , // score (8 bits)
|
|
docId ,
|
|
false );// del key?
|
|
// store rdbid with optional "nosplit" flag
|
|
*m_p++ = (rdbId | f);
|
|
// store it. it is a del key.
|
|
*(key128_t *)m_p = mk;
|
|
// skip it
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// add keys/recs from the table into the metalist
|
|
bool XmlDoc::addTable96 ( HashTableX *tt1 ,
|
|
HashTableX *tt2 ,
|
|
int32_t date1 ,
|
|
int32_t date2 ,
|
|
bool del ,
|
|
bool nosplit ) {
|
|
|
|
// sanity check
|
|
if ( tt1->m_numSlots ) {
|
|
if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
if ( tt2->m_numSlots ) {
|
|
if ( tt2->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
|
|
if ( tt2->m_ds != 4 ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// docid is handy
|
|
int64_t d = *getDocId();
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// use secondary rdbs if repairing
|
|
//bool useRdb2 = ( g_repair.isRepairActive() &&
|
|
// ! g_repair.m_fullRebuild &&
|
|
// ! g_repair.m_removeBadPages );
|
|
char rdbId1 = RDB_INDEXDB;
|
|
char rdbId2 = RDB_DATEDB;
|
|
if ( m_useSecondaryRdbs ) { // useRdb2 ) {
|
|
rdbId1 = RDB2_INDEXDB2;
|
|
rdbId2 = RDB2_DATEDB2;
|
|
}
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
int64_t *termId1 = (int64_t *)tt1->getKey ( i );
|
|
// get the score
|
|
uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
|
|
// sanity check
|
|
if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// see if in "tt2"
|
|
int32_t slot = tt2->getSlot ( termId1 );
|
|
// assume 0
|
|
uint8_t score2 = 0;
|
|
// look it up in the positive key table
|
|
if ( slot >= 0 ) {
|
|
score2 = score32to8 ( tt2->getScoreFromSlot(slot) );
|
|
// sanity check
|
|
if ( score2 <= 0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// we annihilate!
|
|
if ( score1 != score2 ) {
|
|
// store rdbid
|
|
*m_p++ = (rdbId1 | f);
|
|
// store it. it is a del key.
|
|
*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,del);
|
|
// skip it
|
|
m_p += sizeof(key_t);
|
|
}
|
|
// add to datedb?
|
|
if ( date1 == -1 ) continue;
|
|
// same dates too?
|
|
if ( date1 == date2 && score1 == score2 ) continue;
|
|
// yes
|
|
*m_p++ = (rdbId2 | f);
|
|
// store it. it is a del key.
|
|
*(key128_t *)m_p=g_datedb.makeKey(*termId1,date1,score1,d,del);
|
|
// advance over that
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . add table into our metalist pointed to by m_p
|
|
// . k.n1 = date (see hashWords() below)
|
|
// . k.n0 = termId (see hashWords() below)
|
|
// . and the value is the score, 32-bits
|
|
bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
HashTableX *tt2 , // <key128_t,char> *tt2
|
|
uint64_t docId ,
|
|
uint8_t rdbId ,
|
|
bool del ,
|
|
bool nosplit ) {
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity checks
|
|
if ( nosplit ) {
|
|
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key96_t *k = (key96_t *)tt1->getKey ( i );
|
|
// get its value
|
|
uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
|
|
// convert to 8 bits
|
|
v = score32to8 ( v );
|
|
// see if in "tt2"
|
|
int32_t slot = tt2->getSlot ( k );
|
|
// get value if there
|
|
if ( slot >= 0 ) {
|
|
// get it
|
|
uint32_t val =*(uint32_t *)tt2->getValueFromSlot(slot);
|
|
// convert to 8 bits
|
|
val = score32to8 ( val );
|
|
// compare, if same, skip it!
|
|
if ( val == v ) continue;
|
|
}
|
|
// . make the meta list key for datedb
|
|
// . a datedb key (see Datedb.h)
|
|
key128_t mk = g_datedb.makeKey ( k->n0 , // termId
|
|
k->n1 , // date
|
|
v , // score (8 bits)
|
|
docId ,
|
|
del );// del key?
|
|
// store rdbid with optional "nosplit" flag
|
|
*m_p++ = (rdbId | f);
|
|
// store it. it is a del key.
|
|
*(key128_t *)m_p = mk;
|
|
// skip it
|
|
m_p += sizeof(key128_t);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1
|
|
HashTableX *tt2 , // <key128_t,char> *tt2
|
|
uint8_t rdbId ,
|
|
bool del ,
|
|
bool nosplit ) {
|
|
|
|
uint8_t f = 0;
|
|
if ( nosplit ) f = 0x80;
|
|
|
|
// sanity check
|
|
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity checks
|
|
if ( nosplit ) {
|
|
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
|
|
if ( rdbId == RDB_DATEDB ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
|
|
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
|
|
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
|
|
if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
|
|
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
|
|
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
|
|
|
|
// sanity checks
|
|
if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else if ( rdbId == RDB_SECTIONDB ) {
|
|
int32_t svs = sizeof(SectionVote);
|
|
if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != svs ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( tt2->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
int32_t count = 0;
|
|
|
|
// store terms from "tt1" table
|
|
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( tt1->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get its key
|
|
key128_t *k = (key128_t *)tt1->getKey ( i );
|
|
// no key is allowed to have the del bit clear at this point
|
|
// because we reserve that for making negative keys!
|
|
if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
|
|
// see if in "tt2"
|
|
int32_t slot = tt2->getSlot ( k );
|
|
// . skip if already indexed
|
|
// . do not do incremental indexing for sectiondb/placedb since
|
|
// it may have the same key but different data!!!!!!!
|
|
if ( slot >= 0 &&
|
|
rdbId != RDB_SECTIONDB &&
|
|
rdbId != RDB_PLACEDB )
|
|
continue;
|
|
// store rdbid with optional "nosplit" flag
|
|
*m_p++ = (rdbId | f);
|
|
// store it
|
|
// *(key128_t *)m_p = *k; does this work?
|
|
gbmemcpy ( m_p , k , sizeof(key128_t) );
|
|
// all keys must be positive at this point
|
|
if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
|
|
// clear the del bit if we are an unmatched key and "del"
|
|
// is true. we need to be a negative key now
|
|
if ( del ) m_p[0] = m_p[0] & 0xfe;
|
|
// skip key
|
|
m_p += sizeof(key128_t);
|
|
// count it
|
|
count++;
|
|
// skip if not sectiondb or placedb
|
|
if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
|
|
// ok test it out (MDW)
|
|
//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
|
|
//if ( count > 1 ) continue;
|
|
// if we were a negative key, do not add a value, even for
|
|
// sectiondb
|
|
if ( del ) continue;
|
|
// get the data value
|
|
char *val = (char *)tt1->getValue ( k );
|
|
// get the size of the data to store. assume Sectiondb vote.
|
|
int32_t ds = sizeof(SectionVote);
|
|
// placedb is special even. include the \0 terminator
|
|
if ( rdbId == RDB_PLACEDB ) {
|
|
// "ds" is how many bytes we store as data
|
|
ds = gbstrlen(val)+1;
|
|
// store dataSize first
|
|
*(int32_t *)m_p = ds;
|
|
// skip it
|
|
m_p += 4;
|
|
}
|
|
// store possible accompanying date of the rdb record
|
|
gbmemcpy (m_p,val, ds );
|
|
// skip it
|
|
m_p += ds;
|
|
}
|
|
//if(rdbId==RDB_LINKDB ) log("doc: added %"INT32" linkdb keys" ,count);
|
|
//if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count);
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
// . hash terms that are sharded by TERMID not DOCID!!
|
|
//
|
|
// . returns false and sets g_errno on error
|
|
// . these terms are stored in indexdb/datedb, but all terms with the same
|
|
// termId reside in one and only one group. whereas normally the records
|
|
// are split based on docid and every group gets 1/nth of the termlist.
|
|
// . we do this "no splitting" so that only one disk seek is required, and
|
|
// we know the termlist is small, or the termlist is being used for spidering
|
|
// or parsing purposes and is usually not sent across the network.
|
|
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
|
|
|
//if ( m_pbuf )
|
|
// m_pbuf->safePrintf("<h3>Terms which are immune to indexdb "
|
|
// "splitting:</h3>");
|
|
|
|
//if ( m_skipIndexing ) return true;
|
|
|
|
// this should be ready to go and not block!
|
|
int64_t *pch64 = getExactContentHash64();
|
|
//int64_t *pch64 = getLooseContentHash64();
|
|
if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// int16_tcut
|
|
Url *fu = getFirstUrl();
|
|
|
|
if ( ! hashVectors ( tt ) ) return false;
|
|
|
|
// constructor should set to defaults automatically
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
// usually we shard by docid, but these are terms we shard by termid!
|
|
hi.m_shardByTermId = true;
|
|
|
|
|
|
// for exact content deduping
|
|
setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
|
|
char cbuf[64];
|
|
int32_t clen = sprintf(cbuf,"%"UINT64"",*pch64);
|
|
hi.m_prefix = "gbcontenthash";
|
|
if ( ! hashString ( cbuf,clen,&hi ) ) return false;
|
|
|
|
////
|
|
//
|
|
// let's stop here for now, until other stuff is actually used again
|
|
//
|
|
////
|
|
|
|
// let's bring back image thumbnail support for the widget project
|
|
//return true;
|
|
|
|
|
|
|
|
char *host = fu->getHost ();
|
|
//int32_t hlen = fu->getHostLen ();
|
|
|
|
/*
|
|
setStatus ( "hashing no-split qdom keys" );
|
|
|
|
char *dom = fu->getDomain ();
|
|
int32_t dlen = fu->getDomainLen();
|
|
|
|
// desc is NULL, prefix will be used as desc
|
|
hi.m_prefix = "qdom";
|
|
if ( ! hashString ( dom,dlen,&hi ) ) return false;
|
|
|
|
|
|
setStatus ( "hashing no-split qhost keys" );
|
|
|
|
// desc is NULL, prefix will be used as desc
|
|
hi.m_prefix = "qhost";
|
|
if ( ! hashString ( host,hlen,&hi ) ) return false;
|
|
*/
|
|
|
|
|
|
// now hash the site
|
|
|
|
|
|
setStatus ( "hashing no-split SiteGetter terms");
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// these are now no-split terms
|
|
//
|
|
char *s = fu->getUrl ();
|
|
int32_t slen = fu->getUrlLen();
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->m_plen <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
|
|
//Dates *dp = getDates ();
|
|
// hash the clocks into indexdb
|
|
//if ( ! dp->hash ( m_docId , tt , this ) ) return false;
|
|
|
|
// . hash special site/hopcount thing for permalinks
|
|
// . used by Images.cpp for doing thumbnails
|
|
// . this returns false and sets g_errno on error
|
|
// . let's try thumbnails for all...
|
|
//if ( ! *getIsPermalink() ) return true;
|
|
|
|
setStatus ( "hashing no-split gbsitetemplate keys" );
|
|
|
|
// must be valid
|
|
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
|
|
char buf[MAX_URL_LEN+20];
|
|
//uint32_t th = m_tagVector.getVectorHash();
|
|
uint32_t tph = *getTagPairHash32();
|
|
// . skip this so we can do site:xyz.com queries
|
|
// . but if this is https:// then you will have to
|
|
// specify that...
|
|
char *site = getSite();
|
|
// sanity check, must NOT start with http://
|
|
if ( ! strncmp ( site , "http://", 7 ) ) { char *xx=NULL;*xx=0;}
|
|
// this must match what we search in Images.cpp::getThumbnail()
|
|
int32_t blen = sprintf(buf,"%"UINT32"%s",tph,site);
|
|
|
|
// use the prefix as the description if description is NULL
|
|
hi.m_prefix = "gbsitetemplate";
|
|
//if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
if ( ! hashSingleTerm ( buf,blen,&hi ) ) return false;
|
|
|
|
|
|
setStatus ( "hashing no-split gbimage keys" );
|
|
|
|
hi.m_prefix = "gbimage";
|
|
// hash gbimage: for permalinks only for Images.cpp
|
|
for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) {
|
|
// get the node number
|
|
//int32_t nn = m_images.m_imageNodes[i];
|
|
// get the url of the image
|
|
//XmlNode *xn = m_xml.getNodePtr(nn);
|
|
int32_t srcLen;
|
|
char *src = m_images.getImageUrl(i,&srcLen);
|
|
// set it to the full url
|
|
Url iu;
|
|
// use "pageUrl" as the baseUrl
|
|
Url *cu = getCurrentUrl();
|
|
// we can addwww to normalize since this is for deduping kinda
|
|
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
|
|
char *u = iu.getUrl ();
|
|
int32_t ulen = iu.getUrlLen();
|
|
// hash each one
|
|
//if ( ! hashString ( u,ulen,&hi ) ) return false;
|
|
// hash a single entity
|
|
if ( ! hashSingleTerm ( u,ulen,&hi) ) return false;
|
|
//log("test: %s",u);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . "sr" is the tagdb Record
|
|
// . "ws" store the terms for PageParser.cpp display
|
|
char *XmlDoc::hashAll ( HashTableX *table ) {
|
|
|
|
setStatus ( "hashing document" );
|
|
|
|
if ( m_allHashed ) return (char *)1;
|
|
|
|
// sanity checks
|
|
if ( table->m_ks != 18 ) { char *xx=NULL;*xx=0; }
|
|
if ( table->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( m_wts && m_wts->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
// ptr to term = 4 + score = 4 + ptr to sec = 4
|
|
if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){char *xx=NULL;*xx=0;}
|
|
|
|
unsigned char *hc = (unsigned char *)getHopCount();
|
|
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
|
|
|
|
// need this for hashing
|
|
HashTableX *cnt = getCountTable();
|
|
if ( ! cnt ) return (char *)cnt;
|
|
if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// and this
|
|
//Weights *we = getWeights();
|
|
//if ( ! we || we == (void *)-1 ) return (char *)we;
|
|
// and this
|
|
Links *links = getLinks();
|
|
if ( ! links ) return (char *)links;
|
|
if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// and now this
|
|
//Synonyms *syn = getSynonyms();
|
|
//if ( ! syn || syn == (void *)-1 ) return (char *)syn;
|
|
|
|
char *wordSpamVec = getWordSpamVec();
|
|
if (!wordSpamVec) return (char *)wordSpamVec;
|
|
if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;}
|
|
|
|
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
|
|
if ( ! fragVec ) return (char *)fragVec;
|
|
if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// why do we need this?
|
|
if ( m_wts ) {
|
|
uint8_t *lv = getLangVector();
|
|
if ( ! lv ) return (char *)lv;
|
|
if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
TagRec *gr = getTagRec();
|
|
if ( ! gr ) return (char *)gr;
|
|
if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; }
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// just keep it somewhat sane...
|
|
//if ( nw > 30000 ) nw = 30000;
|
|
|
|
// then each singleton has one phrase, and 1 empty for good hashing
|
|
//if ( ! table->setTableSize ( nw * 4 ) )
|
|
// return log("build: Could not allocate %"INT32" bytes for table "
|
|
// "for indexing document.",
|
|
// (nw*4)*(8+sizeof(int32_t)));
|
|
|
|
/*
|
|
const char *help =
|
|
"<table><td bgcolor=lightgreen>\n"
|
|
"Each document has several associated pieces. Each piece "
|
|
"is indexed individually. The pieces are listed below and "
|
|
"are preceeded with a table dictating the parameters with "
|
|
"which the piece was indexed."
|
|
|
|
"<br><br>"
|
|
|
|
"Below that table the actual text of the piece is displayed. "
|
|
"Each alphanumeric word in the text has two subscripts of the "
|
|
"form <i>X/Y</i> where X and Y are percentage weights on the "
|
|
"score of that particular alphanumeric word. X is the weight "
|
|
"on the word itself and Y is the weight on the phrase which "
|
|
"is started by that word. A weight of 100% "
|
|
"indicates a weight which does not affect the score."
|
|
|
|
"<br><br>"
|
|
|
|
"Words that are struck out and in a box with a red background "
|
|
"instead of light blue are considered to be spam, meaning "
|
|
"they are repeated in a pattern. They "
|
|
"contain a number in that box which indicates the probability "
|
|
"they are spam and 100 minus that probability is weighted "
|
|
"with their score to get a new, spam-adjusted score. "
|
|
"<br>\n"
|
|
"</tr>\n"
|
|
"</table>\n"
|
|
"</td></table>\n"
|
|
"<br><br>\n";
|
|
|
|
if ( m_pbuf ) m_pbuf->safePrintf("%s",help);
|
|
*/
|
|
|
|
/*
|
|
int32_t inlinks = *getSiteNumInlinks();
|
|
int32_t boost1 = getBoostFromSiteNumInlinks ( inlinks );
|
|
|
|
// . now we hard code "boost2"
|
|
// . based on # of alnum words
|
|
// . this makes us look at keyword density, not just the
|
|
// plain keyword count
|
|
int32_t naw = m_words.getNumAlnumWords();
|
|
// . keep at 100% for up to 200 words then reduce linearly
|
|
// . only do this for newer title recs to avoid undeletable data
|
|
// . if we have a huge document, it can still contain a very
|
|
// relevant paragraph that is dense in the query terms, so
|
|
// we really only want to punish enough so the post query
|
|
// reranking has some good candidates for doing proximity
|
|
// scoring.
|
|
// . back off by .90 every 1000 words
|
|
float nn = naw;
|
|
float bb = 100.0;
|
|
while ( nn > 1000 ) {
|
|
nn *= .9;
|
|
bb *= .9;
|
|
}
|
|
// never drop below %1
|
|
if ( bb < 1.0 ) bb = 1.0;
|
|
// set it
|
|
int64_t boost2 = (int64_t)bb;
|
|
*/
|
|
|
|
/*
|
|
int32_t siteNumInlinks = *getSiteNumInlinks();
|
|
|
|
if ( m_pbuf )
|
|
m_pbuf->safePrintf(
|
|
|
|
"<table border=1 cellpadding=2>"
|
|
|
|
"<tr><td>siteNumInlinks</td><td><b>%"INT32"%%</b></td></tr>"
|
|
|
|
"<tr><td>siteNumInlinksBoost</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>numAlnumWords</td>"
|
|
"<td>%"INT32"</td></tr> "
|
|
|
|
"<tr><td>scoreWeightFromNumAlnumWords"
|
|
"</td><td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>headerWeight</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>urlPathWeight</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>externalLinkTextWeight</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>internalLinkTextWeight</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>conceptWeight</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"<tr><td>titleWeight</td>"
|
|
"<td>%"INT32"%%</td></tr>"
|
|
|
|
"</table>"
|
|
"<br>"
|
|
,
|
|
(int32_t)siteNumInlinks,
|
|
(int32_t)boost1,
|
|
//(int32_t)len,
|
|
(int32_t)naw,
|
|
(int32_t)boost2,
|
|
(int32_t)boost1,
|
|
(int32_t)boost2,
|
|
//(int32_t)boost1,
|
|
(int32_t)m_headerWeight,
|
|
(int32_t)m_urlPathWeight,
|
|
(int32_t)m_externalLinkTextWeight,
|
|
(int32_t)m_internalLinkTextWeight,
|
|
(int32_t)m_conceptWeight,
|
|
(int32_t)m_titleWeight,
|
|
(int32_t)m_titleWeight,
|
|
(int32_t)boost1,
|
|
(int32_t)boost1,
|
|
);
|
|
*/
|
|
|
|
// do not repeat this if the cachedb storage call blocks
|
|
m_allHashed = true;
|
|
|
|
// reset distance cursor
|
|
m_dist = 0;
|
|
|
|
// hash diffbot's json output here
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return NULL;
|
|
/*
|
|
if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) {
|
|
// hash the content type for type:json query
|
|
if ( ! hashContentType ( table ) ) return NULL;
|
|
// and the url: query support
|
|
if ( ! hashUrl ( table ) ) return NULL;
|
|
// language support
|
|
if ( ! hashLanguage ( table ) ) return NULL;
|
|
// country?
|
|
if ( ! hashCountry ( table ) ) return NULL;
|
|
if ( ! hashTagRec ( table ) ) return NULL;
|
|
// hash for gbsortby:gbspiderdate
|
|
if ( ! hashDateNumbers ( table ) ) return NULL;
|
|
// has gbhasthumbnail:1 or 0
|
|
if ( ! hashImageStuff ( table ) ) return NULL;
|
|
// and the json itself
|
|
return hashJSON ( table );
|
|
}
|
|
*/
|
|
|
|
if ( ! hashContentType ( table ) ) return NULL;
|
|
if ( ! hashUrl ( table ) ) return NULL;
|
|
if ( ! hashLanguage ( table ) ) return NULL;
|
|
if ( ! hashCountry ( table ) ) return NULL;
|
|
if ( ! hashSiteNumInlinks( table ) ) return NULL;
|
|
if ( ! hashTagRec ( table ) ) return NULL;
|
|
if ( ! hashAds ( table ) ) return NULL;
|
|
if ( ! hashSubmitUrls ( table ) ) return NULL;
|
|
if ( ! hashIsAdult ( table ) ) return NULL;
|
|
|
|
// has gbhasthumbnail:1 or 0
|
|
if ( ! hashImageStuff ( table ) ) return NULL;
|
|
|
|
// . hash sectionhash:xxxx terms
|
|
// . diffbot still needs to hash this for voting info
|
|
if ( ! hashSections ( table ) ) return NULL;
|
|
|
|
// now hash the terms sharded by termid and not docid here since they
|
|
// just set a special bit in posdb key so Rebalance.cpp can work.
|
|
// this will hash the content checksum which we need for deduping
|
|
// which we use for diffbot custom crawls as well.
|
|
if ( ! hashNoSplit ( table ) ) return NULL;
|
|
|
|
|
|
// MDW: i think we just inject empty html with a diffbotreply into
|
|
// global index now, so don't need this... 9/28/2014
|
|
|
|
// stop indexing xml docs
|
|
bool indexDoc = true;
|
|
if ( cr->m_isCustomCrawl ) indexDoc = false;
|
|
if ( ! cr->m_indexBody ) indexDoc = false;
|
|
// if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject )
|
|
// indexDoc = true;
|
|
// always index diffbot json objects for GI (custom crawl is false)
|
|
if ( m_isDiffbotJSONObject )
|
|
indexDoc = true;
|
|
|
|
// global index unless this is a json object in which case it is
|
|
// hashed above in the call to hashJSON(). this will decrease disk
|
|
// usage by about half, posdb* files are pretty big.
|
|
if ( ! indexDoc ) return (char *)1;
|
|
|
|
// hash json fields
|
|
if ( *ct == CT_JSON ) {
|
|
// this hashes both with and without the fieldname
|
|
hashJSONFields ( table );
|
|
goto skip;
|
|
}
|
|
|
|
// same for xml now, so we can search for field:value like w/ json
|
|
if ( *ct == CT_XML ) {
|
|
// this hashes both with and without the fieldname
|
|
hashXMLFields ( table );
|
|
goto skip;
|
|
}
|
|
|
|
// hash the body of the doc first so m_dist is 0 to match
|
|
// the rainbow display of sections
|
|
if ( ! hashBody2 (table ) ) return NULL;
|
|
|
|
// hash the title now too so neighborhood singles have more
|
|
// to match. plus, we only hash these title terms iff they
|
|
// are not already in the hash table, so as to avoid hashing
|
|
// repeated title terms because we do not do spam detection
|
|
// on them. thus, we need to hash these first before anything
|
|
// else. give them triple the body score
|
|
if ( ! hashTitle ( table )) return NULL;
|
|
|
|
// . hash the keywords tag, limited to first 2k of them so far
|
|
// . hash above the neighborhoods so the neighborhoods only index
|
|
// what is already in the hash table
|
|
if ( ! hashMetaKeywords(table ) ) return NULL;
|
|
|
|
// then hash the incoming link text, NO ANOMALIES, because
|
|
// we index the single words in the neighborhoods next, and
|
|
// we had songfacts.com coming up for the 'street light facts'
|
|
// query because it had a bunch of anomalous inlink text.
|
|
if ( ! hashIncomingLinkText(table,false,true)) return NULL;
|
|
|
|
// then the meta summary and description tags with half the score of
|
|
// the body, and only hash a term if was not already hashed above
|
|
// somewhere.
|
|
if ( ! hashMetaSummary(table) ) return NULL;
|
|
|
|
skip:
|
|
|
|
// this will only increment the scores of terms already in the table
|
|
// because we neighborhoods are not techincally in the document
|
|
// necessarily and we do not want to ruin our precision
|
|
if ( ! hashNeighborhoods ( table ) ) return NULL;
|
|
|
|
|
|
if ( ! hashLinks ( table ) ) return NULL;
|
|
if ( ! hashDateNumbers ( table ) ) return NULL;
|
|
if ( ! hashMetaTags ( table ) ) return NULL;
|
|
if ( ! hashMetaZip ( table ) ) return NULL;
|
|
if ( ! hashDMOZCategories( table ) ) return NULL;
|
|
if ( ! hashCharset ( table ) ) return NULL;
|
|
if ( ! hashRSSInfo ( table ) ) return NULL;
|
|
if ( ! hashPermalink ( table ) ) return NULL;
|
|
|
|
// hash gblang:de last for parsing consistency
|
|
if ( ! hashLanguageString ( table ) ) return NULL;
|
|
|
|
// we set this now in hashWords3()
|
|
if ( m_doingSEO )
|
|
m_wordPosInfoBufValid = true;
|
|
|
|
// store the m_wordPosInfoBuf into cachedb
|
|
// NO! we are not allowed to block in here it messes shit up!!!
|
|
//if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
|
|
// return (char *)-1;
|
|
|
|
// . hash gbkeyword:gbmininlinks where the score is the inlink count
|
|
// . the inlink count can go from 1 to 255
|
|
// . an ip neighborhood can vote no more than once
|
|
// . this is in LinkInfo::hash
|
|
//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;
|
|
|
|
if ( ! hashMetaData ( table ) ) return NULL;
|
|
|
|
// return true if we don't need to print parser info
|
|
//if ( ! m_pbuf ) return true;
|
|
// print out the table into g_bufPtr now if we need to
|
|
//table->print ( );
|
|
return (char *)1;
|
|
}
|
|
|
|
// . "inlinks" is # of inlinks to the SITE
|
|
// . returns a percentage boost
|
|
int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) {
|
|
// . base on # of site inlinks
|
|
// . just hard code this for now
|
|
int32_t boost1 = 100;
|
|
if ( inlinks >= 10 ) boost1 = 150;
|
|
if ( inlinks >= 50 ) boost1 = 200;
|
|
if ( inlinks >= 100 ) boost1 = 250;
|
|
if ( inlinks >= 200 ) boost1 = 300;
|
|
if ( inlinks >= 400 ) boost1 = 350;
|
|
if ( inlinks >= 800 ) boost1 = 400;
|
|
if ( inlinks >= 1600 ) boost1 = 450;
|
|
if ( inlinks >= 3200 ) boost1 = 500;
|
|
if ( inlinks >= 6400 ) boost1 = 550;
|
|
if ( inlinks >= 12800 ) boost1 = 600;
|
|
if ( inlinks >= 25600 ) boost1 = 650;
|
|
if ( inlinks >= 51200 ) boost1 = 700;
|
|
return boost1;
|
|
}
|
|
|
|
bool XmlDoc::appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) {
|
|
|
|
// set4() called from the inject sets these two things for meta data
|
|
// which is basically json that augments the doc, tags it with stuff
|
|
if ( ! m_hasMetadata ) return true;
|
|
if ( ! ptr_metadata ) return true;
|
|
|
|
XmlDoc **pod = getOldXmlDoc ( );
|
|
if ( ! pod ) { char *xx=NULL;*xx=0; }
|
|
if ( pod == (XmlDoc **)-1 ) { char *xx=NULL;*xx=0; }
|
|
// this is non-NULL if it existed
|
|
XmlDoc *od = *pod;
|
|
|
|
// wtf?
|
|
if ( ! od ) return true;
|
|
|
|
|
|
// dedup. if already in there, do not re-add it
|
|
if ( strstr ( od->ptr_metadata , ptr_metadata ) )
|
|
return true;
|
|
|
|
SafeBuf md;
|
|
|
|
// copy over and append
|
|
if ( ! md.safeMemcpy ( od->ptr_metadata , od->size_metadata ) )
|
|
return false;
|
|
// remove trailing \0 if there
|
|
md.removeLastChar ( '\0' );
|
|
// separate from the new stuff
|
|
if ( ! md.safePrintf(",\n") )
|
|
return false;
|
|
|
|
if ( ! md.safeMemcpy ( ptr_metadata , size_metadata ) )
|
|
return false;
|
|
|
|
if ( ! md.nullTerm ( ) )
|
|
return false;
|
|
// update his meta data
|
|
od->ptr_metadata = md.getBufStart();
|
|
od->size_metadata = md.length();
|
|
|
|
int32_t nw = od->size_metadata * 4;
|
|
|
|
HashTableX tt1;
|
|
int32_t need4 = nw * 4 + 5000;
|
|
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,"posdb-i2"))
|
|
return false;
|
|
|
|
od->hashMetaData ( &tt1 );
|
|
|
|
// store the posdb keys from tt1 into our safebuf, tmp
|
|
SafeBuf sb;
|
|
if ( m_usePosdb && ! addTable144 ( &tt1 , od->m_docId , &sb ) )
|
|
return false;
|
|
|
|
// this could use time axis so that is taken into account
|
|
int64_t uh48 = getFirstUrlHash48();
|
|
|
|
// and re-formulate (and compress) his new title rec
|
|
SafeBuf trec;
|
|
if ( ! od->setTitleRecBuf ( &trec , od->m_docId , uh48 ) )
|
|
return false;
|
|
|
|
// force the title rec key to be the same
|
|
// if ( od->m_titleRecKeyValid && trec.getLength() >= sizeof(key_t) ) {
|
|
// char *p = trec.getBufStart();
|
|
// *(key_t *)p = od->m_titleRecKey;
|
|
// }
|
|
// else {
|
|
// log("build: old titlerec invalid docid=%"INT64,od->m_docId);
|
|
// }
|
|
|
|
// store the posdb keys in the meta list
|
|
if ( m_usePosdb && ! metaList->safeMemcpy ( &sb ) )
|
|
return false;
|
|
|
|
// store the updated titlerec into the meta list
|
|
if ( m_useTitledb && ! metaList->pushChar(RDB_TITLEDB) )
|
|
return false;
|
|
if ( m_useTitledb && ! metaList->safeMemcpy(&trec) )
|
|
return false;
|
|
|
|
m_updatedMetaData = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . this is kinda hacky because it uses a short XmlDoc on the stack
|
|
// . no need to hash this stuff for regular documents since all the terms
|
|
// are fielded by gberrorstr, gberrornum or gbisreply.
|
|
// . normally we might use a separate xmldoc class for this but i wanted
|
|
// something more lightweight
|
|
SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
|
|
bool forDelete ) {
|
|
|
|
// set status for this
|
|
setStatus ( "getting spider reply meta list");
|
|
|
|
if ( m_spiderStatusDocMetaListValid )
|
|
return &m_spiderStatusDocMetaList;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
if ( ! cr->m_indexSpiderReplies || forDelete ) {
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// if docid based do not hash a spider reply. docid-based spider
|
|
// requests are added to spiderdb from the query reindex tool.
|
|
// do not do for diffbot subdocuments either, usespiderdb should be
|
|
// false for those.
|
|
// MDW: i disagree, i want to see when these get updated! 9/6/2014
|
|
// ok, let's index for diffbot objects so we can see if they are
|
|
// a dup of another diffbot object, or so we can see when they get
|
|
// revisted, etc.
|
|
//if ( m_setFromDocId || ! m_useSpiderdb ) {
|
|
if ( ! m_useSpiderdb && ! m_isDiffbotJSONObject ) {
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// do not add a status doc if doing a query delete on a status doc
|
|
if ( m_contentTypeValid && m_contentType == CT_STATUS ) {
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// doing it for diffbot throws off smoketests
|
|
// ok, smoketests are updated now, so remove this
|
|
// if ( strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
|
|
// m_spiderStatusDocMetaListValid = true;
|
|
// return &m_spiderStatusDocMetaList;
|
|
// }
|
|
|
|
// we double add regular html urls in a query reindex because the
|
|
// json url adds the parent, so the parent gets added twice sometimes,
|
|
// and for some reason it is adding a spider status doc the 2nd time
|
|
// so cut that out. this is kinda a hack b/c i'm not sure what's
|
|
// going on. but you can set a break point here and see what's up if
|
|
// you want.
|
|
// MDW: likewise, take this out, i want these recorded as well..
|
|
// if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
|
|
// m_spiderStatusDocMetaListValid = true;
|
|
// return &m_spiderStatusDocMetaList;
|
|
// }
|
|
|
|
// . fake this out so we do not core
|
|
// . hashWords3() uses it i guess
|
|
bool forcedLangId = false;
|
|
if ( ! m_langIdValid ) {
|
|
forcedLangId = true;
|
|
m_langIdValid = true;
|
|
m_langId = langUnknown;
|
|
}
|
|
|
|
// prevent more cores
|
|
bool forcedSiteNumInlinks = false;
|
|
if ( ! m_siteNumInlinksValid ) {
|
|
forcedSiteNumInlinks = true;
|
|
m_siteNumInlinks = 0;
|
|
m_siteNumInlinksValid = true;
|
|
}
|
|
|
|
SafeBuf *mbuf = getSpiderStatusDocMetaList2 ( reply );
|
|
|
|
if ( forcedLangId )
|
|
m_langIdValid = false;
|
|
|
|
if ( forcedSiteNumInlinks ) {
|
|
m_siteNumInlinksValid = false;
|
|
}
|
|
|
|
return mbuf;
|
|
}
|
|
|
|
// . the spider status doc
|
|
// . TODO:
|
|
// usedProxy:1
|
|
// proxyIp:1.2.3.4
|
|
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
|
|
|
setStatus ( "making spider reply meta list");
|
|
|
|
// . we also need a unique docid for indexing the spider *reply*
|
|
// as a separate document
|
|
// . use the same url, but use a different docid.
|
|
// . use now to mix it up
|
|
//int32_t now = getTimeGlobal();
|
|
//int64_t h = hash64(m_docId, now );
|
|
// to keep qa test consistent this docid should be consistent
|
|
// so base it on spidertime of parent doc.
|
|
// if doc is being force deleted then this is invalid!
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
int64_t h = hash64(m_docId, m_spideredTime );
|
|
// mask it out
|
|
int64_t d = h & DOCID_MASK;
|
|
// try to get an available docid, preferring "d" if available
|
|
int64_t *uqd = getAvailDocIdOnly ( d );
|
|
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
|
|
|
|
m_addedStatusDocId = *uqd;
|
|
|
|
// unsigned char *hc = (unsigned char *)getHopCount();
|
|
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
|
|
|
int32_t tmpVal = -1;
|
|
int32_t *priority = &tmpVal;
|
|
int32_t *ufn = &tmpVal;
|
|
|
|
// prevent a core if sreq is not valid, these will freak out
|
|
// diffbot replies may not have a valid m_sreq
|
|
if ( m_sreqValid ) {
|
|
priority = getSpiderPriority();
|
|
if ( ! priority || priority == (void *)-1 )
|
|
return (SafeBuf *)priority;
|
|
|
|
ufn = getUrlFilterNum();
|
|
if ( ! ufn || ufn == (void *)-1 )
|
|
return (SafeBuf *)ufn;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
Json *jp1 = NULL;
|
|
// i've seen ptr_utf8Content NULL and content type as html for
|
|
// some reason when deleting a diffbot object doc so check for that
|
|
// here and forget it. we don't want getParsedJson() to core.
|
|
if ( m_isDiffbotJSONObject &&
|
|
m_contentType == CT_JSON &&
|
|
m_contentTypeValid ) {
|
|
jp1 = getParsedJson();
|
|
if ( ! jp1 || jp1 == (void *)-1) return (SafeBuf *)jp1;
|
|
}
|
|
|
|
// sanity
|
|
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// why isn't gbhopcount: being indexed consistently?
|
|
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// reset just in case
|
|
m_spiderStatusDocMetaList.reset();
|
|
|
|
// sanity
|
|
if ( *uqd <= 0 || *uqd > MAX_DOCID ) {
|
|
log("xmldoc: avail docid = %"INT64". could not index spider "
|
|
"reply or %s",*uqd,m_firstUrl.m_url);
|
|
//char *xx=NULL;*xx=0; }
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
// the old doc
|
|
XmlDoc *od = NULL;
|
|
if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;
|
|
|
|
Url *fu = &m_firstUrl;
|
|
|
|
// . make a little json doc that we'll hash up
|
|
// . only index the fields in this doc, no extra gbdocid: inurl:
|
|
// hash terms
|
|
SafeBuf jd;
|
|
jd.safePrintf("{\n");
|
|
|
|
// so type:status query works
|
|
jd.safePrintf("\"type\":\"status\",\n");
|
|
|
|
jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl() );
|
|
|
|
if ( ptr_redirUrl )
|
|
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
|
|
ptr_redirUrl);
|
|
|
|
if ( m_indexCodeValid ) {
|
|
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
|
jd.safePrintf("\"gbssStatusMsg\":\"");
|
|
jd.jsonEncode (mstrerror(m_indexCode));
|
|
jd.safePrintf("\",\n");
|
|
}
|
|
else {
|
|
jd.safePrintf("\"gbssStatusCode\":-1,\n");
|
|
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
|
|
}
|
|
|
|
|
|
if ( m_httpStatusValid )
|
|
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
|
|
(int32_t)m_httpStatus);
|
|
|
|
// do not index gbssIsSeedUrl:0 because there will be too many usually
|
|
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
|
if ( isSeed )
|
|
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
|
|
|
|
if ( od )
|
|
jd.safePrintf("\"gbssWasIndexed\":1,\n");
|
|
else
|
|
jd.safePrintf("\"gbssWasIndexed\":0,\n");
|
|
|
|
int32_t now = getTimeGlobal();
|
|
if ( od )
|
|
jd.safePrintf("\"gbssAgeInIndex\":"
|
|
"%"UINT32",\n",now - od->m_spideredTime);
|
|
|
|
if ( m_isDiffbotJSONObject ) { // && cr->m_isCustomCrawl
|
|
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
|
|
JsonItem *jsonItem = NULL;
|
|
if ( jp1 ) jsonItem = jp1->getItem("diffbotUri");
|
|
if ( jsonItem ) {
|
|
jd.safePrintf("\"gbssDiffbotUri\":\"");
|
|
int32_t vlen;
|
|
char *val = jsonItem->getValueAsString( &vlen );
|
|
if ( val ) jd.safeMemcpy ( val , vlen );
|
|
jd.safePrintf("\",\n");
|
|
}
|
|
else
|
|
jd.safePrintf("\"gbssDiffbotUri\":"
|
|
"\"none\",\n");
|
|
// show the type as gbssDiffbotType:"article" etc.
|
|
JsonItem *dti = NULL;
|
|
if ( jp1 )
|
|
dti = jp1->getItem("type");
|
|
if ( dti ) {
|
|
jd.safePrintf("\"gbssDiffbotType\":\"");
|
|
int32_t vlen;
|
|
char *val = dti->getValueAsString( &vlen );
|
|
if ( val ) jd.jsonEncode ( val , vlen );
|
|
jd.safePrintf("\",\n");
|
|
}
|
|
|
|
}
|
|
else { // if ( cr->m_isCustomCrawl ) {
|
|
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
|
|
}
|
|
|
|
jd.safePrintf("\"gbssDomain\":\"");
|
|
jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
|
|
jd.safePrintf("\",\n");
|
|
|
|
jd.safePrintf("\"gbssSubdomain\":\"");
|
|
jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
|
|
jd.safePrintf("\",\n");
|
|
|
|
//if ( m_redirUrlPtr && m_redirUrlValid )
|
|
//if ( m_numRedirectsValid )
|
|
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);
|
|
|
|
if ( m_docIdValid )
|
|
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
|
|
|
|
if ( m_parentDocPtr && m_isChildDoc && m_parentDocPtr->m_docIdValid )
|
|
jd.safePrintf("\"gbssParentDocId\":%"INT64",\n",
|
|
m_parentDocPtr->m_docId);
|
|
|
|
if ( m_hopCountValid )
|
|
//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
|
|
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);
|
|
|
|
// crawlbot round
|
|
if ( cr->m_isCustomCrawl )
|
|
jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
|
|
cr->m_spiderRoundNum);
|
|
|
|
// for -diffbotxyz fake docs addedtime is 0
|
|
if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
|
|
// in Spider.cpp we try to set m_sreq's m_addedTime to the
|
|
// min of all the spider requests, and we try to ensure
|
|
// that in the case of deduping we preserve the one with
|
|
// the oldest time. no, now we actually use
|
|
// m_discoveryTime since we were using m_addedTime in
|
|
// the url filters as it was originally intended.
|
|
jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
|
|
m_sreq.m_discoveryTime);
|
|
}
|
|
|
|
if ( m_isDupValid && m_isDup )
|
|
jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
|
|
m_docIdWeAreADupOf);
|
|
|
|
// how many spiderings were successful vs. failed
|
|
// these don't work because we only store one reply
|
|
// which overwrites any older reply. that's how the
|
|
// key is. we can change the key to use the timestamp
|
|
// and not parent docid in makeKey() for spider
|
|
// replies later.
|
|
// if ( m_sreqValid ) {
|
|
// jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
|
|
// m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
|
|
// jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
|
|
// m_sreq.m_reservedc1);
|
|
// jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
|
|
// m_sreq.m_reservedc2);
|
|
// }
|
|
|
|
if ( m_spideredTimeValid )
|
|
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
|
|
m_spideredTime);
|
|
else
|
|
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",0);
|
|
|
|
if ( m_firstIndexedDateValid )
|
|
jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
|
|
m_firstIndexedDate);
|
|
|
|
if ( m_contentHash32Valid )
|
|
jd.safePrintf("\"gbssContentHash32\":%"UINT32",\n",
|
|
m_contentHash32);
|
|
|
|
// so we know what hostid spidered the url. this is not the
|
|
// same hostid that will store it necessarily
|
|
jd.safePrintf("\"gbssSpideredByHostId\":%"INT32",\n",
|
|
(int32_t)g_hostdb.getMyHostId());
|
|
|
|
// which shard will store the titlerec and index terms? it
|
|
// is based on docid.
|
|
if ( m_docIdValid ) {
|
|
int32_t shardNum = getShardNumFromDocId ( m_docId );
|
|
jd.safePrintf("\"gbssStoredOnShard\":%"INT32",\n",shardNum);
|
|
}
|
|
|
|
if ( m_downloadStartTimeValid && m_downloadEndTimeValid ) {
|
|
jd.safePrintf("\"gbssDownloadStartTimeMS\":%"INT64",\n",
|
|
m_downloadStartTime);
|
|
jd.safePrintf("\"gbssDownloadEndTimeMS\":%"INT64",\n",
|
|
m_downloadEndTime);
|
|
|
|
int64_t took = m_downloadEndTime - m_downloadStartTime;
|
|
jd.safePrintf("\"gbssDownloadDurationMS\":%"INT64",\n",took);
|
|
|
|
jd.safePrintf("\"gbssDownloadStartTime\":%"UINT32",\n",
|
|
(uint32_t)(m_downloadStartTime/1000));
|
|
|
|
jd.safePrintf("\"gbssDownloadEndTime\":%"UINT32",\n",
|
|
(uint32_t)(m_downloadEndTime/1000));
|
|
}
|
|
|
|
|
|
jd.safePrintf("\"gbssUsedRobotsTxt\":%"INT32",\n",
|
|
m_useRobotsTxt);
|
|
|
|
if ( m_linksValid )
|
|
jd.safePrintf("\"gbssNumOutlinksOnPage\":%"INT32",\n",
|
|
(int32_t)m_links.getNumLinks());
|
|
|
|
//if ( m_numOutlinksAddedValid )
|
|
// crap, this is not right because we only call addOutlinksToMetaList()
|
|
// after we call this function.
|
|
// jd.safePrintf("\"gbssNumOutlinksAdded\":%"INT32",\n",
|
|
// (int32_t)m_numOutlinksAdded);
|
|
|
|
// how many download/indexing errors we've had, including this one
|
|
// if applicable.
|
|
if ( m_srepValid )
|
|
jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",
|
|
m_srep.m_errCount);
|
|
else
|
|
jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",0);
|
|
|
|
|
|
if ( m_ipValid )
|
|
jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
|
|
else
|
|
jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");
|
|
|
|
if ( m_ipEndTime ) {
|
|
int64_t took = m_ipEndTime - m_ipStartTime;
|
|
jd.safePrintf("\"gbssIpLookupTimeMS\":%"INT64",\n",took);
|
|
}
|
|
|
|
if ( m_siteNumInlinksValid ) {
|
|
jd.safePrintf("\"gbssSiteNumInlinks\":%"INT32",\n",
|
|
(int32_t)m_siteNumInlinks);
|
|
char siteRank = getSiteRank();
|
|
jd.safePrintf("\"gbssSiteRank\":%"INT32",\n",
|
|
(int32_t)siteRank);
|
|
}
|
|
|
|
jd.safePrintf("\"gbssContentInjected\":%"INT32",\n",
|
|
(int32_t)m_contentInjected);
|
|
|
|
if ( m_percentChangedValid && od )
|
|
jd.safePrintf("\"gbssPercentContentChanged\""
|
|
":%.01f,\n",
|
|
m_percentChanged);
|
|
|
|
if ( ! m_isDiffbotJSONObject )
|
|
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
|
|
*priority);
|
|
|
|
// this could be -1, careful
|
|
if ( *ufn >= 0 && ! m_isDiffbotJSONObject )
|
|
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
|
|
cr->m_regExs[*ufn].getBufStart());
|
|
|
|
// we forced the langid valid above
|
|
if ( m_langIdValid && m_contentLen )
|
|
jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
|
|
getLangAbbr(m_langId));
|
|
|
|
if ( m_contentTypeValid && m_contentLen )
|
|
jd.safePrintf("\"gbssContentType\":\"%s\",\n",
|
|
g_contentTypeStrings[m_contentType]);
|
|
|
|
if ( m_contentValid )
|
|
jd.safePrintf("\"gbssContentLen\":%"INT32",\n",
|
|
m_contentLen);
|
|
|
|
if ( m_isContentTruncatedValid )
|
|
jd.safePrintf("\"gbssIsContentTruncated\":%"INT32",\n",
|
|
(int32_t)m_isContentTruncated);
|
|
|
|
|
|
// do not show the -1 any more, just leave it out then
|
|
// to make things look prettier
|
|
if ( m_crawlDelayValid && m_crawlDelay >= 0 &&
|
|
! m_isDiffbotJSONObject )
|
|
// -1 if none?
|
|
jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
|
|
(int32_t)m_crawlDelay);
|
|
|
|
// was this url ever sent to diffbot either now or at a previous
|
|
// spider time?
|
|
if ( ! m_isDiffbotJSONObject ) {
|
|
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
|
|
(int)m_sentToDiffbot);
|
|
|
|
// sent to diffbot?
|
|
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
|
|
(int)m_sentToDiffbotThisTime);
|
|
}
|
|
|
|
// page must have been downloaded for this one
|
|
if ( cr->m_isCustomCrawl &&
|
|
m_utf8ContentValid &&
|
|
! m_isDiffbotJSONObject &&
|
|
m_content &&
|
|
m_contentValid &&
|
|
cr->m_diffbotPageProcessPattern.getBufStart() &&
|
|
cr->m_diffbotPageProcessPattern.getBufStart()[0] ) {
|
|
char match = doesPageContentMatchDiffbotProcessPattern();
|
|
jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n",
|
|
(int)match);
|
|
}
|
|
if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){
|
|
|
|
char *url = getFirstUrl()->getUrl();
|
|
|
|
// the crawl regex
|
|
int match = 1;
|
|
regex_t *ucr = &cr->m_ucr;
|
|
if ( ! cr->m_hasucr ) ucr = NULL;
|
|
if ( ucr && regexec(ucr,url,0,NULL,0) ) match = 0;
|
|
if ( ucr )
|
|
jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
|
|
match);
|
|
|
|
// now the substring pattern
|
|
match = 1;
|
|
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
|
|
if ( ucp && ! ucp[0] ) ucp = NULL;
|
|
if ( ucp && ! doesStringContainPattern(url,ucp) ) match = 0;
|
|
if ( ucp )
|
|
jd.safePrintf("\"gbssMatchesUrlCrawlPattern\":%i,\n",
|
|
match);
|
|
|
|
// now process regex
|
|
match = 1;
|
|
regex_t *upr = &cr->m_upr;
|
|
if ( ! cr->m_hasupr ) upr = NULL;
|
|
if ( upr && regexec(upr,url,0,NULL,0) ) match = 0;
|
|
if ( upr )
|
|
jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
|
|
match);
|
|
|
|
// now process pattern
|
|
match = 1;
|
|
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
|
|
if ( upp && ! upp[0] ) upp = NULL;
|
|
if ( upp && ! doesStringContainPattern(url,upp) ) match = 0;
|
|
if ( upp )
|
|
jd.safePrintf("\"gbssMatchesUrlProcessPattern\":%i,\n",
|
|
match);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime &&
|
|
! m_isDiffbotJSONObject ) {
|
|
jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
|
|
m_diffbotReplyError);
|
|
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
|
|
jd.jsonEncode(mstrerror(m_diffbotReplyError));
|
|
jd.safePrintf("\",\n");
|
|
jd.safePrintf("\"gbssDiffbotReplyLen\":%"INT32",\n",
|
|
m_diffbotReply.length());
|
|
int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
|
|
jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%"INT64",\n",
|
|
took );
|
|
jd.safePrintf("\"gbssDiffbotReplyRetries\":%"INT32",\n",
|
|
m_diffbotReplyRetries );
|
|
// this is not correct at this point we haven't parsed the json
|
|
// jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%"INT32",\n",
|
|
// m_diffbotJSONCount);
|
|
}
|
|
|
|
// remove last ,\n
|
|
jd.incrementLength(-2);
|
|
// end the json spider status doc
|
|
jd.safePrintf("\n}\n");
|
|
|
|
// BEFORE ANY HASHING
|
|
int32_t savedDist = m_dist;
|
|
|
|
// add the index list for it. it returns false and sets g_errno on err
|
|
// otherwise it sets m_spiderStatusDocMetaList
|
|
if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
|
|
return NULL;
|
|
|
|
// now make the titlerec
|
|
char xdhead[2048];
|
|
// just the head of it. this is the hacky part.
|
|
XmlDoc *xd = (XmlDoc *)xdhead;
|
|
// clear it out
|
|
memset ( xdhead, 0 , 2048);
|
|
|
|
// copy stuff from THIS so the spider reply "document" has the same
|
|
// header info stuff
|
|
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
|
|
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
|
|
gbmemcpy ( xdhead , (char *)this , hsize );
|
|
|
|
// override spider time in case we had error to be consistent
|
|
// with the actual SpiderReply record
|
|
//xd->m_spideredTime = reply->m_spideredTime;
|
|
//xd->m_spideredTimeValid = true;
|
|
// sanity
|
|
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
|
|
|
|
// this will cause the maroon box next to the search result to
|
|
// say "STATUS" similar to "PDF" "DOC" etc.
|
|
xd->m_contentType = CT_STATUS;
|
|
|
|
int32_t fullsize = &m_dummyEnd - (char *)this;
|
|
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
// the ptr_* were all zero'd out, put the ones we want to keep back in
|
|
SafeBuf tmp;
|
|
// was "Spider Status: %s" but that is unnecessary
|
|
tmp.safePrintf("<title>%s</title>",
|
|
mstrerror(m_indexCode));
|
|
|
|
// if we are a dup...
|
|
if ( m_indexCode == EDOCDUP )
|
|
tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
|
|
|
|
if ( m_redirUrlPtr && m_redirUrlValid )
|
|
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
|
|
*/
|
|
|
|
// put stats like we log out from logIt
|
|
//tmp.safePrintf("<div style=max-width:800px;>\n");
|
|
// store log output into doc
|
|
//logIt(&tmp);
|
|
//tmp.safePrintf("\n</div>");
|
|
|
|
// the content is just the title tag above
|
|
// xd->ptr_utf8Content = tmp.getBufStart();
|
|
// xd->size_utf8Content = tmp.length()+1;
|
|
xd->ptr_utf8Content = jd.getBufStart();
|
|
xd->size_utf8Content = jd.length()+1;
|
|
|
|
// keep the same url as the doc we are the spider reply for
|
|
xd->ptr_firstUrl = ptr_firstUrl;
|
|
xd->size_firstUrl = size_firstUrl;
|
|
|
|
// serps need site, otherwise search results core
|
|
xd->ptr_site = ptr_site;
|
|
xd->size_site = size_site;
|
|
|
|
// if this is null then ip lookup failed i guess so just use
|
|
// the subdomain
|
|
if ( ! ptr_site && m_firstUrlValid ) {
|
|
xd->ptr_site = m_firstUrl.getHost();
|
|
xd->size_site = m_firstUrl.getHostLen();
|
|
}
|
|
|
|
// we can't do this the head is not big enough
|
|
// xd->m_collnum = m_collnum;
|
|
// xd->m_collnumValid = m_collnumValid;
|
|
|
|
// use the same uh48 of our parent
|
|
int64_t uh48 = m_firstUrl.getUrlHash48();
|
|
// then make into a titlerec but store in metalistbuf, not m_titleRec
|
|
SafeBuf titleRecBuf;
|
|
// this should not include ptrs that are NULL when compressing
|
|
// using its m_internalFlags1
|
|
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
|
|
return NULL;
|
|
|
|
// concat titleRec to our posdb key records
|
|
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
|
|
return NULL;
|
|
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
|
|
return NULL;
|
|
|
|
// return the right val
|
|
m_dist = savedDist;
|
|
|
|
// ok, good to go, ready to add to posdb and titledb
|
|
m_spiderStatusDocMetaListValid = true;
|
|
return &m_spiderStatusDocMetaList;
|
|
}
|
|
|
|
|
|
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
|
|
|
|
// the posdb table
|
|
HashTableX tt4;
|
|
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
|
|
return false;
|
|
|
|
|
|
Json jp2;
|
|
if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
|
|
g_errno = EBADJSONPARSER;
|
|
return false;
|
|
}
|
|
|
|
// re-set to 0
|
|
m_dist = 0;
|
|
|
|
// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = &tt4;
|
|
hi.m_desc = "json spider status object";
|
|
hi.m_useCountTable = false;
|
|
hi.m_useSections = false;
|
|
|
|
// fill up tt4. false -> do not hash without field prefixes.
|
|
hashJSONFields2 ( &tt4 , &hi , &jp2 , false );
|
|
|
|
|
|
/*
|
|
char buf[64];
|
|
int32_t bufLen;
|
|
|
|
// hash 'type:status' similar to 'type:json' etc.
|
|
hi.m_prefix = "type";
|
|
if ( ! hashString("status" , &hi ) ) return NULL;
|
|
|
|
// . hash gbstatus:0 for no error, otherwise the error code
|
|
// . this also hashes it as a number so we don't have to
|
|
// . so we can do histograms on this #
|
|
hi.m_prefix = "gbstatus";
|
|
hi.m_desc = "spider error number as string";
|
|
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode );
|
|
if ( ! hashString( buf , &hi ) ) return NULL;
|
|
*/
|
|
|
|
/*
|
|
logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
|
|
logf(LOG_DEBUG,"hashing indexcode=%"INT32"",m_indexCode);
|
|
bool ok = false;
|
|
if ( m_indexCode ) ok = true;
|
|
// scan the keys in tt and make sure the termid fo
|
|
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
|
|
int32_t recSize = 0;
|
|
int32_t rcount = 0;
|
|
char *p = m_spiderStatusDocMetaList.getBufStart();
|
|
char *pend =m_spiderStatusDocMetaList.getBuf();
|
|
for ( ; p < pend ; p += recSize ) {
|
|
// get rdbid, RDB_POSDB
|
|
uint8_t rdbId = *p & 0x7f;
|
|
// skip
|
|
p++;
|
|
// get key size
|
|
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
|
// init this
|
|
int32_t recSize = ks;
|
|
// convert into a key128_t, the biggest possible key
|
|
//key224_t k ;
|
|
char k[MAX_KEY_BYTES];
|
|
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
|
|
//k.setMin();
|
|
gbmemcpy ( &k , p , ks );
|
|
// is it a negative key?
|
|
char neg = false;
|
|
if ( ! ( p[0] & 0x01 ) ) neg = true;
|
|
// this is now a bit in the posdb key so we can rebalance
|
|
char shardByTermId = false;
|
|
if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
|
|
shardByTermId = true;
|
|
// skip it
|
|
p += ks;
|
|
// . always zero if key is negative
|
|
// . this is not the case unfortunately...
|
|
if ( neg ) {char *xx=NULL;*xx=0; }
|
|
// print dbname
|
|
if ( rdbId != RDB_POSDB ) { char *xx=NULL;*xx=0; }
|
|
// get termid et al
|
|
key144_t *k2 = (key144_t *)k;
|
|
int64_t tid = g_posdb.getTermId(k2);
|
|
log("db: tid=%"INT64"",tid);
|
|
if ( tid == 199947062354729LL ) ok = true;
|
|
//if ( m_indexCode == 0 && tid != 199947062354729LL ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
}
|
|
if ( ! ok ) { char *xx=NULL;*xx=0; }
|
|
goto SKIP;
|
|
// was here....
|
|
*/
|
|
|
|
/*
|
|
// gbstatus:"tcp timed out"
|
|
hi.m_prefix = "gbstatusmsg";
|
|
hi.m_desc = "spider error msg";
|
|
if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;
|
|
|
|
//hi.m_prefix = "gbdocid";
|
|
//hi.m_desc = "docid";
|
|
//bufLen = sprintf ( buf , "%"UINT64"", *uqd ) ;
|
|
//if ( ! hashString( buf , &hi ) ) return NULL;
|
|
|
|
// . then the url. url: site: ip: etc. terms
|
|
// . do NOT hash non-fielded terms so we do not get "status"
|
|
// results poluting the serps => false
|
|
if ( ! hashUrl ( &tt4 , true ) ) return NULL;
|
|
|
|
// false --> do not hash the gbdoc* terms (CT_STATUS)
|
|
hashDateNumbers ( &tt4 , true );
|
|
*/
|
|
|
|
// store keys in safebuf then to make our own meta list
|
|
addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );
|
|
|
|
// debug this shit
|
|
//SafeBuf tmpsb;
|
|
//printMetaList ( m_spiderStatusDocMetaList.getBufStart() ,
|
|
// m_spiderStatusDocMetaList.getBuf(),
|
|
// &tmpsb );
|
|
//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());
|
|
|
|
return true;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta tags" );
|
|
|
|
// assume it's empty
|
|
char buf [ 32*1024 ];
|
|
int32_t bufLen = 32*1024 - 1;
|
|
buf[0] = '\0';
|
|
int32_t n = m_xml.getNumNodes();
|
|
XmlNode *nodes = m_xml.getNodes();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "custom meta tag";
|
|
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != 68 ) continue;
|
|
// only get content for <meta name=..> not <meta http-equiv=..>
|
|
int32_t tagLen;
|
|
char *tag = m_xml.getString ( i , "name" , &tagLen );
|
|
char *tptr = tag;
|
|
char tagLower[128];
|
|
int32_t j ;
|
|
int32_t code;
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// make tag name lower case and do not allow bad chars
|
|
if ( tagLen > 126 ) tagLen = 126 ;
|
|
to_lower3_a ( tag , tagLen , tagLower );
|
|
for ( j = 0 ; j < tagLen ; j++ ) {
|
|
// bail if has unacceptable chars
|
|
if ( ! is_alnum_a ( tag[j] ) &&
|
|
tag[j] != '-' &&
|
|
tag[j] != '_' &&
|
|
tag[j] != '.' ) break;
|
|
// convert to lower
|
|
tagLower[j] = to_lower_a ( tag[j] );
|
|
}
|
|
// skip this meta if had unacceptable chars
|
|
if ( j < tagLen ) continue;
|
|
// is it recognized?
|
|
code = getFieldCode ( tag , tagLen );
|
|
// after version 45 or more, do not allow gbrss
|
|
// meta tags, because those are now reserved for us
|
|
if ( code == FIELD_GBRSS ) continue;
|
|
// allow gbrss: fields for earlier versions though
|
|
if ( code == FIELD_GBRSS ) code = FIELD_GENERIC;
|
|
// . do not allow reserved tag names
|
|
// . title,url,suburl,
|
|
if ( code != FIELD_GENERIC ) continue;
|
|
// this is now reserved
|
|
// do not hash keyword, keywords, description, or summary metas
|
|
// because that is done in hashRange() below based on the
|
|
// tagdb (ruleset) record
|
|
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
|
|
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
|
|
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
|
|
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
|
|
continue;
|
|
// . don't allow reserved names: site, url, suburl, link and ip
|
|
// . actually, the colon is included as part of those
|
|
// field names, so we really lucked out...!
|
|
// . index this converted tag name
|
|
tptr = tagLower;
|
|
|
|
// get the content
|
|
int32_t len;
|
|
char *s = m_xml.getString ( i , "content" , &len );
|
|
if ( ! s || len <= 0 ) continue;
|
|
// . ensure not too big for our buffer (keep room for a \0)
|
|
// . TODO: this is wrong, should be len+1 > bufLen,
|
|
// but can't fix w/o resetting the index (COME BACK HERE
|
|
// and see where we index meta tags besides this place!!!)
|
|
// remove those other places, except... what about keywords
|
|
// and description?
|
|
if ( len+1 >= bufLen ) {
|
|
//len = bufLen - 1;
|
|
// assume no punct to break on!
|
|
len = 0;
|
|
// only cut off at punctuation
|
|
char *p = s;
|
|
char *pend = s + len;
|
|
char *last = NULL;
|
|
int32_t size ;
|
|
for ( ; p < pend ; p += size ) {
|
|
// skip if utf8 char
|
|
size = getUtf8CharSize(*p);
|
|
// skip if 2+ bytes
|
|
if ( size > 1 ) continue;
|
|
// skip if not punct
|
|
if ( is_alnum_a(*p) ) continue;
|
|
// mark it
|
|
last = p;
|
|
}
|
|
if ( last ) len = last - s;
|
|
// this old way was faster...:
|
|
//while ( len > 0 && is_alnum(s[len-1]) ) len--;
|
|
}
|
|
// convert html entities to their chars
|
|
len = saftenTags ( buf , bufLen , s , len );
|
|
// NULL terminate the buffer
|
|
buf[len] = '\0';
|
|
|
|
// temp null term
|
|
char c = tptr[tagLen];
|
|
tptr[tagLen] = 0;
|
|
// custom
|
|
hi.m_prefix = tptr;
|
|
// desc is NULL, prefix will be used as desc
|
|
bool status = hashString ( buf,len,&hi );
|
|
// put it back
|
|
tptr[tagLen] = c;
|
|
// bail on error, g_errno should be set
|
|
if ( ! status ) return false;
|
|
|
|
// return false with g_errno set on error
|
|
//if ( ! hashNumber ( buf , bufLen , &hi ) )
|
|
// return false;
|
|
}
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashMetaData ( HashTableX *tt ) {
|
|
|
|
if ( ! ptr_metadata || !ptr_metadata[0] ) return true;
|
|
|
|
Json jp;
|
|
|
|
if ( ! jp.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
|
|
log("XmlDoc had error parsing json in metadata %s",
|
|
ptr_metadata);
|
|
return false;
|
|
}
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta data";
|
|
hi.m_useCountTable = false;
|
|
|
|
// always reset to word pos to 0 now when hashing a json field
|
|
// since it shouldn't matter because they are in a field so we
|
|
// have to search like myfield:whatever. this way we can
|
|
// augment ptr_metadata on an EDOCUNCHANGED error and
|
|
// not end up with undeleteable data in posdb. if we have
|
|
// duplicate fields in our doc and our doc is json, we could have
|
|
// some word position conflicts, which kinda sucks, but can be
|
|
// avoided becomes this is HASHGROUP_INMETATAG, but should really
|
|
// be HASHGROUP_INMETADATA just to be sure.
|
|
int32_t saved = m_dist;
|
|
m_dist = 0;
|
|
|
|
hashJSONFields2 ( tt , &hi , &jp , false );
|
|
|
|
m_dist = saved;
|
|
|
|
return true;
|
|
}
|
|
|
|
// slightly greater than m_spideredTime, which is the download time.
|
|
// we use this for sorting as well, like for the widget so things
|
|
// don't really get added out of order and not show up in the top spot
|
|
// of the widget list.
|
|
int32_t XmlDoc::getIndexedTime() {
|
|
if ( m_indexedTimeValid ) return m_indexedTime;
|
|
m_indexedTime = getTimeGlobal();
|
|
return m_indexedTime;
|
|
}
|
|
|
|
// . hash dates for sorting by using gbsortby: and gbrevsortby:
|
|
// . do 'gbsortby:gbspiderdate' as your query to see this in action
|
|
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
|
|
|
// stop if already set
|
|
if ( ! m_spideredTimeValid ) return true;
|
|
|
|
int32_t indexedTime = getIndexedTime();
|
|
|
|
// first the last spidered date
|
|
HashInfo hi;
|
|
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "last spidered date";
|
|
hi.m_prefix = "gbspiderdate";
|
|
|
|
char buf[64];
|
|
int32_t bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
// and index time is >= spider time, so you want to sort by that for
|
|
// the widget for instance
|
|
hi.m_desc = "last indexed date";
|
|
hi.m_prefix = "gbindexdate";
|
|
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
// do not index the rest if we are a "spider reply" document
|
|
// which is like a fake document for seeing spider statuses
|
|
//if ( isStatusDoc == CT_STATUS ) return true;
|
|
//if ( isStatusDoc ) return true;
|
|
|
|
// now for CT_STATUS spider status "documents" we also index
|
|
// gbspiderdate so index this so we can just do a
|
|
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
|
|
// spider status "documents"
|
|
hi.m_desc = "doc last spidered date";
|
|
hi.m_prefix = "gbdocspiderdate";
|
|
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
hi.m_desc = "doc last indexed date";
|
|
hi.m_prefix = "gbdocindexdate";
|
|
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime );
|
|
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta zip" );
|
|
|
|
// . set the score based on quality
|
|
// . scores are multiplied by 256 to preserve fractions for adding
|
|
uint32_t score = *getSiteNumInlinks8() * 256 ;
|
|
if ( score <= 0 ) score = 1;
|
|
// search for meta date
|
|
char buf [ 32 ];
|
|
int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
|
|
if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
|
|
char *p = buf;
|
|
char *pend = buf + bufLen ;
|
|
if ( bufLen <= 0 ) return true;
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
//hi.m_prefix = "zipcode";
|
|
hi.m_prefix = "gbzipcode";
|
|
|
|
nextZip:
|
|
// . parse out the zip codes, may be multiple ones
|
|
// . skip non-digits
|
|
while ( p < pend && ! is_digit(*p) ) p++;
|
|
// skip if no digits
|
|
if ( p == pend ) return true;
|
|
// need at least 5 consecutive digits
|
|
if ( p + 5 > pend ) return true;
|
|
// if not a zip code, skip it
|
|
if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
|
|
if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
|
|
if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
|
|
if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
|
|
// do we have too many consectuive digits?
|
|
if ( p + 5 != pend && is_digit(p[5]) ) {
|
|
// if so skip this whole string of digits
|
|
p += 5; while ( p < pend && is_digit(*p) ) p++;
|
|
goto nextZip;
|
|
}
|
|
// 90210 --> 90 902 9021 90210
|
|
for ( int32_t i = 0 ; i <= 3 ; i++ )
|
|
// use prefix as description
|
|
if ( ! hashString ( p,5-i,&hi ) ) return false;
|
|
p += 5;
|
|
goto nextZip;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashContentType ( HashTableX *tt ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
uint8_t ctype = *getContentType();
|
|
char *s = NULL;
|
|
|
|
setStatus ( "hashing content type" );
|
|
|
|
|
|
// hash numerically so we can do gbfacetint:type on it
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "type";
|
|
|
|
char tmp[6];
|
|
sprintf(tmp,"%"UINT32"",(uint32_t)ctype);
|
|
if ( ! hashString (tmp,gbstrlen(tmp),&hi ) ) return false;
|
|
|
|
|
|
// these ctypes are defined in HttpMime.h
|
|
switch (ctype) {
|
|
case CT_HTML: s = "html"; break;
|
|
case CT_TEXT: s = "text"; break;
|
|
case CT_XML : s = "xml" ; break;
|
|
case CT_PDF : s = "pdf" ; break;
|
|
case CT_DOC : s = "doc" ; break;
|
|
case CT_XLS : s = "xls" ; break;
|
|
case CT_PPT : s = "ppt" ; break;
|
|
case CT_PS : s = "ps" ; break;
|
|
// for diffbot. so we can limit search to json objects
|
|
// in Diffbot.cpp
|
|
case CT_JSON: s = "json" ; break;
|
|
}
|
|
// bail if unrecognized content type
|
|
if ( ! s ) return true;
|
|
|
|
// hack for diffbot. do not hash type:json because diffbot uses
|
|
// that for searching diffbot json objects
|
|
if ( cr->m_isCustomCrawl && ctype==CT_JSON && !m_isDiffbotJSONObject )
|
|
return true;
|
|
|
|
// . now hash it
|
|
// . use a score of 1 for all
|
|
// . TODO: ensure doc counting works ok with this when it does
|
|
// it's interpolation
|
|
return hashString (s,gbstrlen(s),&hi );
|
|
}
|
|
|
|
// . hash the link: terms
|
|
// . ensure that more useful linkers are scored higher
|
|
// . useful for computing offsite link text for qdb-ish algorithm
|
|
// . NOTE: for now i do not hash links to the same domain in order to
|
|
// hopefully save 10%-25% index space
|
|
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
|
|
// different site links with no link text will be ranked behind them
|
|
// . the 8-bit bitmap of the score of a link: term:
|
|
// . 00ubdcss u = link is Unbanned? b = link isBanned?
|
|
// d = link dirty? c = link clean?
|
|
// s = 01 if no link text, 10 if link text
|
|
// . NOTE: this is used in Msg18.cpp for extraction
|
|
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
|
|
// so i moved the bits down
|
|
bool XmlDoc::hashLinks ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing links" );
|
|
|
|
// int16_tcuts
|
|
bool isRSSFeed = *getIsRSS();
|
|
Url *cu = getCurrentUrl() ;
|
|
Url *ru = *getRedirUrl() ;
|
|
|
|
char dbuf[8*4*1024];
|
|
HashTableX dedup;
|
|
dedup.set( 8,0,1024,dbuf,8*4*1024,false,m_niceness,"hldt");
|
|
|
|
// see ../url/Url2.cpp for hashAsLink() algorithm
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// skip links with zero 0 length
|
|
if ( m_links.m_linkLens[i] == 0 ) continue;
|
|
// . skip if we are rss page and this link is an <a href> link
|
|
// . we only harvest/index <link> urls from rss feeds
|
|
// . or in the case of feedburner, those orig tags
|
|
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) )
|
|
continue;
|
|
// if we have a <feedburner:origLink> tag, then ignore <link>
|
|
// tags and only get the links from the original links
|
|
if ( m_links.m_isFeedBurner &&
|
|
!(m_links.m_linkFlags[i] & LF_FBTAG) )
|
|
continue;
|
|
// normalize the link
|
|
Url link;
|
|
// now we always add "www" to these links so that any link
|
|
// to cnn.com is same as link to www.cnn.com, because either
|
|
// we index cnn.com or www.cnn.com but not both providing
|
|
// their content is identical (deduping). This way whichever
|
|
// one we index, we can take advantage of all link text whether
|
|
// it's to cnn.com or www.cnn.com.
|
|
// Every now and then we add new session ids to our list in
|
|
// Url.cpp, too, so we have to version that.
|
|
// Since this is just for hashing, it shouldn't matter that
|
|
// www.tmblr.co has no IP whereas only tmblr.co does.
|
|
link.set ( m_links.m_linkPtrs[i] ,
|
|
m_links.m_linkLens[i] ,
|
|
true , // addWWW?
|
|
m_links.m_stripIds ,
|
|
false , // stripPound?
|
|
false , // stripCommonFile?
|
|
m_version );// used for new session id stripping
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . the score depends on some factors:
|
|
// . NOTE: these are no longer valid! (see score bitmap above)
|
|
// . 4 --> if link has different domain AND has link text
|
|
// . 3 --> if link has same domain AND has link text
|
|
// . 2 --> if link has different domain AND no link text
|
|
// . 1 --> if link has sam domain AND no link text
|
|
// . is domain the same as ours?
|
|
// . NOTE: ideally, using the IP domain would be better, but
|
|
// we do not know the ip of the linker right now... so scores
|
|
// may be topped with a bunch of same-ip domain links so that
|
|
// we may not get as much link text as we'd like, since we
|
|
// only sample from one link text per ip domain
|
|
// . now we also just use the mid domain! (excludes TLD)
|
|
bool internal = false;
|
|
int32_t mdlen = cu->getMidDomainLen();
|
|
if ( mdlen == link.getMidDomainLen() &&
|
|
strncmp(cu->getMidDomain(),link.getMidDomain(),mdlen)==0)
|
|
//continue; // sameMidDomain = true;
|
|
internal = true;
|
|
// also check the redir url
|
|
if ( ru ) {
|
|
mdlen = ru->getMidDomainLen();
|
|
if ( mdlen == link.getMidDomainLen() &&
|
|
strncmp(ru->getMidDomain(),
|
|
link.getMidDomain(),mdlen)==0)
|
|
//continue; // sameMidDomain = true;
|
|
internal = true;
|
|
}
|
|
// now make the score
|
|
//unsigned char score ;
|
|
// . TODO: consider not hashing link w/o text!
|
|
// . otherwise, give it a higher score if it's got link TEXT
|
|
//bool gotLinkText = m_links.hasLinkText ( i, m_version );
|
|
// otherwise, beginning with version 21, allow internal links,
|
|
// but with lower scores
|
|
// score
|
|
// internal, no link text: 2
|
|
// internal, w/ link text: 4
|
|
// external, no link text: 6
|
|
// external, w/ link text: 8
|
|
//if ( internal ) {
|
|
// if ( ! gotLinkText ) score = 0x02;
|
|
// else score = 0x04;
|
|
//}
|
|
//else {
|
|
// if ( ! gotLinkText ) score = 0x06;
|
|
// else score = 0x08;
|
|
//}
|
|
|
|
|
|
// dedup this crap
|
|
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "link";
|
|
|
|
// hash link:<url>
|
|
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi ))
|
|
return false;
|
|
|
|
|
|
h = hash64 ( link.getHost() , link.getHostLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
|
|
// fix parm
|
|
hi.m_prefix = "sitelink";
|
|
|
|
// hash sitelink:<urlHost>
|
|
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi))
|
|
return false;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
}
|
|
|
|
// skip this for now
|
|
return true;
|
|
|
|
/*
|
|
setStatus ("hashing gbhasbannedoutlink" );
|
|
|
|
// only lets a domain vote once
|
|
int32_t numBannedOutlinks = *getNumBannedOutlinks();
|
|
//if ( numBannedOutlinks <= 0 ) return true;
|
|
// a score of 235 seems to give a negative return for score8to32()
|
|
uint32_t score = score8to32 ( numBannedOutlinks );
|
|
// make score at least 1!
|
|
if ( score <= 0 ) score = 1;
|
|
// a hack fix
|
|
if ( score > 0x7fffffff ) score = 0x7fffffff;
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "gbhasbannedoutlink";
|
|
|
|
// hash this special thing to help us de-spam the index
|
|
if ( numBannedOutlinks > 0 ) return hashString ("1",1,&hi );
|
|
else return hashString ("0",1,&hi );
|
|
*/
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . hash for linkdb
|
|
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
|
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key224_t) ) { char *xx=NULL;*xx=0; }
|
|
if ( dt->m_ds != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this will be different with our new site definitions
|
|
uint32_t linkerSiteHash32 = *getSiteHash32();
|
|
|
|
char siteRank = getSiteRank();
|
|
|
|
if ( ! m_linksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we need to store this in the title rec for re-building
|
|
// the meta list from the title rec...
|
|
// is this just site info?
|
|
//TagRec ***pgrv = getOutlinkTagRecVector();
|
|
//if ( ! pgrv || pgrv == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
//TagRec **grv = *pgrv;
|
|
|
|
int32_t *linkSiteHashes = getLinkSiteHashes();
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ){
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// convert siteNumInlinks into a score
|
|
//int32_t numSiteInlinks = *xd->getSiteNumInlinks();
|
|
|
|
unsigned char hopCount = *getHopCount();
|
|
|
|
// use spidered time! might not be current time! like if rebuilding
|
|
// or injecting from a past spider time
|
|
int32_t discoveryDate = getSpideredTime();//TimeGlobal();
|
|
int32_t lostDate = 0;
|
|
|
|
// add in new links
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// give up control
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if empty
|
|
if ( m_links.m_linkLens[i] == 0 ) continue;
|
|
// . skip if spam, ALWAYS allow internal outlinks though!!
|
|
// . CAUTION: now we must version islinkspam()
|
|
bool spam = m_links.isLinkSpam(i) ;
|
|
// or if it has no link text, skip it
|
|
//if ( ! links->hasLinkText(i,TITLEREC_CURRENT_VERSION) )
|
|
//continue;
|
|
// get site of outlink from tagrec if in there
|
|
int32_t linkeeSiteHash32 = linkSiteHashes[i];
|
|
/*
|
|
TagRec *gr = grv[i];
|
|
char *site = NULL;
|
|
int32_t siteLen = 0;
|
|
if ( gr ) {
|
|
int32_t dataSize = 0;
|
|
site = gr->getString("site",NULL,&dataSize);
|
|
if ( dataSize ) siteLen = dataSize - 1;
|
|
}
|
|
// otherwise, make it the host or make it cut off at
|
|
// a "/user/" or "/~xxxx" or whatever path component
|
|
if ( ! site ) {
|
|
// GUESS link site... TODO: augment for /~xxx
|
|
char *s = m_links.getLink(i);
|
|
//int32_t slen = m_links.getLinkLen(i);
|
|
//siteLen = slen;
|
|
site = ::getHost ( s , &siteLen );
|
|
}
|
|
uint32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
|
|
*/
|
|
|
|
//
|
|
// when setting the links class it should set the site hash
|
|
//
|
|
|
|
|
|
// set this key, it is the entire record
|
|
key224_t k;
|
|
k = g_linkdb.makeKey_uk ( linkeeSiteHash32 ,
|
|
m_links.getLinkHash64(i) ,
|
|
spam , // link spam?
|
|
siteRank , // was quality
|
|
hopCount,
|
|
*getIp() ,
|
|
*getDocId() ,
|
|
discoveryDate ,
|
|
lostDate ,
|
|
false , // new add?
|
|
linkerSiteHash32 ,
|
|
false );// delete?
|
|
/*
|
|
// debug
|
|
if ( m_links.getLinkHash64(i) != 0x3df1c439a364e18dLL )
|
|
continue;
|
|
//char c = site[siteLen];
|
|
//site[siteLen]=0;
|
|
//char tmp[1024];
|
|
//sprintf(tmp,"xmldoc: hashinglink site=%s sitelen=%"INT32" ",
|
|
// site,siteLen);
|
|
//site[siteLen] = c;
|
|
log(//"%s "
|
|
"url=%s "
|
|
"linkeesitehash32=0x%08"XINT32" "
|
|
"linkersitehash32=0x%08"XINT32" "
|
|
"urlhash64=0x%16llx "
|
|
"docid=%"INT64" k=%s",
|
|
//tmp,
|
|
m_links.getLink(i),
|
|
(int32_t)linkeeSiteHash32,
|
|
linkerSiteHash32,
|
|
m_links.getLinkHash64(i),
|
|
*getDocId(),
|
|
KEYSTR(&k,sizeof(key224_t))
|
|
);
|
|
*/
|
|
// store in hash table
|
|
if ( ! dt->addKey ( &k , NULL ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::getUseTimeAxis ( ) {
|
|
if ( m_useTimeAxisValid )
|
|
return m_useTimeAxis;
|
|
if ( m_setFromTitleRec )
|
|
// return from titlerec header
|
|
return m_useTimeAxis;
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if ( ! cr ) return false;
|
|
m_useTimeAxis = cr->m_useTimeAxis;
|
|
m_useTimeAxisValid = true;
|
|
// sanity check
|
|
// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
|
|
// log("build: custom crawls can't use time axis");
|
|
// char *xx=NULL;*xx=0;
|
|
// m_useTimeAxis = false;
|
|
// }
|
|
return m_useTimeAxis;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
|
bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
|
|
|
setStatus ( "hashing url colon" );
|
|
|
|
// get the first url
|
|
Url *fu = getFirstUrl();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
|
|
// we do not need diversity bits for this
|
|
hi.m_useCountTable = false;
|
|
//
|
|
// HASH url: term
|
|
//
|
|
// append a "www." for doing url: searches
|
|
Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
|
|
hi.m_prefix = "url";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "url2";
|
|
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
|
return false;
|
|
|
|
if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
|
|
hi.m_prefix = "gbtimeurl";
|
|
SafeBuf *tau = getTimeAxisUrl();
|
|
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
|
|
}
|
|
|
|
// use hash of url as score so we can get a # of docs per site est.
|
|
//uint16_t score = hash16 ( fu->getUrl() , fu->getUrlLen() );
|
|
|
|
setStatus ( "hashing inurl colon" );
|
|
|
|
//
|
|
// HASH inurl: terms
|
|
//
|
|
char *s = fu->getUrl ();
|
|
int32_t slen = fu->getUrlLen();
|
|
hi.m_prefix = "inurl";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "inurl2";
|
|
if ( ! hashString ( s,slen, &hi ) ) return false;
|
|
|
|
setStatus ( "hashing ip colon" );
|
|
|
|
//
|
|
// HASH ip:a.b.c.d
|
|
//
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
// copy it to save it
|
|
char ipbuf[64];
|
|
int32_t iplen = sprintf(ipbuf,"%s",iptoa(m_ip));
|
|
//char *tmp = iptoa ( m_ip );
|
|
//int32_t tlen = gbstrlen(tmp);
|
|
hi.m_prefix = "ip";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "ip2";
|
|
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
|
|
|
|
//
|
|
// HASH ip:a.b.c
|
|
//
|
|
char *end1 = ipbuf + iplen - 1;
|
|
while ( *end1 != '.' ) end1--;
|
|
if ( ! hashSingleTerm(ipbuf,end1-ipbuf,&hi) ) return false;
|
|
|
|
|
|
// . sanity check
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// get the boost
|
|
//floatboost1=(float)getBoostFromSiteNumInlinks(m_siteNumInlinks)/100.0
|
|
|
|
|
|
|
|
//
|
|
// HASH the url path plain as if in body
|
|
//
|
|
// get number of components in the path. does not include the filename
|
|
int32_t pathDepth = fu->getPathDepth(false);
|
|
// make it a density thing
|
|
//pathScore /= ( pathDepth + 1 );
|
|
// ensure score positive
|
|
//if ( pathScore <= 0 ) pathScore = 1;
|
|
// get it
|
|
char *path = fu->getPath();
|
|
int32_t plen = fu->getPathLen();
|
|
/*
|
|
// update it
|
|
float boost2 = (float)m_urlPathWeight / 100;
|
|
// again
|
|
float boost3 = 1.0 / ((float)pathDepth + 1.0) ;
|
|
// make a description
|
|
char tmp3[190];
|
|
sprintf( tmp3 ,
|
|
"path score = "
|
|
"siteInlinksBoost * "
|
|
"urlPathWeight * "
|
|
"pathDepthBoost * "
|
|
"256 = %.02f * %.02f * %.02f * 256 " ,
|
|
boost1 ,
|
|
boost2 ,
|
|
boost3 );
|
|
*/
|
|
//int32_t pathScore = (int32_t) (256.0 * boost1 * boost2 * boost3);
|
|
// update parms
|
|
//hi.m_desc = tmp3;
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "url path";
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
|
|
// if parm "index article content only" is true, do not index this!
|
|
//if ( m_eliminateMenus ) skipIndex=true;
|
|
|
|
setStatus ( "hashing gbpathdepth");
|
|
|
|
//
|
|
// HASH gbpathdepth:X
|
|
//
|
|
// xyz.com/foo --> 0
|
|
// xyz.com/foo/ --> 1
|
|
// xyz.com/foo/boo --> 1
|
|
// xyz.com/foo/boo/ --> 2
|
|
char buf[20];
|
|
int32_t blen = sprintf(buf,"%"INT32"",pathDepth);
|
|
// update parms
|
|
hi.m_prefix = "gbpathdepth";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// hash gbpathdepth:X
|
|
if ( ! hashString ( buf,blen,&hi) ) return false;
|
|
|
|
|
|
|
|
//
|
|
// HASH gbhopcount:X
|
|
//
|
|
setStatus ( "hashing gbhopcount");
|
|
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
|
blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount);
|
|
// update parms
|
|
hi.m_prefix = "gbhopcount";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// hash gbpathdepth:X
|
|
if ( ! hashString ( buf,blen,&hi) ) return false;
|
|
|
|
|
|
|
|
setStatus ( "hashing gbhasfilename");
|
|
|
|
//
|
|
// HASH gbhasfilename:0 or :1
|
|
//
|
|
char *hm;
|
|
if ( fu->getFilenameLen() ) hm = "1";
|
|
else hm = "0";
|
|
// update parms
|
|
hi.m_prefix = "gbhasfilename";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
|
|
// hash gbhasfilename:[0|1]
|
|
if ( ! hashString ( hm,1,&hi) ) return false;
|
|
|
|
setStatus ( "hashing gbiscgi");
|
|
|
|
//
|
|
// HASH gbiscgi:0 or gbiscgi:1
|
|
//
|
|
if ( fu->isCgi() ) hm = "1";
|
|
else hm = "0";
|
|
hi.m_prefix = "gbiscgi";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
|
|
if ( ! hashString ( hm,1,&hi) ) return false;
|
|
|
|
|
|
setStatus ( "hashing gbext");
|
|
|
|
//
|
|
// HASH gbhasext:0 or gbhasext:1 (does it have a fileextension)
|
|
//
|
|
// . xyz.com/foo --> gbhasext:0
|
|
// . xyz.com/foo.xxx --> gbhasext:1
|
|
if ( fu->getExtensionLen() ) hm = "1";
|
|
else hm = "0";
|
|
hi.m_prefix = "gbhasext";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
|
|
if ( ! hashString ( hm,1,&hi) ) return false;
|
|
|
|
//
|
|
// HASH the url's mid domain and host as they were in the body
|
|
//
|
|
setStatus ( "hashing site colon terms");
|
|
|
|
//
|
|
// HASH the site: terms
|
|
//
|
|
// . hash the pieces of the site
|
|
// . http://host.domain.com/~harry/level1/ should hash to:
|
|
// . site:host.domain.com/~harry/level1/
|
|
// . site:host.domain.com/~harry/
|
|
// . site:host.domain.com/~
|
|
// . site:host.domain.com/
|
|
// . site:domain.com/~harry/level1/
|
|
// . site:domain.com/~harry/
|
|
// . site:domain.com/~
|
|
// . site:domain.com/
|
|
// ensure score is positive
|
|
//if ( siteScore <= 0 ) siteScore = 1;
|
|
// get the hostname (later we set to domain name)
|
|
char *name = fu->getHost();
|
|
int32_t nameLen = fu->getHostLen();
|
|
// . point to the end of the whole thing, including port field
|
|
// . add in port, if non default
|
|
char *end3 = name + fu->getHostLen() + fu->getPortLen();
|
|
loop:
|
|
// now loop through the sub paths of this url's path
|
|
for ( int32_t i = 0 ; ; i++ ) {
|
|
// get the subpath
|
|
int32_t len = fu->getSubPathLen(i);
|
|
// FIX: always include first /
|
|
if ( len == 0 ) len = 1;
|
|
// write http://www.whatever.com/path into buf
|
|
char buf[MAX_URL_LEN+10];
|
|
char *p = buf;
|
|
gbmemcpy ( p , "http://" , 7 ); p += 7;
|
|
gbmemcpy ( p , name , nameLen ); p += nameLen;
|
|
gbmemcpy ( p , fu->getPath() , len ); p += len;
|
|
*p = '\0';
|
|
// update hash parms
|
|
hi.m_prefix = "site";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "site2";
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
// this returns false on failure
|
|
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
|
|
// break when we hash the root path
|
|
if ( len <=1 ) break;
|
|
}
|
|
// now keep moving the period over in the hostname
|
|
while ( name < end3 && *name != '.' ) { name++; nameLen--; }
|
|
// skip the '.'
|
|
name++; nameLen--;
|
|
// if not '.' we're done
|
|
if ( name < end3 ) goto loop;
|
|
|
|
setStatus ( "hashing ext colon");
|
|
|
|
//
|
|
// HASH ext: term
|
|
//
|
|
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
|
|
char *ext = fu->getExtension();
|
|
int32_t elen = fu->getExtensionLen();
|
|
// update hash parms
|
|
hi.m_prefix = "ext";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "ext2";
|
|
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
|
|
|
|
|
|
setStatus ( "hashing gbdocid" );
|
|
hi.m_prefix = "gbdocid";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
|
|
char buf2[32];
|
|
sprintf(buf2,"%"UINT64"",(m_docId) );
|
|
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
|
|
|
|
// if indexing a json diffbot object, index
|
|
// gbparenturl:xxxx of the original url from which the json was
|
|
// datamined. we use this so we can act as a diffbot json cache.
|
|
if ( m_isDiffbotJSONObject ) {
|
|
setStatus ( "hashing gbparenturl term");
|
|
char *p = fu->getUrl() + fu->getUrlLen() - 1;
|
|
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
|
|
for ( ; *p && *p != '-' ; p-- );
|
|
// set up the hashing parms
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "diffbot parent url";
|
|
// append a "www." as part of normalization
|
|
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
|
|
hi.m_prefix = "gbparenturl";
|
|
// no longer, we just index json now
|
|
//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
|
|
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
|
return false;
|
|
}
|
|
|
|
//if ( isStatusDoc ) return true;
|
|
|
|
setStatus ( "hashing SiteGetter terms");
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->m_plen <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
|
|
char *host = fu->getHost ();
|
|
int32_t hlen = fu->getHostLen ();
|
|
|
|
// tags from here out
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_shardByTermId = true;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
hi.m_shardByTermId = false;
|
|
|
|
setStatus ( "hashing urlhashdiv10 etc");
|
|
|
|
//
|
|
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
|
|
//
|
|
// this is for proving how many docs are in the index
|
|
uint32_t h = hash32 ( s , slen );
|
|
blen = sprintf(buf,"%"UINT32"",h);
|
|
hi.m_prefix = "urlhash";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
blen = sprintf(buf,"%"UINT32"",h/10);
|
|
// update hashing parms
|
|
hi.m_prefix = "urlhashdiv10";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
blen = sprintf(buf,"%"UINT32"",h/100);
|
|
// update hashing parms
|
|
hi.m_prefix = "urlhashdiv100";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
|
|
|
|
setStatus ( "hashing url mid domain");
|
|
// the final score
|
|
//int32_t plainScore = (int32_t)(256.0 * boost1 * boost2 * fw);
|
|
// update parms
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "middle domain";//tmp3;
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
// if parm "index article content only" is true, do not index this!
|
|
//if ( m_eliminateMenus ) plainScore = 0;
|
|
//char *mid = fu->getMidDomain ();
|
|
//int32_t mlen = fu->getMidDomainLen();
|
|
//hi.m_desc = "url mid dom";
|
|
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
|
|
//hi.m_desc = "url host";
|
|
if ( ! hashString ( host,hlen,&hi)) return false;
|
|
|
|
|
|
setStatus ( "hashing url path");
|
|
|
|
// hash the path plain
|
|
if ( ! hashString (path,plen,&hi) ) return false;
|
|
|
|
return true;
|
|
}
|
|
/////////////
|
|
//
|
|
// CHROME DETECTION
|
|
//
|
|
// we search for these terms we hash here in getSectionsWithDupStats()
|
|
// so we can remove chrome.
|
|
//
|
|
/////////////
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
|
bool XmlDoc::hashSections ( HashTableX *tt ) {
|
|
|
|
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( m_contentType == CT_HTML ) return true;
|
|
|
|
setStatus ( "hashing sections" );
|
|
|
|
if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
Sections *ss = &m_sections;
|
|
|
|
int32_t siteHash32 = *getSiteHash32();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
// the prefix is custom set for each section below
|
|
//hi.m_prefix = "gbsectionhash";
|
|
// put all guys with the same xpath/site on the same shard
|
|
hi.m_shardByTermId = true;
|
|
|
|
Section *si = ss->m_rootSection;
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . skip if empty
|
|
// . this needs to be like 48 bits because 32 bits is not
|
|
// big enought!
|
|
//uint64_t ih64 = si->m_sentenceContentHash64;
|
|
|
|
// don't bother with the section if it doesn't have this set
|
|
// because this eliminates parent dupage to reduce amount
|
|
// of gbxpathsitehash123456 terms we index
|
|
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
|
|
continue;
|
|
|
|
// skip if sentence, only hash tags now i guess for diffbot
|
|
//if ( si->m_sentenceContentHash64 )
|
|
// continue;
|
|
|
|
// get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
|
if ( ! val32 )
|
|
continue;
|
|
|
|
// the termid is now the xpath and the sitehash, the "value"
|
|
// will be the hash of the innerhtml, m_sentenceContentHash64
|
|
uint64_t thash64 = (uint32_t)si->m_turkTagHash32;
|
|
// combine with site hash
|
|
thash64 ^= (uint32_t)siteHash32;
|
|
|
|
// this is a special hack we need to make it the
|
|
// hash of the inner html
|
|
//hi.m_sentHash32 = (uint32_t)ih64;
|
|
|
|
// . get section xpath & site hash
|
|
// . now if user does a gbfacets:gbxpathsitehashxxxxxx query
|
|
// he will get back a histogram of the values it hash,
|
|
// which are 32-bit hashes of the innerhtml for that
|
|
// xpath on this site.
|
|
char prefix[96];
|
|
sprintf(prefix,"gbxpathsitehash%"UINT64"",thash64);
|
|
|
|
// like a normal key but we store "ih64" the innerHTML hash
|
|
// of the section into the key instead of wordbits etc.
|
|
// similar to hashNumber*() functions.
|
|
//if ( ! hashSectionTerm ( term , &hi, (uint32_t)ih64 ) )
|
|
// return false;
|
|
|
|
// i guess use facets
|
|
hi.m_prefix = prefix;
|
|
|
|
// we already have the hash of the inner html of the section
|
|
hashFacet2 ( "gbfacetstr",
|
|
prefix,
|
|
//(int32_t)(uint32_t)ih64 ,
|
|
val32,
|
|
hi.m_tt ,
|
|
// shard by termId?
|
|
true );
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
|
bool hashAnomalies ,
|
|
bool hashNonAnomalies ) {
|
|
|
|
// do not index ANY of the body if it is NOT a permalink and
|
|
// "menu elimination" technology is enabled.
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing link text" );
|
|
|
|
// . now it must have an rss item to be indexed in all its glory
|
|
// . but if it tells us it has an rss feed, toss it and wait for
|
|
// the feed.... BUT sometimes the rss feed outlink is 404!
|
|
// . NO, now we discard with ENORSS at Msg16.cpp
|
|
//if ( ! *getHasRSSItem() && m_eliminateMenus ) return true;
|
|
|
|
// sanity check
|
|
if ( hashAnomalies == hashNonAnomalies ) { char *xx = NULL; *xx =0; }
|
|
// display this note in page parser
|
|
char *note = "hashing incoming link text";
|
|
// sanity
|
|
if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
|
|
// . finally hash in the linkText terms from the LinkInfo
|
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
|
// somewhere in TitleRec
|
|
// . otherwise, we generated it from merging a bunch of LinkInfos
|
|
// and storing them in this new TitleRec
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
LinkInfo **pinfo2 = getLinkInfo2 ();
|
|
LinkInfo *info2 = *pinfo2;
|
|
LinkInfo *linkInfo = info1;
|
|
// pick the one with the most inlinks with valid incoming link text,
|
|
// otherwise, we end up with major bias when we stop importing
|
|
// link text from another cluster, because some pages will have
|
|
// twice as many links as they should!
|
|
if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
|
|
linkInfo = info2;
|
|
note = "hashing incoming link text from other cluster";
|
|
}
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// brought the following code in from LinkInfo.cpp
|
|
//
|
|
|
|
int32_t noteLen = 0;
|
|
if ( note ) noteLen = gbstrlen ( note );
|
|
// count "external" inlinkers
|
|
int32_t ecount = 0;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_useSynonyms = true;
|
|
// hashstring should update this like a cursor.
|
|
hi.m_startDist = 0;
|
|
|
|
// loop through the link texts and hash them
|
|
for ( Inlink *k = NULL; (k = linkInfo->getNextInlink(k)) ; ) {
|
|
// is this inlinker internal?
|
|
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// count external inlinks we have for indexing gbmininlinks:
|
|
if ( ! internal ) ecount++;
|
|
// get score
|
|
//int64_t baseScore = k->m_baseScore;
|
|
// get the weight
|
|
//int64_t ww ;
|
|
//if ( internal ) ww = m_internalLinkTextWeight;
|
|
//else ww = m_externalLinkTextWeight;
|
|
// modify the baseScore
|
|
//int64_t final = (baseScore * ww) / 100LL;
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// sanity check
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 2 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
// if it is anomalous, set this, we don't
|
|
//if ( k->m_isAnomaly )
|
|
// hi.m_hashIffNotUnique = true;
|
|
//hi.m_baseScore = final;
|
|
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
|
|
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
|
|
// store the siterank of the linker in this and use that
|
|
// to set the multiplier M bits i guess
|
|
hi.m_linkerSiteRank = k->m_siteRank;
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
k->m_wordPosStart = m_dist; // hi.m_startDist;
|
|
// . hash the link text into the table
|
|
// . returns false and sets g_errno on error
|
|
// . we still have the score punish from # of words though!
|
|
// . for inlink texts that are the same it should accumulate
|
|
// and use the reserved bits as a multiplier i guess...
|
|
if ( ! hashString ( txt,tlen,&hi) ) return false;
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
//k->m_wordPosEnd = hi.m_startDist;
|
|
// spread it out
|
|
hi.m_startDist += 20;
|
|
}
|
|
|
|
/*
|
|
// . hash gbkeyword:numinlinks where score is # of inlinks from 1-255
|
|
// . do not hash gbkeyword:numinlinks if we don't got any
|
|
if ( ecount <= 0 ) return true;
|
|
// limit it since our score can't be more than 255 (8-bits)
|
|
//if ( ecount > 255 ) ecount = 255;
|
|
// convert our 32 bit score to 8-bits so we trick it!
|
|
//int32_t score = score8to32 ( (uint8_t)ecount );
|
|
// watch out for wrap
|
|
//if ( score < 0 ) score = 0x7fffffff;
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "gbkeyword";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// for terms where word position/density/diversity is irrelevant,
|
|
// we can store this value...
|
|
hi.m_fakeValue = ecount;
|
|
// hash gbkeyword:numinlinks term
|
|
if ( ! hashString ( "numinlinks",10,&hi ) )return false;
|
|
*/
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
|
|
|
// seems like iffUnique is off, so do this
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing neighborhoods" );
|
|
|
|
//g_tt = table;
|
|
|
|
// . now we also hash the neighborhood text of each inlink, that is,
|
|
// the text surrounding the inlink text.
|
|
// . this is also destructive in that it will remove termids that
|
|
// were not in the document being linked to in order to save
|
|
// space in the titleRec
|
|
// . now we only do one or the other, not both
|
|
LinkInfo *info1 = getLinkInfo1 ();
|
|
LinkInfo **pinfo2 = getLinkInfo2 ();
|
|
LinkInfo *info2 = *pinfo2;
|
|
LinkInfo *linkInfo = info1;
|
|
|
|
char *note = " (internal cluster)";
|
|
// pick the one with the most inlinks with valid incoming link text
|
|
// otherwise, we end up with major bias when we stop importing
|
|
// link text from another cluster, because some pages will have
|
|
// twice as many links as they should!
|
|
if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
|
|
linkInfo = info2;
|
|
note = " (external cluster)";
|
|
}
|
|
|
|
// loop over all the Inlinks
|
|
Inlink *k = NULL;
|
|
loop:
|
|
// get the next inlink
|
|
k = linkInfo->getNextInlink( k );
|
|
// break if done
|
|
if ( ! k ) return true;
|
|
|
|
// skip if internal, they often have the same neighborhood text
|
|
if ( (k->m_ip&0x0000ffff)==(m_ip&0x0000ffff) ) goto loop;
|
|
|
|
// get the left and right texts and hash both
|
|
char *s = k->getSurroundingText();
|
|
if ( ! s || k->size_surroundingText <= 1 ) goto loop;
|
|
|
|
//int32_t inlinks = *getSiteNumInlinks();
|
|
|
|
// HACK: to avoid having to pass a flag to TermTable, then to
|
|
// Words::hash(), Phrases::hash(), etc. just flip a bit in the
|
|
// table to make it not add anything unless it is already in there.
|
|
tt->m_addIffNotUnique = true;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "surrounding text";
|
|
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
|
|
|
|
// . hash that
|
|
// . this returns false and sets g_errno on error
|
|
int32_t len = k->size_surroundingText - 1;
|
|
if ( ! hashString ( s, len, &hi ) ) return false;
|
|
|
|
// now turn it back off
|
|
tt->m_addIffNotUnique = false;
|
|
|
|
// get the next Inlink
|
|
goto loop;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing rss info" );
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . finally hash in the linkText terms from the LinkInfo
|
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
|
// somewhere in TitleRec
|
|
// . otherwise, we generated it from merging a bunch of LinkInfos
|
|
// and storing them in this new TitleRec
|
|
LinkInfo *linkInfo = getLinkInfo1();
|
|
|
|
// get the xml of the first rss/atom item/entry referencing this url
|
|
Xml xml;
|
|
// . returns NULL if no item xml
|
|
// . this could also be a "channel" blurb now, so we index channel pgs
|
|
if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;
|
|
|
|
if ( xml.isEmpty() )
|
|
// hash gbrss:0
|
|
return hashRSSTerm ( tt , false );
|
|
|
|
// parser info msg
|
|
//if ( m_pbuf ) {
|
|
// m_pbuf->safePrintf(
|
|
// "<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
|
|
//}
|
|
|
|
// hash nothing if not a permalink and eliminating "menus"
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
// . IMPORTANT: you must be using the new link algo, so turn it on
|
|
// in the spider controls. this allows us to include LinkTexts from
|
|
// the same IP in our LinkInfo class in the TitleRec.
|
|
// . is it rss or atom? both use title tag, so doesn't matter
|
|
// . get the title tag
|
|
bool isHtmlEncoded;
|
|
int32_t titleLen;
|
|
char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
|
|
char c = 0;
|
|
|
|
// sanity check
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
bool hashIffUnique = true;
|
|
// but if we had no content because we were an mp3 or whatever,
|
|
// do not worry about avoiding double hashing
|
|
if ( size_utf8Content <= 0 ) hashIffUnique = false;
|
|
|
|
// decode it?
|
|
// should we decode it? if they don't use [CDATA[]] then we should
|
|
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
|
|
// but most other feeds do not use it
|
|
if ( isHtmlEncoded && title && titleLen > 0 ) {
|
|
// it is html encoded so that the <'s are encoded to <'s so
|
|
// we must decode them back. this could turn latin1 into utf8
|
|
// though? no, because the &'s should have been encoded, too!
|
|
int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
|
|
// make sure we don't overflow the buffer
|
|
if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
|
|
// reassign the length
|
|
titleLen = newLen;
|
|
// NULL terminate it
|
|
c = title[titleLen];
|
|
title[titleLen] = '\0';
|
|
}
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
hi.m_desc = "rss title";
|
|
|
|
// . hash the rss title
|
|
// . only hash the terms if they are unique to stay balanced with docs
|
|
// that are not referenced by an rss feed
|
|
bool status = hashString ( title,titleLen,&hi ) ;
|
|
// pop the end back just in case
|
|
if ( c ) title[titleLen] = c;
|
|
// return false with g_errno set on error
|
|
if ( ! status ) return false;
|
|
|
|
// get the rss description
|
|
int32_t descLen;
|
|
char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );
|
|
|
|
// for adavanced hashing
|
|
Xml xml2;
|
|
Words w;
|
|
//Scores scores;
|
|
Words *wordsPtr = NULL;
|
|
//Scores *scoresPtr = NULL;
|
|
c = 0;
|
|
// should we decode it? if they don't use [CDATA[]] then we should
|
|
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
|
|
// but most other feeds do not use it
|
|
if ( isHtmlEncoded && desc && descLen > 0 ) {
|
|
// it is html encoded so that the <'s are encoded to <'s so
|
|
// we must decode them back. this could turn latin1 into utf8
|
|
// though? no, because the &'s should have been encoded, too!
|
|
int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
|
|
// make sure we don't overflow the buffer
|
|
if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
|
|
// reassign the length
|
|
descLen = newLen;
|
|
}
|
|
|
|
// NULL terminate it
|
|
if ( desc ) {
|
|
c = desc[descLen];
|
|
desc[descLen] = '\0';
|
|
// set the xml class from the decoded html
|
|
if ( ! xml2.set ( desc ,
|
|
descLen ,
|
|
false , // own data?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
true , // set parents?
|
|
m_niceness ,
|
|
*ct ) )
|
|
return false;
|
|
// set the words class from the xml, returns false and sets
|
|
// g_errno on error
|
|
if ( ! w.set ( &xml2 ,
|
|
true , // compute Ids
|
|
true ))// has html ents? (WERE encoded twice!)
|
|
|
|
return false;
|
|
// pass it in to TermTable::hash() below
|
|
wordsPtr = &w;
|
|
}
|
|
|
|
// update hash parms
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "rss body";
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
// . hash the rss/atom description
|
|
// . only hash the terms if they are unique to stay balanced with docs
|
|
// that are not referenced by an rss feed
|
|
status = hashString ( desc, descLen, &hi );
|
|
// pop the end back just in case
|
|
if ( c ) desc[descLen] = c;
|
|
// return false with g_errno set
|
|
if ( ! status ) return false;
|
|
|
|
// hash gbrss:1
|
|
if ( ! hashRSSTerm ( tt , true ) ) return false;
|
|
|
|
// parser info msg
|
|
//if ( m_pbuf ) {
|
|
// m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
|
|
// "</b><br><br>");
|
|
//}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
|
|
// hash gbrss:0 or gbrss:1
|
|
char *value;
|
|
if ( inRSS ) value = "1";
|
|
else value = "0";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "gbinrss";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
|
|
// returns false and sets g_errno on error
|
|
if ( ! hashString(value,1,&hi ) ) return false;
|
|
|
|
// hash gbisrss:1 if we are an rss page ourselves
|
|
if ( *getIsRSS() ) value = "1";
|
|
else value = "0";
|
|
// update hash parms
|
|
hi.m_prefix = "gbisrss";
|
|
// returns false and sets g_errno on error
|
|
if ( ! hashString(value,1,&hi) ) return false;
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
// . the new Weights class hashes title as part of body now with a high weight
|
|
// given by "titleWeight" parm
|
|
bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
|
// sanity check
|
|
if ( m_hashedTitle ) { char *xx=NULL ; *xx=0; }
|
|
|
|
setStatus ( "hashing title" );
|
|
|
|
// this has been called, note it
|
|
m_hashedTitle = true;
|
|
|
|
nodeid_t *tids = m_words.m_tagIds;
|
|
int32_t nw = m_words.m_numWords;
|
|
|
|
// find the first <title> tag in the doc
|
|
int32_t i ;
|
|
for ( i = 0 ; i < nw ; i++ )
|
|
if ( tids[i] == TAG_TITLE ) break;
|
|
|
|
// return true if no title
|
|
if ( i >= nw ) return true;
|
|
|
|
// skip tag
|
|
i++;
|
|
// mark it as start of title
|
|
int32_t a = i;
|
|
|
|
// limit end
|
|
int32_t max = i + 40;
|
|
if ( max > nw ) max = nw;
|
|
|
|
// find end of title, either another <title> or a <title> tag
|
|
for ( ; i < max ; i++ )
|
|
if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break;
|
|
|
|
// ends on a <title> tag?
|
|
if ( i == a ) return true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "title";
|
|
hi.m_useSynonyms= true;
|
|
|
|
// the new posdb info
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
|
|
// . hash it up! use 0 for the date
|
|
// . use XmlDoc::hashWords()
|
|
// . use "title" as both prefix and description
|
|
//if ( ! hashWords (a,i,&hi ) ) return false;
|
|
|
|
char **wptrs = m_words.getWords();
|
|
int32_t *wlens = m_words.getWordLens();
|
|
char *title = wptrs[a];
|
|
char *titleEnd = wptrs[i-1] + wlens[i-1];
|
|
int32_t titleLen = titleEnd - title;
|
|
if ( ! hashString ( title, titleLen, &hi) ) return false;
|
|
|
|
// now hash as without title: prefix
|
|
hi.m_prefix = NULL;
|
|
if ( ! hashString ( title, titleLen, &hi) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
|
|
|
|
// do not index ANY of the body if it is NOT a permalink and
|
|
// "menu elimination" technology is enabled.
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing body" );
|
|
|
|
// if more than X% of words are spammed to some degree, index all
|
|
// words with a minimum score
|
|
//int64_t x[] = {30,40,50,70,90};
|
|
//int64_t y[] = {6,8,10,20,30};
|
|
//int32_t mp = getY ( *getSiteNumInlinks8() , x , y , 5 );
|
|
|
|
//int32_t nw = m_words.getNumWords();
|
|
|
|
// record this
|
|
m_bodyStartPos = m_dist;
|
|
m_bodyStartPosValid = true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "body";
|
|
hi.m_useSynonyms= true;
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
// use NULL for the prefix
|
|
return hashWords (&hi );
|
|
}
|
|
|
|
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta keywords" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta keywords";
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// call XmlDoc::hashString
|
|
return hashString ( mk , mklen , &hi);
|
|
}
|
|
|
|
|
|
// . hash the meta summary, description and keyword tags
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
|
|
|
// sanity check
|
|
if ( m_hashedMetas ) { char *xx=NULL ; *xx=0; }
|
|
|
|
// this has been called, note it
|
|
m_hashedMetas = true;
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta summary" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
|
|
int32_t mslen;
|
|
char *ms = getMetaSummary ( &mslen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// udpate hashing parms
|
|
hi.m_desc = "meta summary";
|
|
// hash it
|
|
if ( ! hashString ( ms , mslen , &hi )) return false;
|
|
|
|
|
|
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription ( &mdlen );
|
|
|
|
// udpate hashing parms
|
|
hi.m_desc = "meta desc";
|
|
// . TODO: only hash if unique????? set a flag on ht then i guess
|
|
if ( ! hashString ( md , mdlen , &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
//bool XmlDoc::linksToGigablast ( ) {
|
|
// // check m_links for a link to gigablast.com or www.gigablast.com
|
|
// return m_links.linksToGigablast();
|
|
//}
|
|
|
|
bool XmlDoc::searchboxToGigablast ( ) {
|
|
// . they may have a form variable like
|
|
// . <form method=get action=http://www.gigablast.com/cgi/0.cgi name=f>
|
|
return m_xml.hasGigablastForm();
|
|
}
|
|
|
|
// . bring back support for dmoz integration
|
|
// . when clicking on a "search within this category" it does a gbpdcat:<catid>
|
|
// search to capture all pages that have that dmoz category as one of their
|
|
// parent topics
|
|
bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
|
|
|
|
getDmozTitles();
|
|
|
|
|
|
char *titlePtr = ptr_dmozTitles;
|
|
char *sumPtr = ptr_dmozSumms;
|
|
//char *anchPtr = ptr_dmozAnchors;
|
|
|
|
char buf[128];
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
|
|
int32_t *catIds = (int32_t *)ptr_catIds;
|
|
int32_t numCatIds = size_catIds / 4;
|
|
// go through the catIds and hash them
|
|
for (int32_t i = 0; i < numCatIds; i++) {
|
|
// write the catid as a string
|
|
sprintf(buf, "%"UINT32"", (uint32_t)catIds[i]);
|
|
// term prefix for hashing
|
|
hi.m_prefix = "gbcatid";
|
|
// hash it
|
|
hashString ( buf , gbstrlen(buf) , &hi );
|
|
// we also want to hash the parents
|
|
int32_t currCatId = catIds[i];
|
|
int32_t currParentId = catIds[i];
|
|
int32_t currCatIndex;
|
|
// loop to the Top, Top = 1
|
|
while ( currCatId > 1 ) {
|
|
// hash the parent
|
|
sprintf(buf, "%"UINT32"", (uint32_t)currParentId);
|
|
hi.m_prefix = "gbpcatid";
|
|
hashString ( buf , gbstrlen(buf), &hi );
|
|
// next cat
|
|
currCatId = currParentId;
|
|
// get the index for this cat
|
|
currCatIndex = g_categories->getIndexFromId(currCatId);
|
|
if ( currCatIndex <= 0 ) break;
|
|
// get the parent for this cat
|
|
currParentId =
|
|
g_categories->m_cats[currCatIndex].m_parentid;
|
|
}
|
|
|
|
// do not hash titles or summaries if "index article content
|
|
// only" parm is on
|
|
//if ( tr->eliminateMenus() ) continue;
|
|
|
|
// hash dmoz title
|
|
hi.m_prefix = NULL;
|
|
// call this DMOZ title as regular title i guess
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
// hash the DMOZ title
|
|
hashString ( titlePtr , gbstrlen(titlePtr), &hi );
|
|
// next title
|
|
titlePtr += gbstrlen(titlePtr) + 1;
|
|
|
|
// hash DMOZ summary
|
|
hi.m_prefix = NULL;
|
|
// call this DMOZ summary as body i guess
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
// hash the DMOZ summary
|
|
hashString ( sumPtr , gbstrlen(sumPtr), &hi );
|
|
// next summary
|
|
sumPtr += gbstrlen(sumPtr) + 1;
|
|
}
|
|
|
|
int32_t numIndCatIds = size_indCatIds / 4;
|
|
int32_t *indCatIds = (int32_t *)ptr_indCatIds;
|
|
// go through the INDIRECT catIds and hash them
|
|
for (int32_t i = 0 ; i < numIndCatIds; i++) {
|
|
|
|
// write the catid as a string
|
|
sprintf(buf, "%"UINT32"", (uint32_t)indCatIds[i]);
|
|
// use prefix
|
|
hi.m_prefix = "gbicatid";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// hash it
|
|
hashString ( buf , gbstrlen(buf), &hi );
|
|
|
|
// we also want to hash the parents
|
|
int32_t currCatId = indCatIds[i];
|
|
int32_t currParentId = indCatIds[i];
|
|
int32_t currCatIndex;
|
|
// loop to the Top, Top = 1
|
|
while (currCatId > 1) {
|
|
// hash the parent
|
|
sprintf(buf, "%"UINT32"", (uint32_t)currParentId);
|
|
// new prefix
|
|
hi.m_prefix = "gbipcatid";
|
|
// hash it
|
|
hashString ( buf , gbstrlen(buf), &hi );
|
|
// next cat
|
|
currCatId = currParentId;
|
|
// get the index for this cat
|
|
currCatIndex = g_categories->getIndexFromId(currCatId);
|
|
if ( currCatIndex <= 0 ) break;
|
|
// get the parent for this cat
|
|
currParentId =
|
|
g_categories->m_cats[currCatIndex].m_parentid;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
char s[32]; // numeric langid
|
|
int32_t slen = sprintf(s, "%"INT32"", langId );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
// try lang abbreviation
|
|
sprintf(s , "%s ", getLangAbbr(langId) );
|
|
// go back to broken way to try to fix parsing consistency bug
|
|
// by adding hashLanguageString() function below
|
|
//sprintf(s , "%s ", getLangAbbr(langId) );
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language string" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
// try lang abbreviation
|
|
char s[32];
|
|
int32_t slen = sprintf(s , "%s ", getLangAbbr(langId) );
|
|
// go back to broken way to try to fix parsing consistency bug
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashCountry ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing country" );
|
|
|
|
//uint16_t *cids = getCountryIds();
|
|
//if ( ! cids ) return true;
|
|
//if ( cids == (uint16_t *)-1 ) return false;
|
|
uint16_t *cid = getCountryId();
|
|
if ( ! cid || cid == (uint16_t *)-1 ) return false;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbcountry";
|
|
|
|
for ( int32_t i = 0 ; i < 1 ; i++ ) {
|
|
// get the ith country id
|
|
//int32_t cid = cids[i];
|
|
// convert it
|
|
char buf[32];
|
|
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
|
|
// hash it
|
|
if ( ! hashString ( buf, blen, &hi ) ) return false;
|
|
}
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing site num inlinks" );
|
|
|
|
char s[32];
|
|
int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbsitenuminlinks";
|
|
|
|
// hack test
|
|
// slen = sprintf(s,"%"UINT32"",
|
|
// ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
|
|
// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);
|
|
|
|
return hashString ( s, slen, &hi );
|
|
}
|
|
|
|
bool XmlDoc::hashCharset ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing charset" );
|
|
|
|
char s[128]; // charset string
|
|
int32_t slen;
|
|
|
|
// hash the charset as a string
|
|
if ( ! get_charset_str(*getCharset()))
|
|
slen = sprintf(s, "unknown");
|
|
else
|
|
slen = sprintf(s, "%s", get_charset_str(*getCharset()));
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbcharset";
|
|
|
|
if ( ! hashString ( s,slen, &hi ) ) return false;
|
|
|
|
// hash charset as a number
|
|
slen = sprintf(s, "%d", *getCharset());
|
|
|
|
return hashString ( s,slen, &hi ) ;
|
|
}
|
|
|
|
|
|
// . only hash certain tags (single byte scores and ST_COMMENT)
|
|
// . do not hash clocks, ST_SITE, ST_COMMENT
|
|
// . term = gbtag:blog1 score=0-100
|
|
// . term = gbtag:blog2 score=0-100
|
|
// . term = gbtag:english1 score=0-100
|
|
// . term = gbtag:pagerank1 score=0-100, etc. ...
|
|
// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
|
|
// . later we can support query like gbtag:english1>30
|
|
bool XmlDoc::hashTagRec ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing tag rec" );
|
|
|
|
//char *field = "gbtag:";
|
|
//int32_t fieldlen = gbstrlen(field);
|
|
//bool retval = true;
|
|
|
|
// . this tag rec does not have the ST_SITE tag in it to save space
|
|
// . it does not have clocks either?
|
|
TagRec *gr = getTagRec();
|
|
|
|
// count occurence of each tag id
|
|
//int16_t count [ LAST_TAG ];
|
|
//memset ( count , 0 , 2 * LAST_TAG );
|
|
|
|
// loop over all tags in the title rec
|
|
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get id
|
|
int32_t type = tag->m_type;
|
|
// skip tags we are not supposed to index, like
|
|
// ST_CLOCK, etc. or anything with a dataSize not 1
|
|
if ( ! tag->isIndexable() ) continue;
|
|
// hash these metas below
|
|
//if ( type == ST_META ) continue;
|
|
//if ( tag->isType("meta") ) continue;
|
|
// only single byters. this should have been covered by the
|
|
// isIndexable() function.
|
|
//if ( tag->getTagDataSize() != 1 ) continue;
|
|
// get the name
|
|
char *str = getTagStrFromType ( type );
|
|
// get data size
|
|
//uint8_t *data = (uint8_t *)tag->getTagData();
|
|
// make it a string
|
|
//char dataStr[6];
|
|
//sprintf ( dataStr , "%"INT32"",(int32_t)*data );
|
|
// skip if has non numbers
|
|
//bool num = true;
|
|
//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
|
|
// if ( ! is_digit(tag->getTagData()[i]) ) num = false;
|
|
// skip if it has more than just digits, we are not indexing
|
|
// strings at this point
|
|
//if ( ! num ) continue;
|
|
// point to it, should be a NULL terminated string
|
|
char *dataStr = tag->getTagData();
|
|
// skip if number is too big
|
|
//int32_t val = atol ( dataStr );
|
|
// boost by one so we can index "0" score
|
|
//val++;
|
|
// we really only want to index scores from 0-255
|
|
//if ( val > 255 ) continue;
|
|
// no negatives
|
|
//if ( val <= 0 ) continue;
|
|
// count occurence
|
|
//count [ type ]++;
|
|
// . make the term name to hash after the gbtag:
|
|
// . we want to hash "gbtag:english3" for example, for the
|
|
// ST_ENGLISH tag id.
|
|
char prefix[64];
|
|
// . do not include the count for the first occurence
|
|
// . follows the gbruleset:36 convention
|
|
// . index gbtagspam:0 or gbtagspam:1, etc.!!!
|
|
//if ( count[type] == 1 )
|
|
sprintf ( prefix , "gbtag%s",str);
|
|
// assume that is good enough
|
|
//char *prefix = tmp;
|
|
// store prefix into m_wbuf so XmlDoc::print() works!
|
|
//if ( m_pbuf ) {
|
|
// int32_t tlen = gbstrlen(tmp);
|
|
// m_wbuf.safeMemcpy(tmp,tlen+1);
|
|
// prefix = m_wbuf.getBuf() - (tlen+1);
|
|
//}
|
|
//else
|
|
// sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]);
|
|
// "unmap" it so when it is hashed it will have the correct
|
|
// 8-bit score. IndexList.cpp will convert it back to 8 bits
|
|
// in IndexList::set(table), which sets our termlist from
|
|
// this "table".
|
|
//int32_t score = score8to32 ( val );
|
|
// we already incorporate the score as a string when we hash
|
|
// gbtagtagname:tagvalue so why repeat it?
|
|
//int32_t score = 1;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = prefix;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
|
|
// meta is special now
|
|
if ( tag->isType("meta") ) {
|
|
hi.m_prefix = NULL;
|
|
}
|
|
|
|
// hash it. like "gbtagenglish:1" with a score of 1, etc.
|
|
// or "gbtagspam:33" with a score of 33. this would also
|
|
// hash gbtagclock:0xfe442211 type things as well.
|
|
int32_t dlen = gbstrlen(dataStr);
|
|
if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashPermalink ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing is permalink" );
|
|
|
|
// put a colon in there so it can't be faked using a meta tag.
|
|
char *s = "0";
|
|
if ( *getIsPermalink() ) s = "1";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbpermalink";
|
|
|
|
return hashString ( s,1,&hi );
|
|
}
|
|
|
|
|
|
//hash the tag pair vector, the gigabit vector and the sample vector
|
|
bool XmlDoc::hashVectors ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing vectors" );
|
|
|
|
int32_t score = *getSiteNumInlinks8() * 256;
|
|
if ( score <= 0 ) score = 1;
|
|
char buf[32];
|
|
uint32_t h;
|
|
//char *field;
|
|
//char *descr;
|
|
//h = m_tagVector.getVectorHash();
|
|
uint32_t tph = *getTagPairHash32();
|
|
int32_t blen = sprintf(buf,"%"UINT32"", tph);
|
|
//field = "gbtagvector";
|
|
//descr = "tag vector hash";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbtagvector";
|
|
hi.m_desc = "tag vector hash";
|
|
hi.m_shardByTermId = true;
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen, &hi ) ) return false;
|
|
|
|
h = *getGigabitVectorScorelessHash();
|
|
blen = sprintf(buf,"%"UINT32"",(uint32_t)h);
|
|
// udpate hash parms
|
|
hi.m_prefix = "gbgigabitvector";
|
|
hi.m_desc = "gigabit vector hash";
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi) ) return false;
|
|
|
|
// . dup checking uses the two hashes above, not this hash!!! MDW
|
|
// . i think this vector is just used to see if the page changed
|
|
// significantly since last spidering
|
|
// . it is used by getPercentChanged() and by Dates.cpp
|
|
// . sanity check
|
|
//if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
|
|
//int32_t *pc = m_pageSampleVec;
|
|
//h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE);
|
|
//blen = sprintf(buf,"%"UINT32"",(int32_t unsigned int)h);
|
|
//field = "gbsamplevector";
|
|
//descr = "sample vector hash";
|
|
// this returns false on failure
|
|
//if ( ! hashString ( tt,buf,blen,score,field,descr) )
|
|
// return false;
|
|
|
|
// . hash combined for Dup Dectection
|
|
// . must match XmlDoc::getDupList ( );
|
|
//uint64_t h1 = m_tagVector.getVectorHash();
|
|
//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
|
|
//uint64_t h64 = hash64 ( h1 , h2 );
|
|
|
|
// take this out for now
|
|
/*
|
|
uint64_t *dh = getDupHash ( );
|
|
blen = sprintf(buf,"%"UINT64"", *dh );//h64);
|
|
//field = "gbduphash";
|
|
//descr = "dup vector hash";
|
|
// update hash parms
|
|
hi.m_prefix = "gbduphash";
|
|
hi.m_desc = "dup vector hash";
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
*/
|
|
|
|
// hash the wikipedia docids we match
|
|
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
|
|
for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) {
|
|
blen = sprintf(buf,"%"UINT64"",ptr_wikiDocIds[i]);
|
|
// convert to int32_t
|
|
//int32_t convScore = (int32_t)ptr_wikiScores[i];
|
|
// get score
|
|
//uint32_t ws = score8to32 ( convScore );
|
|
// update hash parms
|
|
hi.m_prefix = "gbwikidocid";
|
|
hi.m_desc = "wiki docid";
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashAds ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing ad ids" );
|
|
|
|
for(int32_t i = 0; i < size_adVector / 8 ; i++) {
|
|
int32_t score = *getSiteNumInlinks8() * 256;
|
|
if ( score <= 0 ) score = 1;
|
|
char buf[128];
|
|
char *field;
|
|
char *descr;
|
|
//buflen = snprintf(buf,128,"%s-%s",
|
|
// m_adProvider[i],m_adClient[i]);
|
|
snprintf(buf,128,"%"UINT64"",ptr_adVector[i] );
|
|
int32_t bufLen = gbstrlen(buf);
|
|
field = "gbad";
|
|
descr = "ad provider and id";
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbad";
|
|
hi.m_desc = "ad provider and id";
|
|
//log(LOG_WARN, "build: url %s indexing ad termid %s:%s",
|
|
// getFirstUrl()->getUrl(), field, buf);
|
|
//this returns false on failure
|
|
if ( ! hashString ( buf,bufLen,&hi ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Url *XmlDoc::getBaseUrl ( ) {
|
|
if ( m_baseUrlValid ) return &m_baseUrl;
|
|
// need this
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
|
|
Url *cu = getCurrentUrl();
|
|
if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
|
|
// no longer set addWWW to true since tmblr.co has an IP but
|
|
// www.tmblr.co does not
|
|
m_baseUrl.set ( cu , false ); // addWWW = true
|
|
// look for base url
|
|
for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
|
|
// 12 is the <base href> tag id
|
|
if ( xml->getNodeId ( i ) != TAG_BASE ) continue;
|
|
// get the href field of this base tag
|
|
int32_t linkLen;
|
|
char *link = (char *) xml->getString ( i, "href", &linkLen );
|
|
// skip if not valid
|
|
if ( ! link || linkLen == 0 ) continue;
|
|
// set base to it. addWWW=true
|
|
m_baseUrl.set(link, linkLen, false);//true);
|
|
break;
|
|
}
|
|
// fix invalid <base href="/" target="_self"/> tag
|
|
if ( m_baseUrl.getHostLen () <= 0 || m_baseUrl.getDomainLen() <= 0 )
|
|
m_baseUrl.set ( cu , false );
|
|
|
|
m_baseUrlValid = true;
|
|
return &m_baseUrl;
|
|
}
|
|
|
|
// hash gbhasthumbnail:0|1
|
|
bool XmlDoc::hashImageStuff ( HashTableX *tt ) {
|
|
|
|
setStatus ("hashing image stuff");
|
|
|
|
char *val = "0";
|
|
char **td = getThumbnailData();
|
|
if ( *td ) val = "1";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbhasthumbnail";
|
|
hi.m_desc = "has a thumbnail";
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( val,1,&hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
|
|
|
|
setStatus ("hashing isadult");
|
|
|
|
char *ia = getIsAdult();
|
|
// this should not block or return error! should have been
|
|
// set in prepareToMakeTitleRec() before hashAll() was called!
|
|
if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }
|
|
|
|
// index gbisadult:1 if adult or gbisadult:0 if not
|
|
char *val;
|
|
if ( *ia ) val = "1";
|
|
else val = "0";
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbisadult";
|
|
hi.m_desc = "is document adult content";
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( val,1,&hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// hash destination urls for embedded gb search boxes
|
|
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing submit urls" );
|
|
|
|
Url *baseUrl = getBaseUrl();
|
|
if ( ! baseUrl || baseUrl == (Url *)-1) { char*xx=NULL;*xx=0;}
|
|
|
|
for ( int32_t i = 0 ; i < m_xml.getNumNodes() ; i++ ) {
|
|
// Find forms
|
|
if ( m_xml.getNodeId(i) != TAG_FORM ) continue;
|
|
if ( m_xml.isBackTag(i) ) continue;
|
|
int32_t score = *getSiteNumInlinks8() * 256;
|
|
if ( score <= 0 ) score = 1;
|
|
int32_t len;
|
|
char *s = m_xml.getString ( i , "action" , &len );
|
|
if (!s || len == 0) continue;
|
|
Url url; url.set(baseUrl, s, len, true);
|
|
|
|
char *buf = url.getUrl();
|
|
int32_t blen = url.getUrlLen();
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbsubmiturl";
|
|
hi.m_desc = "submit url for form";
|
|
|
|
// this returns false on failure
|
|
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
//
|
|
// STUFF IMPORTED FROM INDEXLIST.CPP
|
|
//
|
|
|
|
// we also assume all scores are above 256, too
|
|
uint8_t score32to8 ( uint32_t score ) {
|
|
// ensure score is > 0... no! not any more
|
|
if ( score <= 0 ) return (unsigned char) 0;
|
|
// extremely large scores need an adjustment to avoid wrapping
|
|
if ( score < (uint32_t)0xffffffff - 128 )
|
|
score += 128;
|
|
// scores are multiplied by 256 to preserve fractions, so undo that
|
|
score /= 256;
|
|
// ensure score is > 0
|
|
if ( score <= 0 ) return (unsigned char) 1;
|
|
// if score < 128 return it now
|
|
if ( score < 128 ) return (unsigned char) score;
|
|
// now shrink it so it's now from 1 upwards
|
|
score -= 127;
|
|
|
|
// . take NATURAL log of score now
|
|
// . PROBLEM: for low scores logscore may increase by close to 1.0
|
|
// for a score increase of 1.0. and since s_maxscore is about 22.0
|
|
// we end up moving 1.0/22.0 of 128 total pts causing a jump of
|
|
// 2 or more score points!! oops!!! to fix, let's add 10 pts
|
|
// to the score
|
|
score += 10;
|
|
double logscore = ::log ( (double)score );
|
|
// now the max it can be
|
|
//double maxscore = ::log ( (double)(0x00ffffff - 127));
|
|
static double s_maxscore = -1.0;
|
|
static double s_minscore = -1.0;
|
|
if ( s_maxscore == -1.0 ) {
|
|
uint32_t max = ((0xffffffff + 0)/256) - 127 + 10;
|
|
uint32_t min = ( 128 ) - 127 + 10;
|
|
s_maxscore = ::log((double)max);
|
|
s_minscore = ::log((double)min);
|
|
// adjust
|
|
s_maxscore -= s_minscore;
|
|
}
|
|
// adjust it
|
|
logscore -= s_minscore;
|
|
// scale it into [126,0] (add .5 for rounding)
|
|
double scaled = (logscore* 127.0) / s_maxscore + .5;
|
|
// sanity check
|
|
if ( (unsigned char)scaled >= 128 ) { char *xx=NULL;*xx=0; }
|
|
// . go into the 8 bit score now
|
|
// . set the hi bit so they know we took its log
|
|
unsigned char score8 = (unsigned char)scaled | 128;
|
|
return score8;
|
|
}
|
|
|
|
// for score8to32() below
|
|
static uint32_t s_scoreMap[] = {
|
|
0UL,
|
|
1UL,
|
|
385UL,
|
|
641UL,
|
|
897UL,
|
|
1153UL,
|
|
1409UL,
|
|
1665UL,
|
|
1921UL,
|
|
2177UL,
|
|
2433UL,
|
|
2689UL,
|
|
2945UL,
|
|
3201UL,
|
|
3457UL,
|
|
3713UL,
|
|
3969UL,
|
|
4225UL,
|
|
4481UL,
|
|
4737UL,
|
|
4993UL,
|
|
5249UL,
|
|
5505UL,
|
|
5761UL,
|
|
6017UL,
|
|
6273UL,
|
|
6529UL,
|
|
6785UL,
|
|
7041UL,
|
|
7297UL,
|
|
7553UL,
|
|
7809UL,
|
|
8065UL,
|
|
8321UL,
|
|
8577UL,
|
|
8833UL,
|
|
9089UL,
|
|
9345UL,
|
|
9601UL,
|
|
9857UL,
|
|
10113UL,
|
|
10369UL,
|
|
10625UL,
|
|
10881UL,
|
|
11137UL,
|
|
11393UL,
|
|
11649UL,
|
|
11905UL,
|
|
12161UL,
|
|
12417UL,
|
|
12673UL,
|
|
12929UL,
|
|
13185UL,
|
|
13441UL,
|
|
13697UL,
|
|
13953UL,
|
|
14209UL,
|
|
14465UL,
|
|
14721UL,
|
|
14977UL,
|
|
15233UL,
|
|
15489UL,
|
|
15745UL,
|
|
16001UL,
|
|
16257UL,
|
|
16513UL,
|
|
16769UL,
|
|
17025UL,
|
|
17281UL,
|
|
17537UL,
|
|
17793UL,
|
|
18049UL,
|
|
18305UL,
|
|
18561UL,
|
|
18817UL,
|
|
19073UL,
|
|
19329UL,
|
|
19585UL,
|
|
19841UL,
|
|
20097UL,
|
|
20353UL,
|
|
20609UL,
|
|
20865UL,
|
|
21121UL,
|
|
21377UL,
|
|
21633UL,
|
|
21889UL,
|
|
22145UL,
|
|
22401UL,
|
|
22657UL,
|
|
22913UL,
|
|
23169UL,
|
|
23425UL,
|
|
23681UL,
|
|
23937UL,
|
|
24193UL,
|
|
24449UL,
|
|
24705UL,
|
|
24961UL,
|
|
25217UL,
|
|
25473UL,
|
|
25729UL,
|
|
25985UL,
|
|
26241UL,
|
|
26497UL,
|
|
26753UL,
|
|
27009UL,
|
|
27265UL,
|
|
27521UL,
|
|
27777UL,
|
|
28033UL,
|
|
28289UL,
|
|
28545UL,
|
|
28801UL,
|
|
29057UL,
|
|
29313UL,
|
|
29569UL,
|
|
29825UL,
|
|
30081UL,
|
|
30337UL,
|
|
30593UL,
|
|
30849UL,
|
|
31105UL,
|
|
31361UL,
|
|
31617UL,
|
|
31873UL,
|
|
32129UL,
|
|
32385UL,
|
|
32641UL,
|
|
32897UL,
|
|
33488UL,
|
|
33842UL,
|
|
34230UL,
|
|
34901UL,
|
|
35415UL,
|
|
35979UL,
|
|
36598UL,
|
|
37278UL,
|
|
38025UL,
|
|
39319UL,
|
|
40312UL,
|
|
41404UL,
|
|
43296UL,
|
|
44747UL,
|
|
46343UL,
|
|
48098UL,
|
|
51138UL,
|
|
53471UL,
|
|
56037UL,
|
|
58859UL,
|
|
61962UL,
|
|
65374UL,
|
|
71287UL,
|
|
75825UL,
|
|
80816UL,
|
|
86305UL,
|
|
92342UL,
|
|
98982UL,
|
|
110492UL,
|
|
119326UL,
|
|
129042UL,
|
|
139728UL,
|
|
151481UL,
|
|
171856UL,
|
|
187496UL,
|
|
204699UL,
|
|
223622UL,
|
|
244437UL,
|
|
267333UL,
|
|
307029UL,
|
|
337502UL,
|
|
371022UL,
|
|
407893UL,
|
|
448450UL,
|
|
493062UL,
|
|
570408UL,
|
|
629783UL,
|
|
695095UL,
|
|
766938UL,
|
|
845965UL,
|
|
982981UL,
|
|
1088163UL,
|
|
1203862UL,
|
|
1331130UL,
|
|
1471124UL,
|
|
1625117UL,
|
|
1892110UL,
|
|
2097072UL,
|
|
2322530UL,
|
|
2570533UL,
|
|
2843335UL,
|
|
3143416UL,
|
|
3663697UL,
|
|
4063102UL,
|
|
4502447UL,
|
|
4985726UL,
|
|
5517332UL,
|
|
6439034UL,
|
|
7146599UL,
|
|
7924919UL,
|
|
8781070UL,
|
|
9722836UL,
|
|
10758778UL,
|
|
12554901UL,
|
|
13933735UL,
|
|
15450451UL,
|
|
17118838UL,
|
|
18954063UL,
|
|
20972809UL,
|
|
24472927UL,
|
|
27159874UL,
|
|
30115514UL,
|
|
33366717UL,
|
|
36943040UL,
|
|
43143702UL,
|
|
47903786UL,
|
|
53139877UL,
|
|
58899576UL,
|
|
65235244UL,
|
|
72204478UL,
|
|
84287801UL,
|
|
93563849UL,
|
|
103767501UL,
|
|
114991518UL,
|
|
127337936UL,
|
|
140918995UL,
|
|
164465962UL,
|
|
182542348UL,
|
|
202426372UL,
|
|
224298798UL,
|
|
248358466UL,
|
|
290073346UL,
|
|
322096762UL,
|
|
357322519UL,
|
|
396070851UL,
|
|
438694015UL,
|
|
485579494UL,
|
|
566869982UL,
|
|
629274552UL,
|
|
697919578UL,
|
|
773429105UL,
|
|
856489583UL,
|
|
947856107UL,
|
|
1106268254UL,
|
|
1227877095UL,
|
|
1361646819UL,
|
|
1508793514UL,
|
|
1670654878UL,
|
|
1951291651UL,
|
|
2166729124UL,
|
|
2403710344UL,
|
|
2664389686UL,
|
|
2951136962UL,
|
|
3266558965UL,
|
|
3813440635UL,
|
|
4233267317UL
|
|
};
|
|
|
|
uint32_t score8to32 ( uint8_t score8 ) {
|
|
|
|
/*
|
|
int32_t test = score32to8((uint32_t)0xffffffff);
|
|
static bool s_set = false;
|
|
if ( ! s_set ) {
|
|
s_set = true;
|
|
uint8_t lasts = 0;
|
|
int32_t step = 128;
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
for ( uint64_t i=1 ; i<(uint32_t)0xffffffff ; i+=step) {
|
|
// get the score
|
|
uint8_t s = score32to8(i);
|
|
// print it out now
|
|
if ( s != lasts ) {
|
|
fprintf(stderr,"\t%"UINT32"UL,\n",i);
|
|
}
|
|
// if no change, skip it
|
|
if (lasts != 0 && s == lasts ) {
|
|
if ( s > 128 )
|
|
step = (int32_t)((float)step * 1.1);
|
|
continue;
|
|
}
|
|
// otherwise set it
|
|
s_scoreMap[s] = i;
|
|
// reset
|
|
lasts = s;
|
|
}
|
|
// sanity test
|
|
for ( int32_t j = 1 ; j < 256 ; j++ ) {
|
|
uint32_t big = s_scoreMap[j];
|
|
if ( score32to8(big) != j ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
int64_t end = gettimeofdayInMilliseconds();
|
|
logf(LOG_DEBUG,
|
|
"gb: took %"INT64" ms to build score table.",
|
|
end-start);
|
|
|
|
}
|
|
// sanity test
|
|
static bool s_set = false;
|
|
if ( ! s_set ) {
|
|
for ( int32_t j = 1 ; j < 256 ; j++ ) {
|
|
uint32_t big = s_scoreMap[j];
|
|
uint8_t tt;
|
|
tt = score32to8(big);
|
|
if ( tt != j ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
s_set = true;
|
|
}
|
|
*/
|
|
|
|
return(s_scoreMap[score8]);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//
|
|
// Summary/Title generation for Msg20
|
|
//
|
|
////////////////////////////////////////////////////////////
|
|
|
|
void XmlDoc::set20 ( Msg20Request *req ) {
|
|
// clear it all out
|
|
reset();
|
|
// this too
|
|
m_reply.reset();
|
|
|
|
m_pbuf = NULL;//pbuf;
|
|
m_niceness = req->m_niceness;
|
|
// remember this
|
|
m_req = req;
|
|
// and this!
|
|
//m_coll = req->ptr_coll;
|
|
//setCollNum ( req->ptr_coll );
|
|
m_collnum = req->m_collnum;
|
|
m_collnumValid = true;
|
|
// make this stuff valid
|
|
if ( m_req->m_docId > 0 ) {
|
|
m_docId = m_req->m_docId;
|
|
m_docIdValid = true;
|
|
}
|
|
// set url too if we should
|
|
if ( m_req->size_ubuf > 1 )
|
|
setFirstUrl ( m_req->ptr_ubuf , false );
|
|
}
|
|
|
|
#define MAX_LINK_TEXT_LEN 512
|
|
#define MAX_RSSITEM_SIZE 30000
|
|
|
|
void getMsg20ReplyWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// return if it blocked
|
|
if ( THIS->getMsg20Reply ( ) == (void *)-1 ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// . returns NULL with g_errno set on error
|
|
// . returns -1 if blocked
|
|
Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
|
|
|
// return it right away if valid
|
|
if ( m_replyValid ) return &m_reply;
|
|
|
|
// . internal callback
|
|
// . so if any of the functions we end up calling directly or
|
|
// indirectly block, this callback will be called
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getMsg20ReplyWrapper;
|
|
m_masterState = this;
|
|
}
|
|
|
|
// used by Msg20.cpp to time this XmlDoc::getMsg20Reply() function
|
|
if ( ! m_startTimeValid && isClockInSync() ) {
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
m_startTimeValid = true;
|
|
}
|
|
|
|
// caller shouldhave the callback set
|
|
if ( ! m_callback1 && ! m_callback2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
//char safeStack[100000];
|
|
//safeStack[0] = 0;
|
|
//safeStack[90000] = 0;
|
|
|
|
// int16_tcut
|
|
Msg20Reply *reply = &m_reply;
|
|
|
|
m_niceness = m_req->m_niceness;
|
|
|
|
m_collnum = m_req->m_collnum;//cr->m_collnum;
|
|
m_collnumValid = true;
|
|
|
|
//char *coll = m_req->ptr_coll;
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }
|
|
|
|
|
|
//CollectionRec *cr = getCollRec();
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// set this important member var
|
|
//if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll));
|
|
// return NULL with g_errno set on error
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// . cache it for one hour
|
|
// . this will set our ptr_ and size_ member vars
|
|
char **otr = getOldTitleRec ( );
|
|
if ( ! otr || otr == (void *)-1 ) return (Msg20Reply *)otr;
|
|
|
|
// must have a title rec in titledb
|
|
if ( ! *otr ) { g_errno = ENOTFOUND; return NULL; }
|
|
|
|
// sanity
|
|
if ( *otr != m_oldTitleRec ) { char *xx=NULL;*xx=0; }
|
|
|
|
// what is this?
|
|
int32_t maxSize = 0;
|
|
|
|
// . set our ptr_ and size_ member vars from it after uncompressing
|
|
// . returns false and sets g_errno on error
|
|
if ( ! m_setTr ) {
|
|
// . this completely resets us
|
|
// . this returns false with g_errno set on error
|
|
bool status = set2( *otr, maxSize, cr->m_coll, NULL,
|
|
m_niceness);
|
|
// sanity check
|
|
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// if there was an error, g_errno should be set.
|
|
if ( ! status ) return NULL;
|
|
m_setTr = true;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// init
|
|
reply->m_nextMerged = NULL;
|
|
|
|
reply->m_collnum = m_collnum;
|
|
|
|
// MsgE uses this one
|
|
if ( m_req->m_getTitleRec ) {
|
|
// this is the original compressed titleRec, preceeded
|
|
// by key and dataSize and followed by the data
|
|
reply-> ptr_tr = m_oldTitleRec;
|
|
reply->size_tr = m_oldTitleRecSize;
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
|
|
// if they provided a query with gbfacet*: terms then we have
|
|
// to get those facet values.
|
|
if ( ! m_gotFacets ) {
|
|
// only do this once
|
|
m_gotFacets = true;
|
|
// get facet term
|
|
char *qs = m_req->ptr_qbuf;
|
|
facetPrintLoop:
|
|
for ( ; qs && *qs ; qs++ ) {
|
|
if ( qs[0] != 'g' ) continue;
|
|
if ( qs[1] != 'b' ) continue;
|
|
if ( qs[2] != 'f' ) continue;
|
|
if ( strncasecmp(qs,"gbfacet",7) ) continue;
|
|
qs += 7;
|
|
// gbfacetstr: gbfacetint: gbfacetfloat:
|
|
if ( strncasecmp(qs,"str:" ,4) == 0 ) qs += 4;
|
|
else if ( strncasecmp(qs,"int:" ,4) == 0 ) qs += 4;
|
|
else if ( strncasecmp(qs,"float:",6) == 0 ) qs += 6;
|
|
else continue;
|
|
break;
|
|
}
|
|
// if we had a facet, get the values it has in the doc
|
|
if ( qs && *qs ) {
|
|
// need this for storeFacetValues() if we are json
|
|
if ( m_contentType == CT_JSON ||
|
|
// spider status docs are really json
|
|
m_contentType == CT_STATUS ) {
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1)
|
|
return (Msg20Reply *)jp;
|
|
}
|
|
if ( m_contentType == CT_HTML ||
|
|
m_contentType == CT_XML ) {
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml==(void *)-1)
|
|
return (Msg20Reply *)xml;
|
|
}
|
|
// find end of it
|
|
char *e = qs;
|
|
for ( ; *e && ! is_wspace_a(*e) ; e++ );
|
|
// tmp null it
|
|
char c = *e; *e = '\0';
|
|
// this is zero if unspecifed
|
|
FacetValHash_t fvh = m_req->m_facetValHash;
|
|
// . this will store facetField/facetValue pairs
|
|
// . stores into safebuf, m_tmpBuf2
|
|
// . it will terminate all stored strings with \0
|
|
// . we check meta tags for html docs
|
|
// . otherwise we check xml/json doc fields
|
|
// . returns false with g_errno set on error
|
|
bool ret = storeFacetValues ( qs , &m_tmpBuf2 , fvh ) ;
|
|
// revert the \0
|
|
*e = c;
|
|
// return NULL with g_errno set on error
|
|
if ( ! ret ) return NULL;
|
|
// advance
|
|
qs = e;
|
|
// do another one
|
|
goto facetPrintLoop;
|
|
}
|
|
// assign
|
|
reply-> ptr_facetBuf = m_tmpBuf2.getBufStart();
|
|
reply->size_facetBuf = m_tmpBuf2.length();
|
|
}
|
|
|
|
if ( m_req->m_justGetFacets ) {
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
if ( m_req->m_getTermListBuf ) {
|
|
// ensure content is recycled from title rec
|
|
m_recycleContent = true;
|
|
//xd->m_recycleLinkInfo = true;
|
|
// only get posdb keys really for this stuff
|
|
m_useTitledb = false;
|
|
m_useTagdb = false;
|
|
m_useClusterdb = false;
|
|
m_useSpiderdb = false;
|
|
m_useLinkdb = false;
|
|
// time it
|
|
if ( m_tlbufTimer == 0 )
|
|
m_tlbufTimer = gettimeofdayInMilliseconds();
|
|
// . shit limit content for speed!!!
|
|
// . this is for getting matching queries/relatedqueries
|
|
// anyway, so should be ok
|
|
if ( size_utf8Content > 150000 ) {
|
|
char *p = ptr_utf8Content + 150000 - 1;
|
|
char *pstart = ptr_utf8Content;
|
|
// back up until we hit punct
|
|
for ( ; p > pstart ; p-- )
|
|
if ( is_punct_utf8(p) ) break;
|
|
// set new size then
|
|
*p = '\0';
|
|
size_utf8Content = p - pstart + 1;
|
|
}
|
|
// hack: should be sorted by lower 32bits of termids
|
|
// so handleRequest8e does not have to sort before doing
|
|
// its query matching algo with queries in g_qbuf.
|
|
// but these termlists are really mostly used for doing
|
|
// the gbdocid:|xxxx queries in handleRequest8e.
|
|
SafeBuf *tbuf = getTermListBuf();
|
|
if ( ! tbuf || tbuf == (void *)-1 ) return (Msg20Reply *)tbuf;
|
|
SafeBuf *tibuf = getTermId32Buf();
|
|
if ( ! tibuf || tibuf == (void *)-1)return (Msg20Reply *)tibuf;
|
|
// time it
|
|
int64_t took = gettimeofdayInMilliseconds() - m_tlbufTimer;
|
|
log("seo: tlistbuf gen took %"INT64" ms for docid %"INT64"",
|
|
took,m_docId);
|
|
// just that
|
|
reply-> ptr_tlistBuf = tbuf->getBufStart();
|
|
reply->size_tlistBuf = tbuf->length();
|
|
reply-> ptr_tiBuf = tibuf->getBufStart();
|
|
reply->size_tiBuf = tibuf->length();
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
// lookup the tagdb rec fresh if setting for a summary. that way we
|
|
// can see if it is banned or not. but for getting m_getTermListBuf
|
|
// and stuff above, skip the tagrec lookup!
|
|
// save some time when SPIDERING/BUILDING by skipping fresh
|
|
// tagdb lookup and using tags in titlerec
|
|
if ( m_req && ! m_req->m_getLinkText && ! m_checkedUrlFilters )
|
|
m_tagRecDataValid = false;
|
|
|
|
// set and validate member vars
|
|
//if ( ! m_setFromTitleRec )
|
|
// // return NULL with g_errno set on error
|
|
// if ( ! set ( tr , NULL , m_niceness ) ) return NULL;
|
|
|
|
// if shard responsible for tagrec is dead, then
|
|
// just recycle!
|
|
if ( m_req && ! m_checkedUrlFilters && ! m_tagRecDataValid ) {
|
|
char *site = getSite();
|
|
TAGDB_KEY tk1 = g_tagdb.makeStartKey ( site );
|
|
TAGDB_KEY tk2 = g_tagdb.makeDomainStartKey ( &m_firstUrl );
|
|
uint32_t shardNum1 = g_hostdb.getShardNum(RDB_TAGDB,&tk1);
|
|
uint32_t shardNum2 = g_hostdb.getShardNum(RDB_TAGDB,&tk2);
|
|
// shardnum1 and shardnum2 are often different!
|
|
// log("db: s1=%i s2=%i",(int)shardNum1,(int)shardNum2);
|
|
if ( g_hostdb.isShardDead ( shardNum1 ) ) {
|
|
log("query: skipping tagrec lookup for dead shard "
|
|
"# %"INT32""
|
|
,shardNum1);
|
|
m_tagRecDataValid = true;
|
|
}
|
|
if ( g_hostdb.isShardDead ( shardNum2 ) && m_firstUrlValid ) {
|
|
log("query: skipping tagrec lookup for dead shard "
|
|
"# %"INT32""
|
|
,shardNum2);
|
|
m_tagRecDataValid = true;
|
|
}
|
|
}
|
|
|
|
|
|
// if we are showing sites that have been banned in tagdb, we dont
|
|
// have to do a tagdb lookup. that should speed things up.
|
|
TagRec *gr = NULL;
|
|
if ( cr && cr->m_doTagdbLookups ) {
|
|
gr = getTagRec();
|
|
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
|
|
}
|
|
|
|
//reply-> ptr_tagRec = (char *)gr;
|
|
//reply->size_tagRec = gr->getSize();
|
|
|
|
// we use this instead of nowGlobal
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this should be valid, it is stored in title rec
|
|
if ( m_contentHash32Valid ) reply->m_contentHash32 = m_contentHash32;
|
|
else reply->m_contentHash32 = 0;
|
|
|
|
// if this page is potential spam, toss it!
|
|
//char *isSpam = getIsSpam();
|
|
//if ( ! isSpam || isSpam == (char *)-1 ) return (Msg20Reply *)isSpam;
|
|
|
|
if ( ! m_checkedUrlFilters ) {
|
|
// do it
|
|
//int32_t *rn = getRegExpNum2(-1);
|
|
//if ( ! rn || rn == (int32_t *)-1 ) return (Msg20Reply *)rn;
|
|
// do not re-check
|
|
m_checkedUrlFilters = true;
|
|
|
|
// a non-www url?
|
|
/*
|
|
|
|
now we allow domain-only urls in the index, so this is
|
|
hurting us...
|
|
|
|
if ( ! m_req->m_getLinkText ) {
|
|
Url tmp;
|
|
tmp.set ( ptr_firstUrl );
|
|
if ( tmp.getHostLen() == tmp.getDomainLen() ) {
|
|
// set m_errno
|
|
reply->m_errno = EDOCFILTERED;
|
|
// tmp debug
|
|
log("xmldoc: filtering non www url %s",
|
|
ptr_firstUrl);
|
|
// and this
|
|
reply->m_isFiltered = true;
|
|
// give back the url at least
|
|
reply->ptr_ubuf = getFirstUrl()->getUrl();
|
|
reply->size_ubuf =getFirstUrl()->getUrlLen()+1;
|
|
// validate
|
|
m_replyValid = true;
|
|
// and return
|
|
return reply;
|
|
}
|
|
}
|
|
*/
|
|
|
|
// get this
|
|
//time_t nowGlobal = getTimeGlobal();
|
|
// get this
|
|
SpiderRequest sreq;
|
|
SpiderReply srep;
|
|
setSpiderReqForMsg20 ( &sreq , &srep );//, *isSpam );
|
|
int32_t spideredTime = getSpideredTime();
|
|
int32_t langIdArg = -1;
|
|
if ( m_langIdValid ) langIdArg = m_langId;
|
|
// get it
|
|
int32_t ufn;
|
|
ufn=::getUrlFilterNum(&sreq,&srep,spideredTime,true,
|
|
m_niceness,cr,
|
|
false, // isOutlink?
|
|
NULL ,
|
|
langIdArg);
|
|
// sanity check
|
|
if ( ufn < 0 ) {
|
|
log("msg20: bad url filter for url %s", sreq.m_url);
|
|
}
|
|
|
|
// save it
|
|
reply->m_urlFilterNum = ufn;
|
|
// get spider priority if ufn is valid
|
|
int32_t pr = 0;
|
|
//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
|
|
if ( cr->m_forceDelete[ufn] ) pr = -3;
|
|
|
|
// this is an automatic ban!
|
|
if ( gr && gr->getLong("manualban",0))
|
|
pr=-3;//SPIDER_PRIORITY_BANNED;
|
|
|
|
// is it banned
|
|
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
|
|
// set m_errno
|
|
reply->m_errno = EDOCBANNED;
|
|
// and this
|
|
reply->m_isBanned = true;
|
|
}
|
|
|
|
//
|
|
// for now always allow it until we can fix this better
|
|
// we probably should assume NOT filtered unless it matches
|
|
// a string match only url filter... but at least we will
|
|
// allow it to match "BANNED" filters for now...
|
|
//
|
|
pr = 0;
|
|
|
|
|
|
// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
|
|
// // set m_errno
|
|
// reply->m_errno = EDOCFILTERED;
|
|
// // and this
|
|
// reply->m_isFiltered = true;
|
|
// }
|
|
// done if we are
|
|
if ( reply->m_errno && ! m_req->m_showBanned ) {
|
|
// give back the url at least
|
|
reply->ptr_ubuf = getFirstUrl()->getUrl();
|
|
reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// a special hack for XmlDoc::getRecommendedLinksBuf() so we exclude
|
|
// links that link to the main url's site/domain as well as a
|
|
// competitor url (aka related docid)
|
|
Links *links = NULL;
|
|
if ( m_req->m_ourHostHash32 || m_req->m_ourDomHash32 ) {
|
|
links = getLinks();
|
|
if ( ! links || links==(Links *)-1) return (Msg20Reply *)links;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// truncate content length if we should
|
|
// this was hurting our linkdb lookups! do not do it for those!
|
|
/*
|
|
if ( size_utf8Content > cr->m_contentLenMaxForSummary &&
|
|
// fix for link text fetching!
|
|
! req->m_getLinkText ) {
|
|
logf(LOG_DEBUG,"summary: truncating doc of len %"INT32" to %"INT32" for "
|
|
"generating summary",
|
|
size_utf8Content,cr->m_contentLenMaxForSummary);
|
|
size_utf8Content = cr->m_contentLenMaxForSummary ;
|
|
// null term just in case
|
|
ptr_utf8Content[size_utf8Content-1] = '\0';
|
|
}
|
|
*/
|
|
// do they want a summary?
|
|
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
|
|
char *hsum = getHighlightedSummary();
|
|
|
|
if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
|
|
//Summary *s = getSummary();
|
|
//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
|
|
//int32_t sumLen = m_finalSummaryBuf.length();
|
|
// is it size and not length?
|
|
int32_t hsumLen = 0;
|
|
// seems like it can return 0x01 if none...
|
|
if ( hsum == (char *)0x01 ) hsum = NULL;
|
|
// get len. this is the HIGHLIGHTED summary so it is ok.
|
|
if ( hsum ) hsumLen = gbstrlen(hsum);
|
|
// must be \0 terminated. not any more, it can be a subset
|
|
// of a larger summary used for deduping
|
|
if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
|
|
// assume size is 0
|
|
//int32_t sumSize = 0;
|
|
// include the \0 in size
|
|
//if ( sum ) sumSize = sumLen + 1;
|
|
// do not get any more than "me" lines/excerpts of summary
|
|
//int32_t max = m_req->m_numSummaryLines;
|
|
// grab stuff from it!
|
|
//reply->m_proximityScore = s->getProximityScore();
|
|
reply-> ptr_displaySum = hsum;//s->getSummary();
|
|
reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
|
|
// this is unhighlighted for deduping, and it might be longer
|
|
// . seems like we are not using this for deduping but using
|
|
// the gigabit vector in Msg40.cpp, so take out for now
|
|
//reply-> ptr_dedupSum = s->m_summary;
|
|
//reply->size_dedupSum = s->m_summaryLen+1;
|
|
//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
|
|
//reply->m_diversity = s->getDiversity();
|
|
}
|
|
|
|
reply->m_numAlnumWords = 0;
|
|
if ( m_wordsValid )
|
|
reply->m_numAlnumWords = m_words.m_numAlnumWords;
|
|
|
|
// . we filter out search results that do not have all the query terms
|
|
// . Matches.cpp checks the link text, dmoz, etc. for all query terms
|
|
// . it must get into the results form indexdb corruption?
|
|
// . this filtering method is/was known as the "BIG HACK"
|
|
// . We also make sure that matches aren't based on
|
|
// . "anomalous" link text, where a doc has so many link texts
|
|
// . that most common dictionary terms appear in or around
|
|
// . a link to the site.
|
|
if ( m_req->size_qbuf > 1 ) {
|
|
Matches *mm = getMatches();
|
|
int32_t numInlinks = getLinkInfo1()->getNumLinkTexts( );
|
|
reply->m_hasAllQueryTerms = mm->docHasQueryTerms(numInlinks);
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// copy the link info stuff?
|
|
if ( ! m_req->m_getLinkText ) {
|
|
reply->ptr_linkInfo = (char *)ptr_linkInfo1;
|
|
reply->size_linkInfo = size_linkInfo1;
|
|
}
|
|
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
bool getThatTitle = true;
|
|
if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
|
|
if ( reply->ptr_tbuf ) getThatTitle = false;
|
|
// if steve's requesting the inlink summary we will want to get
|
|
// the title of each linker even if they are spammy!
|
|
// only get title here if NOT getting link text otherwise
|
|
// we only get it down below if not a spammy voter, because
|
|
// this sets the damn slow sections class
|
|
if ( m_req->m_getLinkText &&
|
|
! m_useSiteLinkBuf &&
|
|
! m_usePageLinkBuf &&
|
|
// m_pbuf is used by pageparser.cpp now, not the other two things
|
|
// above this.
|
|
! m_pbuf )
|
|
getThatTitle = false;
|
|
|
|
// if steve is getting the inlinks, bad and good, for displaying
|
|
// then get the title here now... otherwise, if we are just spidering
|
|
// and getting the inlinks, do not bother getting the title because
|
|
// the inlink might be linkspam... and we check down below...
|
|
if ( ! m_req->m_onlyNeedGoodInlinks )
|
|
getThatTitle = true;
|
|
|
|
// ... no more seo so stop it... disable this for sp
|
|
if ( m_req->m_getLinkText )
|
|
getThatTitle = false;
|
|
|
|
if ( getThatTitle ) {
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
|
|
char *tit = ti->getTitle();
|
|
int32_t titLen = ti->getTitleLen();
|
|
reply-> ptr_tbuf = tit;
|
|
reply->size_tbuf = titLen + 1; // include \0
|
|
// sanity
|
|
if ( tit && tit[titLen] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
if ( ! tit || titLen <= 0 ) {
|
|
reply->ptr_tbuf = NULL;
|
|
reply->size_tbuf = 0;
|
|
}
|
|
}
|
|
|
|
// this is not documented because i don't think it will be popular
|
|
if ( m_req->m_getHeaderTag ) {
|
|
SafeBuf *htb = getHeaderTagBuf();
|
|
if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
|
|
// . it should be null terminated
|
|
// . actually now it is a \0 separated list of the first
|
|
// few h1 tags
|
|
// . we call SafeBuf::pushChar(0) to add each one
|
|
reply->ptr_htag = htb->getBufStart();
|
|
reply->size_htag = htb->getLength();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( m_req->m_getMatches && ! reply->ptr_mbuf ) {
|
|
MatchOffsets *mo = getMatchOffsets();
|
|
if ( ! mo || mo == (MatchOffsets *)-1) return (Msg20Reply *)mo;
|
|
reply-> ptr_mbuf = (char *)mo->m_matchOffsets;
|
|
reply->size_mbuf = mo->m_numMatches*4;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// get site
|
|
reply->ptr_site = ptr_site;
|
|
reply->size_site = size_site;
|
|
|
|
// assume unknown
|
|
reply->m_noArchive = 0;
|
|
// are we noarchive? only check this if not getting link text
|
|
if ( ! m_req->m_getLinkText ) {
|
|
char *na = getIsNoArchive();
|
|
if ( ! na || na == (char *)-1 ) return (Msg20Reply *)na;
|
|
reply->m_noArchive = *na;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
int32_t nowUTC2 = m_req->m_nowUTC;
|
|
if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet;
|
|
|
|
// . summary vector for deduping
|
|
// . does not compute anything if we should not! (svSize will be 0)
|
|
if ( ! reply->ptr_vbuf &&
|
|
m_req->m_getSummaryVector &&
|
|
cr->m_percentSimilarSummary > 0 &&
|
|
cr->m_percentSimilarSummary < 100 ) {
|
|
int32_t *sv = getSummaryVector ( );
|
|
if ( ! sv || sv == (void *)-1 ) return (Msg20Reply *)sv;
|
|
reply-> ptr_vbuf = (char *)m_summaryVec;
|
|
reply->size_vbuf = m_summaryVecSize;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
if ( m_req->m_numSummaryLines > 0 ) {
|
|
// turn off for now since we added this to posdb
|
|
uint8_t *sl = getSummaryLangId();
|
|
if ( ! sl || sl == (void *)-1 ) return (Msg20Reply *)sl;
|
|
reply->m_summaryLanguage = *sl;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// returns values of specified meta tags
|
|
if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) {
|
|
int32_t dsize; char *d;
|
|
d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
|
|
if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d;
|
|
reply->ptr_dbuf = d;
|
|
reply->size_dbuf = dsize; // includes \0
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// . sample buffer for doing gigabit generation
|
|
// . Msg40.cpp calls intersectGigabits on all these samples from
|
|
// all the Msg20Replies it gets in the search results
|
|
//if ( ! reply->ptr_gigabitQuery && m_req->m_bigSampleMaxLen > 0 ) {
|
|
if ( ! reply->ptr_gigabitSample && m_req->m_bigSampleMaxLen > 0 ) {
|
|
// before we got a chunk of text from teh doc
|
|
SafeBuf *gsbuf = getSampleForGigabits();
|
|
if ( ! gsbuf||gsbuf ==(void *)-1) return (Msg20Reply *)gsbuf;
|
|
reply->ptr_gigabitSample = gsbuf->getBufStart();
|
|
reply->size_gigabitSample = gsbuf->length();
|
|
// . now we use the gigabit query!
|
|
// . this is really used to find out what wikipedia pages
|
|
// we match the best...
|
|
// . this also sets the vector
|
|
/*
|
|
char *gq = getGigabitQuery();
|
|
if ( ! gq || gq == (char *)-1) return (Msg20Reply *)gq;
|
|
reply-> ptr_gigabitQuery = m_gigabitQuery;
|
|
reply->size_gigabitQuery = gbstrlen(m_gigabitQuery)+1;
|
|
reply-> ptr_gigabitScores = ptr_gigabitScores;
|
|
reply->size_gigabitScores = size_gigabitScores;
|
|
*/
|
|
}
|
|
|
|
// get full image url. but not if we already have a thumbnail...
|
|
if ( ! reply->ptr_imgUrl&&!reply->ptr_imgData&&!m_req->m_getLinkText){
|
|
// && m_req->m_getImageUrl ) {
|
|
char **iu = getImageUrl();
|
|
if ( ! iu || iu == (char **)-1 ) return (Msg20Reply *)iu;
|
|
reply-> ptr_imgUrl = *iu;
|
|
reply->size_imgUrl = 0;
|
|
if ( *iu ) reply->size_imgUrl = gbstrlen(*iu)+1;
|
|
}
|
|
|
|
// get thumbnail image DATA
|
|
if ( ! reply->ptr_imgData && ! m_req->m_getLinkText ) {
|
|
// && m_req->m_getImageUrl ) {
|
|
reply-> ptr_imgData = ptr_imageData;
|
|
reply->size_imgData = size_imageData;
|
|
}
|
|
|
|
// . adids contained in the doc
|
|
// . get from title rec rather than generating
|
|
// . but we need to generate to store in titleRec at index time
|
|
// . they are 32 bits each
|
|
int64_t **avp = getAdVector();
|
|
if ( ! avp || avp == (void *)-1 ) return (Msg20Reply *)avp;
|
|
|
|
// get firstip
|
|
int32_t *fip = getFirstIp();
|
|
if ( ! fip || fip == (void *)-1 ) return (Msg20Reply *)fip;
|
|
|
|
|
|
//Url **redir = getRedirUrl();
|
|
//if ( ! redir || redir == (Url **)-1 ) return (Msg20Reply *)redir;
|
|
//int32_t redirSize = 0;
|
|
//if ( *redir ) redirSize = (*redir)->getUrlLen() + 1;
|
|
//char *ru = NULL;
|
|
//if ( *redir ) ru = (*redir)->getUrl();
|
|
char *ru = ptr_redirUrl;
|
|
int32_t rulen = 0;
|
|
if ( ru ) rulen = gbstrlen(ru)+1;
|
|
|
|
// . Msg25.cpp uses m_adIdHash for restricting voting
|
|
// . these are 64 bit termids hashes
|
|
reply-> ptr_gbAdIds = (char *)*avp;
|
|
// this size is in bytes and includes the \0
|
|
reply->size_gbAdIds = size_adVector;
|
|
|
|
// need full cached page of each search result?
|
|
// include it always for spider status docs.
|
|
if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
|
|
reply-> ptr_content = ptr_utf8Content;
|
|
reply->size_content = size_utf8Content;
|
|
}
|
|
|
|
// if ( m_req->m_getSectionVotingInfo && m_tmpBuf3.getCapacity() <=0) {
|
|
// Sections *ss = getSections();
|
|
// if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
|
|
// // will at least store a \0 in there, but will not count
|
|
// // as part of the m_tmpBuf.length()
|
|
// ss->printVotingInfoInJSON ( &m_tmpBuf3 );
|
|
// reply-> ptr_sectionVotingInfo = m_tmpBuf3.getBufStart();
|
|
// reply->size_sectionVotingInfo = m_tmpBuf3.length() + 1;
|
|
// }
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// do they want to know if this doc has an outlink to a url
|
|
// that has the provided site and domain hash, Msg20Request::
|
|
// m_ourHostHash32 and m_ourDomHash32?
|
|
int32_t nl = 0;
|
|
if ( links ) nl = links->getNumLinks();
|
|
// scan all outlinks we have on this page
|
|
int32_t i ; for ( i = 0 ; i < nl ; i++ ) {
|
|
// get the normalized url
|
|
//char *url = links->getLinkPtr(i);
|
|
// get the site. this will not block or have an error.
|
|
int32_t hh32 = (int32_t)((uint32_t)links->getHostHash64(i));
|
|
if ( hh32 == m_req->m_ourHostHash32 ) break;
|
|
int32_t dh32 = links->getDomHash32(i);
|
|
if ( dh32 == m_req->m_ourDomHash32 ) break;
|
|
}
|
|
reply->m_hasLinkToOurDomOrHost = false;
|
|
if ( i < nl )
|
|
reply->m_hasLinkToOurDomOrHost = true;
|
|
|
|
|
|
// easy ones
|
|
reply->m_isPermalink = m_isPermalink;
|
|
reply->m_ip = m_ip;
|
|
reply->m_firstIp = *fip;
|
|
reply->m_domHash = getDomHash32();//domHash;
|
|
reply->m_docId = m_docId;
|
|
reply->m_urlHash48 = getFirstUrlHash48();
|
|
reply->m_contentLen = size_utf8Content;
|
|
reply->m_lastSpidered = getSpideredTime();//m_spideredTime;
|
|
reply->m_datedbDate = m_pubDate;
|
|
reply->m_firstIndexedDate = m_firstIndexedDate;
|
|
reply->m_firstSpidered = m_firstIndexedDate;
|
|
reply->m_contentType = m_contentType;
|
|
reply->m_hostHash = getHostHash32a();
|
|
//reply->m_contentHash = *getContentHash32();
|
|
reply->m_language = m_langId;
|
|
reply->m_country = *getCountryId();
|
|
//reply->m_hasAllQueryTerms = false;
|
|
reply->m_hopcount = m_hopCount;
|
|
reply->m_siteRank = getSiteRank();
|
|
|
|
reply->ptr_ubuf = getFirstUrl()->getUrl();
|
|
reply->ptr_rubuf = ru;
|
|
reply->ptr_catIds = ptr_catIds;
|
|
reply->ptr_indCatIds = ptr_indCatIds;
|
|
reply->ptr_dmozTitles = ptr_dmozTitles;
|
|
reply->ptr_dmozSumms = ptr_dmozSumms;
|
|
reply->ptr_dmozAnchors = ptr_dmozAnchors;
|
|
reply->ptr_metadataBuf = ptr_metadata;
|
|
|
|
|
|
reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
|
|
reply->size_rubuf = rulen;
|
|
reply->size_catIds = size_catIds;
|
|
reply->size_indCatIds = size_indCatIds;
|
|
reply->size_dmozTitles = size_dmozTitles;
|
|
reply->size_dmozSumms = size_dmozSumms;
|
|
reply->size_dmozAnchors = size_dmozAnchors;
|
|
reply->size_metadataBuf = size_metadata;
|
|
|
|
|
|
// breathe
|
|
QUICKPOLL( m_req->m_niceness );
|
|
|
|
/*
|
|
// truncate if necessary (buzz)
|
|
int32_t maxLen = 150000;
|
|
// truncate it?
|
|
bool trunc = true;
|
|
// not if getting link text
|
|
if ( req->m_getLinkText ) trunc = false;
|
|
// or outlinks
|
|
if ( req->m_getOutlinks ) trunc = false;
|
|
// or any niceness 1+ for that matter, that indicates a build operation
|
|
if ( req->m_niceness > 0 ) trunc = false;
|
|
// this is causing us to get EMISSINGQUERYTERMS errors!!!
|
|
trunc = false;
|
|
// MDW: int16_ten for speed test
|
|
//int32_t maxLen = 1000;
|
|
if ( trunc && contentLen > maxLen+1 ) {
|
|
contentLen = maxLen;
|
|
content [maxLen ] = '\0';
|
|
}
|
|
*/
|
|
|
|
// check the tag first
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
|
|
//Tag *tag1 = gr->getTag ("sitenuminlinks");
|
|
//Tag *tag2 = gr->getTag ("sitepop");
|
|
//int32_t sni = 0;
|
|
//int32_t spop = 0;
|
|
//if ( tag1 ) sni = atol(tag1->m_data);
|
|
//if ( tag2 ) spop = atol(tag2->m_data);
|
|
reply->m_siteNumInlinks = m_siteNumInlinks;
|
|
//reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
|
|
//reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
|
|
//reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
|
|
//reply->m_sitePop = m_sitePop;
|
|
|
|
// . get stuff from link info
|
|
// . this is so fast, just do it for all Msg20 requests
|
|
// . no! think about it -- this can be huge for pages like
|
|
// google.com!!!
|
|
LinkInfo *info1 = ptr_linkInfo1;
|
|
if ( info1 ) { // && m_req->m_getLinkInfo ) {
|
|
reply->m_pageNumInlinks = info1->m_totalInlinkingDocIds;
|
|
reply->m_pageNumGoodInlinks = info1->m_numGoodInlinks;
|
|
reply->m_pageNumUniqueIps = info1->m_numUniqueIps;
|
|
reply->m_pageNumUniqueCBlocks = info1->m_numUniqueCBlocks;
|
|
reply->m_pageInlinksLastUpdated = info1->m_lastUpdated;
|
|
//reply->m_pagePop = 0;//info1->m_pagePop;
|
|
//reply->m_siteNumInlinks = info1->m_siteNumInlinks;
|
|
//reply->m_sitePop = info1->m_sitePop;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// getLinkText is true if we are getting the anchor text for a
|
|
// supplied url as part of the SPIDER process..
|
|
// this was done by Msg23 before
|
|
if ( ! m_req->m_getLinkText ) {
|
|
m_replyValid = true;
|
|
return &m_reply;
|
|
}
|
|
|
|
// use the first url of the linker by default
|
|
Url *linker = &m_firstUrl;
|
|
|
|
// the base url, used for doing links: terms, is the final url,
|
|
// just in case there were any redirects
|
|
Url redir;
|
|
if ( ru ) {
|
|
redir.set ( ru );
|
|
linker = &redir;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// . get score weight of link text
|
|
// . phase out the sitedb*.xml files
|
|
//int64_t x[] = {0,20,30,40,50,70,90,100}; qualities!
|
|
// map these siteNumInlinks (x) to a weight (y)
|
|
//int64_t x[] = {0,50,100,200,500,3000,10000,50000};
|
|
// these are the weights the link text will receive
|
|
//int64_t y[] = {10,30,2000,3000,4000,5000,6000,7000};
|
|
// sanity check
|
|
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
//int32_t sni = m_siteNumInlinks;// *getSiteNumInlinks();
|
|
// get the final link text weight as a percentage
|
|
//int32_t ltw = getY ( m_siteNumInlinks , x , y , 8 );
|
|
// store the weight in the reply
|
|
//reply->m_linkTextScoreWeight = ltw;
|
|
|
|
//log(LOG_DEBUG,"build: got score weight of %"INT32" for sni=%"INT32"",
|
|
// (int32_t)reply->m_linkTextScoreWeight, m_siteNumInlinks);
|
|
|
|
// breathe
|
|
//QUICKPOLL( m_niceness );
|
|
|
|
// . we need the mid doma hash in addition to the ip domain because
|
|
// chat.yahoo.com has different ip domain than www.yahoo.com , ...
|
|
// and we don't want them both to be able to vote
|
|
// . the reply is zeroed out in call the reply->reset() above so
|
|
// if this is not yet set it will be 0
|
|
if ( reply->m_midDomHash == 0 ) {
|
|
char *m = linker->getMidDomain();
|
|
int32_t mlen = linker->getMidDomainLen();
|
|
reply->m_midDomHash = hash32 ( m , mlen );
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
// if not set from above, set it here
|
|
if ( ! links ) links = getLinks ( true ); // do quick set?
|
|
if ( ! links || links == (Links *)-1 ) return (Msg20Reply *)links;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Msg20Reply *)pos;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Msg20Reply *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Msg20Reply *)xml;
|
|
//Sections *ss = getSections();
|
|
//if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
|
|
|
|
// . is this page a dynamic page?
|
|
// . like a guestbook, access log stats, etc.
|
|
// . we don't like to count such pages for links analysis because
|
|
// they can be spammed so easily
|
|
// . TODO: guestbooks and message boards typically contain cgi links
|
|
// can we use that to identify?
|
|
// . the coll size includes the \0
|
|
//CollectionRec *cr ;
|
|
//cr = g_collectiondb.getRec ( m_req->ptr_coll,m_req->size_coll-1);
|
|
// g_errno should be ENOCOLLREC
|
|
//if ( ! cr ) return NULL;
|
|
|
|
// . we want link text for this url, "linkee"
|
|
// . TODO: true --> add "www" to see if that fixes our problem
|
|
// i guess Links.cpp does that with the outlinks, so when
|
|
// Linkdb::fillList() uses Links.cpp, the outlinks have "www"
|
|
// prepended on them...
|
|
//Url linkee;
|
|
//linkee.set ( m_req->ptr_linkee , m_req->size_linkee );
|
|
|
|
// get a ptr to the link in the content. will point to the
|
|
// stuff in the href field of the anchor tag. used for seeing if
|
|
// we have bad links or not.
|
|
int32_t linkNode = -1;
|
|
int32_t linkNum = -1;
|
|
// . get associated link text from the linker's document for our "url"
|
|
// . only gets from FIRST link to us
|
|
// . TODO: allow more link text from better quality pages?
|
|
// . TODO: limit score based on link text length?
|
|
// . should always be NULL terminated
|
|
// . should not break in the middle of a word
|
|
// . this will return the item/entry if we are extracting from an
|
|
// rss/atom feed
|
|
char *rssItem = NULL;
|
|
int32_t rssItemLen = 0;
|
|
// store link text in here
|
|
//char linkTextBuf[MAX_LINK_TEXT_LEN];
|
|
|
|
//
|
|
// TODO: for getting siteinlinks just match the site in the url
|
|
// not the full url... and maybe match the one with the int16_test path.
|
|
//
|
|
|
|
// . get the link text
|
|
// . linkee might be a site if m_isSiteLinkInfo is true in which
|
|
// case we get the best inlink to that site, and linkee is
|
|
// something like blogspot.com/mary/ or some other site.
|
|
int32_t blen = links->getLinkText ( m_req->ptr_linkee ,//&linkee,
|
|
m_req->m_isSiteLinkInfo ,
|
|
m_linkTextBuf ,
|
|
MAX_LINK_TEXT_LEN-2 ,
|
|
&rssItem ,
|
|
&rssItemLen ,
|
|
&linkNode ,
|
|
&linkNum ,
|
|
m_niceness );
|
|
|
|
|
|
// . BUT this skips the news topic stuff too. bad?
|
|
// . THIS HAPPENED before because we were truncating the xml(see above)
|
|
if ( linkNode < 0 ) {
|
|
|
|
int64_t took = gettimeofdayInMilliseconds() - start;
|
|
if ( took > 100 )
|
|
log("build: took %"INT64" ms to get link text for "
|
|
"%s from linker %s",
|
|
took,
|
|
m_req->ptr_linkee,
|
|
m_firstUrl.m_url );
|
|
|
|
logf(LOG_DEBUG,"build: Got linknode = %"INT32" < 0. Cached "
|
|
"linker %s does not have outlink to %s like linkdb "
|
|
"says it should. page is probably too big and the "
|
|
"outlink is past our limit. contentLen=%"INT32". or "
|
|
"a sitehash collision, or an area tag link.",
|
|
linkNode,getFirstUrl()->getUrl(),m_req->ptr_linkee,
|
|
m_xml.getContentLen());
|
|
//g_errno = ECORRUPTDATA;
|
|
// do not let multicast forward to a twin! so use this instead
|
|
// of ECORRUTPDATA
|
|
g_errno = EBADENGINEER;
|
|
//char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
if ( ! verifyUtf8 ( m_linkTextBuf , blen ) ) {
|
|
log("xmldoc: bad OUT link text from url=%s for %s",
|
|
m_req->ptr_linkee,m_firstUrl.m_url);
|
|
m_linkTextBuf[0] = '\0';
|
|
blen = 0;
|
|
}
|
|
|
|
// verify for rss as well. seems like we end up coring because
|
|
// length/size is not in cahoots and [size-1] != '\0' sometimes
|
|
if ( ! verifyUtf8 ( rssItem , rssItemLen ) ) {
|
|
log("xmldoc: bad RSS ITEM text from url=%s for %s",
|
|
m_req->ptr_linkee,m_firstUrl.m_url);
|
|
rssItem[0] = '\0';
|
|
rssItemLen = 0;
|
|
}
|
|
|
|
// point to it, include the \0.
|
|
if ( blen > 0 ) {
|
|
reply->ptr_linkText = m_linkTextBuf;
|
|
// save the size into the reply, include the \0
|
|
reply->size_linkText = blen + 1;
|
|
// sanity check
|
|
if ( blen + 2 > MAX_LINK_TEXT_LEN ) { char *xx=NULL;*xx=0; }
|
|
// sanity check. null termination required.
|
|
if ( m_linkTextBuf[blen] ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// . the link we link to
|
|
// . important when getting site info because the link url
|
|
// can be different than the root url!
|
|
reply-> ptr_linkUrl = links->getLink (linkNum);
|
|
reply->size_linkUrl = links->getLinkLen(linkNum)+1;
|
|
|
|
// save the rss item in our state so we can point to it, include \0
|
|
//if(rssItemLen > MAX_RSSITEM_SIZE-2 ) rssItemLen = MAX_RSSITEM_SIZE-2;
|
|
//char rssItemBuf[MAX_RSSITEM_SIZE];
|
|
if ( rssItemLen > MAX_RSSITEM_SIZE )
|
|
rssItemLen = MAX_RSSITEM_SIZE;
|
|
if ( rssItemLen > 0) {
|
|
m_rssItemBuf.safeMemcpy ( rssItem , rssItemLen );
|
|
m_rssItemBuf.pushChar('\0');
|
|
// gbmemcpy ( rssItemBuf, rssItem , rssItemLen );
|
|
// // NULL terminate it
|
|
// rssItemBuf[rssItemLen] = 0;
|
|
}
|
|
|
|
// point to it, include the \0
|
|
if ( rssItemLen > 0 ) {
|
|
reply->ptr_rssItem = m_rssItemBuf.getBufStart();
|
|
reply->size_rssItem = m_rssItemBuf.getLength();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
if ( ! m_req->m_doLinkSpamCheck )
|
|
reply->m_isLinkSpam = false;
|
|
|
|
if ( m_req->m_doLinkSpamCheck ) {
|
|
// reset to NULL to avoid gbstrlen segfault
|
|
char *note = NULL;
|
|
// need this
|
|
if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; }
|
|
// time it
|
|
//int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
Url linkeeUrl;
|
|
linkeeUrl.set ( m_req->ptr_linkee );
|
|
|
|
// get it. does not block.
|
|
reply->m_isLinkSpam = ::isLinkSpam ( linker ,
|
|
m_ip ,
|
|
ptr_indCatIds ,
|
|
size_indCatIds / 4 ,
|
|
m_siteNumInlinks,
|
|
&m_xml,
|
|
links,
|
|
// if doc length more
|
|
// than 150k then consider
|
|
// it linkspam
|
|
// automatically so it
|
|
// can't vote
|
|
150000,//MAXDOCLEN//150000
|
|
¬e ,
|
|
&linkeeUrl , // url ,
|
|
linkNode ,
|
|
cr->m_coll ,
|
|
m_niceness );
|
|
// store it
|
|
if ( note ) {
|
|
// include the \0
|
|
reply->ptr_note = note;
|
|
reply->size_note = gbstrlen(note)+1;
|
|
}
|
|
// log the reason why it is a log page
|
|
if ( reply->m_isLinkSpam )
|
|
log(LOG_DEBUG,"build: linker %s: %s.",
|
|
linker->getUrl(),note);
|
|
// sanity
|
|
if ( reply->m_isLinkSpam && ! note )
|
|
log("linkspam: missing note for d=%"INT64"!",m_docId);
|
|
// store times... nah, might have yielded cpu!
|
|
reply->m_timeLinkSpam = 0;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// sanity check
|
|
if ( reply->ptr_rssItem &&
|
|
reply->size_rssItem>0 &&
|
|
reply->ptr_rssItem[reply->size_rssItem-1]!=0) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
|
|
//log ("nogl=%"INT32"",(int32_t)m_req->m_onlyNeedGoodInlinks );
|
|
|
|
// . skip all this junk if we are a spammy voter
|
|
// . we get the title above in "getThatTitle"
|
|
if ( reply->m_isLinkSpam ) {
|
|
m_replyValid = true; return reply; }
|
|
|
|
// . this vector is set from a sample of the entire doc
|
|
// . it is used to dedup voters in Msg25.cpp
|
|
// . this has pretty much been replaced by vector2, it was
|
|
// also saying a doc was a dup if all its words were
|
|
// contained by another, like if it was a small subset, which
|
|
// wasn't the best behaviour.
|
|
// . yeah neighborhood text is much better and this is setting
|
|
// the slow sections class, so i took it out
|
|
getPageSampleVector ();
|
|
// must not block or error out. sanity check
|
|
if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
|
|
//st->m_v1.setPairHashes ( ww , -1 , m_niceness );
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
//st->m_v2.setPairHashes ( ww,linkWordNum, m_niceness );
|
|
// . this vector is set from the text after the link text
|
|
// . it terminates at at a breaking tag
|
|
// . check it out in ~/fff/src/Msg20.cpp
|
|
getPostLinkTextVector ( linkNode );
|
|
// must not block or error out. sanity check
|
|
//if ( ! m_postLinkTextVecValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// set from the hashes of the tag id pairs
|
|
//st->m_v3.setTagPairHashes ( xml , m_niceness );
|
|
// get it
|
|
getTagPairHashVector();
|
|
// must not block or error out. sanity check
|
|
if ( ! m_tagPairHashVecValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// breathe
|
|
QUICKPOLL( m_niceness );
|
|
|
|
// this vector is set from the hashes of the path components
|
|
// with punctuation stripped out
|
|
//v4.set ( xml, NULL , linker, -1 ,buf4,size);
|
|
// . the 4th vector is provided, this will point to m_topIps[] buffer
|
|
// . this is temporarily disabled
|
|
// . this is the top 2 bytes of the ips of each inlink
|
|
// . we were looking this info up in linkdb
|
|
// . so if two good inlinkers had their inlinks from the same ip
|
|
// neighborhoods, then one would have its voting power "deduped".
|
|
// . see the old LinkText.cpp for the logic that read these from linkdb
|
|
//v5.set2 ( (char *)incomingIps , numIncomingIps );
|
|
|
|
// reference the vectors in our reply
|
|
reply-> ptr_vector1 = m_pageSampleVec;//(char *)&st->m_v1;
|
|
reply->size_vector1 = m_pageSampleVecSize;//st->m_v1.getSize();
|
|
reply-> ptr_vector2 = m_postVec;//(char *)&st->m_v2;
|
|
reply->size_vector2 = m_postVecSize;//st->m_v2.getSize();
|
|
reply-> ptr_vector3 = m_tagPairHashVec; // (char *)&st->m_v3;
|
|
reply->size_vector3 = m_tagPairHashVecSize;//st->m_v3.getSize();
|
|
|
|
// crap, we gotta bubble sort these i think
|
|
// but only tag pair hash vec
|
|
bool flag = true;
|
|
uint32_t *d = (uint32_t *)m_tagPairHashVec;
|
|
// exclude the terminating 0 int32_t
|
|
int32_t nd = (m_tagPairHashVecSize / 4) - 1;
|
|
while ( flag ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
flag = false;
|
|
for ( int32_t i = 1 ; i < nd ; i++ ) {
|
|
if ( d[i-1] <= d[i] ) continue;
|
|
uint32_t tmp = d[i-1];
|
|
d[i-1] = d[i];
|
|
d[i] = tmp;
|
|
flag = true;
|
|
}
|
|
}
|
|
|
|
|
|
// just always do it
|
|
//if ( ! req->m_getInlinkNeighborhoods ) return true;
|
|
|
|
// convert "linkNode" into a string ptr into the document
|
|
char *node = xml->getNodePtr(linkNode)->m_node;
|
|
// . find the word index, "n" for this node
|
|
// . this is INEFFICIENT!!
|
|
char **wp = ww->getWords();
|
|
int32_t nw = ww->getNumWords();
|
|
int32_t n;
|
|
for ( n = 0; n < nw && wp[n] < node ; n++ )
|
|
QUICKPOLL(m_niceness);
|
|
// sanity check
|
|
//if ( n >= nw ) { char *xx=NULL; *xx=0; }
|
|
if ( n >= nw ) {
|
|
log("links: crazy! could not get word before linknode");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
|
|
// get the ptrs to the sections, 1-1 with words
|
|
//Section **sp = NULL;
|
|
//if ( ss ) sp = ss->m_sectionPtrs;
|
|
// . even tags in the article section have positive scores
|
|
// . the scores array is 1-1 with the words in Words, not the nodes
|
|
// in Xml. so we had to do that conversion.
|
|
//if ( ! sp || !(sp[n]->m_flags & NOINDEXFLAGS) )
|
|
// reply->m_outlinkInContent = true;
|
|
|
|
//
|
|
// get the surrounding link text, around "linkNode"
|
|
//
|
|
// radius of 80 characters around n
|
|
char sbuf[1201];
|
|
int32_t radius = 80;
|
|
char *p = sbuf;
|
|
char *pend = sbuf + 600;
|
|
// . make a neighborhood in the "words" space [a,b]
|
|
// . radius is in characters, so "convert" into words by dividing by 5
|
|
int32_t a = n - radius / 5;
|
|
int32_t b = n + radius / 5;
|
|
if ( a < 0 ) a = 0;
|
|
if ( b > nw ) b = nw;
|
|
int32_t *pp = pos->m_pos;
|
|
int32_t len;
|
|
// if too big shring the biggest, a or b?
|
|
while ( (len=pp[b]-pp[a]) >= 2 * radius + 1 ) {
|
|
// decrease the largest, a or b
|
|
if ( a<n && (pp[n]-pp[a])>(pp[b]-pp[n])) a++;
|
|
else if ( b>n ) b--;
|
|
}
|
|
// only store it if we can
|
|
if ( p + len + 1 < pend ) {
|
|
// store it
|
|
// FILTER the html entities!!
|
|
int32_t len2 = pos->filter(p,pend,ww,a,b,NULL);//ss);
|
|
// ensure NULL terminated
|
|
p[len2] = '\0';
|
|
// store in reply. it will be serialized when sent.
|
|
// thanks to isj for finding this bug fix.
|
|
m_surroundingTextBuf.safeMemcpy ( p , len2 + 1 );
|
|
reply->ptr_surroundingText =m_surroundingTextBuf.getBufStart();
|
|
reply->size_surroundingText=m_surroundingTextBuf.getLength();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// get title? its slow because it sets the sections class
|
|
if ( m_req->m_titleMaxLen > 0 && ! reply->ptr_tbuf &&
|
|
// don't get it anymore if getting link info because it
|
|
// is slow...
|
|
getThatTitle ) {
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
|
|
char *tit = ti->getTitle();
|
|
int32_t titLen = ti->getTitleLen();
|
|
reply-> ptr_tbuf = tit;
|
|
reply->size_tbuf = titLen + 1; // include \0
|
|
if ( ! tit || titLen <= 0 ) {
|
|
reply->ptr_tbuf = NULL;
|
|
reply->size_tbuf = 0;
|
|
}
|
|
}
|
|
|
|
int64_t took = gettimeofdayInMilliseconds() - start;
|
|
if ( took > 100 )
|
|
log("build: took %"INT64" ms to get link text for "
|
|
"%s from linker %s",
|
|
took,
|
|
m_req->ptr_linkee,
|
|
m_firstUrl.m_url );
|
|
|
|
|
|
m_replyValid = true;
|
|
return reply;
|
|
}
|
|
|
|
//static void gotMsg5ListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
|
|
// XmlDoc *THIS = (XmlDoc *)state;
|
|
// THIS->m_masterLoop ( THIS->m_masterState );
|
|
//}
|
|
|
|
|
|
char **XmlDoc::getDiffbotPrimaryImageUrl ( ) {
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (char **)jp;
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
|
|
// assume none
|
|
m_imageUrl2 = NULL;
|
|
m_imageUrl2Valid = true;
|
|
|
|
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
|
|
//char *topName = NULL;
|
|
// what name level are we?
|
|
// int32_t numNames = 1;
|
|
// JsonItem *pi = ji->m_parent;
|
|
// for ( ; pi ; pi = pi->m_parent ) {
|
|
// // empty name?
|
|
// if ( ! pi->m_name ) continue;
|
|
// if ( ! pi->m_name[0] ) continue;
|
|
// topName = pi->m_name;
|
|
// numNames++;
|
|
// }
|
|
|
|
char *name0 = ji->m_name;
|
|
char *name1 = NULL;
|
|
char *name2 = NULL;
|
|
if ( ji->m_parent )
|
|
name1 = ji->m_parent->m_name;
|
|
if ( ji->m_parent->m_parent )
|
|
name2 = ji->m_parent->m_parent->m_name;
|
|
|
|
// stop at first image for "images":[{ indicator
|
|
if ( strcmp(name0,"url") == 0 &&
|
|
name1 &&
|
|
strcmp(name1,"images") == 0 )
|
|
break;
|
|
|
|
|
|
// for products
|
|
if ( strcmp(name0,"link") == 0 &&
|
|
name1 &&
|
|
strcmp(name1,"media") == 0 )
|
|
break;
|
|
}
|
|
|
|
|
|
if ( ! ji )
|
|
return &m_imageUrl2;
|
|
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
|
|
// ok, we got it, just copy that
|
|
m_imageUrlBuf2.safeMemcpy ( val , vlen );
|
|
m_imageUrlBuf2.nullTerm();
|
|
m_imageUrl2 = m_imageUrlBuf2.getBufStart();
|
|
return &m_imageUrl2;
|
|
}
|
|
|
|
// get the image url SPECIFIED by the page, so there is no guesswork here
|
|
// unlike with the Images.cpp class
|
|
char **XmlDoc::getImageUrl() {
|
|
// return if valid
|
|
if ( m_imageUrlValid ) return &m_imageUrl;
|
|
// get first url
|
|
Url *f = getFirstUrl();
|
|
if ( ! f || f == (Url *)-1 ) return (char **)f;
|
|
|
|
// assume none
|
|
m_imageUrl = NULL;
|
|
m_imageUrlValid = true;
|
|
|
|
// we use getDiffbotPrimaryImageUrl() above for doing thumbs
|
|
if ( m_isDiffbotJSONObject || m_contentType == CT_JSON )
|
|
return &m_imageUrl;
|
|
|
|
// all done if not youtube or meta cafe
|
|
char *host = f->getHost();
|
|
char found = 0;
|
|
if ( ! strncmp ( host , "www.youtube.com/" , 16 ) ) found = 1;
|
|
if ( ! strncmp ( host , "youtube.com/" , 12 ) ) found = 1;
|
|
if ( ! strncmp ( host , "www.metacafe.com/" , 17 ) ) found = 2;
|
|
if ( ! strncmp ( host , "metacafe.com/" , 13 ) ) found = 2;
|
|
if ( ! found ) return &m_imageUrl;
|
|
// char ptr
|
|
char *u = f->getUrl();
|
|
// make it
|
|
if ( found == 1 ) {
|
|
char *s = strstr(u,"v=");
|
|
// if url does not contain a "v=" then forget it
|
|
if ( ! s ) return &m_imageUrl;
|
|
// point to the id
|
|
s += 2;
|
|
//m_imageUrl = m_imageUrlBuf;
|
|
//char *p = m_imageUrlBuf;
|
|
m_imageUrlBuf.safeStrcpy("http://img.youtube.com/vi/");
|
|
// do not break
|
|
//char *pend = m_imageUrlBuf + 80;
|
|
// copy the id/number
|
|
//for ( ; is_digit(*s) && p < pend ; ) *p++ = *s++;
|
|
for ( ; is_digit(*s) ; s++ )
|
|
m_imageUrlBuf.pushChar(*s);
|
|
// wrap it up
|
|
m_imageUrlBuf.safeStrcpy ( "/2.jpg" );
|
|
// size includes \0;
|
|
//m_imageUrlSize = p - m_imageUrl ;
|
|
// sanity check
|
|
//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
|
|
m_imageUrl = m_imageUrlBuf.getBufStart();
|
|
return &m_imageUrl;
|
|
}
|
|
// must be meta cafe now
|
|
// http://www.metacafe.com/watch/559561/surfer_girls_vol_2/
|
|
// http://s2.mcstatic.com/thumb/559561.jpg
|
|
// scan url path for first digit
|
|
for ( char *t = f->getPath() ; *t ; t++ ) {
|
|
// look for digit
|
|
if ( ! is_digit ( *t ) ) t++;
|
|
// grab that
|
|
int32_t id = atol ( t );
|
|
// skip ifnot good
|
|
if ( id <= 0 ) continue;
|
|
// make the url
|
|
//m_imageUrl = m_imageUrlBuf;
|
|
//char *p = m_imageUrlBuf;
|
|
//gbmemcpy ( p , "http://s2.mcstatic.com/thumb/" , 29 );
|
|
//p += 29;
|
|
//p += sprintf ( p , "%"INT32"" , id );
|
|
//gbmemcpy ( p , ".jpg\0" , 5 );
|
|
//p += 5;
|
|
m_imageUrlBuf.safePrintf("http://s2.mcstatic."
|
|
"com/thumb/%"INT32".jpg", id);
|
|
m_imageUrl = m_imageUrlBuf.getBufStart();
|
|
// size includes \0;
|
|
//m_imageUrlSize = p - m_imageUrl ;
|
|
// sanity check
|
|
//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
|
|
break;
|
|
}
|
|
return &m_imageUrl;
|
|
}
|
|
|
|
|
|
MatchOffsets *XmlDoc::getMatchOffsets () {
|
|
// return it if it is set
|
|
if ( m_matchOffsetsValid ) return &m_matchOffsets;
|
|
|
|
// need a buncha crap
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (MatchOffsets *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (MatchOffsets *)xml;
|
|
Matches *mm = getMatches();
|
|
if ( ! mm || mm == (Matches *)-1 ) return (MatchOffsets *)mm;
|
|
|
|
m_matchOffsets.set ( xml , ww , mm , true ); // getMatches=true
|
|
m_matchOffsetsValid = true;
|
|
return &m_matchOffsets;
|
|
}
|
|
|
|
Query *XmlDoc::getQuery() {
|
|
if ( m_queryValid ) return &m_query;
|
|
// bail if no query
|
|
if ( ! m_req || ! m_req->ptr_qbuf ) {
|
|
m_queryValid = true;
|
|
return &m_query;
|
|
}
|
|
// return NULL with g_errno set on error
|
|
if ( ! m_query.set2( m_req->ptr_qbuf ,
|
|
m_req->m_langId ,
|
|
true ) ) return NULL;
|
|
m_queryValid = true;
|
|
return &m_query;
|
|
}
|
|
|
|
Matches *XmlDoc::getMatches () {
|
|
// return it if it is set
|
|
if ( m_matchesValid ) return &m_matches;
|
|
|
|
// if no query, matches are empty
|
|
if ( ! m_req->ptr_qbuf ) {
|
|
m_matchesValid = true;
|
|
return &m_matches;
|
|
}
|
|
|
|
// cache it for one hour
|
|
//XmlDoc *od = getOldXmlDoc ( 3600 );
|
|
//if ( ! od || od == (XmlDoc *)-1 ) return (Matches *)od;
|
|
//if ( od->isEmpty() ) od = NULL;
|
|
|
|
// need a buncha crap
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Matches *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Matches *)xml;
|
|
Bits *bits = getBitsForSummary();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (Matches *)bits;
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss == (void *)-1) return (Matches *)ss;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Matches *)pos;
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Matches *)ti;
|
|
//Synonyms *syn = getSynonyms();
|
|
//if ( ! syn || syn == (void *)-1 ) return (Matches *)syn;
|
|
Phrases *phrases = getPhrases();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (Matches *)phrases;
|
|
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (Matches *)q;
|
|
|
|
// set it up
|
|
m_matches.setQuery ( q );
|
|
// returns false and sets g_errno on error
|
|
if ( ! m_matches.set ( this ,
|
|
ww ,
|
|
//syn ,
|
|
phrases ,
|
|
ss ,
|
|
bits ,
|
|
pos ,
|
|
xml ,
|
|
ti ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
// we got it
|
|
m_matchesValid = true;
|
|
return &m_matches;
|
|
}
|
|
|
|
// sender wants meta description, custom tags, etc.
|
|
char *XmlDoc::getDescriptionBuf ( char *displayMetas , int32_t *dsize ) {
|
|
// return the buffer if we got it
|
|
if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
// now get the content of the requested display meta tags
|
|
//char dbuf [ 1024*64 ];
|
|
char *dbufEnd = m_dbuf + 1024;//1024*64;
|
|
char *dptr = m_dbuf;
|
|
char *pp = displayMetas;
|
|
char *ppend = pp + gbstrlen(displayMetas);
|
|
// loop over the list of requested meta tag names
|
|
while ( pp < ppend && dptr < dbufEnd ) {
|
|
// skip initial spaces. meta tag names are ascii always i guess
|
|
while ( *pp && is_wspace_a(*pp) ) pp++;
|
|
// that's the start of the meta tag name
|
|
char *s = pp;
|
|
// . find end of that meta tag name
|
|
// . can end in :<integer> which specifies max len
|
|
while ( *pp && ! is_wspace_a(*pp) && *pp != ':' ) pp++;
|
|
// assume no max length to the content of this meta tag
|
|
int32_t maxLen = 0x7fffffff;
|
|
// save current char
|
|
char c = *pp;
|
|
// . NULL terminate the name
|
|
// . before, overflowed the request buffer and caused core!
|
|
// . seems like it is already NULL terminated
|
|
if ( *pp ) *pp = '\0';
|
|
// always advance regardless though
|
|
pp++;
|
|
// if ':' was specified, get the max length
|
|
if ( c == ':' ) {
|
|
if ( is_digit(*pp) ) maxLen = atoi ( pp );
|
|
// skip over the digits
|
|
while ( *pp && ! is_wspace_a (*pp) ) pp++;
|
|
}
|
|
// don't exceed our total buffer size (save room for \0 at end)
|
|
int32_t avail = dbufEnd - dptr - 1;
|
|
if ( maxLen > avail ) maxLen = avail;
|
|
// store the content at "dptr" (do not exceed "maxLen" bytes)
|
|
int32_t wlen = xml->getMetaContent ( dptr , // write buf
|
|
maxLen , // buf length
|
|
s , // name value
|
|
gbstrlen(s) , // name len
|
|
"name" , // http-equiv/name
|
|
false );// convert &#'s?
|
|
dptr[wlen] = '\0';
|
|
|
|
// test it out
|
|
if ( ! verifyUtf8 ( dptr ) ) {
|
|
log("xmldoc: invalid utf8 content for meta tag %s.",s);
|
|
continue;
|
|
}
|
|
|
|
// advance and NULL terminate
|
|
dptr += wlen;
|
|
*dptr++ = '\0';
|
|
// bitch if we truncated
|
|
if ( dptr >= dbufEnd )
|
|
log("query: More than %"INT32" bytes of meta tag "
|
|
"content "
|
|
"was encountered. Truncating.",
|
|
(int32_t)(dbufEnd-m_dbuf));
|
|
}
|
|
// what is the size of the content of displayed meta tags?
|
|
m_dbufSize = dptr - m_dbuf;
|
|
m_dbufValid = true;
|
|
*dsize = m_dbufSize;
|
|
return m_dbuf;
|
|
}
|
|
|
|
SafeBuf *XmlDoc::getHeaderTagBuf() {
|
|
if ( m_htbValid ) return &m_htb;
|
|
|
|
Sections *ss = getSections();
|
|
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
|
|
|
|
int32_t count = 0;
|
|
|
|
// scan sections
|
|
Section *si = ss->m_rootSection;
|
|
|
|
moreloop:
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
if ( si->m_tagId != TAG_H1 ) continue;
|
|
// if it contains now text, this will be -1
|
|
// so give up on it
|
|
if ( si->m_firstWordPos < 0 ) continue;
|
|
if ( si->m_lastWordPos < 0 ) continue;
|
|
// ok, it works, get it
|
|
break;
|
|
}
|
|
// if no h1 tag then make buf empty
|
|
if ( ! si ) {
|
|
m_htb.nullTerm();
|
|
m_htbValid = true;
|
|
return &m_htb;
|
|
}
|
|
// otherwise, set it
|
|
char *a = m_words.m_words[si->m_firstWordPos];
|
|
char *b = m_words.m_words[si->m_lastWordPos] ;
|
|
b += m_words.m_wordLens[si->m_lastWordPos];
|
|
|
|
// copy it
|
|
m_htb.safeMemcpy ( a , b - a );
|
|
m_htb.pushChar('\0');
|
|
|
|
si = si->m_next;
|
|
|
|
// add more?
|
|
if ( count++ < 3 ) goto moreloop;
|
|
|
|
m_htbValid = true;
|
|
return &m_htb;
|
|
}
|
|
|
|
|
|
Title *XmlDoc::getTitle ( ) {
|
|
if ( m_titleValid ) return &m_title;
|
|
// need a buncha crap
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml;
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Title *)ww;
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) return (Title *)sections;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Title *)pos;
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (Title *)q;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
int32_t titleMaxLen = cr->m_titleMaxLen;
|
|
if ( m_req ) titleMaxLen = m_req->m_titleMaxLen;
|
|
// limit for speed, some guys have a 100k word title!
|
|
if ( titleMaxLen > 256 ) titleMaxLen = 256;
|
|
|
|
m_titleValid = true;
|
|
if ( ! m_title.setTitle ( this ,
|
|
xml ,
|
|
ww ,
|
|
sections ,
|
|
pos ,
|
|
titleMaxLen ,
|
|
0xffff ,
|
|
NULL ,
|
|
q ,
|
|
cr ,
|
|
m_niceness ) )
|
|
return NULL;
|
|
return &m_title;
|
|
}
|
|
|
|
|
|
Summary *XmlDoc::getSummary () {
|
|
if ( m_summaryValid ) return &m_summary;
|
|
|
|
// xml and json docs have empty summaries for now
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (Summary *)ct;
|
|
|
|
if ( *ct == CT_JSON || *ct == CT_XML ) {
|
|
m_summaryValid = true;
|
|
return &m_summary;
|
|
}
|
|
|
|
// need a buncha crap
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (Summary *)xml;
|
|
Bits *bits = getBitsForSummary();
|
|
if ( ! bits || bits == (Bits *)-1 ) return (Summary *)bits;
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) return (Summary *)sections;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (Summary *)pos;
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (Summary *)site;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Summary *)d;
|
|
Matches *mm = getMatches();
|
|
if ( ! mm || mm == (Matches *)-1 ) return (Summary *)mm;
|
|
Title *ti = getTitle();
|
|
if ( ! ti || ti == (Title *)-1 ) return (Summary *)ti;
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (Summary *)q;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . get the highest number of summary lines that we need
|
|
// . the summary vector we generate for doing summary-based deduping
|
|
// typically has more lines in it than the summary we generate for
|
|
// displaying to the user
|
|
int32_t numLines = m_req->m_numSummaryLines;
|
|
if ( cr->m_percentSimilarSummary > 0 &&
|
|
cr->m_percentSimilarSummary < 100 &&
|
|
m_req->m_getSummaryVector &&
|
|
cr->m_summDedupNumLines > numLines )
|
|
// request more lines than we will display
|
|
numLines = cr->m_summDedupNumLines;
|
|
|
|
// int16_tcut
|
|
Summary *s = &m_summary;
|
|
|
|
// time cpu set time
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
m_cpuSummaryStartTime = start;
|
|
|
|
// make sure summary does not include title
|
|
char *tbuf = ti->m_title;
|
|
// this does not include the terminating \0
|
|
int32_t tbufLen = ti->m_titleBytes;
|
|
|
|
// compute the summary
|
|
bool status;
|
|
status = s->set2( xml ,
|
|
ww ,
|
|
bits ,
|
|
sections ,
|
|
pos ,
|
|
q ,
|
|
(int64_t *)m_req->ptr_termFreqs ,
|
|
(float *)m_req->ptr_affWeights ,
|
|
false , // doStemming
|
|
m_req->m_summaryMaxLen ,
|
|
numLines ,
|
|
// . displayLines, # lines we are displaying
|
|
// . Summary::getDisplayLen() will return the
|
|
// length of the summary to display
|
|
m_req->m_numSummaryLines ,
|
|
m_req->m_summaryMaxNumCharsPerLine,
|
|
m_req->m_ratInSummary ,
|
|
getFirstUrl() ,
|
|
//&reply->m_queryProximityScore ,
|
|
mm ,
|
|
tbuf ,
|
|
tbufLen );
|
|
|
|
// error, g_errno should be set!
|
|
if ( ! status ) return NULL;
|
|
|
|
m_summaryValid = true;
|
|
|
|
return &m_summary;
|
|
}
|
|
|
|
char *XmlDoc::getHighlightedSummary ( ) {
|
|
|
|
if ( m_finalSummaryBufValid ) {
|
|
//char *fsum = m_finalSummaryBuf.getBufStart();
|
|
//if ( ! fsum ) fsum = (char *)0x01;
|
|
return m_finalSummaryBuf.getBufStart();
|
|
}
|
|
|
|
Summary *s = getSummary();
|
|
|
|
if ( ! s || s == (void *)-1 ) return (char *)s;
|
|
|
|
Query *q = getQuery();
|
|
if ( ! q ) return (char *)q;
|
|
|
|
// get the summary
|
|
char *sum = s->getSummary();
|
|
//int32_t sumLen = s->getSummaryLen();
|
|
int32_t sumLen = s->getSummaryDisplayLen();
|
|
|
|
//sum[sumLen] = 0;
|
|
|
|
// assume no highlighting?
|
|
if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
|
|
m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
|
|
m_finalSummaryBuf.nullTerm();
|
|
m_finalSummaryBufValid = true;
|
|
return m_finalSummaryBuf.getBufStart();
|
|
//char *fsum = m_finalSummaryBuf.getBufStart();
|
|
//if ( ! fsum ) fsum = (char *)0x01;
|
|
//return fsum;
|
|
}
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
Highlight hi;
|
|
StackBuf(hb);
|
|
// highlight the query in it
|
|
int32_t hlen = hi.set ( &hb,
|
|
sum,
|
|
sumLen,
|
|
m_langId,
|
|
q,
|
|
false , // doStemming?
|
|
false , //click&scroll?
|
|
NULL , // base url
|
|
"<b>" , // front tag
|
|
"</b>" , // back tag
|
|
0,
|
|
m_niceness );
|
|
|
|
|
|
// highlight::set() returns 0 on error
|
|
if ( hlen < 0 ) {
|
|
log("build: highlight class error = %s",mstrerror(g_errno));
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return NULL;
|
|
}
|
|
|
|
// store into our safebuf then
|
|
m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 );
|
|
m_finalSummaryBufValid = true;
|
|
m_finalSummaryBuf.nullTerm();
|
|
|
|
return m_finalSummaryBuf.getBufStart();
|
|
|
|
//char *fsum = m_finalSummaryBuf.getBufStart();
|
|
//if ( ! fsum ) fsum = (char *)0x01;
|
|
//return fsum;
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// GET GIGABIT SAMPLE
|
|
//
|
|
//
|
|
// This will get samples surrounding all the query terms for purposes
|
|
// of gigabits generation. We don't just generate gigabits from the
|
|
// WHOLE document because it takes much longer?? is that still true?
|
|
// We assume that the first call to getTopLines() above set
|
|
// matches/numMatches. We use those arrays to
|
|
// skip directly to just the query terms in the document and save time.
|
|
// We may have to reset the Scores array here if we want to use it ltr.
|
|
//
|
|
// aka getGigabitSample. get gigabit sample
|
|
//
|
|
SafeBuf *XmlDoc::getSampleForGigabits ( ) {
|
|
|
|
|
|
if ( m_gsbufValid ) return &m_gsbuf;
|
|
|
|
// assume empty
|
|
//m_gsbuf = NULL;
|
|
|
|
// basically, exit now if no sample needed
|
|
if ( m_req->m_bigSampleMaxLen <= 0 ||
|
|
m_req->m_bigSampleRadius <= 0 ) {
|
|
m_gsbufValid = true;
|
|
return &m_gsbuf;
|
|
}
|
|
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
|
|
|
|
|
|
// if it is json then only return the json fields that are strings
|
|
// and json decode them... separate each field with a \0.
|
|
if ( *ct == CT_JSON )
|
|
return getSampleForGigabitsJSON();
|
|
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
|
|
|
|
// just send back the whole page, but separate each section
|
|
// with \0. make only sentences end with ? ! or ., headers
|
|
// not with anything, and no menu items
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) return (SafeBuf *)sections;
|
|
Section *sp = sections->m_rootSection;
|
|
SafeBuf reply;
|
|
reply.setLabel("gbtrepbuf");
|
|
// m_contentLen is invalid, don't use that here use size_utf8Content
|
|
if ( ! reply.reserve ( size_utf8Content + 1000 ) ) return NULL;
|
|
// scan the sections of the document
|
|
for ( ; sp ; sp = sp->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// do not allow menu crap
|
|
if ( sp->m_flags & ( SEC_MENU |
|
|
SEC_MENU_SENTENCE |
|
|
SEC_MENU_HEADER ) )
|
|
continue;
|
|
// must be sentence or header
|
|
bool ok = false;
|
|
if ( sp->m_flags & SEC_SENTENCE ) ok = true;
|
|
// headings are ok, just don't use as sentences...
|
|
if ( sp->m_flags & SEC_HEADING ) ok = true;
|
|
if ( ! ok ) continue;
|
|
|
|
// store without tags
|
|
char *p = ww->m_words[sp->m_a];
|
|
// include period after final word in section
|
|
int32_t b = sp->m_b - 1;
|
|
char *e = ww->m_words[b] + ww->m_wordLens[b];
|
|
|
|
// if 3+ commas and one comma for every 4 words, forget it,
|
|
// it is probably a list! well, process it, but make sure it
|
|
// does not end in a period so we do not display it
|
|
// as a fast fact, but we use it for gigabits.
|
|
bool isList = false;
|
|
int32_t commaCount = 0;
|
|
int32_t bracketCount = 0;
|
|
for ( char *z = p ; z < e ; z++ ) {
|
|
if ( *z == ',' ) commaCount++;
|
|
// fix ] [AllTheWeb] [Gigablast] [Google] [HotBot]...
|
|
if ( *z == '[' ) bracketCount++;
|
|
}
|
|
int32_t naw = (b - sp->m_a) / 2;
|
|
|
|
// just skip even for gigabits if too long. most likely
|
|
// a spammy list of nouns.
|
|
if ( naw >= 130 ) continue;
|
|
|
|
if ( commaCount >= 3 && commaCount *4 >= naw )
|
|
isList = true;
|
|
if ( commaCount >= 10 )
|
|
isList = true;
|
|
if ( bracketCount >= 3 )
|
|
isList = true;
|
|
|
|
// too much uppercase?
|
|
bool yelling = false;
|
|
int32_t upper = 0;
|
|
int32_t lower = 0;
|
|
char cs = 0;
|
|
for ( char *z = p ; z < e ; z += cs ) {
|
|
cs = getUtf8CharSize(z);
|
|
if ( ! is_alpha_utf8(z) ) continue;
|
|
if ( is_upper_utf8(z) ) upper++;
|
|
if ( is_lower_utf8(z) ) lower++;
|
|
}
|
|
if ( upper > lower ) yelling = true;
|
|
|
|
|
|
|
|
// ending ) or ]
|
|
if ( e[0] == ')' ) e++;
|
|
else if ( e[0] == ']' ) e++;
|
|
|
|
// incorporate period etc.
|
|
if ( e[0] == '.' ) e++;
|
|
else if ( e[0] == '!' ) e++;
|
|
else if ( e[0] == '?' ) e++;
|
|
else if ( e[0] == ';' ) e++;
|
|
|
|
|
|
// must end in a period, or .) or .]
|
|
bool endsInPeriod = false;
|
|
if ( e-2 >= p &&
|
|
( e[-1] =='.' ||
|
|
e[-1] =='!' ||
|
|
e[-1] =='?' ) )
|
|
endsInPeriod = true;
|
|
if ( (e[-1] == ')' ||
|
|
e[-1] == ']' ) &&
|
|
(e[-2] == '.' ||
|
|
e[-2] == '?' ||
|
|
e[-2] == '!' ) )
|
|
endsInPeriod = true;
|
|
|
|
//int32_t off = reply.length();
|
|
|
|
// filter out tags and \n's and \r's and store into "reply"
|
|
if ( ! reply.safePrintFilterTagsAndLines ( p , e-p ,false ) )
|
|
return NULL;
|
|
|
|
// if a sentence and does not end in period, toss one in
|
|
//if ( sp->m_flags & SEC_SENTENCE ) {
|
|
// if ( e[-1] !='.' &&
|
|
// e[-1] !='!' &&
|
|
// e[-1] !='?' &&
|
|
// e[-1] !=']' &&
|
|
// e[-1] !=')' )
|
|
// reply.pushChar('.');
|
|
//}
|
|
|
|
// too huge? if # of ALNUM words > 70 it's too big.
|
|
bool isHuge = false;
|
|
if ( naw > 70 ) isHuge = true;
|
|
|
|
|
|
// ending in a * indicates a printable sentence for fast facts
|
|
if ( (sp->m_flags & SEC_SENTENCE) &&
|
|
! isList &&
|
|
! isHuge &&
|
|
! yelling &&
|
|
endsInPeriod )
|
|
reply.pushChar('*');
|
|
|
|
// delineate sentences/headers/sections with | now so
|
|
// we can still allow a word to be a gigabit even if it is
|
|
// not in a sentence with a query term
|
|
//reply.pushChar('\0');
|
|
reply.pushChar('|');
|
|
char *pc = reply.getBufStart() + reply.length() - 1;
|
|
*pc = '\0';
|
|
|
|
// debug
|
|
//char *x = reply.getBufStart() + off;
|
|
// turn off fast fact debug for now
|
|
//log("fastfact: fastfact: %s",x);
|
|
// revert back to |
|
|
*pc = '|';
|
|
|
|
// stop? this fixes the query 'lesbain vedeo porno' on
|
|
// my cluster taking 10 seconds to get gigabits for.
|
|
// bigsamplemaxlen is 1000 as of 12/4/2013.
|
|
if ( reply.length() >= m_req->m_bigSampleMaxLen )
|
|
break;
|
|
}
|
|
// a final \0
|
|
reply.pushChar('\0');
|
|
// move it over to m_gsbuf now
|
|
m_gsbuf.stealBuf ( &reply );
|
|
// we are valid
|
|
m_gsbufValid = true;
|
|
// success
|
|
return &m_gsbuf;
|
|
|
|
|
|
|
|
|
|
|
|
// need a buncha crap
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
|
|
Pos *pos = getPos();
|
|
if ( ! pos || pos == (Pos *)-1 ) return (SafeBuf *)pos;
|
|
Matches *mm = getMatches();
|
|
if ( ! mm || mm == (Matches *)-1 ) return (SafeBuf *)mm;
|
|
|
|
// convert length to number of words
|
|
int32_t bigSampleRadius = m_req->m_bigSampleRadius / 5;
|
|
// at least 1
|
|
if ( bigSampleRadius <= 0 ) bigSampleRadius = 1;
|
|
|
|
// alloc for whole document?
|
|
int32_t max = xml->getContentLen() ;
|
|
// do not exceed
|
|
if ( max > m_req->m_bigSampleMaxLen ) max = m_req->m_bigSampleMaxLen;
|
|
// make sure we have something in words too. i guess no sample?
|
|
if ( max <= 2 ) { m_gsbufValid = true; return &m_gsbuf; }
|
|
// a flag so we don't overlap samples...
|
|
int32_t lastb = -1;
|
|
// . set m_buf to where we write the sample
|
|
// . add a byte for the terminating \0
|
|
int32_t gsbufAllocSize = max + 1;
|
|
// temp hack
|
|
//m_gsbuf = (char *)mmalloc(m_gsbufAllocSize,"gsbuf");
|
|
if ( ! m_gsbuf.reserve ( gsbufAllocSize, "gsbuf" ) ) return NULL;
|
|
// g_errno should be set...
|
|
//if ( ! m_gsbuf ) return NULL;
|
|
//m_freeBuf = true;
|
|
// set our pointer
|
|
char *pstart = m_gsbuf.getBufStart();
|
|
char *p = pstart;
|
|
char *pend = pstart + max;
|
|
|
|
int32_t nw = ww->m_numWords;
|
|
|
|
// skip to first query term
|
|
for ( int32_t i = 0 ; i < mm->m_numMatches ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the match
|
|
Match *m = &mm->m_matches[i];
|
|
// break out if match is not from the document's Words class
|
|
if ( m->m_words != ww ) break;
|
|
// the word #
|
|
int32_t n = m->m_wordNum;
|
|
// got a match, add this samplet, [a,b]
|
|
int32_t a = n - bigSampleRadius;
|
|
int32_t b = n + bigSampleRadius;
|
|
if ( a < 0 ) a = 0;
|
|
if ( b > nw ) b = nw;
|
|
if ( a < lastb ) a = lastb;
|
|
// ensure the samples are separated by \0
|
|
else if ( p > pstart && p + 2 < pend ) {
|
|
*p++ = '\0';
|
|
}
|
|
Pos *pos = m->m_pos;
|
|
int32_t *pp = pos->m_pos;
|
|
int32_t len = pp[b+1] - pp[a];
|
|
// if match would send us over, we are done
|
|
if ( p + len >= pend ) break;
|
|
len = pos->filter(p,pend,m->m_words,a,b,m->m_sections);
|
|
// for debug (mdw)
|
|
//log("query: gigabitsample#%"INT32"=%s",i,p);
|
|
p += len;
|
|
// we are the new lastb
|
|
lastb = b;
|
|
}
|
|
// always null terminate
|
|
*p++ = '\0';
|
|
// . set sample size
|
|
// . this includes terminating 0\'s in this case
|
|
//int32_t gsbufSize = p - m_gsbuf;
|
|
m_gsbuf.setLength( p - m_gsbuf.getBufStart() );
|
|
// we are valid
|
|
m_gsbufValid = true;
|
|
// for debug (mdw)
|
|
//log("query: finalgigabitsample=%s",m_gsbuf);
|
|
// success
|
|
return &m_gsbuf;
|
|
}
|
|
|
|
// if it is json then only return the json fields that are strings
|
|
// and json decode them... separate each field with a \0.
|
|
SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {
|
|
|
|
SafeBuf tmp;
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (SafeBuf *)jp;
|
|
JsonItem *ji = jp->getFirstItem();
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not string
|
|
if ( ji->m_type != JT_STRING )
|
|
continue;
|
|
// store field value
|
|
char *val = ji->getValue();
|
|
int valLen = ji->getValueLen();
|
|
// if it contains html then skip it as a gigabit candidate.
|
|
// otherwise our fast facts end up including html tags in them
|
|
// in computeFastFacts() in Msg40.cpp
|
|
int i;
|
|
for ( i = 0 ; i < valLen ; i++ )
|
|
if ( val[i] == '<' ) break;
|
|
if ( i < valLen ) continue;
|
|
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
// if ( ! tmp.safePrintf("<p>"))
|
|
// return NULL;
|
|
|
|
|
|
// decode the json
|
|
//SafeBuf xx;
|
|
if ( ! tmp.safeDecodeJSONToUtf8(val,valLen,m_niceness))
|
|
return NULL;
|
|
|
|
// escape out the html
|
|
// if ( ! tmp.htmlEncode ( xx.getBufStart() ))
|
|
// return NULL;
|
|
|
|
// two new lines
|
|
if ( ! tmp.safePrintf("<hr>"))
|
|
return NULL;
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
if ( ! tmp.pushChar('\n') )
|
|
return NULL;
|
|
}
|
|
|
|
if ( ! tmp.nullTerm() )
|
|
return NULL;
|
|
|
|
Xml xml;
|
|
if ( ! xml.set ( tmp.getBufStart() ,
|
|
tmp.length() ,
|
|
false , // ownData?
|
|
0 , // allocSize
|
|
false , // pure xml?
|
|
m_version ,
|
|
false , // setParentsArg?
|
|
m_niceness ,
|
|
CT_HTML ) ) // *ct ) )
|
|
return NULL;
|
|
Words ww;
|
|
if ( ! ww.set ( &xml , true , m_niceness ) ) return NULL;
|
|
Bits bb;
|
|
if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL;
|
|
Phrases pp;
|
|
if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL;
|
|
// this uses the sectionsReply to see which sections are
|
|
// "text", etc. rather than compute it expensively
|
|
Sections sec;
|
|
if ( !sec.set ( &ww ,
|
|
&pp ,
|
|
&bb ,
|
|
getFirstUrl() ,
|
|
0,//*d ,
|
|
0,//*sh64 , // 64 bits
|
|
"",//cr->m_coll ,
|
|
m_niceness ,
|
|
NULL,//m_masterState , // state
|
|
NULL,//m_masterLoop , // callback
|
|
CT_JSON, // *ct ,
|
|
NULL,//&m_dates ,
|
|
NULL , // sd // sections data
|
|
true , // sections data valid?
|
|
NULL , // sv // for m_nsvt
|
|
NULL , // buf
|
|
0 )) { // bufSize
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// now add each sentence section into the buffer
|
|
// scan the sentences if we got those
|
|
char **wptrs = ww.getWords();
|
|
int32_t *wlens = ww.getWordLens();
|
|
Section *ss = sec.m_firstSent;
|
|
for ( ; ss ; ss = ss->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// count of the alnum words in sentence
|
|
int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
|
|
// start with one word!
|
|
count--;
|
|
// how can it be less than one alnum word
|
|
if ( count < 0 ) continue;
|
|
// store it
|
|
char *wp1 = wptrs[ss->m_senta];
|
|
char *wp2 = wptrs[ss->m_sentb-1] + wlens[ss->m_sentb-1];
|
|
|
|
bool gotTerm = (wp2[0]=='.' || wp2[0]=='?' || wp2[0]=='!' ) ;
|
|
|
|
//if ( ! gotTerm ) continue;
|
|
|
|
if ( ! m_gsbuf.safeMemcpy ( wp1 , wp2 - wp1 ) )
|
|
return NULL;
|
|
|
|
// puncty?
|
|
if ( gotTerm && ! m_gsbuf.pushChar(wp2[0]))
|
|
return NULL;
|
|
|
|
// to indicate end of header or sentence, in order to
|
|
// qualify as a fast fact, we must add a '*'. see
|
|
// PageResults.cpp, search for ''*''
|
|
if ( gotTerm && ! m_gsbuf.pushChar('*') )
|
|
return NULL;
|
|
if ( ! m_gsbuf.pushChar('\0') )
|
|
return NULL;
|
|
}
|
|
m_gsbufValid = true;
|
|
return &m_gsbuf;
|
|
}
|
|
|
|
|
|
// . good sites sometimes have hacked pages
|
|
// . try to identify those
|
|
char *XmlDoc::getIsCompromised ( ) {
|
|
if ( m_isCompromisedValid ) return &m_isCompromised;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (void *)-1 ) return (char *)xml;
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// assume compromised
|
|
m_isCompromised = true;
|
|
m_isCompromisedValid = true;
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != TAG_FONT ) continue;
|
|
// only get content for <meta name=..> not <meta http-equiv=..>
|
|
int32_t stlen;
|
|
char *style = nodes[i].getFieldValue ( "style" , &stlen );
|
|
// skip if none
|
|
if ( ! style || stlen <= 6 ) continue;
|
|
// NULL term
|
|
char c = style[stlen];
|
|
style[stlen] = '\0';
|
|
char *hc = strstr(style,"height");
|
|
char *wc = strstr(style,"width");
|
|
// skip if neighter
|
|
if ( ! hc && ! wc ) continue;
|
|
// advance
|
|
if ( hc ) hc += 6;
|
|
if ( wc ) wc += 5;
|
|
while ( is_wspace_a(*hc) ) hc++;
|
|
while ( is_wspace_a(*wc) ) wc++;
|
|
if ( hc && *hc == ':' ) hc++;
|
|
if ( wc && *wc == ':' ) hc++;
|
|
while ( is_wspace_a(*hc) ) hc++;
|
|
while ( is_wspace_a(*wc) ) wc++;
|
|
style[stlen] = c;
|
|
// a zero height or width is a signal of invisble text and of
|
|
// our syzygy compromised site to compromised site spammer
|
|
if ( *hc == '0' ) return &m_isCompromised;
|
|
if ( *wc == '0' ) return &m_isCompromised;
|
|
}
|
|
m_isCompromised = false;
|
|
return &m_isCompromised;
|
|
}
|
|
|
|
// <meta name=robots value=noarchive>
|
|
// <meta name=gigabot value=noarchive>
|
|
char *XmlDoc::getIsNoArchive ( ) {
|
|
if ( m_isNoArchiveValid ) return &m_isNoArchive;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (void *)-1 ) return (char *)xml;
|
|
m_isNoArchive = false;
|
|
m_isNoArchiveValid = true;
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// find the meta tags
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != TAG_META ) continue;
|
|
// get robots attribute
|
|
int32_t alen; char *att;
|
|
// <meta name=robots value=noarchive>
|
|
att = nodes[i].getFieldValue ( "name" , &alen );
|
|
// need a name!
|
|
if ( ! att ) continue;
|
|
// get end
|
|
char *end = att + alen;
|
|
// skip leading spaces
|
|
while ( att < end && *att && is_wspace_a(*att) ) att++;
|
|
// must be robots or gigabot. skip if not
|
|
if ( strncasecmp(att,"robots" ,6) &&
|
|
strncasecmp(att,"gigabot",7) ) continue;
|
|
// get the content vaue
|
|
att = nodes[i].getFieldValue("content",&alen);
|
|
// skip if none
|
|
if ( ! att ) continue;
|
|
// get end
|
|
end = att + alen;
|
|
// skip leading spaces
|
|
while ( att < end && *att && is_wspace_a(*att) ) att++;
|
|
// is is noarchive? skip if no such match
|
|
if ( strncasecmp(att,"noarchive",9) ) continue;
|
|
// ok, we got it
|
|
m_isNoArchive = true;
|
|
break;
|
|
}
|
|
// return what we got
|
|
return &m_isNoArchive;
|
|
}
|
|
|
|
// this vector's components are 64-bit, not the usual 32-bit
|
|
int64_t **XmlDoc::getAdVector ( ) {
|
|
if ( m_adVectorValid ) return &ptr_adVector;
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (int64_t **)xml;
|
|
setStatus ( "parsing out ad ids");
|
|
// assume valid
|
|
m_adVectorValid = true;
|
|
int32_t na = 0;
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes();
|
|
// find the meta tags
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// continue if not a script tag
|
|
if ( nodes[i].m_nodeId != TAG_SCRIPT ) continue; // 83
|
|
// must be a front tag, not a back tag
|
|
if ( xml->isBackTag ( i ) ) continue;
|
|
// find the back tag for it
|
|
int32_t j;
|
|
for ( j = i ; j < n ; j++ ) {
|
|
// another script tag
|
|
if( nodes[i].m_nodeId != TAG_SCRIPT ) continue;
|
|
// must be a back tag this time
|
|
if ( ! xml->isBackTag ( i ) ) continue;
|
|
// ok, we got it
|
|
break;
|
|
}
|
|
// if no back tag, give up
|
|
if ( j == n ) break;
|
|
|
|
// buf/len defines the script area
|
|
char *buf = xml->getNode(i);
|
|
int32_t len = xml->getNode(j) - buf;
|
|
|
|
// skip this script tag for next loop
|
|
i = j;
|
|
|
|
bool found = false;
|
|
|
|
// start off looking for google
|
|
char *needles[3] =
|
|
{ "google_ad_client" ,
|
|
"ctxt_ad_partner",
|
|
"http://ad" };
|
|
char *providers[3] =
|
|
{ "google" ,
|
|
"yahoo",
|
|
"doubleclick" };
|
|
|
|
for ( int32_t k = 0 ; k < 3 ; k++ ) {
|
|
// try to match this needle
|
|
char *match = needles[k];
|
|
// try to get a match
|
|
char *p = strnstr ( buf, match , len );
|
|
// go again
|
|
if ( ! p ) continue;
|
|
// do not exceed the script area
|
|
char *pend = buf + len;
|
|
|
|
// it is in quotes
|
|
// pub-uint64_t for google ad, uint32_t for yahoo
|
|
|
|
// check for double or single quote
|
|
while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
|
|
// it must have them!... i guess
|
|
if ( p >= pend ) continue;
|
|
|
|
// point to after the quote
|
|
char *pbegin = ++p;
|
|
// find the ending quote
|
|
while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
|
|
// if none, bail
|
|
if ( p >= pend ) continue;
|
|
// get length of the ad client id between the quotes
|
|
int32_t adClientLen = p - pbegin;
|
|
|
|
if ( k == 2 ) {
|
|
p = strnstr(p,".doubleclick.net/",pend-p);
|
|
if ( ! p ) continue;
|
|
p += 17;
|
|
// look for doubleclick ads
|
|
// user name is the second element of the path
|
|
while(p < pend && *p != '/') p++;
|
|
pbegin = ++p;
|
|
while(p < pend && *p != '/') p++;
|
|
if(p >= pend) continue;
|
|
adClientLen = p - pbegin;
|
|
found = true;
|
|
}
|
|
|
|
char *f = pbegin;
|
|
char *fend = pbegin + adClientLen;
|
|
for ( ; f < fend ; f++ ) {
|
|
if ( is_alnum_a ( *f ) ) continue;
|
|
if ( *f == '-' || *f == '_' || *f == '.' )
|
|
continue;
|
|
break;
|
|
}
|
|
if ( f < fend ) continue;
|
|
if ( adClientLen >= 400 ) continue;
|
|
if ( adClientLen < 4 ) continue;
|
|
// null term temp
|
|
char c = *fend;
|
|
*fend = '\0';
|
|
// hash it
|
|
char buf[512];
|
|
sprintf(buf,"gbad:%s-%s",providers[k],pbegin);
|
|
// put it back
|
|
*fend = c;
|
|
// . make the query term id
|
|
// . first hash the field
|
|
uint64_t h = hash64 ( "gbad" , 4 );
|
|
// then add in the other junk
|
|
h = hash64 ( buf , gbstrlen(buf) , h );
|
|
// . now we will index that as-is
|
|
// . and Msg25/LinkInfo can use to dedup voters!
|
|
m_adIds[na++] = h;
|
|
// stop if too many. save room for NULL termination.
|
|
if ( na + 1 >= XD_MAX_AD_IDS ) break;
|
|
}
|
|
//look for another if not found or not ok.
|
|
}
|
|
// null term it like a good vector! no, those are 32-bit components,
|
|
// we are a 64-bit component vector
|
|
//m_adIds[na++] = 0;
|
|
// point to where we should put them
|
|
ptr_adVector = m_adIds;
|
|
// store this i guess
|
|
size_adVector = na * 8;
|
|
// *lastNode = nn;
|
|
return &ptr_adVector;
|
|
}
|
|
|
|
|
|
|
|
char *XmlDoc::getIsLinkSpam ( ) {
|
|
if ( m_isLinkSpamValid ) return &m_isLinkSpam2;
|
|
|
|
setStatus ( "checking if linkspam" );
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
Links *links = getLinks();
|
|
if ( ! links || links == (Links *)-1 ) return (char *)links;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
|
|
int32_t **pici = getIndCatIds();
|
|
if ( ! pici || pici == (void *)-1 ) return (char *)pici;
|
|
//LinkInfo *info1 = getLinkInfo1();
|
|
//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// reset note
|
|
m_note = NULL;
|
|
|
|
// . if a doc is "link spam" then it cannot vote, or its
|
|
// voting power is reduced
|
|
// . look for indications that the link is from a guestbook
|
|
// . doc length over 100,000 bytes consider it link spam
|
|
m_isLinkSpamValid = true;
|
|
m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
|
|
*ip ,
|
|
ptr_indCatIds ,
|
|
size_indCatIds / 4 ,
|
|
*sni ,
|
|
xml,
|
|
links,
|
|
150000,//MAXDOCLEN,//maxDocLen ,
|
|
&m_note ,
|
|
NULL , // &linkee , // url ,
|
|
-1 , // linkNode ,
|
|
cr->m_coll ,
|
|
m_niceness );
|
|
// set shadow
|
|
m_isLinkSpam2 = (bool)m_isLinkSpam;
|
|
return &m_isLinkSpam2;
|
|
}
|
|
|
|
|
|
|
|
void *zliballoc ( void *opaque , unsigned int items , unsigned int size ) {
|
|
//log("db: got zlib alloc");
|
|
return (void *)mmalloc ( items * size , "zlib" );
|
|
}
|
|
|
|
void zlibfree ( void *opaque , void *address ) {
|
|
//log("db: got zlib free");
|
|
// -1 will tell Mem.cpp to look it up in the table
|
|
mfree ( address , -1 , "zlib" );
|
|
}
|
|
|
|
void *malloc_replace (void *pf , unsigned int nitems , unsigned int size ) {
|
|
return g_mem.gbmalloc(size*nitems,"malloc_replace");
|
|
}
|
|
|
|
void free_replace ( void *pf , void *s ) {
|
|
// -1 means we don't know the size
|
|
g_mem.gbfree(s,-1,"free_replace");
|
|
}
|
|
|
|
int gbuncompress ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ) {
|
|
z_stream stream;
|
|
int err;
|
|
|
|
stream.next_in = (Bytef*)source;
|
|
stream.avail_in = (uInt)sourceLen;
|
|
// Check for source > 64K on 16-bit machine:
|
|
if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
|
|
|
|
stream.next_out = dest;
|
|
stream.avail_out = (uInt)*destLen;
|
|
if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
|
|
|
|
//stream.zalloc = (alloc_func)0;
|
|
//stream.zfree = (free_func)0;
|
|
stream.zalloc = malloc_replace;//zliballoc;
|
|
stream.zfree = free_replace;//zlibfree;
|
|
|
|
// this calls memcpy so make sure Profiler.cpp doesn't crash
|
|
// since when it calls backtrace() that calls memcpy() too
|
|
// and it's not async safe
|
|
g_inMemcpy = 2;
|
|
|
|
//we can be gzip or deflate
|
|
err = inflateInit2(&stream, 47);
|
|
|
|
g_inMemcpy = 0;
|
|
|
|
if (err != Z_OK) return err;
|
|
|
|
err = inflate(&stream, Z_FINISH);
|
|
if (err != Z_STREAM_END) {
|
|
inflateEnd(&stream);
|
|
if (err == Z_NEED_DICT ||
|
|
(err == Z_BUF_ERROR && stream.avail_in == 0))
|
|
return Z_DATA_ERROR;
|
|
return err;
|
|
}
|
|
*destLen = stream.total_out;
|
|
|
|
err = inflateEnd(&stream);
|
|
return err;
|
|
}
|
|
|
|
void deflateQuickPoll ( ) {
|
|
QUICKPOLL(1);
|
|
}
|
|
|
|
int gbcompress ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ,
|
|
int32_t encoding ) {
|
|
|
|
int level = Z_DEFAULT_COMPRESSION;
|
|
z_stream stream;
|
|
int err;
|
|
int method = Z_DEFLATED;
|
|
//lots of mem, faster, more compressed, see zlib.h
|
|
int windowBits = 31;
|
|
int memLevel = 8;
|
|
int strategy = Z_DEFAULT_STRATEGY;
|
|
|
|
stream.next_in = (Bytef*)source;
|
|
stream.avail_in = (uInt)sourceLen;
|
|
#ifdef MAXSEG_64K
|
|
// Check for source > 64K on 16-bit machine:
|
|
if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
|
|
#endif
|
|
stream.next_out = dest;
|
|
stream.avail_out = (uInt)*destLen;
|
|
if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
|
|
|
|
//stream.zalloc = (alloc_func)0;
|
|
//stream.zfree = (free_func)0;
|
|
stream.zalloc = malloc_replace;//zliballoc;
|
|
stream.zfree = free_replace;//zlibfree;
|
|
|
|
stream.opaque = (voidpf)0;
|
|
|
|
//we can be gzip or deflate
|
|
if(encoding == ET_DEFLATE) err = deflateInit (&stream, level);
|
|
else err = deflateInit2(&stream, level,
|
|
method, windowBits,
|
|
memLevel, strategy);
|
|
if (err != Z_OK) {
|
|
// zlib's incompatible version error?
|
|
if ( err == -6 ) {
|
|
log("zlib: zlib did you forget to add #pragma pack(4) to "
|
|
"zlib.h when compiling libz.a so it aligns on 4-byte "
|
|
"boundaries because we have that pragma in "
|
|
"gb-include.h so its used when including zlib.h");
|
|
}
|
|
return err;
|
|
}
|
|
|
|
// cygwin uses the system libz.a which is not hacked for our quickpoll
|
|
#ifndef CYGWIN
|
|
// tell deflat() to call quickpoll
|
|
|
|
// MDW: 11/14/2014 don't do this for the 64bit zlib for now just to
|
|
// save some time. do it later when it proves to be an issue.
|
|
//setQuickPoll ( (char *)&g_loop.m_needsToQuickPoll, deflateQuickPoll);
|
|
#endif
|
|
|
|
// this calls memcpy so make sure Profiler.cpp doesn't crash
|
|
// since when it calls backtrace() that calls memcpy() too
|
|
// and it's not async safe
|
|
g_inMemcpy = 3;
|
|
|
|
err = deflate(&stream, Z_FINISH);
|
|
|
|
g_inMemcpy = 0;
|
|
|
|
if (err != Z_STREAM_END) {
|
|
deflateEnd(&stream);
|
|
return err == Z_OK ? Z_BUF_ERROR : err;
|
|
}
|
|
*destLen = stream.total_out;
|
|
|
|
err = deflateEnd(&stream);
|
|
return err;
|
|
}
|
|
|
|
//
|
|
// NO NO don't use until use replace in[64] with SafeBuf in and out below
|
|
//
|
|
int gbcompress7 ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ,
|
|
bool compress ) {
|
|
|
|
//int32_t id = 1;
|
|
// pass the input to the program through this file
|
|
// rather than a pipe, since popen() seems broken
|
|
char in[64];
|
|
if ( compress ) sprintf ( in , "%s/in.7z", g_hostdb.m_dir );
|
|
else sprintf ( in , "%s/out.7z", g_hostdb.m_dir );
|
|
unlink ( in );
|
|
// collect the output from the filter from this file
|
|
char out[64];
|
|
if ( compress ) sprintf ( out , "%s/out.7z", g_hostdb.m_dir );
|
|
else sprintf ( out , "%s/in.7z", g_hostdb.m_dir );
|
|
if ( ! compress )
|
|
unlink ( out );
|
|
// ignore errno from those unlinks
|
|
errno = 0;
|
|
// open the input file
|
|
retry11:
|
|
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry11;
|
|
log("build: Could not open file %s for writing: %s.",
|
|
in,mstrerror(errno));
|
|
return -1;
|
|
}
|
|
|
|
retry12:
|
|
// write the content into the input file
|
|
int32_t w = write ( fd , source , sourceLen );
|
|
// valgrind
|
|
if ( w < 0 && errno == EINTR ) goto retry12;
|
|
// did we get an error
|
|
if ( w != (int32_t)sourceLen ) {
|
|
log("build: Error writing to %s: %s.",in,mstrerror(errno));
|
|
close(fd);
|
|
return -1;
|
|
}
|
|
// close the file
|
|
close ( fd );
|
|
|
|
// . open a pipe to pdf2html program
|
|
// . the output will go to stdout
|
|
//char cmd[2048];
|
|
SafeBuf cmd;
|
|
// different commands to filter differt ctypes
|
|
// -i : ignore images
|
|
// -stdout: send output to stdout
|
|
// -c : generate complex document
|
|
// Google generates complex docs, but the large ones are horribly slow
|
|
// in the browser, but docs with 2 cols don't display right w/o -c.
|
|
// damn, -stdout doesn't work when -c is specified.
|
|
// These ulimit sizes are max virtual memory in kilobytes. let's
|
|
// keep them to 25 Megabytes
|
|
// . the newer 2.6 kernels do not support ulimit !!!
|
|
if ( compress )
|
|
// 7za a out.7z in.7z
|
|
cmd.safePrintf( "%s7za a %s %s > /dev/null",
|
|
g_hostdb.m_dir , out,in);
|
|
else
|
|
// -y = yes on all. so we overwrite "in.7z"
|
|
cmd.safePrintf( "%s7za -o%s -y e %s > /dev/null",
|
|
g_hostdb.m_dir,g_hostdb.m_dir , in);//,in);
|
|
// breach sanity check
|
|
//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// exectue it
|
|
int retVal = gbsystem ( cmd.getBufStart() );
|
|
if ( retVal == -1 )
|
|
log("gb: system(%s) : %s",cmd.getBufStart(),
|
|
mstrerror(g_errno));
|
|
|
|
// all done with input file
|
|
// clean up the binary input file from disk
|
|
//if ( unlink ( in ) != 0 ) {
|
|
// // log error
|
|
// log("gbfilter: unlink (%s): %s\n",in,strerror(errno));
|
|
// // ignore it, since it was not a processing error per se
|
|
// errno = 0;
|
|
//}
|
|
|
|
retry13:
|
|
fd = open ( out , O_RDONLY );
|
|
if ( fd < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry13;
|
|
log("7zip: Could not open file %s for reading: %s.",
|
|
out,mstrerror(errno));
|
|
return -1;
|
|
}
|
|
// to read - leave room for \0
|
|
//int32_t toRead = MAXDOCLEN + 1000;
|
|
int32_t toRead = 150000 + 1000;
|
|
retry14:
|
|
// read right from pipe descriptor
|
|
int32_t r = read (fd, dest,toRead);
|
|
// note errors
|
|
if ( r < 0 ) {
|
|
// valgrind
|
|
if ( errno == EINTR ) goto retry14;
|
|
log("7zip: reading output: %s",mstrerror(errno));
|
|
// this is often bad fd from an oom error, so ignore it
|
|
errno = 0;
|
|
r = 0;
|
|
}
|
|
// clean up shop
|
|
close ( fd );
|
|
// delete output file
|
|
//unlink ( out );
|
|
if ( r > (int32_t)*destLen ) { char *xx=NULL;*xx=0; }
|
|
// assign
|
|
*destLen = r;
|
|
// debug for now
|
|
char *pre = "";
|
|
if ( ! compress ) pre = "un";
|
|
log("7zip: %scompressed %"UINT32" to %"UINT32" bytes"
|
|
, pre,sourceLen , *destLen );
|
|
return Z_OK;
|
|
}
|
|
|
|
int gbuncompress7 ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ) {
|
|
return gbcompress7(dest,destLen,source,sourceLen,false);
|
|
}
|
|
|
|
/*
|
|
bool XmlDoc::hashSingleTerm ( int64_t termId , HashInfo *hi ) {
|
|
// combine with a non-NULL prefix
|
|
if ( hi->m_prefix ) {
|
|
int64_t prefixHash = hash64b ( hi->m_prefix );
|
|
// sanity test, make sure it is in supported list
|
|
if ( getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
|
|
char *xx=NULL;*xx=0; }
|
|
termId = hash64 ( termId , prefixHash );
|
|
}
|
|
|
|
// save it?
|
|
if ( m_wts && ! ::storeTerm ( "binary",6,termId,hi,0,0,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
hi->m_hashGroup,
|
|
false,&m_wbuf,m_wts,false) )
|
|
return false;
|
|
|
|
// int16_tcut
|
|
HashTableX *dt = hi->m_tt;
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key_t) ) { char *xx=NULL;*xx=0; }
|
|
// make the key like we do in hashWords()
|
|
key96_t k;
|
|
k.n1 = hi->m_date;
|
|
k.n0 = termId;
|
|
// get current score for this wordid
|
|
int32_t slot = dt->getSlot ( &k );
|
|
// does this termid/date already exist?
|
|
if ( slot >= 0 ) {
|
|
// done
|
|
return true;
|
|
}
|
|
// otherwise, add a new slot
|
|
char val = 1;
|
|
if ( ! hi->m_tt->addKey ( (char *)k , &val ) )
|
|
return false;
|
|
// return true on success
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
bool storeTerm ( char *s ,
|
|
int32_t slen ,
|
|
int64_t termId ,
|
|
HashInfo *hi ,
|
|
int32_t wordNum ,
|
|
int32_t wordPos ,
|
|
char densityRank,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char hashGroup,
|
|
//bool isPhrase ,
|
|
SafeBuf *wbuf ,
|
|
HashTableX *wts ,
|
|
char synSrc ,
|
|
char langId ,
|
|
POSDBKEY key ) {
|
|
|
|
// store prefix
|
|
int32_t poff = wbuf->length();
|
|
// int16_tcut
|
|
char *p = hi->m_prefix;
|
|
// add the prefix too!
|
|
if ( p && ! wbuf->safeMemcpy(p,gbstrlen(p)+1)) return false;
|
|
// none?
|
|
if ( ! p ) poff = -1;
|
|
|
|
|
|
// store description
|
|
int32_t doff = wbuf->length();
|
|
// int16_tcut
|
|
char *d = hi->m_desc;
|
|
// add the desc too!
|
|
if ( d && ! wbuf->safeMemcpy(d,gbstrlen(d)+1) ) return false;
|
|
// none?
|
|
if ( ! d ) doff = -1;
|
|
|
|
// store term
|
|
int32_t toff = wbuf->length();
|
|
// add it
|
|
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
|
|
|
|
// make this
|
|
TermDebugInfo ti;
|
|
ti.m_termOff = toff;
|
|
ti.m_termLen = slen;
|
|
ti.m_descOff = doff;
|
|
ti.m_prefixOff = poff;
|
|
ti.m_date = hi->m_date;
|
|
ti.m_shardByTermId = hi->m_shardByTermId;
|
|
ti.m_termId = termId;
|
|
//ti.m_weight = 1.0;
|
|
//ti.m_spam = -1.0;
|
|
ti.m_diversityRank = diversityRank;
|
|
ti.m_densityRank = densityRank;
|
|
ti.m_wordSpamRank = wordSpamRank;
|
|
ti.m_hashGroup = hashGroup;
|
|
ti.m_wordNum = wordNum;
|
|
ti.m_wordPos = wordPos;
|
|
ti.m_langId = langId;
|
|
ti.m_key = key;
|
|
|
|
// was sitehash32
|
|
//ti.m_facetVal32 = hi->m_facetVal32;//sentHash32 = hi->m_sentHash32;
|
|
|
|
// save for printing out an asterisk
|
|
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
|
|
|
|
// get language bit vec
|
|
ti.m_langBitVec64 = g_speller.getLangBits64(&termId);
|
|
|
|
//if ( isPhrase ) ti.m_synSrc = SOURCE_NGRAM;
|
|
|
|
/*
|
|
// the weight vec for the words and phrases
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) ti.m_rv[j] = 1.0;
|
|
|
|
int32_t *wscores = NULL;
|
|
|
|
if ( weights && ! isPhrase ) wscores = weights->m_ww;
|
|
if ( weights && isPhrase ) wscores = weights->m_pw;
|
|
|
|
// int16_tcut
|
|
int32_t i = wordNum;
|
|
|
|
if ( weights && ! weights->m_rvw ) { char *xx=NULL;*xx=0; }
|
|
if ( weights && ! weights->m_rvp ) { char *xx=NULL;*xx=0; }
|
|
|
|
float *rv = NULL;
|
|
if ( weights && ! isPhrase ) rv = &weights->m_rvw[i*MAX_RULES];
|
|
if ( weights && isPhrase ) rv = &weights->m_rvp[i*MAX_RULES];
|
|
|
|
if ( weights ) ti.m_weight = (float)wscores[i] / (float)DW;
|
|
|
|
if ( weights )
|
|
gbmemcpy ( &ti.m_rv, rv , MAX_RULES*sizeof(float));
|
|
|
|
// no, because if this is zero we force it up to 1!
|
|
//if ( weights )
|
|
// ti.m_score32 = (int32_t)((float)ti.m_score32 * ti.m_weight);
|
|
ti.m_score32 = score;
|
|
|
|
if ( isSynonym )
|
|
ti.m_score32 = score;
|
|
*/
|
|
|
|
// make the key
|
|
key96_t k;
|
|
k.n1 = 0; // date
|
|
k.n0 = termId;
|
|
|
|
// store it
|
|
return wts->addKey ( &k , &ti ) ;
|
|
}
|
|
|
|
|
|
|
|
bool XmlDoc::hashSingleTerm ( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ) {
|
|
// empty?
|
|
if ( slen <= 0 ) return true;
|
|
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
|
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
|
|
//
|
|
//if ( ! m_storeTermListInfo )
|
|
// return true;
|
|
|
|
|
|
// a single blob hash
|
|
int64_t termId = hash64 ( s , slen );
|
|
// combine with prefix
|
|
int64_t final = termId;
|
|
// combine with a non-NULL prefix
|
|
int64_t prefixHash = 0LL;
|
|
if ( hi->m_prefix ) {
|
|
prefixHash = hash64b ( hi->m_prefix );
|
|
final = hash64 ( termId , prefixHash );
|
|
}
|
|
// call the other guy now
|
|
//return hashSingleTerm ( final , hi );
|
|
|
|
// int16_tcut
|
|
HashTableX *dt = hi->m_tt;
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key144_t) ) { char *xx=NULL;*xx=0; }
|
|
// make the key like we do in hashWords()
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
final,
|
|
0LL, // docid
|
|
0, // dist
|
|
MAXDENSITYRANK, // density rank
|
|
MAXDIVERSITYRANK, // diversity rank
|
|
MAXWORDSPAMRANK, // wordspamrank
|
|
0, // siterank
|
|
hi->m_hashGroup,
|
|
// we set to docLang in final hash loop
|
|
langUnknown,// langid
|
|
0, // multiplier
|
|
0, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//
|
|
// HACK: mangle the key if its a gbsitehash:xxxx term
|
|
// used for doing "facets" like stuff on section xpaths.
|
|
//
|
|
// no longer do this because we just hash the term
|
|
// gbxpathsitehash1234567 where 1234567 is that hash.
|
|
// but
|
|
//
|
|
//static int64_t s_gbsectionhash = 0LL;
|
|
//if ( ! s_gbsectionhash ) s_gbsectionhash = hash64b("gbsectionhash");
|
|
//if ( prefixHash == s_gbsectionhash )
|
|
// g_posdb.setSectionSentHash32 ( &k, hi->m_sentHash32 );
|
|
|
|
// . otherwise, add a new slot
|
|
// . key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
if ( ! dt->addTerm144 ( &k ) ) return false;
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( m_wts && ! storeTerm ( s,slen,final,hi,
|
|
0, // wordnum
|
|
0, // wordPos,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
hi->m_hashGroup,
|
|
//false,
|
|
&m_wbuf,
|
|
m_wts,
|
|
SOURCE_NONE, // synsrc
|
|
langUnknown,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashString ( char *s, HashInfo *hi ) {
|
|
return hashString ( s , gbstrlen(s), hi ); }
|
|
|
|
bool XmlDoc::hashString ( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ) {
|
|
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
|
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t *sni = getSiteNumInlinks();
|
|
return hashString3( s ,
|
|
slen ,
|
|
hi ,
|
|
&m_countTable ,
|
|
m_pbuf ,
|
|
m_wts ,
|
|
&m_wbuf ,
|
|
m_version ,
|
|
*sni ,
|
|
m_niceness );
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashString3( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ,
|
|
HashTableX *countTable ,
|
|
SafeBuf *pbuf ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf ,
|
|
int32_t version ,
|
|
int32_t siteNumInlinks ,
|
|
int32_t niceness ) {
|
|
Words words;
|
|
Bits bits;
|
|
Phrases phrases;
|
|
//Weights weights;
|
|
//Synonyms synonyms;
|
|
if ( ! words.set ( s , slen , version , true , niceness ) )
|
|
return false;
|
|
if ( ! bits.set ( &words , version , niceness ) )
|
|
return false;
|
|
if ( ! phrases.set(&words,&bits,true,false,version,niceness ) )
|
|
return false;
|
|
|
|
// use primary langid of doc
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// words
|
|
//SafeBuf myLangVec;
|
|
//if ( ! setLangVec ( &words , &myLangVec , m_niceness ) )
|
|
// return false;
|
|
//char *langVec = (char *)myLangVec.getBufStart();
|
|
|
|
/*
|
|
// debugBuf for synonyms? yes if we are debugging
|
|
SafeBuf synDebugBuf;
|
|
SafeBuf *sdbp = NULL;
|
|
if ( pbuf || m_storeTermListInfo ) sdbp = &synDebugBuf;
|
|
// now we can set it...
|
|
if ( hi->m_useSynonyms && !synonyms.set(&words,
|
|
NULL, // langVec,
|
|
m_langId,
|
|
&phrases,
|
|
niceness,
|
|
sdbp))
|
|
return false;
|
|
*/
|
|
|
|
// set weights because of count table
|
|
//if ( countTable && ! weights.set ( &words ,
|
|
/*
|
|
if ( hi->m_useWeights &&
|
|
! weights.set ( &words ,
|
|
&phrases ,
|
|
&bits ,
|
|
NULL ,
|
|
pbuf ,
|
|
false ,
|
|
false ,
|
|
version ,
|
|
100 , // titleWeight
|
|
100 , // headerWeight
|
|
countTable ,
|
|
false , // isLinkText
|
|
false , // isCntTable?
|
|
siteNumInlinks ,
|
|
niceness ) )
|
|
return false;
|
|
|
|
Weights *wp = &weights;
|
|
if ( ! hi->m_useWeights ) wp = NULL;
|
|
*/
|
|
|
|
//Synonyms *sp = NULL;
|
|
//if ( hi->m_useSynonyms ) sp = &synonyms;
|
|
|
|
return hashWords3 ( //0 ,
|
|
//words.getNumWords() ,
|
|
hi ,
|
|
&words ,
|
|
&phrases ,
|
|
NULL,//sp , synonyms
|
|
NULL , // sections
|
|
countTable ,
|
|
NULL , // fragvec
|
|
NULL , // wordspamvec
|
|
NULL , // langvec
|
|
langUnknown , // default langid doclangid
|
|
pbuf ,
|
|
wts ,
|
|
wbuf ,
|
|
niceness );
|
|
}
|
|
|
|
bool XmlDoc::hashWords ( //int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
HashInfo *hi ) {
|
|
// sanity checks
|
|
if ( ! m_wordsValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_phrasesValid ) { char *xx=NULL; *xx=0; }
|
|
if ( hi->m_useCountTable &&!m_countTableValid){char *xx=NULL; *xx=0; }
|
|
if ( ! m_bitsValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_sectionsValid) { char *xx=NULL; *xx=0; }
|
|
//if ( ! m_synonymsValid) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_fragBufValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_wordSpamBufValid ) { char *xx=NULL; *xx=0; }
|
|
if ( m_wts && ! m_langVectorValid ) { char *xx=NULL; *xx=0; }
|
|
if ( ! m_langIdValid ) { char *xx=NULL; *xx=0; }
|
|
// . is the word repeated in a pattern?
|
|
// . this should only be used for document body, for meta tags,
|
|
// inlink text, etc. we should make sure words are unique
|
|
char *wordSpamVec = getWordSpamVec();
|
|
char *fragVec = m_fragBuf.getBufStart();
|
|
char *langVec = m_langVec.getBufStart();
|
|
|
|
return hashWords3( //wordStart ,
|
|
//wordEnd ,
|
|
hi ,
|
|
&m_words ,
|
|
&m_phrases ,
|
|
NULL,//&m_synonyms ,
|
|
&m_sections ,
|
|
&m_countTable ,
|
|
fragVec ,
|
|
wordSpamVec ,
|
|
langVec ,
|
|
m_langId , // defaultLangId docLangId
|
|
m_pbuf ,
|
|
m_wts ,
|
|
&m_wbuf ,
|
|
m_niceness );
|
|
}
|
|
|
|
// . this now uses posdb exclusively
|
|
bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
HashInfo *hi ,
|
|
Words *words ,
|
|
Phrases *phrases ,
|
|
Synonyms *synonyms ,
|
|
Sections *sectionsArg ,
|
|
HashTableX *countTable ,
|
|
char *fragVec ,
|
|
char *wordSpamVec ,
|
|
char *langVec ,
|
|
char docLangId , // default lang id
|
|
//Weights *weights ,
|
|
SafeBuf *pbuf ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf ,
|
|
int32_t niceness ) {
|
|
|
|
//
|
|
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
|
|
//
|
|
//if ( ! m_storeTermListInfo )
|
|
// return true;
|
|
|
|
Sections *sections = sectionsArg;
|
|
// for getSpiderStatusDocMetaList() we don't use sections it'll
|
|
// mess us up
|
|
if ( ! hi->m_useSections ) sections = NULL;
|
|
|
|
// int16_tcuts
|
|
uint64_t *wids = (uint64_t *)words->getWordIds();
|
|
//nodeid_t *tids = words->m_tagIds;
|
|
uint64_t *pids2 = (uint64_t *)phrases->m_phraseIds2;
|
|
//uint64_t *pids3 = (uint64_t *)phrases->m_phraseIds3;
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// . sanity checks
|
|
// . posdb just uses the full keys with docid
|
|
if ( dt->m_ks != 18 ) { char *xx=NULL;*xx=0; }
|
|
if ( dt->m_ds != 4 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if provided...
|
|
if ( wts ) {
|
|
if ( wts->m_ks != 12 ) { char *xx=NULL;*xx=0; }
|
|
if ( wts->m_ds != sizeof(TermDebugInfo)){char *xx=NULL;*xx=0; }
|
|
if ( ! wts->m_allowDups ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// ensure caller set the hashGroup
|
|
if ( hi->m_hashGroup < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// handy
|
|
char **wptrs = words->getWordPtrs();
|
|
int32_t *wlens = words->getWordLens();
|
|
|
|
// hash in the prefix
|
|
uint64_t prefixHash = 0LL;
|
|
int32_t plen = 0;
|
|
if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && plen ) {
|
|
// we gotta make this case insensitive, and skip spaces
|
|
// because if it is 'focal length' we can't search
|
|
// 'focal length:10' because that comes across as TWO terms.
|
|
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
|
|
// . sanity test, make sure it is in supported list
|
|
// . hashing diffbot json output of course fails this so
|
|
// skip in that case if diffbot
|
|
//if ( ! m_isDiffbotJSONObject &&
|
|
// getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
|
|
// if (hi->m_desc&&strcmp(hi->m_desc,"custom meta tag")) {
|
|
// char *xx=NULL;*xx=0; }
|
|
//}
|
|
}
|
|
|
|
bool hashIffUnique = false;
|
|
//if ( hi->m_hashGroup == HASHGROUP_INLINKTEXT ) hashIffUnique = true;
|
|
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
|
|
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
|
|
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,niceness,"uqtbl");
|
|
|
|
///////
|
|
//
|
|
// diversity rank vector.
|
|
//
|
|
///////
|
|
// the final diversity which is a multiplier
|
|
// is converted into a rank from 0-15 i guess.
|
|
// so 'mexico' in "new mexico" should receive a low word score but high
|
|
// phrase score. thus, a search for 'mexico' should not bring up
|
|
// the page for university of new mexico!
|
|
SafeBuf dwbuf;
|
|
if(!getDiversityVec ( words,phrases,countTable,&dwbuf,niceness))
|
|
return false;
|
|
char *wdv = dwbuf.getBufStart();
|
|
|
|
int32_t nw = words->getNumWords();
|
|
|
|
/////
|
|
//
|
|
// calculate density ranks
|
|
//
|
|
/////
|
|
//
|
|
// this now varies depending on the length of the sentence/header etc.
|
|
// so if the hasgroup is not title, link text or meta tag, we have to
|
|
// use a safebuf.
|
|
SafeBuf densBuf;
|
|
// returns false and sets g_errno on error
|
|
if ( ! getDensityRanks((int64_t *)wids,
|
|
nw,//wordStart,
|
|
//wordEnd,
|
|
hi->m_hashGroup,
|
|
&densBuf,
|
|
sections,
|
|
m_niceness))
|
|
return false;
|
|
// a handy ptr
|
|
char *densvec = (char *)densBuf.getBufStart();
|
|
|
|
////////////
|
|
//
|
|
// get word positions
|
|
//
|
|
///////////
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
SafeBuf wpos;
|
|
if ( ! getWordPosVec ( words ,
|
|
sections,
|
|
//wordStart,
|
|
//wordEnd,
|
|
m_dist, // hi->m_startDist,
|
|
fragVec,
|
|
niceness,
|
|
&wpos) ) return false;
|
|
// a handy ptr
|
|
int32_t *wposvec = (int32_t *)wpos.getBufStart();
|
|
|
|
/*
|
|
// show that for debug
|
|
if ( m_docId == 192304365235LL ) {
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
char buf[1000];
|
|
int32_t len = wlens[i];
|
|
if ( len > 900 ) len = 900;
|
|
gbmemcpy(buf,wptrs[i],len);
|
|
buf[len]='\0';
|
|
log("seopipe: wptr=%s pos[%"INT32"]=%"INT32"",buf,i,wposvec[i]);
|
|
}
|
|
}
|
|
*/
|
|
|
|
//int32_t wc = 0;
|
|
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
|
|
|
|
int32_t i;
|
|
for ( i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
if ( ! wids[i] ) continue;
|
|
// ignore if in repeated fragment
|
|
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
|
|
// ignore if in style section
|
|
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
|
|
|
|
// do not breach wordpos bits
|
|
if ( wposvec[i] > MAXWORDPOS ) break;
|
|
|
|
// . hash the startHash with the wordId for this word
|
|
// . we must mask it before adding it to the table because
|
|
// this table is also used to hash IndexLists into that come
|
|
// from LinkInfo classes (incoming link text). And when
|
|
// those IndexLists are hashed they used masked termIds.
|
|
// So we should too...
|
|
//uint64_t h = g_indexdb.getTermId ( startHash , wids[i] ) ;
|
|
uint64_t h ;
|
|
if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash );
|
|
else h = wids[i];
|
|
|
|
// . get word spam rank. 0 means not spammed
|
|
// . just mod Weights class to ues a weight rank...
|
|
// . and diversity rank
|
|
// . need to separate weights by spam vs. diversity.
|
|
// . maybe just have a diversity class and a pattern class
|
|
// and leave the poor weights class alone
|
|
//int32_t wsr = 0;
|
|
|
|
int32_t hashGroup = hi->m_hashGroup;
|
|
|
|
Section *sx = NULL;
|
|
if ( sp ) {
|
|
sx = sp[i];
|
|
// . this is taken care of in hashTitle()
|
|
// . it is slightly different if the title is
|
|
// multiple sentences because when hashing the
|
|
// body the density rank is per sentence, but in
|
|
// hashTitle we count all the words in the title
|
|
// towards the density rank even if they are
|
|
// in different sentences
|
|
if ( sx->m_flags & SEC_IN_TITLE )
|
|
//hashGroup = HASHGROUP_TITLE;
|
|
continue;
|
|
if ( sx->m_flags & SEC_IN_HEADER )
|
|
hashGroup = HASHGROUP_HEADING;
|
|
if ( sx->m_flags & ( SEC_MENU |
|
|
SEC_MENU_SENTENCE |
|
|
SEC_MENU_HEADER ) )
|
|
hashGroup = HASHGROUP_INMENU;
|
|
}
|
|
|
|
// this is for link text and meta tags mostly
|
|
if ( hashIffUnique ) {
|
|
// skip if already did it
|
|
if ( ut.isInTable ( &h ) ) continue;
|
|
if ( ! ut.addKey ( &h ) ) return false;
|
|
}
|
|
|
|
char ws = 15;
|
|
if ( wordSpamVec ) ws = wordSpamVec[i];
|
|
|
|
// HACK:
|
|
// if this is inlink text, use the wordspamrank to hold the
|
|
// inlinker's site rank!
|
|
if ( hashGroup == HASHGROUP_INLINKTEXT )
|
|
ws = hi->m_linkerSiteRank;
|
|
|
|
// default to the document's primary language if it is not
|
|
// clear what language this word belongs to.
|
|
// if the word is only in german it should be german,
|
|
// otherwise it will be the document's primary language.
|
|
char langId = langUnknown;
|
|
if ( m_wts && langVec ) langId = langVec[i];
|
|
// keep it as the original vector. i'm not sure we use
|
|
// this for anything but for display, so show the user
|
|
// how we made our calculation of the document's primary lang
|
|
//if ( langId == langUnknown ) langId = docLangId;
|
|
|
|
char wd;
|
|
if ( hi->m_useCountTable ) wd = wdv[i];
|
|
else wd = MAXDIVERSITYRANK;
|
|
|
|
// if using posdb
|
|
key144_t k;
|
|
// if ( i == 11429 )
|
|
// log("foo");
|
|
g_posdb.makeKey ( &k ,
|
|
h ,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd, // diversityRank 0-15
|
|
ws, // wordSpamRank 0-15
|
|
0, // siterank
|
|
hashGroup ,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
false , // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
// get the one we lost
|
|
// char *kstr = KEYSTR ( &k , sizeof(POSDBKEY) );
|
|
// if (!strcmp(kstr,"0x0ca3417544e400000000000032b96bf8aa01"))
|
|
// log("got lost key");
|
|
|
|
// key should NEVER collide since we are always incrementing
|
|
// the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
|
|
|
|
// . make the m_wordPosInfoBuf here because we need to set
|
|
// WordPosInfo::m_wordPtr/m_wordLen.
|
|
// . could also use instead of the "wts" buffer?
|
|
if ( m_doingSEO ) {
|
|
// alloc in 10k chunks
|
|
if ( m_wordPosInfoBuf.getAvail() <
|
|
(int32_t)sizeof(WordPosInfo) ) {
|
|
int32_t newSize = m_wordPosInfoBuf.length();
|
|
newSize += 10000;
|
|
if ( ! m_wordPosInfoBuf.reserve ( newSize ) )
|
|
return false;
|
|
}
|
|
// make it
|
|
WordPosInfo wi;
|
|
wi.m_wordPtr = wptrs[i];
|
|
wi.m_wordLen = wlens[i];
|
|
wi.m_wordPos = wposvec[i];
|
|
wi.m_densityRank = densvec[i];
|
|
wi.m_wordSpamRank = ws;
|
|
wi.m_diversityRank = wd;//v[i];
|
|
wi.m_hashGroup = hashGroup;
|
|
wi.m_trafficGain = 0;
|
|
int32_t cs = sizeof(WordPosInfo);
|
|
if(!m_wordPosInfoBuf.safeMemcpy(&wi,cs)) return false;
|
|
}
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( wts ) {
|
|
if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_NONE, // synsrc
|
|
langId ,
|
|
k))
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// STRIP POSSESSIVE WORDS for indexing
|
|
//
|
|
// . for now do simple stripping here
|
|
// . if word is "bob's" hash "bob"
|
|
//
|
|
if ( wlens[i] >= 3 &&
|
|
wptrs[i][wlens[i]-2] == '\'' &&
|
|
to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
|
|
int64_t nah ;
|
|
nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 );
|
|
if ( plen>0 ) nah = hash64 ( nah , prefixHash );
|
|
g_posdb.makeKey ( &k ,
|
|
nah,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i], // diversityRank ,
|
|
ws, // wordSpamRank ,
|
|
0, //siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
// keep going if not debug
|
|
if ( ! wts ) continue;
|
|
// print the synonym
|
|
if ( ! storeTerm(wptrs[i], // synWord,
|
|
wlens[i] -2, // gbstrlen(synWord),
|
|
nah, // termid
|
|
hi,
|
|
i, // wordnum
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_GENERATED,
|
|
langId,
|
|
k) )
|
|
return false;
|
|
}
|
|
|
|
|
|
/////////////
|
|
//
|
|
// synonyms (alt words,morphs,synonyms)
|
|
//
|
|
/////////////
|
|
/*
|
|
int64_t *aids = NULL;
|
|
int16_t naids = 0;
|
|
int64_t syh;
|
|
if ( synonyms ) {
|
|
aids = synonyms->getAltIds (i);
|
|
naids = synonyms->getNumAlts(i);
|
|
//ascore = saved / 4;
|
|
//if ( ascore <= 0 ) ascore = 1;
|
|
//asaved = ascore;
|
|
}
|
|
for ( int32_t j = 0 ; j < naids ; j++ ) {
|
|
// skip if same as original
|
|
if ( (uint64_t)aids[j] == wids[i] ) continue;
|
|
// . hash it with the prefix if any
|
|
// . fixes gbwhere:galleries bug...
|
|
if ( plen>0 ) syh = hash64 ( aids[j] , prefixHash );
|
|
else syh = aids[j];
|
|
g_posdb.makeKey ( &k ,
|
|
syh ,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wdv[i], // diversityRank ,
|
|
ws, // wordSpamRank ,
|
|
0, //siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false ); // delkey?
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
|
|
// keep going if not debug
|
|
if ( ! wts ) continue;
|
|
// get the junk
|
|
char *synWord = synonyms->getStringFromId(&aids[j]);
|
|
// sanity
|
|
if ( ! synWord ) { char *xx=NULL;*xx=0; }
|
|
// print the synonym
|
|
if ( ! storeTerm(synWord,
|
|
gbstrlen(synWord),
|
|
syh, // termid
|
|
hi,
|
|
i, // wordnum
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wdv[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
synonyms->m_source[i], // synsrc
|
|
langId) )
|
|
return false;
|
|
}
|
|
*/
|
|
|
|
////////
|
|
//
|
|
// two-word phrase
|
|
//
|
|
////////
|
|
|
|
int64_t npid = pids2[i];
|
|
int32_t npw = 2;
|
|
uint64_t ph2 = 0;
|
|
|
|
// repeat for the two word hash if different!
|
|
if ( npid ) {
|
|
// hash with prefix
|
|
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
|
else ph2 = npid;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0LL,//docid
|
|
wposvec[i],//dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK, //phrase
|
|
ws, // wordSpamRank ,
|
|
0,//siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
}
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( wts && npid ) {
|
|
// get phrase as a string
|
|
int32_t plen;
|
|
char *phr=phrases->getPhrase(i,&plen,npw);
|
|
// store it
|
|
if ( ! storeTerm ( phr,plen,ph2,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK,//phrase
|
|
ws,
|
|
hashGroup,
|
|
//true,
|
|
wbuf,
|
|
wts,
|
|
SOURCE_BIGRAM, // synsrc
|
|
langId,
|
|
k) )
|
|
return false;
|
|
}
|
|
|
|
////////
|
|
//
|
|
// three-word phrase
|
|
//
|
|
////////
|
|
/*
|
|
npid = pids3[i];
|
|
npw = 3;
|
|
|
|
// repeat for the two word hash if different!
|
|
if ( npid ) {
|
|
// hash with prefix
|
|
uint64_t ph2 ;
|
|
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
|
else ph2 = npid;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0LL,//docid
|
|
wposvec[i],//dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK, //phrase
|
|
ws, // wordSpamRank ,
|
|
0,//siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true , // syn?
|
|
false ); // delkey?
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
}
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( wts && npid ) {
|
|
// get phrase as a string
|
|
int32_t plen;
|
|
char *phr=phrases->getPhrase(i,&plen,npw);
|
|
// store it
|
|
if ( ! storeTerm ( phr,plen,ph2,hi,i,
|
|
wposvec[i], // wordpos
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK,//phrase
|
|
ws,
|
|
hashGroup,
|
|
//true, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_TRIGRAM, // synsrc
|
|
langId ) )
|
|
return false;
|
|
}
|
|
*/
|
|
// update for hashIncomingLinkText()
|
|
//hi->m_startDist = wposvec[i];
|
|
|
|
// debug point
|
|
//if ( ph2 == (uint64_t)-233869093807964777LL ) {
|
|
// log("hey slot=%"INT32" date=%"UINT32" n0=%"INT64" score=%"INT32"",
|
|
// slot,
|
|
// k.n1,k.n0,
|
|
// score);
|
|
// //char *xx=NULL;*xx=0;
|
|
//}
|
|
|
|
//
|
|
// NUMERIC SORTING AND RANGES
|
|
//
|
|
|
|
// only store numbers in fields this way
|
|
if ( prefixHash == 0 ) continue;
|
|
|
|
// this may or may not be numeric.
|
|
if ( ! is_digit ( wptrs[i][0] ) ) continue;
|
|
|
|
// this might have to "back up" before any '.' or '-' symbols
|
|
if ( ! hashNumber ( wptrs[0] ,
|
|
wptrs[i] ,
|
|
wlens[i] ,
|
|
hi ) )
|
|
return false;
|
|
}
|
|
|
|
// hash a single term so they can do gbfacet:ext or
|
|
// gbfacet:siterank or gbfacet:price. a field on a field.
|
|
if ( prefixHash && words->m_numWords )
|
|
// hash gbfacet:price with and store the price in the key
|
|
hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi );
|
|
|
|
// between calls? i.e. hashTitle() and hashBody()
|
|
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
|
|
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
|
|
|
|
return true;
|
|
}
|
|
|
|
// just like hashNumber*() functions but we use "gbfacet" as the
|
|
// primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby,
|
|
// gbsortbyint, gbrevsortby, gbrevsortbyint
|
|
bool XmlDoc::hashFacet1 ( char *term ,
|
|
Words *words ,
|
|
HashTableX *tt ) {
|
|
|
|
// need a prefix
|
|
//if ( ! hi->m_prefix ) return true;
|
|
|
|
// hash the ENTIRE content, all words as one blob
|
|
int32_t nw = words->getNumWords();
|
|
char *a = words->m_words[0];
|
|
char *b = words->m_words[nw-1]+words->m_wordLens[nw-1];
|
|
// hash the whole string as one value, the value of the facet
|
|
int32_t val32 = hash32 ( a , b - a );
|
|
|
|
if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;
|
|
|
|
//
|
|
// why do this if we already do it for hashNumber() using gbsortby: ?
|
|
//
|
|
|
|
/*
|
|
// if it's a number hash as float and int
|
|
if ( nw != 1 ) return true;
|
|
char **wptrs = words->m_words;
|
|
if ( ! is_digit ( wptrs[0][0] ) ) return true;
|
|
|
|
// hash with a float val
|
|
float f = atof(wptrs[0]);
|
|
int32_t vf32 = *(int32_t *)&f;
|
|
if ( ! hashFacet2 ( "gbfacetfloat",term, vf32 , tt ) ) return false;
|
|
|
|
// and an int val
|
|
int32_t vi32 = atoi(wptrs[0]);
|
|
if ( ! hashFacet2 ( "gbfacetint",term, vi32 , tt ) ) return false;
|
|
*/
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashFacet2 ( char *prefix,
|
|
char *term ,
|
|
int32_t val32 ,
|
|
HashTableX *tt ,
|
|
// we only use this for gbxpathsitehash terms:
|
|
bool shardByTermId ) {
|
|
|
|
// need a prefix
|
|
//if ( ! hi->m_prefix ) return true;
|
|
//int32_t plen = gbstrlen ( hi->m_prefix );
|
|
//if ( plen <= 0 ) return true;
|
|
// we gotta make this case insensitive, and skip spaces
|
|
// because if it is 'focal length' we can't search
|
|
// 'focal length:10' because that comes across as TWO terms.
|
|
//int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen);
|
|
|
|
// now any field has to support gbfacet:thatfield
|
|
// and store the 32-bit termid into where we normally put
|
|
// the word position bits, etc.
|
|
//static int64_t s_facetPrefixHash = 0LL;
|
|
//if ( ! s_facetPrefixHash )
|
|
// s_facetPrefixHash = hash64n ( "gbfacet" );
|
|
|
|
// this is case-sensitive
|
|
int64_t prefixHash = hash64n ( prefix );
|
|
|
|
// term is like something like "object.price" or whatever.
|
|
// it is the json field itself, or the meta tag name, etc.
|
|
int64_t termId64 = hash64n ( term );
|
|
|
|
// combine with the "gbfacet" prefix. old prefix hash on right.
|
|
// like "price" on right and "gbfacetfloat" on left... see Query.cpp.
|
|
int64_t ph2 = hash64 ( termId64, prefixHash );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
g_posdb.setInt ( &k , val32 );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
HashTableX *dt = tt;//hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
bool isFloat = false;
|
|
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
|
|
|
|
// store in buffer for display on pageparser.cpp output
|
|
char buf[130];
|
|
if ( isFloat )
|
|
snprintf(buf,128,"facetField=%s facetVal32=%f",term,
|
|
*(float *)&val32);
|
|
else
|
|
snprintf(buf,128,"facetField=%s facetVal32=%"UINT32"",
|
|
term,(uint32_t)val32);
|
|
int32_t bufLen = gbstrlen(buf);
|
|
|
|
// make a special hashinfo for this facet
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
// the full prefix
|
|
char fullPrefix[66];
|
|
snprintf(fullPrefix,64,"%s:%s",prefix,term);
|
|
hi.m_prefix = fullPrefix;//"gbfacet";
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
ph2, // prefixHash, // s_facetPrefixHash,
|
|
&hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
|
|
|
|
HashTableX *tt = hi->m_tt;
|
|
|
|
uint64_t val64 = hash64 ( val , vlen );
|
|
|
|
// term is like something like "object.price" or whatever.
|
|
// it is the json field itself, or the meta tag name, etc.
|
|
uint64_t middlePrefix = hash64n ( hi->m_prefix );
|
|
|
|
// hash "This is a new product." with "object.desc".
|
|
// "object.desc" (termId64) is case-sensitive.
|
|
uint64_t composite = hash64 ( val64 , middlePrefix );
|
|
|
|
// hash that with "gbfieldmatch"
|
|
char *prefix = "gbfieldmatch";
|
|
uint64_t prefixHash = hash64n ( prefix );
|
|
uint64_t ph2 = hash64 ( composite , prefixHash );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
false ) ; // shardByTermId? no, by docid.
|
|
|
|
HashTableX *dt = tt;//hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer for display on pageparser.cpp output
|
|
char buf[128];
|
|
int32_t bufLen ;
|
|
bufLen = sprintf(buf,"gbfieldmatch:%s:%"UINT64"",hi->m_prefix,val64);
|
|
|
|
// make a special hashinfo for this facet
|
|
HashInfo hi2;
|
|
hi2.m_tt = tt;
|
|
// the full prefix
|
|
char fullPrefix[64];
|
|
snprintf(fullPrefix,62,"%s:%s",prefix,hi->m_prefix);
|
|
hi2.m_prefix = fullPrefix;//"gbfacet";
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
ph2, // prefixHash, // s_facetPrefixHash,
|
|
&hi2,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the
|
|
// posdb key
|
|
// . the termid is the hash of the preceeding field
|
|
// . in json docs a field is like "object.details.price"
|
|
// . in meta tags it is just the meta tag name
|
|
// . credit card numbers are 16 digits. we'd need like 58 bits to store those
|
|
// so we can't do that here, but we can approximate as a float
|
|
// . the binary representation of floating point numbers is ordered in the
|
|
// same order as the floating points themselves! so we are lucky and can
|
|
// keep our usually KEYCMP sorting algos to keep the floats in order.
|
|
bool XmlDoc::hashNumber ( char *beginBuf ,
|
|
char *buf ,
|
|
int32_t bufLen ,
|
|
HashInfo *hi ) {
|
|
|
|
if ( ! is_digit(buf[0]) ) return true;
|
|
|
|
char *p = buf;
|
|
char *bufEnd = buf + bufLen;
|
|
|
|
// back-up over any .
|
|
if ( p > beginBuf && p[-1] == '.' ) p--;
|
|
|
|
// negative sign?
|
|
if ( p > beginBuf && p[-1] == '-' ) p--;
|
|
|
|
// . convert it to a float
|
|
// . this now allows for commas in numbers like "1,500.62"
|
|
float f = atof2 ( p , bufEnd - p );
|
|
|
|
// debug
|
|
//log("build: hashing %s %f",hi->m_prefix,f);
|
|
|
|
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
|
|
return false;
|
|
|
|
// also hash in reverse order for sorting from low to high
|
|
f = -1.0 * f;
|
|
|
|
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
|
|
return false;
|
|
|
|
//
|
|
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
|
|
// dont lose 128 seconds of resolution
|
|
//
|
|
|
|
int32_t i = (int32_t) atoll2 ( p , bufEnd - p );
|
|
|
|
if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) )
|
|
return false;
|
|
|
|
// also hash in reverse order for sorting from low to high
|
|
i = -1 * i;
|
|
|
|
if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) )
|
|
return false;
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// . THIS IS NOW replaced by ::hashFacet2() being called by hashSections()
|
|
// above. it is a more generic, faceted approch.
|
|
// . the term is gbxpathsite123456 the prefix is gbfacet the val32
|
|
// stored in the posdb key is the inner html hash of the section, and
|
|
// the "123456" is the hash of the xpath and site. so the field names
|
|
// are very custom, not your typical "ext" or "title"
|
|
// . CHROME DETECTION
|
|
// . hash a special "gbxpathsitehash12345678" term which has the hash of the
|
|
// innerHTML content embedded in it.
|
|
// . we do this for doing gbfacetstr:gbxpathsitehash12345678 etc. on every
|
|
// section with innerHTML so we can figure out the histogram of each
|
|
// section on this page relative to its subdomain. like the distriubtion
|
|
// of the innerHTML for this section as it appears on other pages from
|
|
// this site. this allows killer CHROME DETECTION!!!!
|
|
/*
|
|
bool XmlDoc::hashSectionTerm ( char *term , HashInfo *hi , int32_t sentHash32 ) {
|
|
|
|
int64_t termId = hash64 ( term , gbstrlen(term) );
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
termId,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
g_posdb.setInt ( &k , sentHash32 );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
int t = g_posdb.getInt ( &k );
|
|
if ( t != sentHash32 ) { char *xx=NULL;*xx=0; }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
//char buf[128];
|
|
//int32_t bufLen = sprintf(buf,"%"UINT32"",sentHash32);
|
|
|
|
// if no gbmin or gbmax or gbsorty or gbrevsortby we need gbfacet
|
|
//int64_t truePrefix64 = hash64n ( "gbfacet" );
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( term,//buf,
|
|
gbstrlen(term),//bufLen,
|
|
0LL,//truePrefix64,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
|
|
|
|
// prefix is something like price. like the meta "name" or
|
|
// the json name with dots in it like "product.info.price" or something
|
|
int64_t nameHash = 0LL;
|
|
int32_t nameLen = 0;
|
|
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && nameLen )
|
|
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
|
// need a prefix for hashing numbers... for now
|
|
else { char *xx=NULL; *xx=0; }
|
|
|
|
// combine prefix hash with a special hash to make it unique to avoid
|
|
// collisions. this is the "TRUE" prefix.
|
|
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
|
// hash with the "TRUE" prefix
|
|
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
g_posdb.setFloat ( &k , f );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
float t = g_posdb.getFloat ( &k );
|
|
if ( t != f ) { char *xx=NULL;*xx=0; }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
char buf[128];
|
|
snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
|
|
int32_t bufLen = gbstrlen(buf);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
truePrefix64,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashNumber3 ( int32_t n , HashInfo *hi , char *sortByStr ) {
|
|
|
|
// prefix is something like price. like the meta "name" or
|
|
// the json name with dots in it like "product.info.price" or something
|
|
int64_t nameHash = 0LL;
|
|
int32_t nameLen = 0;
|
|
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && nameLen )
|
|
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
|
// need a prefix for hashing numbers... for now
|
|
else { char *xx=NULL; *xx=0; }
|
|
|
|
// combine prefix hash with a special hash to make it unique to avoid
|
|
// collisions. this is the "TRUE" prefix.
|
|
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
|
// hash with the "TRUE" prefix
|
|
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
g_posdb.makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
//int64_t final = hash64n("products.offerprice",0);
|
|
//int64_t prefix = hash64n("gbsortby",0);
|
|
//int64_t h64 = hash64 ( final , prefix);
|
|
//if ( ph2 == h64 )
|
|
// log("hey: got offer price");
|
|
|
|
// now set the float in that key
|
|
//g_posdb.setFloat ( &k , f );
|
|
g_posdb.setInt ( &k , n );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
g_posdb.setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
//float t = g_posdb.getFloat ( &k );
|
|
int32_t x = g_posdb.getInt ( &k );
|
|
if ( x != n ) { char *xx=NULL;*xx=0; }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
char buf[128];
|
|
snprintf(buf,126,"%s:%s int32=%"INT32"",sortByStr, hi->m_prefix,n);
|
|
int32_t bufLen = gbstrlen(buf);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
truePrefix64,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . many many websites got hijacked pages in them...
|
|
// . revkim.org/mcdrt/mgntf/sata/sata.htm
|
|
// . collegefootballweekly.net/hswsj/riime/sata/sata.htm
|
|
char *XmlDoc::getIsHijacked() {
|
|
bool hj = false;
|
|
if ( ! hj ) hj = isHijackerFormat ( ptr_firstUrl );
|
|
if ( ! hj ) hj = isHijackerFormat ( ptr_redirUrl );
|
|
if ( ! hj ) {
|
|
m_isHijacked = false;
|
|
m_isHijackedValid = true;
|
|
return &m_isHijacked;
|
|
}
|
|
uint32_t *h1 = getTagPairHash32();
|
|
if ( ! h1 || h1 == (void *)-1 ) return (char *)h1;
|
|
// TODO: check it for the malicious tag formats here!!
|
|
m_isHijacked = false;
|
|
m_isHijackedValid = true;
|
|
return &m_isHijacked;
|
|
}
|
|
|
|
// is it a custom error page? ppl do not always use status 404!
|
|
char *XmlDoc::getIsErrorPage ( ) {
|
|
if ( m_isErrorPageValid ) return &m_isErrorPage;
|
|
|
|
setStatus ( "getting is error page");
|
|
|
|
// need a buncha crap
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
|
|
// get local link info
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// error or blocked
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
|
|
// get remote link info
|
|
LinkInfo **pinfo2 = getLinkInfo2();
|
|
// error or blocked
|
|
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
|
|
// convenience
|
|
LinkInfo *info2 = *pinfo2;
|
|
|
|
// default
|
|
LinkInfo *li = info1;
|
|
|
|
//we have to be more sophisticated with longer pages because they
|
|
//are could actually be talking about an error message.
|
|
//if(xml->getContentLen() > 4096) return false;
|
|
|
|
|
|
// assume not
|
|
m_isErrorPage = false;
|
|
m_isErrorPageValid = true;
|
|
|
|
int32_t nn = xml->getNumNodes();
|
|
int32_t i;
|
|
|
|
char* s;
|
|
int32_t len;
|
|
int32_t len2;
|
|
|
|
char* errMsg = NULL;
|
|
|
|
int32_t numChecked = 0;
|
|
// check the first header and title tag
|
|
// limit it to first 32 nodes
|
|
if(nn > 32) nn = 32;
|
|
for ( i = 0 ; i < nn ; i++ ) {
|
|
switch(xml->getNodeId(i)) {
|
|
case TAG_TITLE:
|
|
case TAG_H1:
|
|
case TAG_H2:
|
|
case TAG_H3:
|
|
case TAG_SPAN:
|
|
char* p = xml->getString(i,true,&len);
|
|
if(len == 0 || len > 1024) continue;
|
|
char* pend = p + len;
|
|
errMsg = matchErrorMsg(p, pend );
|
|
++numChecked;
|
|
break;
|
|
}
|
|
if(errMsg || numChecked > 1) break;
|
|
}
|
|
if(!errMsg) return &m_isErrorPage;
|
|
len = gbstrlen(errMsg);
|
|
|
|
// make sure the error message was not present in the link text
|
|
loop:
|
|
if ( li && li->getNumGoodInlinks() > 5 ) return &m_isErrorPage;
|
|
for (Inlink *k=NULL;li && (k=li->getNextInlink(k)); ) {
|
|
//int32_t nli = li->getNumLinkTexts();
|
|
//if we can index some link text from the page, then do it
|
|
//if(nli > 5) return false;
|
|
//for ( int32_t i = 0 ; i < nli ; i++ ) {
|
|
s = k->getLinkText();
|
|
len2 = k->size_linkText - 1; // exclude \0
|
|
//if(!s) break;
|
|
//allow error msg to contain link text or vice versa
|
|
if(len < len2) {
|
|
if(strncasestr(errMsg, s,len,len2) != NULL)
|
|
return &m_isErrorPage;
|
|
}
|
|
else {
|
|
if(strncasestr(s, errMsg,len2,len) != NULL)
|
|
return &m_isErrorPage;
|
|
}
|
|
}
|
|
|
|
if ( li ) { li = info2; info2 = NULL; goto loop; }
|
|
|
|
m_isErrorPage = true;
|
|
return &m_isErrorPage;
|
|
}
|
|
|
|
|
|
char* XmlDoc::matchErrorMsg(char* p, char* pend ) {
|
|
char utf8Buf[1024];
|
|
// int32_t utf8Len = 0;
|
|
int32_t len = pend - p;
|
|
|
|
if(len > 1024) len = 1024;
|
|
pend = p + len;
|
|
char* tmp = utf8Buf;
|
|
while(p < pend) {
|
|
*tmp = to_lower_a(*p);
|
|
tmp++; p++;
|
|
}
|
|
|
|
p = utf8Buf;
|
|
pend = p + len;
|
|
|
|
char* errMsg = NULL;
|
|
|
|
while(p < pend) {
|
|
int32_t r = pend - p;
|
|
switch (*p) { //sorted by first letter, then by frequency
|
|
case '4':
|
|
errMsg = "404 error";
|
|
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
|
|
errMsg = "403 forbidden";
|
|
if(r>=13&&strncmp(p, errMsg, 13) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'd':
|
|
errMsg = "detailed error information follows";
|
|
if(r>=34&&strncmp(p, errMsg, 34) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'e':
|
|
errMsg = "error 404";
|
|
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
|
|
errMsg = "error was encountered while processing "
|
|
"your request";
|
|
if(r>=51&&strncmp(p, errMsg,51) == 0) return errMsg;
|
|
|
|
errMsg = "error occurred while processing request";
|
|
if(r>=39&&strncmp(p, errMsg, 39) == 0) return errMsg;
|
|
errMsg = "exception error has occurred";
|
|
if(r>=28&&strncmp(p, errMsg,28) == 0) return errMsg;
|
|
errMsg = "error occurred";
|
|
if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
|
|
//http://www.gnu.org/fun/jokes/unix.errors.html
|
|
//errMsg = "error message";
|
|
//if(strncmp(p, errMsg, 13) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'f':
|
|
errMsg = "file not found";
|
|
if(r>=14&&strncmp(p, errMsg, 14) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'h':
|
|
errMsg = "has moved";
|
|
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'n':
|
|
errMsg = "no referrer";
|
|
if(r>=12&&strncmp(p, errMsg,12) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'o':
|
|
errMsg = "odbc error code = ";
|
|
if(r>=18&&strncmp(p, errMsg,18) == 0) return errMsg;
|
|
errMsg = "object not found";
|
|
if(r>=16&&strncmp(p, errMsg,16) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'p':
|
|
errMsg = "page not found";
|
|
if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
|
|
break;
|
|
|
|
case 's':
|
|
errMsg = "system error";
|
|
if(r>=12&&strncmp(p, errMsg, 12) == 0) return errMsg;
|
|
break;
|
|
case 't':
|
|
errMsg = "the application encountered an "
|
|
"unexpected problem";
|
|
if(r>=49&&strncmp(p, errMsg, 49) == 0) return errMsg;
|
|
errMsg = "the page you requested has moved";
|
|
if(r>=32&&strncmp(p, errMsg, 32) == 0) return errMsg;
|
|
errMsg = "this page has moved";
|
|
if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
|
|
break;
|
|
|
|
case 'u':
|
|
errMsg = "unexpected problem has occurred";
|
|
if(r>=31&&strncmp(p, errMsg, 31) == 0) return errMsg;
|
|
errMsg = "unexpected error has occurred";
|
|
if(r>=29&&strncmp(p, errMsg, 29) == 0) return errMsg;
|
|
errMsg = "unexpected problem occurred";
|
|
if(r>=27&&strncmp(p, errMsg, 27) == 0) return errMsg;
|
|
errMsg ="unexpected error occurred";
|
|
if(r>=25&&strncmp(p, errMsg, 25) == 0) return errMsg;
|
|
errMsg ="unexpected result has occurred";
|
|
if(r>=33&&strncmp(p, errMsg, 33) == 0) return errMsg;
|
|
errMsg ="unhandled exception";
|
|
if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
|
|
|
|
break;
|
|
|
|
case 'y':
|
|
errMsg = "you have been blocked";
|
|
if(r>=21&&strncmp(p, errMsg, 21) == 0) return errMsg;
|
|
break;
|
|
}
|
|
//skip to the beginning of the next word
|
|
while(p < pend && !is_wspace_a(*p)) p++;
|
|
while(p < pend && is_wspace_a(*p)) p++;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
#include "Spider.h"
|
|
|
|
static SafeBuf *s_wbuf = NULL;
|
|
|
|
// . this is used by gbsort() above
|
|
// . sorts TermInfos alphabetically by their TermInfo::m_term member
|
|
int cmptp (const void *v1, const void *v2) {
|
|
TermDebugInfo *t1 = *(TermDebugInfo **)v1;
|
|
TermDebugInfo *t2 = *(TermDebugInfo **)v2;
|
|
|
|
char *start = s_wbuf->getBufStart();
|
|
|
|
// prefix first
|
|
char *ps1 = start + t1->m_prefixOff;
|
|
char *ps2 = start + t2->m_prefixOff;
|
|
if ( t1->m_prefixOff < 0 ) ps1 = NULL;
|
|
if ( t2->m_prefixOff < 0 ) ps2 = NULL;
|
|
int32_t plen1 = 0; if ( ps1 ) plen1 = gbstrlen(ps1);
|
|
int32_t plen2 = 0; if ( ps2 ) plen2 = gbstrlen(ps2);
|
|
int32_t pmin = plen1;
|
|
if ( plen2 < pmin ) pmin = plen2;
|
|
int32_t pn = strncmp ( ps1 , ps2 , pmin );
|
|
if ( pn ) return pn;
|
|
if ( plen1 != plen2 ) return ( plen1 - plen2 );
|
|
|
|
// return if groups differ
|
|
int32_t len1 = t1->m_termLen;
|
|
int32_t len2 = t2->m_termLen;
|
|
int32_t min = len1;
|
|
if ( len2 < min ) min = len2;
|
|
char *s1 = start + t1->m_termOff;
|
|
char *s2 = start + t2->m_termOff;
|
|
int32_t n = strncasecmp ( s1 , s2 , min );
|
|
if ( n ) return n;
|
|
// . if length same, we are tied
|
|
// . otherwise, prefer the int16_ter
|
|
return ( len1 - len2 );
|
|
}
|
|
|
|
// . this is used by gbsort() above
|
|
// . sorts TermDebugInfos by their TermDebugInfo::m_wordPos member
|
|
int cmptp2 (const void *v1, const void *v2) {
|
|
TermDebugInfo *t1 = *(TermDebugInfo **)v1;
|
|
TermDebugInfo *t2 = *(TermDebugInfo **)v2;
|
|
// word position first
|
|
int32_t d = t1->m_wordPos - t2->m_wordPos;
|
|
if ( d ) return d;
|
|
// secondly drop back to hashgroup i guess
|
|
//d = t1->m_hashGroup - t2->m_hashGroup;
|
|
d = t1->m_synSrc - t2->m_synSrc;
|
|
if ( d ) return d;
|
|
// word len
|
|
d = t1->m_termLen - t2->m_termLen;
|
|
if ( d ) return d;
|
|
return 0;
|
|
}
|
|
|
|
bool printLangBits ( SafeBuf *sb , TermDebugInfo *tp ) {
|
|
|
|
char printed = false;
|
|
if ( tp->m_synSrc ) {
|
|
sb->safePrintf(" ");
|
|
printed = true;
|
|
}
|
|
int32_t j = 0;
|
|
if ( printed ) j = MAX_LANGUAGES;
|
|
for ( ; j < MAX_LANGUAGES ; j++ ) {
|
|
int64_t mask = 1LL << j;
|
|
//if ( j == tp->m_langId )
|
|
// sb->safePrintf("[%s]",
|
|
// getLangAbbr(tp->m_langId));
|
|
if ( ! (tp->m_langBitVec64 & mask) ) continue;
|
|
char langId = j+1;
|
|
// match in langvec? that means even if the
|
|
// word is in multiple languages we put it in
|
|
// this language because we interesect its lang bit
|
|
// vec with its neighbors in the sliding window
|
|
// algo in setLangVector.
|
|
if ( langId == tp->m_langId )
|
|
sb->safePrintf("<b>");
|
|
sb->safePrintf("%s ", getLangAbbr(langId) );
|
|
if ( langId == tp->m_langId )
|
|
sb->safePrintf("</b>");
|
|
printed = true;
|
|
}
|
|
if ( ! printed ) {
|
|
sb->safePrintf("??");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
|
|
|
if ( ! sb ) return true;
|
|
|
|
Url *u = getFirstUrl();
|
|
// hash the url into 64 bits
|
|
int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());
|
|
|
|
|
|
// int16_tcut
|
|
char *fu = ptr_firstUrl;
|
|
|
|
char *allowed = "???";
|
|
if ( m_isAllowedValid && m_isAllowed ) allowed = "yes";
|
|
else if ( m_isAllowedValid ) allowed = "no";
|
|
|
|
int32_t ufn = -1;
|
|
if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
|
|
time_t spideredTime = getSpideredTime();
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
|
|
"content=\"text/html; charset=utf-8\">"
|
|
|
|
"<table cellpadding=3 border=0>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">docId</td>"
|
|
"<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">uh48</td>"
|
|
"<td>%"UINT64"</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">uh64</td>"
|
|
"<td>%"UINT64"</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>index error code</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>http status</td>"
|
|
"<td>%i</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>url filter num</td>"
|
|
"<td>%"INT32"</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>other - errno</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>robots.txt allows</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>metalist size</td>"
|
|
"<td>%"INT32"</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
|
|
,
|
|
cr->m_coll,
|
|
m_docId ,
|
|
m_docId ,
|
|
getFirstUrlHash48(), // uh48
|
|
getFirstUrlHash64(), // uh48
|
|
|
|
mstrerror(m_indexCode),
|
|
m_httpStatus,
|
|
ufn,
|
|
mstrerror(g_errno),
|
|
allowed,
|
|
|
|
m_metaListSize,
|
|
|
|
fu,
|
|
fu
|
|
|
|
);
|
|
|
|
if ( ptr_redirUrl )
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>redir url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
,ptr_redirUrl
|
|
,ptr_redirUrl
|
|
);
|
|
else
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>redir url</td>"
|
|
"<td>--</td>"
|
|
"</tr>\n"
|
|
);
|
|
|
|
|
|
sb->safePrintf("<tr><td>hostHash64</td><td>0x%"XINT64"</td></tr>",
|
|
(uint64_t)getHostHash32a());
|
|
sb->safePrintf("<tr><td>site</td><td>");
|
|
sb->safeMemcpy(ptr_site,size_site-1);
|
|
sb->safePrintf("</td></tr>\n");
|
|
if ( m_siteHash32Valid )
|
|
sb->safePrintf("<tr><td>siteHash32</td><td>0x%"XINT32"</td></tr>\n",
|
|
m_siteHash32);
|
|
if ( m_domHash32Valid )
|
|
sb->safePrintf("<tr><td>domainHash32</td><td>0x%"XINT32"</td></tr>\n",
|
|
m_domHash32);
|
|
sb->safePrintf ( "<tr>"
|
|
"<td>domainHash8</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"</tr>\n"
|
|
,
|
|
(int32_t)g_titledb.getDomHash8FromDocId(m_docId)
|
|
);
|
|
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>coll</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>spidered date</td>"
|
|
"<td>%s UTC</td>"
|
|
"</tr>\n"
|
|
,
|
|
cr->m_coll,
|
|
asctime(gmtime ( &spideredTime ))
|
|
);
|
|
|
|
|
|
/*
|
|
char *ms = "-1";
|
|
if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
|
|
sb->safePrintf (
|
|
"<tr>"
|
|
"<td>min pub date</td>"
|
|
"<td>%s UTC</td>"
|
|
"</tr>\n" , ms );
|
|
|
|
ms = "-1";
|
|
if ( m_maxPubDate != -1 ) ms = asctime(gmtime ( &m_maxPubDate ));
|
|
sb->safePrintf (
|
|
"<tr>"
|
|
"<td>max pub date</td>"
|
|
"<td>%s UTC</td>"
|
|
"</tr>\n" , ms );
|
|
*/
|
|
|
|
// our html template fingerprint
|
|
sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
|
|
if ( m_tagPairHash32Valid )sb->safePrintf("%"UINT32"",
|
|
(uint32_t)m_tagPairHash32);
|
|
else sb->safePrintf("invalid");
|
|
sb->safePrintf("</td></tr>\n" );
|
|
|
|
|
|
// print list we added to delete stuff
|
|
if ( m_indexCode && m_oldDocValid && m_oldDoc ) {
|
|
// skip debug printing for now...
|
|
//return true;
|
|
sb->safePrintf("</table><br>\n");
|
|
sb->safePrintf("<h2>Delete Meta List</h2>");
|
|
printMetaList ( m_metaList , m_metaList + m_metaListSize ,sb);
|
|
}
|
|
|
|
|
|
if ( m_indexCode || g_errno ) {
|
|
printMetaList ( m_metaList , m_metaList + m_metaListSize, sb );
|
|
}
|
|
|
|
if ( m_indexCode ) return true;
|
|
if ( g_errno ) return true;
|
|
|
|
|
|
// sanity check
|
|
//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
sb->safePrintf("<tr><td>next spider date</td>"
|
|
"<td>%s UTC</td></tr>\n"
|
|
|
|
"<tr><td>next spider priority</td>"
|
|
"<td>%"INT32"</td></tr>\n" ,
|
|
asctime(gmtime( &m_nextSpiderTime )) ,
|
|
(int32_t)m_nextSpiderPriority );
|
|
*/
|
|
|
|
// must always start with http i guess!
|
|
if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
|
|
// show the host that should spider it
|
|
//int32_t domLen ; char *dom = getDomFast ( fu , &domLen , true );
|
|
//int32_t hostId;
|
|
if ( m_sreqValid ) {
|
|
// must not block
|
|
SpiderRequest *oldsr = &m_sreq;
|
|
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
|
|
sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
|
|
"</td>\n"
|
|
"<td><b>%"UINT32"</b></td></tr>\n",shard);
|
|
}
|
|
|
|
time_t ts = m_firstIndexedDate;
|
|
sb->safePrintf("<tr><td>first indexed date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
|
|
ts = m_outlinksAddedDate;
|
|
sb->safePrintf("<tr><td>outlinks last added date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
|
|
// hop count
|
|
sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td></tr>\n",
|
|
(int32_t)m_hopCount);
|
|
|
|
// thumbnails
|
|
ThumbnailArray *ta = (ThumbnailArray *) ptr_imageData;
|
|
if ( ta ) {
|
|
int32_t nt = ta->getNumThumbnails();
|
|
sb->safePrintf("<tr><td># thumbnails</td>"
|
|
"<td>%"INT32"</td></tr>\n",nt);
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
ThumbnailInfo *ti = ta->getThumbnailInfo(i);
|
|
sb->safePrintf("<tr><td>thumb #%"INT32"</td>"
|
|
"<td>%s (%"INT32"x%"INT32",%"INT32"x%"INT32") "
|
|
, i
|
|
, ti->getUrl()
|
|
, ti->m_origDX
|
|
, ti->m_origDY
|
|
, ti->m_dx
|
|
, ti->m_dy
|
|
);
|
|
ti->printThumbnailInHtml ( sb , 100,100,true,NULL) ;
|
|
// end the row for this thumbnail
|
|
sb->safePrintf("</td></tr>\n");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
char *ddd;
|
|
time_t datedbDate = (time_t)m_pubDate;
|
|
if ( datedbDate != -1 ) ddd = asctime ( gmtime(&datedbDate ));
|
|
else ddd = "---";
|
|
|
|
char strLanguage[128];
|
|
languageToString(m_langId, strLanguage);
|
|
|
|
// print tags
|
|
//if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
|
|
SafeBuf tb;
|
|
|
|
TagRec *ogr = NULL;
|
|
if ( m_tagRecValid ) ogr = &m_tagRec;
|
|
if ( ogr ) ogr->printToBufAsHtml ( &tb , "old tag" );
|
|
|
|
SafeBuf *ntb = NULL;
|
|
if ( m_newTagBufValid ) ntb = getNewTagBuf();
|
|
if ( ntb ) {
|
|
// this is just a sequence of tags like an rdblist
|
|
char *pt = ntb->getBufStart();
|
|
char *ptend = pt + ntb->length();
|
|
for ( ; pt < ptend ; ) {
|
|
// skip rdbid
|
|
pt++;
|
|
// cast it
|
|
Tag *tag = (Tag *)pt;
|
|
// skip it
|
|
pt += tag->getRecSize();
|
|
// print tag out
|
|
tag->printToBufAsHtml ( &tb, "new tag");
|
|
}
|
|
}
|
|
|
|
|
|
// prevent (null) from being displayed
|
|
tb.pushChar('\0');
|
|
|
|
|
|
//Tag *tag1 = gr->getTag ("sitenuminlinks");
|
|
//Tag *tag2 = gr->getTag ("sitepop");
|
|
//int32_t sni = 0;
|
|
//int32_t spop = 0;
|
|
//if ( tag1 ) sni = atol(tag1->m_data);
|
|
//if ( tag2 ) spop = atol(tag2->m_data);
|
|
int32_t sni = m_siteNumInlinks;
|
|
//int32_t spop = m_sitePop;
|
|
|
|
LinkInfo *info1 = ptr_linkInfo1;
|
|
//LinkInfo *info2 = ptr_linkInfo2;
|
|
//int32_t sni ;
|
|
//int32_t extrapolated = 0;
|
|
//if ( info1 ) extrapolated = info1->m_numInlinksExtrapolated;
|
|
//if ( info1 ) sni = info1->m_siteNumInlinks;
|
|
|
|
char *ipString = iptoa(m_ip);
|
|
char *estimated = "";
|
|
if ( datedbDate & 0x01 ) // tr->datedbDateIsEstimated() )
|
|
estimated = "<nobr><b>[estimated from bisection]</b></nobr>";
|
|
|
|
//char *ls = getIsLinkSpam();
|
|
Links *links = getLinks();
|
|
// sanity check. should NEVER block!
|
|
if ( links == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// this is all to get "note"
|
|
//char *note = NULL;
|
|
// make it a URL
|
|
Url uu; uu.set ( ptr_firstUrl , false );
|
|
// sanity check
|
|
Xml *xml = getXml();
|
|
// sanity check
|
|
if ( xml == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
sb->safePrintf (
|
|
"<tr><td>datedb date</td><td>%s UTC (%"UINT32")%s"
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>compressed size</td><td>%"INT32" bytes</td></tr>\n"
|
|
|
|
"<tr><td>original charset</td><td>%s</td></tr>\n"
|
|
|
|
//"<tr><td>site num inlinks</td><td><b>%"INT32"%</b></td></tr>\n"
|
|
|
|
//"<tr><td>total extrapolated linkers</td><td>%"INT32"</td></tr>\n"
|
|
|
|
"<tr><td><b>title rec version</b></td><td><b>%"INT32"</b>"
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>adult bit</td><td>%"INT32"</td></tr>\n"
|
|
|
|
//"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n"
|
|
|
|
"<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n"
|
|
"<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n"
|
|
//"<tr><td>index article only?</td><td>%"INT32"</td></tr>\n"
|
|
"%s\n"
|
|
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
|
|
"%s</td></tr>\n"
|
|
"<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n"
|
|
"<tr><td>content truncated</td><td>%"INT32"</td></tr>\n"
|
|
|
|
"<tr><td>content type</td><td>%"INT32" (%s)</td></tr>\n"
|
|
"<tr><td>language</td><td>%"INT32" (%s)</td></tr>\n"
|
|
"<tr><td>country</td><td>%"INT32" (%s)</td></tr>\n"
|
|
"<tr><td>time axis used</td><td>%"INT32"</td></tr>\n"
|
|
"<tr><td>metadata</td><td>%s</td></tr>\n"
|
|
"</td></tr>\n",
|
|
|
|
ddd ,
|
|
(uint32_t)datedbDate ,
|
|
estimated ,
|
|
|
|
m_oldTitleRecSize,
|
|
|
|
get_charset_str(m_charset),
|
|
|
|
//sni ,
|
|
|
|
//ptr_linkInfo1->m_numInlinksExtrapolated,
|
|
|
|
(int32_t)m_version ,
|
|
|
|
(int32_t)m_isAdult,
|
|
|
|
//(int32_t)m_isLinkSpam,
|
|
//m_note,
|
|
|
|
(int32_t)m_isPermalink,
|
|
|
|
(int32_t)m_isRSS,
|
|
|
|
|
|
//(int32_t)m_eliminateMenus,
|
|
|
|
|
|
// tag rec
|
|
tb.getBufStart(),
|
|
|
|
ipString,
|
|
cr->m_coll,
|
|
ipString,
|
|
size_utf8Content - 1,
|
|
(int32_t)m_isContentTruncated,
|
|
|
|
(int32_t)m_contentType,
|
|
g_contentTypeStrings[(int)m_contentType] ,
|
|
|
|
(int32_t)m_langId,
|
|
strLanguage,
|
|
|
|
(int32_t)m_countryId,
|
|
g_countryCode.getName(m_countryId),
|
|
m_useTimeAxis,
|
|
ptr_metadata);
|
|
|
|
|
|
/*
|
|
int32_t boost1 = getBoostFromSiteNumInlinks ( sni );
|
|
|
|
sb->safePrintf (
|
|
"<tr><td><b>title weight</b></td>"
|
|
"<td><b>%"UINT32"%%</b></td></tr>\n"
|
|
|
|
"<tr><td>header weight</td>"
|
|
"<td>%"UINT32"%%</td></tr>\n"
|
|
|
|
"<tr><td>url path weight</td>"
|
|
"<td>%"UINT32"%%</td></tr>\n"
|
|
|
|
"<tr><td>external link text weight</td>"
|
|
"<td>%"UINT32"%%</td></tr>\n"
|
|
|
|
"<tr><td>internal link text weight</td>"
|
|
"<td>%"UINT32"%%</td></tr>\n"
|
|
|
|
"<tr><td>concept weight</td>"
|
|
"<td>%"UINT32"%%</td></tr>\n"
|
|
|
|
"<tr><td>score boost from site num inlinks</td>"
|
|
"<td>%"INT32"%%</td>"
|
|
"</tr>\n",
|
|
|
|
(int32_t)m_titleWeight,
|
|
(int32_t)m_headerWeight,
|
|
(int32_t)m_urlPathWeight,
|
|
(int32_t)m_externalLinkTextWeight,
|
|
(int32_t)m_internalLinkTextWeight,
|
|
(int32_t)m_conceptWeight ,
|
|
boost1 );
|
|
*/
|
|
|
|
// print title
|
|
//sb->safePrintf( "<tr><td>title</td><td>%s</td></tr>\n" ,
|
|
// ti->m_title );
|
|
|
|
// print the new, unstored, gigabit vector
|
|
if ( size_gigabitHashes ) {
|
|
// get gigabit vector
|
|
int32_t *vec = ptr_gigabitHashes;
|
|
// point to scores
|
|
int32_t *ss = ptr_gigabitScores;
|
|
int32_t count = 0;
|
|
int32_t total = 0;
|
|
sb->safePrintf ( "<tr><td>stored gigabit vector</td><td>");
|
|
while ( *vec ) {
|
|
sb->safePrintf ( "%08"XINT32" ", *vec );
|
|
sb->safePrintf ( "(%05"INT32") ", *ss );
|
|
vec++;
|
|
ss++;
|
|
count++;
|
|
total++;
|
|
//if ( total >= GIGABITS_IN_VECTOR ) break;
|
|
if ( count < 4 ) continue;
|
|
count = 0;
|
|
sb->safePrintf ( "<br>\n");
|
|
}
|
|
sb->safePrintf ( "</tr>\n");
|
|
}
|
|
|
|
// print dmoz stuff
|
|
int32_t numCatIds = size_catIds/4;
|
|
int32_t numIndCatIds = size_indCatIds/4;
|
|
sb->safePrintf( "<tr><td>Number of Category IDs</td>"
|
|
"<td>%"INT32"</td></tr>\n", numCatIds );
|
|
char *dtp = ptr_dmozTitles;
|
|
char *dsp = ptr_dmozSumms;
|
|
char *dap = ptr_dmozAnchors;
|
|
for (int32_t i = 0; i < numCatIds; i++) {
|
|
// print the ID
|
|
sb->safePrintf( "<tr><td>ID #%"INT32"</td><td>%"INT32"</td></tr>\n",
|
|
i, ptr_catIds[i]);
|
|
// print the title
|
|
if ( dtp ) {
|
|
sb->safePrintf( "<tr><td>Title #%"INT32" </td><td>",i);
|
|
sb->safeMemcpy( dtp,gbstrlen(dtp) );
|
|
sb->safePrintf( "</td></tr>\n");
|
|
dtp += gbstrlen(dtp) + 1;
|
|
}
|
|
// print the summary
|
|
if ( dsp ) {
|
|
sb->safePrintf( "<tr><td>Summary #%"INT32"</td><td>", i);
|
|
sb->safeMemcpy( dsp , gbstrlen(dsp ) ) ;
|
|
sb->safePrintf( "</td></tr>\n");
|
|
dsp += gbstrlen ( dsp ) + 1;
|
|
}
|
|
// print the anchor
|
|
if ( dap ) {
|
|
sb->safePrintf( "<tr><td>Anchor #%"INT32"</td><td>",i);
|
|
sb->safeMemcpy( dap , gbstrlen(dap) );
|
|
sb->safePrintf( "</td></tr>\n");
|
|
dap += gbstrlen ( dap ) + 1;
|
|
}
|
|
}
|
|
sb->safePrintf( "<tr><td>Number of Indirect Category IDs</td>"
|
|
"<td>%"INT32"</td></tr>\n", numIndCatIds);
|
|
|
|
for (int32_t i = 0; i < numIndCatIds; i++) {
|
|
// print the ID
|
|
sb->safePrintf( "<tr><td>Indirect ID #%"INT32"</td>"
|
|
"<td>%"INT32"</td></tr>\n",
|
|
i, ptr_indCatIds[i]);
|
|
}
|
|
|
|
if ( info1 ) {
|
|
//sb->safePrintf("<tr><td>page pop</td><td>%"INT32"</td></tr>\n",
|
|
// info1->m_pagePop );
|
|
//sb->safePrintf("<tr><td>whole site pop</td>"
|
|
// "<td>%"INT32"</td></tr>\n",
|
|
// spop );
|
|
sb->safePrintf("<tr><td>num GOOD links to whole site</td>"
|
|
"<td>%"INT32"</td></tr>\n",
|
|
sni );
|
|
}
|
|
|
|
// close the table
|
|
sb->safePrintf ( "</table></center><br>\n" );
|
|
|
|
//
|
|
// convert document into json representing multiple documents
|
|
// if it makes sense. sometimes a single url contains multiple
|
|
// subdocuments that each should have their own url, but do not,
|
|
// so we fix that here.
|
|
//
|
|
SafeBuf *dbr = getDiffbotReply();
|
|
if ( dbr->length() ) {
|
|
sb->safePrintf("<b>START EXACT DIFFBOT REPLY</b><br>\n");
|
|
sb->safePrintf("<pre>");
|
|
sb->safeMemcpy ( dbr );
|
|
sb->safePrintf("</pre>");
|
|
sb->safePrintf("<b>END EXACT DIFFBOT REPLY</b><br><br>\n");
|
|
}
|
|
|
|
// print outlinks
|
|
links->print( sb );
|
|
|
|
//
|
|
// PRINT ADDRESSES (prints streets first)
|
|
//
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (Addresses *)-1 ) { char *xx=NULL;*xx=0;}
|
|
aa->print(sb,uh64);
|
|
|
|
|
|
|
|
//
|
|
// PRINT PUB DATE CANDIDATES
|
|
//
|
|
|
|
// print stored pub date candidates which we indexed as clock
|
|
// or not clock!
|
|
Dates *dp = getDates() ;
|
|
// should never block!
|
|
if ( dp == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// print it out
|
|
if ( dp ) dp->printDates ( sb );
|
|
|
|
//return true;
|
|
|
|
//
|
|
// PRINT SECTIONS
|
|
//
|
|
Sections *sections = getSections();
|
|
if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;}
|
|
//SectionVotingTable *nsvt = getNewSectionVotingTable();
|
|
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
|
|
|
|
// these are nice
|
|
//HashTableX *pt = dp->getPhoneTable();
|
|
//HashTableX *et = dp->getEmailTable();
|
|
//HashTableX *at = aa->getPlaceTable();
|
|
//HashTableX *tt = dp->getTODTable();
|
|
//HashTableX *rt = ev->getRegistrationTable();
|
|
//HashTableX *priceTable = dp->getPriceTable();
|
|
|
|
//sections->print ( sb , pt , et , NULL , at , tt , priceTable );
|
|
|
|
// try the new print function
|
|
//sections->print2 ( sb , NULL, NULL , NULL , false );
|
|
|
|
printRainbowSections ( sb , NULL );
|
|
|
|
//nsvt->print ( sb , "NEW Sections Voting Table" );
|
|
|
|
//osvt->print ( sb , "OLD Sections Voting Table" );
|
|
|
|
|
|
//
|
|
// PRINT LINKINFO
|
|
//
|
|
|
|
//if ( info1 )
|
|
// info1->print ( sb , cr->m_coll );
|
|
|
|
//if ( info2 ) {
|
|
// sb->safePrintf ( "<tr><td><b>IMPORTED LINK INFO:"
|
|
// "</b></td></tr>" );
|
|
// info2->print ( sb , cr->m_coll );
|
|
//}
|
|
|
|
|
|
// cut it int16_t for debugging
|
|
logf(LOG_DEBUG,"xmldoc: FIX ME remove return");
|
|
|
|
//return true;
|
|
|
|
//
|
|
// PRINT LINKINFO
|
|
//
|
|
|
|
char *p = m_pageLinkBuf.getBufStart();
|
|
int32_t plen = m_pageLinkBuf.length();
|
|
sb->safeMemcpy ( p , plen );
|
|
|
|
|
|
//
|
|
// PRINT SITE LINKINFO
|
|
//
|
|
p = m_siteLinkBuf.getBufStart();
|
|
plen = m_siteLinkBuf.length();
|
|
sb->safeMemcpy ( p , plen );
|
|
|
|
|
|
//
|
|
// BEGIN PRINT GIGABITS
|
|
//
|
|
|
|
// print out for PageParser.cpp
|
|
const char *help =
|
|
"The <i>Gigabits</i> are words extracted from the document "
|
|
"that are deemed to best represent it. The <i>Pop</i> column "
|
|
"is the popularity of the word and it ranges from 0 to 1000 "
|
|
"and is how many documents out of a sample of 1000 that "
|
|
"contained that word. The <i>Score</i> of each Gigabit is "
|
|
"based on the popularity and how many times the word appeared "
|
|
"in the document. Higher scores are deemed more "
|
|
"representative of the document. The hashes of these Gigabits "
|
|
"are stored with the cached copy of the document as numeric "
|
|
"hashes for purposes of topic clustering. You can see these "
|
|
"hashes by clicking on the <i>[info]</i> link next to "
|
|
"any search result.<br><br>";
|
|
|
|
if ( m_numTop > 0 )
|
|
sb->safePrintf( "<table width=100%%>"
|
|
"<td bgcolor=pink>\n"
|
|
"%s"
|
|
"<table>"
|
|
"<tr><td>#</td><td>"
|
|
"<b>%"INT32" Gigabits</b></td><td><b>Score</b>"
|
|
"</td>"
|
|
"<td><b>Pop</b></td>"
|
|
"<td><b>Hash</b></td>"
|
|
"</tr>\n",
|
|
help,m_numTop);
|
|
|
|
// . print out the top gigabits we harvested
|
|
// . start with the highest scoring node first, the last node since
|
|
// nodes are ranked by lowest to highest key
|
|
int32_t total = 0;
|
|
for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
|
|
// get the info
|
|
GigabitInfo *gi = m_top[i];
|
|
// print row
|
|
sb->safePrintf("<tr><td>%"INT32"</td><td>",i);
|
|
// print gigabit
|
|
sb->safeMemcpy(gi->m_ptr , gi->m_len );
|
|
// get 32 bit hash
|
|
uint32_t h = gi->m_hash & 0xffffffff;
|
|
// never allow 0
|
|
if ( h == 0 ) h = 1;
|
|
// if unicode, pop's hi bit is set
|
|
sb->safePrintf( "</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%08"XINT32"</td>"
|
|
"</tr>\n",
|
|
(int32_t)gi->m_pts,
|
|
(int32_t)gi->m_pop,
|
|
(int32_t)h );
|
|
// add up all scores
|
|
total += gi->m_pts;
|
|
}
|
|
|
|
// close table
|
|
if ( m_numTop > 0 ) {
|
|
sb->safePrintf("<tr><td></td><td></td><td>"
|
|
"<b>%"INT32"</b></td></tr>\n",total);
|
|
sb->safePrintf("</table>\n");
|
|
}
|
|
|
|
|
|
//
|
|
// END PRINT GIGABITS
|
|
//
|
|
|
|
|
|
// note this
|
|
sb->safePrintf("<h2>NEW Meta List</h2>");
|
|
|
|
printMetaList ( m_metaList , m_metaList + m_metaListSize , sb );
|
|
|
|
|
|
// all done if no term table to print out
|
|
if ( ! m_wts ) return true;
|
|
|
|
|
|
// print out the rules in Weights.cpp
|
|
/*
|
|
sb->safePrintf ("<br>"
|
|
"<table border=1 cellpadding=0>"
|
|
|
|
"<tr><td>Rule #3</td>"
|
|
"<td>First 40 words in ()'s.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #4</td>"
|
|
"<td>Adjacent to bad punct.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #5</td>"
|
|
"<td>In a link.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #6</td>"
|
|
"<td>First occurence in a section. Actual weight "
|
|
"depends on section word count.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #7</td>"
|
|
"<td>In a header tag. h1 is most weight.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #8</td>"
|
|
"<td>In a \"ul\" list.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #9</td>"
|
|
"<td>Repeated occurence in the same fragment or "
|
|
"sentence.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #10</td>"
|
|
"<td>In a comma-separated list.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #11</td>"
|
|
"<td>Promoted isolated capitalized words, demote "
|
|
"if it is in a capitalized phrase.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #13</td>"
|
|
"<td>First occurence in document.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #15</td>"
|
|
"<td>Word to phrase ratio weight.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #16</td>"
|
|
"<td>At the beginning of a fragment or sentence."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>Rule #17</td>"
|
|
"<td>If immediately after a quote, iff not "
|
|
"promoted by Rule #18.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #18</td>"
|
|
"<td>Promote phrase if capitalized. Demote phrase "
|
|
"if mixed case without hypehn.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #22</td>"
|
|
"<td>Demote phrases containing bad punct.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #23</td>"
|
|
"<td>In script, style, select or marquee tag. "
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>Rule #23</td>"
|
|
"<td>Follows a number.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #25</td>"
|
|
"<td>Demote non-hyphenated phrases that would split "
|
|
"adjacent hyphenated phrases.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #26</td>"
|
|
"<td>Demote if in a repeated fragment.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #27</td>"
|
|
"<td>Demote if in a menu section.</td></tr>\n"
|
|
|
|
"<tr><td>Rule #28</td>"
|
|
"<td>Pattern spam detector.</td></tr>\n"
|
|
|
|
"</table>\n"
|
|
"<br>"
|
|
);
|
|
*/
|
|
|
|
|
|
//
|
|
// BEGIN PRINT HASHES TERMS
|
|
//
|
|
|
|
// int16_tcut
|
|
HashTableX *wt = m_wts;
|
|
|
|
// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
|
|
TermDebugInfo **tp = NULL;
|
|
// add them with this counter
|
|
int32_t nt = 0;
|
|
|
|
int32_t nwt = 0;
|
|
if ( wt ) {
|
|
nwt = wt->m_numSlots;
|
|
tp = (TermDebugInfo **)wt->m_keys;
|
|
}
|
|
|
|
// now print the table we stored all we hashed into
|
|
for ( int32_t i = 0 ; i < nwt ; i++ ) {
|
|
// skip if empty
|
|
if ( wt->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
//QUICKPOLL(m_niceness);
|
|
// get its key, date=32bits termid=64bits
|
|
//key96_t *k = (key96_t *)wt->getKey ( i );
|
|
// get the TermDebugInfo
|
|
TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
|
|
// point to it for sorting
|
|
tp[nt++] = ti;
|
|
}
|
|
|
|
// set this for cmptp
|
|
s_wbuf = &m_wbuf;
|
|
|
|
// sort them alphabetically by Term
|
|
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
|
|
|
|
// determine how many non 1.0 weight fields we got in the vectors
|
|
/*
|
|
int32_t count [ MAX_RULES ];
|
|
memset ( count , 0 , MAX_RULES * 4 );
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
TermDebugInfo *ti = tp[i];
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ )
|
|
if ( ti->m_rv[j] != 1.0 ) count[j]++;
|
|
}
|
|
// count the counts
|
|
char fbuf[9024];
|
|
char *fp = fbuf;
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
|
|
if ( ! count[j] ) continue;
|
|
fp += sprintf(fp ,"<td><b>R#%"INT32"</b></td>",j);
|
|
}
|
|
*/
|
|
|
|
// print them out in a table
|
|
char hdr[1000];
|
|
sprintf(hdr,
|
|
"<table border=1 cellpadding=0>"
|
|
"<tr>"
|
|
// this messes up Test.cpp diff'ing
|
|
//"<td><b>#</b></td>"
|
|
"<td><b>Prefix</b></td>"
|
|
"<td><b>WordNum</b></td>"
|
|
"<td><b>Lang</b></td>"
|
|
"<td><b>Term</b></td>"
|
|
|
|
//"%s"
|
|
|
|
//"<td><b>Weight</b></td>"
|
|
//"<td><b>Spam</b></td>"
|
|
|
|
"<td><b>Desc</b></td>"
|
|
"<td><b>TermId/TermHash48</b></td>"
|
|
"<td><b>ShardByTermId?</b></td>"
|
|
"<td><b>Note</b></td>"
|
|
"</tr>\n"
|
|
//,fbuf
|
|
);
|
|
|
|
sb->safePrintf("%s",hdr);
|
|
|
|
char *start = m_wbuf.getBufStart();
|
|
int32_t rcount = 0;
|
|
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
|
|
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS) == 0 )
|
|
sb->safePrintf("<!--ignore--></table>%s",hdr);
|
|
|
|
char *prefix = " ";
|
|
if ( tp[i]->m_prefixOff >= 0 )
|
|
prefix = start + tp[i]->m_prefixOff;
|
|
|
|
bool isFacet = false;
|
|
if ( prefix &&
|
|
prefix[0]=='g' &&
|
|
strncmp(prefix,"gbfacet",7)== 0 )
|
|
isFacet = true;
|
|
|
|
sb->safePrintf ( "<tr>"
|
|
//"<td><b>%"INT32"</b></td>"
|
|
"<td>%s</td>"
|
|
//i ,
|
|
, prefix
|
|
);
|
|
|
|
if ( isFacet )
|
|
sb->safePrintf("<td>--</td>");
|
|
else
|
|
sb->safePrintf( "<td>%"INT32"</td>"
|
|
, tp[i]->m_wordNum );
|
|
|
|
|
|
// print lang
|
|
//char langId = tp[i]->m_langId;
|
|
|
|
// print out all langs word is in if it's not clear
|
|
// what language it is. we use a sliding window to
|
|
// resolve some ambiguity, but not all, so print out
|
|
// the possible langs here
|
|
sb->safePrintf("<td>");
|
|
if ( isFacet )
|
|
sb->safePrintf("--");
|
|
else
|
|
printLangBits ( sb , tp[i] );
|
|
sb->safePrintf("</td>");
|
|
|
|
|
|
// print the term
|
|
sb->safePrintf("<td><nobr>");
|
|
|
|
if ( tp[i]->m_synSrc )
|
|
sb->pushChar('*');
|
|
|
|
char *term = start + tp[i]->m_termOff;
|
|
int32_t termLen = tp[i]->m_termLen;
|
|
sb->safeMemcpy ( term , termLen );
|
|
|
|
/*
|
|
char *dateStr = " ";
|
|
int32_t ddd = tp[i]->m_date;
|
|
uint8_t *tddd = (uint8_t *)&ddd;
|
|
char tbbb[32];
|
|
if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
|
|
tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
|
|
sprintf(tbbb,"evIds %"INT32"-%"INT32"",
|
|
(int32_t)tddd[1],(int32_t)tddd[0]);
|
|
dateStr = tbbb;
|
|
}
|
|
else if ( ddd )
|
|
dateStr = asctime ( gmtime(&ddd ));
|
|
*/
|
|
|
|
//char ss[30];
|
|
//if ( tp[i]->m_spam == -1.0 ) sprintf(ss," ");
|
|
//else if ( tp[i]->m_spam == 0.0 ) sprintf(ss,"--");
|
|
//else sprintf ( ss , "%.03f",1.0-tp[i]->m_spam);
|
|
|
|
|
|
sb->safePrintf ( "</nobr></td>"
|
|
);
|
|
|
|
// print the weight vector before Weight and Spam
|
|
/*
|
|
float prod = 1.0;
|
|
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
|
|
if ( ! count[j] ) continue;
|
|
if ( tp[i]->m_isSynonym )
|
|
sb->safePrintf("<td> </td>" );
|
|
else if ( tp[i]->m_rv[j] == 1.0 )
|
|
sb->safePrintf("<td> </td>" );
|
|
else sb->safePrintf("<td>%.02f</td>",tp[i]->m_rv[j] );
|
|
// product up
|
|
prod *= tp[i]->m_rv[j];
|
|
}
|
|
|
|
// sanity check
|
|
// maybe look into this at some point, but not a big deal!!
|
|
//float err = prod - tp[i]->m_weight;
|
|
//if ( err > .05 )
|
|
// logf(LOG_DEBUG,"weights: prod was %.02f should be "
|
|
// "%.02f",prod,tp[i]->m_weight);
|
|
*/
|
|
|
|
//char *desc = " ";
|
|
//if ( tp[i]->m_descOff >= 0 )
|
|
// desc = start + tp[i]->m_descOff;
|
|
|
|
/*
|
|
// synonyms are always 1/4 weight of original
|
|
if ( tp[i]->m_isSynonym )
|
|
sb->safePrintf("<td> </td>" );
|
|
else
|
|
sb->safePrintf("<td>%.03f</td>", tp[i]->m_weight );
|
|
*/
|
|
|
|
sb->safePrintf ( //"<td>%s</td>"
|
|
//"<td><b>%"UINT32"</b></td>"
|
|
//"<td><nobr>%s</nobr></td>"
|
|
"<td><nobr>%s",
|
|
getHashGroupString(tp[i]->m_hashGroup)
|
|
);
|
|
|
|
//if ( tp[i]->m_synSrc ) {
|
|
// char ss = tp[i]->m_synSrc;
|
|
// sb->safePrintf(" - %s",g_synonyms.getSourceString(ss));
|
|
//}
|
|
|
|
sb->safePrintf ( "</nobr></td>" );
|
|
|
|
sb->safePrintf ( "<td>%016"UINT64"</td>"
|
|
,
|
|
//ss ,
|
|
//(uint32_t)tp[i]->m_score32 ,
|
|
//dateStr ,
|
|
//desc, // start + tp[i]->m_descOff ,
|
|
(uint64_t)(tp[i]->m_termId & TERMID_MASK) );
|
|
|
|
if ( tp[i]->m_shardByTermId ) sb->safePrintf("<td><b>1</b></td>" );
|
|
else sb->safePrintf("<td>0</td>" );
|
|
|
|
|
|
sb->safePrintf("<td>");
|
|
|
|
// there is no prefix for such terms now
|
|
// TODO: store actual key in there i guess?? or just this bit.
|
|
int32_t val32 = 0;
|
|
if ( strncmp(prefix,"gbfacet",7) == 0 )
|
|
val32 = g_posdb.getInt(&tp[i]->m_key);
|
|
|
|
// . this is like gbxpathsitehash1234567
|
|
// . the number following it is the hash
|
|
// . the value stored in the posdb key is the hash of the
|
|
// inner html content of that xpath/site for this page
|
|
if ( strncmp(term,"facetField=gbxpathsitehash",26)==0)
|
|
sb->safePrintf("<b>Term</b> is a 32-bit hash of the "
|
|
"X-path of "
|
|
"a section XOR'ed with the 32-bit "
|
|
"hash of this document's subdomain. "
|
|
"[%"UINT32"] is the 32-bit hash of the "
|
|
"Inner HTML of this section stored "
|
|
"in the posdb key instead of "
|
|
"the usual stuff. This is also "
|
|
"sharded by termId!",
|
|
(uint32_t)val32
|
|
//(int32_t)tp[i]->m_sentHash32
|
|
);
|
|
|
|
sb->safePrintf("</td>");
|
|
|
|
|
|
sb->safePrintf("</tr>\n");
|
|
}
|
|
|
|
|
|
sb->safePrintf("</table><br>\n");
|
|
|
|
//
|
|
// END PRINT HASHES TERMS
|
|
//
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printMenu ( SafeBuf *sb ) {
|
|
|
|
// encode it
|
|
SafeBuf ue;
|
|
ue.urlEncode ( ptr_firstUrl );
|
|
|
|
// get
|
|
sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
|
|
"content=\"text/html; charset=utf-8\">" );
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
/*
|
|
char *coll = cr->m_coll;
|
|
|
|
int64_t d = m_docId;
|
|
|
|
// print links at top
|
|
sb->safePrintf(
|
|
//"<a href=/print?c=%s&u=%s&page=1>general info</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=2>page inlinks</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=3>site inlinks</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=4>sections</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=5>indexed terms</a> | "
|
|
// the breakdown of when it was spidered and when it
|
|
// is due to be spidered again. and any errors
|
|
// encountered when spidering
|
|
//"<a href=/print?c=%s&u=%s&page=6>spider stats</a> | "
|
|
//"<a href=/print?c=%s&u=%s&page=7>cached page</a>"
|
|
"<a href=/print?c=%s&d=%"INT64"&page=1>general info</a> | "
|
|
"<a href=/print?c=%s&d=%"INT64"&page=2&recompute=1>"
|
|
"page inlinks</a> | "
|
|
"<a href=/print?c=%s&d=%"INT64"&page=3>site inlinks</a> | "
|
|
//"<a href=/print?c=%s&d=%"INT64"&page=4>sections</a> | "
|
|
"<a href=/print?c=%s&d=%"INT64"&page=5>indexed terms</a>"
|
|
// the breakdown of when it was spidered and when it
|
|
// is due to be spidered again. and any errors
|
|
// encountered when spidering
|
|
//"<a href=/print?c=%s&d=%"INT64"&page=6>spider stats</a> |"
|
|
//" <a href=/print?c=%s&d=%"INT64"&page=7>cached page</a>"
|
|
"<br>"
|
|
"<br>"
|
|
,coll,d//ue.getBufStart()
|
|
,coll,d//ue.getBufStart()
|
|
,coll,d//ue.getBufStart()
|
|
//,coll,d//ue.getBufStart()
|
|
,coll,d//ue.getBufStart()
|
|
//,coll,d//ue.getBufStart()
|
|
//,coll,d//ue.getBufStart()
|
|
);
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
// if printDocForProCog, an entry function, blocks, we gotta re-call it
|
|
static void printDocForProCogWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in print doc for pro cog wrapper" );
|
|
// get it
|
|
bool status = THIS->printDocForProCog ( THIS->m_savedSb ,
|
|
THIS->m_savedHr );
|
|
// return if it blocked
|
|
if ( ! status ) return;
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
}
|
|
|
|
// in PageRoot.cpp
|
|
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
|
bool printGigablast );
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno and returns true on error
|
|
bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
if ( ! sb ) return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
m_masterLoop = printDocForProCogWrapper;
|
|
m_masterState = this;
|
|
|
|
m_savedSb = sb;
|
|
m_savedHr = hr;
|
|
|
|
// if we are generating site or page inlinks info for a
|
|
// non docid based url, then store that info in the respective
|
|
// safe bufs
|
|
m_useSiteLinkBuf = true;
|
|
m_usePageLinkBuf = true;
|
|
|
|
|
|
int32_t page = hr->getLong("page",1);
|
|
|
|
|
|
// for some reason sections page blocks forever in browser
|
|
if ( page != 7 && ! m_printedMenu ) { // && page != 5 )
|
|
printFrontPageShell ( sb , "search" , cr , false );
|
|
m_printedMenu = true;
|
|
//printMenu ( sb );
|
|
}
|
|
|
|
|
|
if ( page == 1 )
|
|
return printGeneralInfo(sb,hr);
|
|
|
|
if ( page == 2 )
|
|
return printPageInlinks(sb,hr);
|
|
|
|
if ( page == 3 )
|
|
return printSiteInlinks(sb,hr);
|
|
|
|
if ( page == 4 )
|
|
return printRainbowSections(sb,hr);
|
|
|
|
if ( page == 5 )
|
|
return printTermList(sb,hr);
|
|
|
|
if ( page == 6 )
|
|
return printSpiderStats(sb,hr);
|
|
|
|
if ( page == 7 )
|
|
return printCachedPage(sb,hr);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// int16_tcut
|
|
char *fu = ptr_firstUrl;
|
|
|
|
// sanity check
|
|
Xml *xml = getXml();
|
|
// blocked?
|
|
if ( xml == (void *)-1 ) return false;
|
|
// error?
|
|
if ( ! xml ) return true;
|
|
|
|
char *ict = getIsContentTruncated();
|
|
if ( ! ict ) return true; if ( ict == (char *)-1 ) return false;
|
|
char *at = getIsAdult();
|
|
if ( ! at ) return true; if ( at == (void *)-1 ) return false;
|
|
char *ls = getIsLinkSpam();
|
|
if ( ! ls ) return true; if ( ls == (void *)-1 ) return false;
|
|
uint8_t *ct = getContentType();
|
|
if ( ! ct ) return true; if ( ct == (void *)-1 ) return false;
|
|
uint16_t *cs = getCharset ( );
|
|
if ( ! cs ) return true; if ( cs == (uint16_t *)-1 ) return false;
|
|
char *pl = getIsPermalink();
|
|
if ( ! pl ) return true; if ( pl == (char *)-1 ) return false;
|
|
char *isRSS = getIsRSS();
|
|
if ( ! isRSS ) return true; if ( isRSS == (char *)-1 ) return false;
|
|
int32_t *ip = getIp();
|
|
if ( ! ip ) return true; if ( ip == (int32_t *)-1 ) return false;
|
|
uint8_t *li = getLangId();
|
|
if ( ! li ) return true; if ( li == (uint8_t *)-1 ) return false;
|
|
uint16_t *cid = getCountryId();
|
|
if ( ! cid ) return true; if ( cid == (uint16_t *)-1 ) return false;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 ) return true; if ( info1 == (void *)-1 ) return false;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
//char *ls = getIsLinkSpam();
|
|
//Links *links = getLinks();
|
|
// blocked?
|
|
//if ( links == (void *)-1 ) { char *xx=NULL;*xx=0;}//return false;
|
|
// error?
|
|
//if ( ! links ) return true;
|
|
|
|
// make it a URL
|
|
Url uu; uu.set ( fu , false );
|
|
|
|
|
|
|
|
char *allowed = "???";
|
|
int32_t allowedInt = 1;
|
|
if ( m_isAllowedValid && m_isAllowed ) {
|
|
allowed = "yes";
|
|
allowedInt = 1;
|
|
}
|
|
else if ( m_isAllowedValid ) {
|
|
allowed = "no";
|
|
allowedInt = 0;
|
|
}
|
|
|
|
int32_t ufn = -1;
|
|
if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
|
|
|
|
char *es = mstrerror(m_indexCode);
|
|
if ( ! m_indexCode ) es = mstrerror(g_errno);
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
//int32_t groupId = g_hostdb.getGroupIdFromDocId(m_docId);
|
|
//Host *group = g_hostdb.getGroup(groupId);
|
|
int32_t shardNum = getShardNumFromDocId ( m_docId );
|
|
Host *hosts = g_hostdb.getShard ( shardNum );
|
|
Host *h = &hosts[0];
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf (
|
|
"<table cellpadding=3 border=0>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">docId</td>"
|
|
"<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td width=\"25%%\">on host #</td>"
|
|
"<td>%"INT32"</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>index error code</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>robots.txt allows</td>"
|
|
"<td>%s</td>"
|
|
"</tr>\n"
|
|
|
|
|
|
"<tr>"
|
|
"<td>url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
|
|
,
|
|
cr->m_coll,
|
|
m_docId ,
|
|
m_docId ,
|
|
|
|
h->m_hostId,
|
|
|
|
es,
|
|
allowed,
|
|
|
|
fu,
|
|
fu
|
|
|
|
);
|
|
else
|
|
sb->safePrintf (
|
|
"<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
"\t<coll><![CDATA[%s]]></coll>\n"
|
|
"\t<docId>%"INT64"</docId>\n"
|
|
"\t<indexError><![CDATA[%s]]></indexError>\n"
|
|
"\t<robotsTxtAllows>%"INT32""
|
|
"</robotsTxtAllows>\n"
|
|
"\t<url><![CDATA[%s]]></url>\n"
|
|
,
|
|
cr->m_coll,
|
|
m_docId ,
|
|
es,
|
|
allowedInt,//(int32_t)m_isAllowed,
|
|
fu
|
|
);
|
|
|
|
char *redir = ptr_redirUrl;
|
|
if ( redir && ! isXml ) {
|
|
sb->safePrintf(
|
|
"<tr>"
|
|
"<td>redir url</td>"
|
|
"<td><a href=\"%s\">%s</a></td>"
|
|
"</tr>\n"
|
|
,redir
|
|
,redir );
|
|
}
|
|
else if ( redir ) {
|
|
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
|
|
"</redirectUrl>\n" ,redir );
|
|
}
|
|
|
|
|
|
if ( m_indexCode || g_errno ) {
|
|
if ( ! isXml ) sb->safePrintf("</table><br>\n");
|
|
else sb->safePrintf("</response>\n");
|
|
return true;
|
|
}
|
|
|
|
|
|
// must always start with http i guess!
|
|
if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
|
|
|
|
time_t ts = (time_t)m_firstIndexedDate;
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>first indexed date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts)) );
|
|
else
|
|
sb->safePrintf("\t<firstIndexedDateUTC>%"UINT32""
|
|
"</firstIndexedDateUTC>\n",
|
|
(uint32_t)m_firstIndexedDate );
|
|
|
|
ts = m_spideredTime;
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>last indexed date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
else
|
|
sb->safePrintf("\t<lastIndexedDateUTC>%"UINT32""
|
|
"</lastIndexedDateUTC>\n",
|
|
(uint32_t)m_spideredTime );
|
|
|
|
ts = m_outlinksAddedDate;
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>outlinks last added date</td>"
|
|
"<td>%s UTC</td></tr>\n" ,
|
|
asctime(gmtime(&ts )) );
|
|
else
|
|
sb->safePrintf("\t<outlinksLastAddedUTC>%"UINT32""
|
|
"</outlinksLastAddedUTC>\n",
|
|
(uint32_t)m_outlinksAddedDate );
|
|
|
|
// hop count
|
|
if ( ! isXml )
|
|
sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td>"
|
|
"</tr>\n",
|
|
(int32_t)m_hopCount);
|
|
else
|
|
sb->safePrintf("\t<hopCount>%"INT32"</hopCount>\n",
|
|
(int32_t)m_hopCount);
|
|
|
|
|
|
char strLanguage[128];
|
|
languageToString(m_langId, strLanguage);
|
|
|
|
// print tags
|
|
//SafeBuf tb;
|
|
int32_t sni = m_siteNumInlinks;
|
|
|
|
char *ipString = iptoa(m_ip);
|
|
|
|
//int32_t sni = info1->getNumGoodInlinks();
|
|
|
|
time_t tlu = info1->getLastUpdated();
|
|
struct tm *timeStruct3 = gmtime ( &tlu );//info1->m_lastUpdated );
|
|
char tmp3[64];
|
|
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
|
|
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf (
|
|
"<tr><td>original charset</td><td>%s</td></tr>\n"
|
|
"<tr><td>adult bit</td><td>%"INT32"</td></tr>\n"
|
|
//"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n"
|
|
"<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n"
|
|
"<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n"
|
|
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
|
|
"%s</td></tr>\n"
|
|
"<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n"
|
|
"<tr><td>content truncated</td><td>%"INT32"</td></tr>\n"
|
|
"<tr><td>content type</td><td>%s</td></tr>\n"
|
|
"<tr><td>language</td><td>%s</td></tr>\n"
|
|
"<tr><td>country</td><td>%s</td></tr>\n"
|
|
|
|
"<tr><td><b>good inlinks to site</b>"
|
|
"</td><td>%"INT32"</td></tr>\n"
|
|
|
|
// "<tr><td>unique IP inlinks to site"
|
|
// "</td><td>%"INT32"</td></tr>\n"
|
|
|
|
// "<tr><td>unique CBlock inlinks to site"
|
|
// "</td><td>%"INT32"</td></tr>\n"
|
|
|
|
"<tr><td><b>site rank</b></td><td>%"INT32"</td></tr>\n"
|
|
|
|
"<tr><td>good inlinks to page"
|
|
"</td><td>%"INT32"</td></tr>\n"
|
|
|
|
// "<tr><td>unique IP inlinks to page"
|
|
// "</td><td>%"INT32"</td></tr>\n"
|
|
|
|
// "<tr><td>unique CBlock inlinks to page"
|
|
// "</td><td>%"INT32"</td></tr>\n"
|
|
|
|
// "<tr><td>total inlinks to page"
|
|
// "</td><td>%"INT32"</td></tr>\n"
|
|
|
|
"<tr><td><nobr>page inlinks last computed</nobr></td>"
|
|
"<td>%s</td></tr>\n"
|
|
"</td></tr>\n",
|
|
get_charset_str(m_charset),
|
|
(int32_t)m_isAdult,
|
|
//(int32_t)m_isLinkSpam,
|
|
//m_note,
|
|
(int32_t)m_isPermalink,
|
|
(int32_t)m_isRSS,
|
|
ipString,
|
|
cr->m_coll,
|
|
ipString,
|
|
size_utf8Content - 1,
|
|
(int32_t)m_isContentTruncated,
|
|
g_contentTypeStrings[(int)m_contentType] ,
|
|
strLanguage,
|
|
g_countryCode.getName(m_countryId) ,
|
|
sni,
|
|
//m_siteNumInlinksUniqueIp,
|
|
//m_siteNumInlinksUniqueCBlock,
|
|
::getSiteRank(sni),
|
|
//info1->getNumTotalInlinks(),
|
|
info1->getNumGoodInlinks(),
|
|
// info1->m_numUniqueIps,
|
|
// info1->m_numUniqueCBlocks,
|
|
// info1->m_totalInlinkingDocIds,
|
|
|
|
tmp3
|
|
);
|
|
else {
|
|
sb->safePrintf (
|
|
"\t<charset><![CDATA[%s]]></charset>\n"
|
|
"\t<isAdult>%"INT32"</isAdult>\n"
|
|
"\t<isLinkSpam>%"INT32"</isLinkSpam>\n"
|
|
"\t<siteRank>%"INT32"</siteRank>\n"
|
|
|
|
"\t<numGoodSiteInlinks>%"INT32"</numGoodSiteInlinks>\n"
|
|
//"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
|
|
// "\t<numUniqueIpsLinkingToSite>%"INT32""
|
|
// "</numUniqueIpsLinkingToSite>\n"
|
|
// "\t<numUniqueCBlocksLinkingToSite>%"INT32""
|
|
// "</numUniqueCBlocksLinkingToSite>\n"
|
|
|
|
|
|
|
|
|
|
// how many inlinks, external and internal, we have
|
|
// to this page not filtered in any way!!!
|
|
//"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
|
|
// how many inlinking ips we got, including our own if
|
|
// we link to ourself
|
|
// "\t<numUniqueIpsLinkingToPage>%"INT32""
|
|
// "</numUniqueIpsLinkingToPage>\n"
|
|
// how many inlinking cblocks we got, including our own
|
|
// if we link to ourself
|
|
// "\t<numUniqueCBlocksLinkingToPage>%"INT32""
|
|
// "</numUniqueCBlocksLinkingToPage>\n"
|
|
|
|
|
|
"\t<numGoodPageInlinks>%"INT32"</numGoodPageInlinks>\n"
|
|
"\t<pageInlinksLastComputed>%"INT32""
|
|
"</pageInlinksLastComputed>\n"
|
|
|
|
,get_charset_str(m_charset)
|
|
,(int32_t)m_isAdult
|
|
,(int32_t)m_isLinkSpam
|
|
,::getSiteRank(sni)
|
|
,sni
|
|
// ,m_siteNumInlinksTotal
|
|
// ,m_siteNumInlinksUniqueIp
|
|
// ,m_siteNumInlinksUniqueCBlock
|
|
|
|
//,info1->m_totalInlinkingDocIds
|
|
//,info1->m_numUniqueIps
|
|
//,info1->m_numUniqueCBlocks
|
|
|
|
,info1->getNumGoodInlinks()
|
|
//,tmp3
|
|
,(int32_t)info1->m_lastUpdated
|
|
);
|
|
//if ( m_note )
|
|
// sb->safePrintf("\t<isLinkSpamReason><![CDATA[%s]]>"
|
|
// "</isLinkSpamReason>\n"
|
|
// , m_note );
|
|
sb->safePrintf("\t<isPermalink>%"INT32"</isPermalink>\n"
|
|
"\t<isRSSFeed>%"INT32"</isRSSFeed>\n"
|
|
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
|
|
"\t<contentLenInBytes>%"INT32""
|
|
"</contentLenInBytes>\n"
|
|
"\t<isContentTruncated>%"INT32""
|
|
"</isContentTruncated>\n"
|
|
"\t<contentType><![CDATA[%s]]></contentType>\n"
|
|
"\t<language><![CDATA[%s]]></language>\n"
|
|
"\t<country><![CDATA[%s]]></country>\n",
|
|
(int32_t)m_isPermalink,
|
|
(int32_t)m_isRSS,
|
|
ipString,
|
|
size_utf8Content - 1,
|
|
(int32_t)m_isContentTruncated,
|
|
g_contentTypeStrings[(int)m_contentType] ,
|
|
strLanguage,
|
|
g_countryCode.getName(m_countryId) );
|
|
}
|
|
|
|
//sb->safePrintf("<tr><td>site</td><td>");
|
|
//sb->safeMemcpy(ptr_site,size_site-1);
|
|
//sb->safePrintf("</td></tr>\n");
|
|
|
|
|
|
TagRec *ogr = NULL;
|
|
if ( m_tagRecDataValid && m_version >= 118 ) {
|
|
ogr = getTagRec(); // &m_tagRec;
|
|
// sanity. should be set from titlerec, so no blocking!
|
|
if ( ! ogr || ogr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
|
|
else if ( ogr ) ogr->printToBufAsXml ( sb );
|
|
|
|
// show the good inlinks we used when indexing this
|
|
if ( ! isXml )
|
|
info1->print(sb,cr->m_coll);
|
|
|
|
// close the table
|
|
if ( ! isXml )
|
|
sb->safePrintf ( "</table></center><br>\n" );
|
|
else
|
|
sb->safePrintf("</response>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// use msg25 to hit linkdb and give us a link info class i guess
|
|
// but we need paging functionality so we can page through like
|
|
// 100 links at a time. clustered by c-class ip.
|
|
|
|
// do we need to mention how many from each ip c-class then? because
|
|
// then we'd have to read the whole termlist, might be several
|
|
// separate disk reads.
|
|
|
|
// we need to re-get both if either is NULL
|
|
LinkInfo *sinfo = getSiteLinkInfo();
|
|
// block or error?
|
|
if ( ! sinfo ) return true; if ( sinfo == (LinkInfo *)-1) return false;
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
|
|
|
|
sb->safeMemcpy ( &m_siteLinkBuf );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("</response>\n" );
|
|
|
|
// just print that
|
|
//sinfo->print ( sb , cr->m_coll );
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// we need to re-get both if either is NULL
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// block or error?
|
|
if ( ! info1 ) return true; if ( info1 == (LinkInfo *)-1) return false;
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
|
|
int32_t recompute = hr->getLong("recompute",0);
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
// i guess we need this
|
|
if ( ! recompute ) // m_setFromTitleRec )
|
|
info1->print ( sb , cr->m_coll );
|
|
else
|
|
sb->safeMemcpy ( &m_pageLinkBuf );
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("</response>\n" );
|
|
|
|
return true;
|
|
}
|
|
|
|
static void getInlineSectionVotingBufWrapper ( void *state ) {
|
|
XmlDoc *xd = (XmlDoc *)state;
|
|
SafeBuf *vb = xd->getInlineSectionVotingBuf();
|
|
// return if blocked
|
|
if ( vb == (void *)-1 ) return;
|
|
// error?
|
|
if ( ! vb ) log("xmldoc: error getting inline section votes: %s",
|
|
mstrerror(g_errno));
|
|
// all done then. call original entry callback
|
|
log("xmldoc: returning control to original caller");
|
|
xd->m_callback1 ( xd->m_state );
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true with g_errno set on error
|
|
// . this actually returns the page content with inserted information
|
|
// based on sectiondb data
|
|
// . for example, <div id=poo> --> <div id=poo d=5 n=20>
|
|
// means that the section is repeated on 20 pages from this site and 5 of
|
|
// which have the same innerHtml as us
|
|
SafeBuf *XmlDoc::getInlineSectionVotingBuf ( ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// . if we block anywhere below we want to come back here until done
|
|
// . this can be a main entry point, so set m_masterLoop
|
|
if ( ! m_masterLoop ) {
|
|
m_masterLoop = getInlineSectionVotingBufWrapper;
|
|
m_masterState = this;
|
|
log("xmldoc: getting section voting info from coll=%s",
|
|
cr->m_coll);
|
|
}
|
|
|
|
if ( m_inlineSectionVotingBufValid )
|
|
return &m_inlineSectionVotingBuf;
|
|
|
|
Sections *sections = getSectionsWithDupStats();
|
|
if ( ! sections || sections == (void *)-1 ) return (SafeBuf *)sections;
|
|
Words *words = getWords();
|
|
if ( ! words || words == (void *)-1 ) return (SafeBuf *)words;
|
|
HttpMime *mime = getMime();
|
|
if ( ! mime || mime == (void *)-1 ) return (SafeBuf *)mime;
|
|
|
|
int32_t siteHash32 = *getSiteHash32();
|
|
|
|
//int32_t nw = words->getNumWords();
|
|
//int64_t *wids = words->getWordIds();
|
|
|
|
SafeBuf *sb = &m_inlineSectionVotingBuf;
|
|
|
|
// store mime first then content
|
|
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we no longer use this through a proxy, so take this out
|
|
//sb->safeMemcpy ( m_httpReply , mime->getMimeLen() );
|
|
// but hack the Content-Length: field to something alien
|
|
// because we markup the html and the lenght will be different...
|
|
//sb->nullTerm();
|
|
|
|
// we no longer use this through a proxy so take this out
|
|
//char *cl = strstr(sb->getBufStart(),"\nContent-Length:");
|
|
//if ( cl ) cl[1] = 'Z';
|
|
|
|
//sec_t mflags = SEC_SENTENCE | SEC_MENU;
|
|
|
|
// just print out each word
|
|
// map the word to a section.
|
|
// if it s the first time we've printed the section then we
|
|
// can inject the stuff
|
|
// set a printed bit to indicate when we print out a section so
|
|
// we do not re-print it...
|
|
|
|
// these are 1-1 with words
|
|
Section **sptrs = sections->m_sectionPtrs;
|
|
int32_t nw = words->getNumWords();
|
|
char **wptrs = words->m_words;
|
|
int32_t *wlens = words->m_wordLens;
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
char *a = wptrs[i];
|
|
// skip if not a front tag
|
|
if ( *a != '<' || a[1] == '/' ) {
|
|
sb->safeMemcpy(a,wlens[i]);
|
|
continue;
|
|
}
|
|
Section *sa = sptrs[i];
|
|
// straight copy if no stats
|
|
if ( ! sa || ! sa->m_stats.m_totalEntries ) {
|
|
sb->safeMemcpy ( a , wlens[i] );
|
|
continue;
|
|
}
|
|
// should be tag then
|
|
char *e = a;
|
|
for ( ; *e && *e != '>' && ! is_wspace_a(*e) ; e++);
|
|
// copy that
|
|
sb->safeMemcpy ( a , e-a);
|
|
|
|
// the hash of the turktaghash and sitehash32 combined
|
|
// so you can do gbfacetstr:gbxpathsitehash12345
|
|
// where the 12345 is this h32 value.
|
|
uint32_t h32 = sa->m_turkTagHash32 ^ siteHash32;
|
|
|
|
// insert our stuff into the tag
|
|
//sb->safePrintf("<!--");
|
|
//sb->safePrintf("<font color=red>");
|
|
SectionStats *sx = &sa->m_stats;
|
|
// # docs from our site had the same innerHTML?
|
|
sb->safePrintf(" _s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32"",
|
|
// total # of docs that had an xpath with
|
|
// our same innerHtml
|
|
(int32_t)sx->m_totalMatches,
|
|
// # of of docids with this facet
|
|
(int32_t)sx->m_totalDocIds,
|
|
// . total # of times this xpath occurred
|
|
// . can be multiple times per doc
|
|
(int32_t)sx->m_totalEntries,
|
|
// unique values in the xpath innerhtml
|
|
(int32_t)sx->m_numUniqueVals,
|
|
// xpathsitehash
|
|
h32 );
|
|
// copy the rest of the tag
|
|
sb->safeMemcpy( e, wlens[i]-(e-a) );
|
|
//sb->safePrintf("-->");
|
|
//sb->safePrintf("</font>");
|
|
// print it here
|
|
}
|
|
sb->nullTerm();
|
|
m_inlineSectionVotingBufValid = true;
|
|
return &m_inlineSectionVotingBuf;
|
|
}
|
|
|
|
bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// what wordposition to scroll to and blink?
|
|
int32_t hiPos = -1;
|
|
if ( hr ) hiPos = hr->getLong("hipos",-1);
|
|
|
|
//
|
|
// PRINT SECTIONS
|
|
//
|
|
Sections *sections ;
|
|
// hr is NULL if being called from page parser which does not have the
|
|
// dup stats! and we core if we block here!
|
|
if ( hr ) sections = getSectionsWithDupStats();
|
|
else sections = getSections();
|
|
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
|
|
//SectionVotingTable *nsvt = getNewSectionVotingTable();
|
|
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
|
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
|
Words *words = getWords();
|
|
if ( ! words ) return true; if ( words == (Words *)-1 ) return false;
|
|
Phrases *phrases = getPhrases();
|
|
if ( ! phrases ) return true; if (phrases == (void *)-1 ) return false;
|
|
HashTableX *cnt = getCountTable();
|
|
if ( ! cnt ) return true; if ( cnt == (void *)-1 ) return false;
|
|
|
|
|
|
int32_t nw = words->getNumWords();
|
|
//int32_t wordStart = 0;
|
|
//int32_t wordEnd = nw;
|
|
int64_t *wids = words->getWordIds();
|
|
|
|
int32_t isXml = false;
|
|
if ( hr ) isXml = (bool)hr->getLong("xml",0);
|
|
|
|
//if ( ! isXml ) printMenu ( sb );
|
|
|
|
// now complement, cuz bigger is better in the ranking world
|
|
//int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY );
|
|
SafeBuf densBuf;
|
|
// returns false and sets g_errno on error
|
|
if ( ! getDensityRanks((int64_t *)wids,
|
|
nw,
|
|
HASHGROUP_BODY,//hi->m_hashGroup,
|
|
&densBuf,
|
|
sections,
|
|
m_niceness))
|
|
return true;
|
|
// a handy ptr
|
|
char *densityVec = (char *)densBuf.getBufStart();
|
|
|
|
|
|
/*
|
|
if ( ! isXml )
|
|
sb->safePrintf("<br><b>density rank of body = %"INT32"</b> "
|
|
"(out of %"INT32")"
|
|
"<br>"
|
|
"<br>"
|
|
, densityRank
|
|
, (int32_t)MAXDENSITYRANK
|
|
);
|
|
*/
|
|
|
|
|
|
char *wordSpamVec = getWordSpamVec();
|
|
char *fragVec = m_fragBuf.getBufStart();
|
|
|
|
SafeBuf dwbuf;
|
|
if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true;
|
|
char *diversityVec = dwbuf.getBufStart();
|
|
|
|
// hack fack debug
|
|
//m_bodyStartPos =2136;
|
|
|
|
SafeBuf wpos;
|
|
if ( ! getWordPosVec ( words ,
|
|
sections,
|
|
//wordStart,
|
|
//wordEnd,
|
|
// we save this in the titlerec, when we
|
|
// start hashing the body. we have the url
|
|
// terms before the body, so this is necessary.
|
|
m_bodyStartPos,//0, // hi->m_startDist,
|
|
fragVec,
|
|
m_niceness,
|
|
&wpos) ) return true;
|
|
// a handy ptr
|
|
int32_t *wposVec = (int32_t *)wpos.getBufStart();
|
|
|
|
if ( ! isXml ) {
|
|
// put url in for steve to parse out
|
|
sb->safePrintf("%s\n",
|
|
m_firstUrl.m_url);
|
|
sb->safePrintf("<font color=black>w</font>"
|
|
"/"
|
|
"<font color=purple>x</font>"
|
|
//"/"
|
|
//"<font color=green>y</font>"
|
|
"/"
|
|
"<font color=red>z</font>"
|
|
": "
|
|
"w=wordPosition "
|
|
"x=densityRank "
|
|
//"y=diversityRank "
|
|
"z=wordSpamRank "
|
|
"<br>"
|
|
"<br>"
|
|
""
|
|
);
|
|
}
|
|
|
|
if ( ! isXml ) {
|
|
// try the new print function
|
|
sections->print2 ( sb ,
|
|
hiPos,
|
|
wposVec,
|
|
densityVec,
|
|
diversityVec,
|
|
wordSpamVec,
|
|
fragVec,
|
|
NULL,
|
|
NULL ,
|
|
&m_addresses ,
|
|
true );
|
|
return true;
|
|
}
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
|
|
Section *si = sections->m_rootSection;
|
|
|
|
sec_t mflags = SEC_SENTENCE | SEC_MENU;
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// print it out
|
|
sb->safePrintf("\t<section>\n");
|
|
// get our offset in the array of sections
|
|
int32_t num = si - sections->m_sections;
|
|
sb->safePrintf("\t\t<id>%"INT32"</id>\n",num);
|
|
Section *parent = si->m_parent;
|
|
if ( parent ) {
|
|
int32_t pnum = parent - sections->m_sections;
|
|
sb->safePrintf("\t\t<parent>%"INT32"</parent>\n",pnum);
|
|
}
|
|
char *byte1 = words->m_words[si->m_a];
|
|
char *byte2 = words->m_words[si->m_b-1] +
|
|
words->m_wordLens[si->m_b-1];
|
|
int32_t off1 = byte1 - words->m_words[0];
|
|
int32_t size = byte2 - byte1;
|
|
sb->safePrintf("\t\t<byteOffset>%"INT32"</byteOffset>\n",off1);
|
|
sb->safePrintf("\t\t<numBytes>%"INT32"</numBytes>\n",size);
|
|
if ( si->m_flags & mflags ) {
|
|
sb->safePrintf("\t\t<flags><![CDATA[");
|
|
bool printed = false;
|
|
if ( si->m_flags & SEC_SENTENCE ) {
|
|
sb->safePrintf("sentence");
|
|
printed = true;
|
|
}
|
|
if ( si->m_flags & SEC_MENU ) {
|
|
if ( printed ) sb->pushChar(' ');
|
|
sb->safePrintf("ismenu");
|
|
printed = true;
|
|
}
|
|
sb->safePrintf("]]></flags>\n");
|
|
}
|
|
int32_t bcolor = (int32_t)si->m_colorHash& 0x00ffffff;
|
|
int32_t fcolor = 0x000000;
|
|
//int32_t rcolor = 0x000000;
|
|
uint8_t *bp = (uint8_t *)&bcolor;
|
|
bool dark = false;
|
|
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
|
|
dark = true;
|
|
// or if two are less than 50
|
|
if ( bp[0]<100 && bp[1]<100 ) dark = true;
|
|
if ( bp[1]<100 && bp[2]<100 ) dark = true;
|
|
if ( bp[0]<100 && bp[2]<100 ) dark = true;
|
|
// if bg color is dark, make font color light
|
|
if ( dark ) {
|
|
fcolor = 0x00ffffff;
|
|
//rcolor = 0x00ffffff;
|
|
}
|
|
sb->safePrintf("\t\t<bgColor>%06"XINT32"</bgColor>\n",bcolor);
|
|
sb->safePrintf("\t\t<textColor>%06"XINT32"</textColor>\n",fcolor);
|
|
// count stats
|
|
uint64_t ch64 = (int32_t)si->m_sentenceContentHash64;
|
|
if ( ! ch64 ) {
|
|
sb->safePrintf("\t</section>\n");
|
|
continue;
|
|
}
|
|
/* take this out for now it is not quite right any more.
|
|
we now use the xpath hash and site hash as the key
|
|
and the "value" is the sentence/innerHtml hash
|
|
sb->safePrintf("\t\t<numOnSitePagesThatDuplicateContent>%"INT32""
|
|
"</numOnSitePagesThatDuplicateContent>\n",
|
|
(int32_t)si->m_stats.m_onSiteDocIds);
|
|
sb->safePrintf("\t\t<numOffSitePagesThatDuplicateContent>%"INT32""
|
|
"</numOffSitePagesThatDuplicateContent>\n",
|
|
(int32_t)si->m_stats.m_offSiteDocIds);
|
|
sb->safePrintf("\t\t<numSitesThatDuplicateContent>%"INT32""
|
|
"</numSitesThatDuplicateContent>\n",
|
|
(int32_t)si->m_stats.m_numUniqueSites);
|
|
*/
|
|
// you can do a sitehash:xxxxx this number to see who the
|
|
// dups are!
|
|
sb->safePrintf("\t\t<innerContentHash64>%"UINT64""
|
|
"</innerContentHash64>\n",
|
|
si->m_sentenceContentHash64);
|
|
sb->safePrintf("\t</section>\n");
|
|
}
|
|
|
|
// now print out the entire page content so the offsets make sense!
|
|
sb->safePrintf("\t<utf8Content><![CDATA[");
|
|
if ( ptr_utf8Content )
|
|
sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,false);
|
|
sb->safePrintf("]]></utf8Content>\n");
|
|
|
|
// end xml response
|
|
sb->safePrintf("</response>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
// set debug buffer
|
|
m_storeTermListInfo = true;
|
|
|
|
// default to sorting by wordpos
|
|
m_sortTermListBy = hr->getLong("sortby",1);
|
|
|
|
// cores in getNewSpiderReply() if we do not have this and provide
|
|
// the docid...
|
|
m_useSpiderdb = false;
|
|
|
|
char *metaList = getMetaList ( );
|
|
if ( ! metaList ) return true; if (metaList==(char *) -1) return false;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( isXml ) {
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
sb->safePrintf(
|
|
"\t<maxDens>%"INT32"</maxDens>\n"
|
|
//"\t<maxDiv>%"INT32"</maxDiv>\n"
|
|
"\t<maxSpam>%"INT32"</maxSpam>\n"
|
|
, (int32_t)MAXDENSITYRANK
|
|
//, (int32_t)MAXDIVERSITYRANK
|
|
, (int32_t)MAXWORDSPAMRANK
|
|
);
|
|
}
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! isXml ) {
|
|
//printMenu ( sb );
|
|
//sb->safePrintf("<i>* indicates word is a synonym or "
|
|
// "alternative word form<br><br>");
|
|
sb->safePrintf("N column = DensityRank (0-%"INT32")<br>"
|
|
//"V column = DiversityRank (0-%"INT32")<br>"
|
|
"S column = WordSpamRank (0-%"INT32") "
|
|
"[or linker "
|
|
"siterank if its offsite link text]<br>"
|
|
|
|
"Lang column = language used for purposes "
|
|
"of detecting the document's primary language "
|
|
"using a simple majority vote"
|
|
"<br>"
|
|
|
|
"</i>"
|
|
"<br>"
|
|
"Document Primary Language: <b>%s</b> (%s)"
|
|
"<br>"
|
|
"<br>"
|
|
, (int32_t)MAXDENSITYRANK
|
|
//, (int32_t)MAXDIVERSITYRANK
|
|
, (int32_t)MAXWORDSPAMRANK
|
|
, getLanguageString (m_langId)
|
|
, getLangAbbr(m_langId)
|
|
);
|
|
// encode it
|
|
SafeBuf ue;
|
|
ue.urlEncode ( ptr_firstUrl );
|
|
|
|
sb->safePrintf("Sort by: " );
|
|
if ( m_sortTermListBy == 0 )
|
|
sb->safePrintf("<b>Term</b>");
|
|
else
|
|
sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
|
|
"sortby=0>"
|
|
"Term</a>"
|
|
, cr->m_coll
|
|
, ue.getBufStart()
|
|
);
|
|
sb->safePrintf(" | ");
|
|
if ( m_sortTermListBy == 1 )
|
|
sb->safePrintf("<b>WordPos</b>");
|
|
else
|
|
sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
|
|
"sortby=1>"
|
|
"WordPos</a>"
|
|
, cr->m_coll
|
|
, ue.getBufStart()
|
|
);
|
|
sb->safePrintf("<br>"
|
|
"<br>"
|
|
);
|
|
}
|
|
|
|
|
|
//
|
|
// BEGIN PRINT HASHES TERMS (JUST POSDB)
|
|
//
|
|
|
|
// int16_tcut
|
|
HashTableX *wt = m_wts;
|
|
|
|
// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
|
|
TermDebugInfo **tp = NULL;
|
|
// add them with this counter
|
|
int32_t nt = 0;
|
|
|
|
int32_t nwt = 0;
|
|
if ( wt ) {
|
|
nwt = wt->m_numSlots;
|
|
tp = (TermDebugInfo **)wt->m_keys;
|
|
}
|
|
|
|
// now print the table we stored all we hashed into
|
|
for ( int32_t i = 0 ; i < nwt ; i++ ) {
|
|
// skip if empty
|
|
if ( wt->m_flags[i] == 0 ) continue;
|
|
// breathe
|
|
//QUICKPOLL(m_niceness);
|
|
// get its key, date=32bits termid=64bits
|
|
//key96_t *k = (key96_t *)wt->getKey ( i );
|
|
// get the TermDebugInfo
|
|
TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
|
|
// point to it for sorting
|
|
tp[nt++] = ti;
|
|
}
|
|
|
|
// set this for cmptp
|
|
s_wbuf = &m_wbuf;
|
|
|
|
if ( m_sortTermListBy == 0 )
|
|
// sort them alphabetically
|
|
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
|
|
else
|
|
// sort by word pos
|
|
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp2 , m_niceness );
|
|
|
|
|
|
// print the weight tables
|
|
//printLocationWeightsTable(sb,isXml);
|
|
//printDiversityWeightsTable(sb,isXml);
|
|
//printDensityWeightsTable(sb,isXml);
|
|
//printWordSpamWeightsTable(sb,isXml);
|
|
|
|
// print them out in a table
|
|
char hdr[1000];
|
|
sprintf(hdr,
|
|
"<table border=1 cellpadding=0>"
|
|
"<tr>"
|
|
// this messes up Test.cpp diff'ing
|
|
//"<td><b>#</b></td>"
|
|
"<td><b>Prefix</b></td>"
|
|
"<td><b>WordPos</b></td>"
|
|
"<td><b>Lang</b></td>"
|
|
|
|
"<td><b>Term</b></td>"
|
|
|
|
//"%s"
|
|
|
|
//"<td><b>Weight</b></td>"
|
|
//"<td><b>Spam</b></td>"
|
|
|
|
"<td><b>Desc</b></td>"
|
|
|
|
"<td><b>N</b></td>"
|
|
//"<td><b>V</b></td>" // diversityRank
|
|
"<td><b>S</b></td>"
|
|
"<td><b>Score</b></td>"
|
|
|
|
//"<td><b>Date</b></td>"
|
|
//"<td><b>Desc</b></td>"
|
|
//"<td><b>TermId</b></td>"
|
|
"</tr>\n"
|
|
//,fbuf
|
|
);
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("%s",hdr);
|
|
|
|
char *start = m_wbuf.getBufStart();
|
|
int32_t rcount = 0;
|
|
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS) == 0 && ! isXml )
|
|
sb->safePrintf("<!--ignore--></table>%s",hdr);
|
|
|
|
char *prefix = NULL;//" ";
|
|
if ( tp[i]->m_prefixOff >= 0 )
|
|
prefix = start + tp[i]->m_prefixOff;
|
|
|
|
if ( isXml ) sb->safePrintf("\t<term>\n");
|
|
|
|
if ( isXml && prefix )
|
|
sb->safePrintf("\t\t<prefix><![CDATA[%s]]>"
|
|
"</prefix>\n",prefix);
|
|
|
|
if ( ! isXml ) {
|
|
sb->safePrintf ( "<tr>");
|
|
if ( prefix )
|
|
sb->safePrintf("<td>%s:</td>",prefix);
|
|
else
|
|
sb->safePrintf("<td> </td>");
|
|
}
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf("<td>%"INT32""
|
|
"/%"INT32""
|
|
"</td>" ,
|
|
tp[i]->m_wordPos
|
|
,tp[i]->m_wordNum
|
|
);
|
|
|
|
//char *abbr = getLangAbbr(tp[i]->m_langId);
|
|
//if ( tp[i]->m_langId == langTranslingual ) abbr ="??";
|
|
//if ( tp[i]->m_langId == langUnknown ) abbr ="--";
|
|
//if ( tp[i]->m_synSrc ) abbr = "";
|
|
|
|
|
|
// print out all langs word is in if it's not clear
|
|
// what language it is. we use a sliding window to
|
|
// resolve some ambiguity, but not all, so print out
|
|
// the possible langs here
|
|
if ( ! isXml ) {
|
|
sb->safePrintf("<td>");
|
|
printLangBits ( sb , tp[i] );
|
|
sb->safePrintf("</td>");
|
|
}
|
|
|
|
|
|
//if ( ! isXml && abbr[0] )
|
|
// sb->safePrintf("<td>%s</td>", abbr );
|
|
//else if ( ! isXml )
|
|
// sb->safePrintf("<td> </td>" );
|
|
//else if ( abbr[0] )
|
|
// sb->safePrintf("\t\t<lang><![CDATA["
|
|
// "]]>%s</lang>\n", abbr );
|
|
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<s><![CDATA[");
|
|
|
|
if ( ! isXml )
|
|
sb->safePrintf ("<td><nobr>" );
|
|
|
|
//if ( tp[i]->m_synSrc )
|
|
// sb->pushChar('*');
|
|
|
|
sb->safeMemcpy_nospaces ( start + tp[i]->m_termOff ,
|
|
tp[i]->m_termLen );
|
|
|
|
/*
|
|
char *dateStr = " ";
|
|
int32_t ddd = tp[i]->m_date;
|
|
uint8_t *tddd = (uint8_t *)&ddd;
|
|
char tbbb[32];
|
|
if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
|
|
tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
|
|
sprintf(tbbb,"evIds %"INT32"-%"INT32"",
|
|
(int32_t)tddd[1],(int32_t)tddd[0]);
|
|
dateStr = tbbb;
|
|
}
|
|
else if ( ddd )
|
|
dateStr = asctime ( gmtime(&ddd ));
|
|
|
|
char tmp[20];
|
|
if ( tp[i]->m_noSplit ) sprintf ( tmp,"<b>1</b>" );
|
|
else sprintf ( tmp,"0" );
|
|
*/
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("]]></s>\n");
|
|
else
|
|
sb->safePrintf ( "</nobr></td>" );
|
|
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<wordPos>%"INT32"</wordPos>\n",
|
|
tp[i]->m_wordPos);
|
|
|
|
char *desc = NULL;
|
|
if ( tp[i]->m_descOff >= 0 )
|
|
desc = start + tp[i]->m_descOff;
|
|
|
|
// use hashgroup
|
|
int32_t hg = tp[i]->m_hashGroup;
|
|
if ( ! desc || ! strcmp(desc,"body") )
|
|
desc = getHashGroupString(hg);
|
|
|
|
if ( isXml && desc )
|
|
sb->safePrintf("\t\t<loc>%s</loc>\n", desc);
|
|
else if ( ! isXml ) {
|
|
if ( ! desc ) desc = " ";
|
|
sb->safePrintf ( "<td>%s", desc );
|
|
char ss = tp[i]->m_synSrc;
|
|
if ( ss )
|
|
sb->safePrintf(" - %s",
|
|
getSourceString(ss));
|
|
sb->safePrintf("</td>");
|
|
}
|
|
|
|
int32_t dn = (int32_t)tp[i]->m_densityRank;
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<dens>%"INT32"</dens>\n",dn);
|
|
|
|
if ( ! isXml && dn >= MAXDENSITYRANK )
|
|
sb->safePrintf("<td>%"INT32"</td>\n",dn);
|
|
else if ( ! isXml )
|
|
sb->safePrintf("<td><font color=purple>%"INT32"</font>"
|
|
"</td>",dn);
|
|
|
|
// the diversityrank/wordspamrank
|
|
/*
|
|
int32_t ds = (int32_t)tp[i]->m_diversityRank;
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<div>%"INT32"</div>\n",ds);
|
|
if ( ! isXml && ds >= MAXDIVERSITYRANK )
|
|
sb->safePrintf("<td>%"INT32"</td>\n",ds);
|
|
else if ( ! isXml )
|
|
sb->safePrintf("<td><font color=green>%"INT32"</font>"
|
|
"</td>",ds);
|
|
*/
|
|
|
|
int32_t ws = (int32_t)tp[i]->m_wordSpamRank;
|
|
|
|
if ( isXml && hg == HASHGROUP_INLINKTEXT )
|
|
sb->safePrintf("\t\t<linkerSiteRank>%"INT32""
|
|
"</linkerSiteRank>\n",ws);
|
|
else if ( isXml )
|
|
sb->safePrintf("\t\t<spam>%"INT32"</spam>\n",ws);
|
|
|
|
if ( ! isXml && ws >= MAXWORDSPAMRANK )
|
|
sb->safePrintf("<td>%"INT32"</td>",ws);
|
|
else if ( ! isXml )
|
|
sb->safePrintf("<td><font color=red>%"INT32"</font></td>",
|
|
ws);
|
|
|
|
float score = 1.0;
|
|
// square this like we do in the query ranking algo
|
|
score *= getHashGroupWeight(hg) * getHashGroupWeight(hg);
|
|
//score *= getDiversityWeight(tp[i]->m_diversityRank);
|
|
score *= getDensityWeight(tp[i]->m_densityRank);
|
|
if ( tp[i]->m_synSrc ) score *= SYNONYM_WEIGHT;
|
|
if ( hg == HASHGROUP_INLINKTEXT ) score *= getLinkerWeight(ws);
|
|
else score *= getWordSpamWeight(ws);
|
|
if ( isXml )
|
|
sb->safePrintf("\t\t<score>%.02f</score>\n",score);
|
|
else
|
|
sb->safePrintf("<td>%.02f</td>\n",score);
|
|
|
|
if ( isXml )
|
|
sb->safePrintf("\t</term>\n");
|
|
else
|
|
sb->safePrintf("</tr>\n");
|
|
}
|
|
|
|
|
|
if ( isXml )
|
|
sb->safePrintf ("</response>\n" );
|
|
else
|
|
sb->safePrintf("</table><br>\n");
|
|
|
|
//
|
|
// END PRINT HASHES TERMS
|
|
//
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
if ( ! isXml ) printMenu ( sb );
|
|
|
|
sb->safePrintf("<b>Coming Soon</b>");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {
|
|
|
|
char **c = getUtf8Content();
|
|
if ( ! c ) return true; if ( c==(void *)-1) return false;
|
|
|
|
int32_t isXml = hr->getLong("xml",0);
|
|
|
|
int32_t raw = hr->getLong("raw",0);
|
|
|
|
if ( ! isXml && ! raw ) printMenu ( sb );
|
|
|
|
if ( ! isXml ) {
|
|
// just copy it otherwise
|
|
if ( ptr_utf8Content )
|
|
sb->safeMemcpy ( ptr_utf8Content ,size_utf8Content -1);
|
|
return true;
|
|
}
|
|
|
|
sb->safePrintf ("<?xml version=\"1.0\" "
|
|
"encoding=\"UTF-8\" ?>\n"
|
|
"<response>\n"
|
|
);
|
|
sb->safePrintf("\t<utf8Content><![CDATA[");
|
|
if ( ptr_utf8Content )
|
|
sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,
|
|
false);
|
|
sb->safePrintf("]]></utf8Content>\n");
|
|
// end xml response
|
|
sb->safePrintf("</response>\n");
|
|
return true;
|
|
}
|
|
|
|
|
|
// . get the possible titles of the root page
|
|
// . includes the title tag text
|
|
// . includes various inlink text
|
|
// . used to match the VERIFIED place name 1 or 2 of addresses on this
|
|
// site in order to set Address::m_flags's AF_VENUE_DEFAULT bit which
|
|
// indicates the address is the address of the website (a venue website)
|
|
char **XmlDoc::getRootTitleBuf ( ) {
|
|
|
|
// return if valid
|
|
if ( m_rootTitleBufValid ) return (char **)&m_rootTitleBuf;
|
|
|
|
// get it from the tag rec first
|
|
setStatus ( "getting root title buf");
|
|
|
|
// sanity check, root must have been indexed
|
|
//if ( ! m_sreq.m_rootIndexed ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . update it first before reading it!
|
|
// . do not update it here, just update it in getTitleRec() because
|
|
// this makes doConsistencyCheck() block and core
|
|
//bool *status2 = updateSiteTitleBuf();
|
|
//if ( ! status2 || status2 == (void *)-1 ) return (char **)status2;
|
|
|
|
// get it from the tag rec if we can
|
|
TagRec *gr = getTagRec ();
|
|
if ( ! gr || gr == (void *)-1 ) return (char **)gr;
|
|
|
|
// clear this if not set from title rec
|
|
//if ( ! m_setFromTitleRec ) {
|
|
// ptr_siteTitleBuf = NULL;
|
|
// size_siteTitleBuf = 0;
|
|
//}
|
|
|
|
// PROBLEM: new title rec is the only thing which has sitetitles tag
|
|
// sometimes and we do not store that in the title rec. in this case
|
|
// we should maybe store ptr_siteTitleBuf/size_siteTitleBuf in the
|
|
// title rec?
|
|
Tag *tag = gr->getTag("roottitles");
|
|
|
|
char *src = NULL;
|
|
int32_t srcSize = 0;
|
|
|
|
if ( ptr_rootTitleBuf || m_setFromTitleRec ) {
|
|
src = ptr_rootTitleBuf;
|
|
srcSize = size_rootTitleBuf;
|
|
}
|
|
else if ( tag ) {
|
|
src = tag->getTagData();
|
|
srcSize = tag->getTagDataSize();
|
|
// no need to add to title rec since already in the tag so
|
|
// make sure we did not double add
|
|
if ( ptr_rootTitleBuf ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
else {
|
|
// . get the root doc
|
|
// . allow for a one hour cache of the titleRec
|
|
XmlDoc **prd = getRootXmlDoc( 3600 );
|
|
if ( ! prd || prd == (void *)-1 ) return (char **)prd;
|
|
// int16_tcut
|
|
XmlDoc *rd = *prd;
|
|
// . if no root doc, then assume no root title
|
|
// . this happens if we are injecting because we do not want
|
|
// to download the root page for speed purposes
|
|
if ( ! rd ) {
|
|
m_rootTitleBuf[0] = '\0';
|
|
m_rootTitleBufSize = 0;
|
|
m_rootTitleBufValid = true;
|
|
return (char **)&m_rootTitleBuf;
|
|
}
|
|
// . ONLY do this if root doc was NOT set from titleRec to
|
|
// avoid that core in updateSiteTitleBuf(). this can happen
|
|
// if the root doc had no title! (or no content)
|
|
//if ( rd->m_setFromTitleRec ) {
|
|
// // emptyt
|
|
// m_siteTitleBuf[0] = '\0';
|
|
// // set the size of it
|
|
// m_siteTitleBufSize = 0;
|
|
// // validate it
|
|
// m_siteTitleBufValid = true;
|
|
// // return a ptr to it
|
|
// return (char **)&m_siteTitleBuf;
|
|
//}
|
|
|
|
// a \0 separated list
|
|
char **rtl = rd->getTitleBuf();
|
|
if ( ! rtl || rtl == (void *)-1 ) return (char **)rtl;
|
|
|
|
// ptr
|
|
src = rd->m_titleBuf;
|
|
srcSize = rd->m_titleBufSize;
|
|
}
|
|
|
|
int32_t max = (int32_t)ROOT_TITLE_BUF_MAX - 5;
|
|
// sanity
|
|
if ( srcSize >= max ) {
|
|
// truncate
|
|
srcSize = max;
|
|
// back up so we split on a space
|
|
for ( ; srcSize>0 && ! is_wspace_a(src[srcSize]); srcSize--);
|
|
// null term
|
|
src[srcSize] = '\0';
|
|
// include it
|
|
srcSize++;
|
|
}
|
|
|
|
// copy that over in case root is destroyed
|
|
gbmemcpy ( m_rootTitleBuf , src , srcSize );
|
|
m_rootTitleBufSize = srcSize;
|
|
|
|
// sanity check, must include the null ni the size
|
|
if ( m_rootTitleBufSize > 0 &&
|
|
m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
|
|
log("build: bad root titlebuf size not end in null char for "
|
|
"collnum=%i",(int)m_collnum);
|
|
ptr_rootTitleBuf = NULL;
|
|
size_rootTitleBuf = 0;
|
|
m_rootTitleBufValid = true;
|
|
return (char **)&m_rootTitleBuf;
|
|
char *xx=NULL;*xx=0;
|
|
//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
|
|
//m_rootTitleBufSize++;
|
|
}
|
|
|
|
// sanity check - breach check
|
|
if ( m_rootTitleBufSize > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0;}
|
|
|
|
// serialize into our titlerec
|
|
ptr_rootTitleBuf = m_rootTitleBuf;
|
|
size_rootTitleBuf = m_rootTitleBufSize;
|
|
|
|
m_rootTitleBufValid = true;
|
|
|
|
return (char **)&m_rootTitleBuf;
|
|
}
|
|
|
|
|
|
char **XmlDoc::getFilteredRootTitleBuf ( ) {
|
|
|
|
if ( m_filteredRootTitleBufValid )
|
|
return (char **)&m_filteredRootTitleBuf;
|
|
|
|
// get unfiltered. m_rootTitleBuf should be set from this call.
|
|
char **rtbp = getRootTitleBuf();
|
|
if ( ! rtbp || rtbp == (void *)-1 ) return (char **)rtbp;
|
|
|
|
/*
|
|
// assume none
|
|
m_filteredRootTitleBuf[0] = '\0';
|
|
m_filteredRootTitleBufSize = 0;
|
|
m_filteredRootTitleBufValid = true;
|
|
return (char **)&m_filteredRootTitleBuf;
|
|
*/
|
|
|
|
// filter all the punct to \0 so that something like
|
|
// "walmart.com : live better" is reduced to 3 potential
|
|
// names, "walmart", "com" and "live better"
|
|
char *src = m_rootTitleBuf;
|
|
char *srcEnd = src + m_rootTitleBufSize;
|
|
char *dst = m_filteredRootTitleBuf;
|
|
// save some room to add a \0, so subtract 5
|
|
char *dstEnd = dst + ROOT_TITLE_BUF_MAX - 5;
|
|
//char *src = tag->getTagData();
|
|
//char *srcEnd = src + tag->getTagDataSize();
|
|
int32_t size = 0;
|
|
bool lastWasPunct = true;
|
|
for ( ; src < srcEnd && dst < dstEnd ; src += size ) {
|
|
// set the char size
|
|
size = getUtf8CharSize(src);
|
|
// space?
|
|
if ( is_wspace_a (*src) ||
|
|
// allow periods too
|
|
*src=='.' ) {
|
|
// no back to back punct
|
|
if ( lastWasPunct ) continue;
|
|
// flag it
|
|
lastWasPunct = true;
|
|
// add it in
|
|
*dst++ = '.';
|
|
// that's it
|
|
continue;
|
|
}
|
|
// x'y or x-y
|
|
if ( ( *src == '\'' ||
|
|
*src == '.' ||
|
|
*src == '-' ) &&
|
|
! lastWasPunct &&
|
|
is_alnum_a(src[1]) ) {
|
|
// add it in
|
|
*dst++ = *src;
|
|
// that's it
|
|
continue;
|
|
}
|
|
// x & y is ok
|
|
if ( *src == '&' ) {
|
|
// assume not punct (stands for and)
|
|
lastWasPunct = false;
|
|
// add it in
|
|
*dst++ = *src;
|
|
// that's it
|
|
continue;
|
|
}
|
|
// store alnums right in
|
|
if ( is_alnum_a(*src) ) {
|
|
// flag it
|
|
lastWasPunct = false;
|
|
// copy it over
|
|
gbmemcpy ( dst , src , size );
|
|
// skip what we copied
|
|
dst += size;
|
|
continue;
|
|
}
|
|
// if punct and haven't stored anything, just skip it
|
|
if ( lastWasPunct ) dst[-1] = '\0';
|
|
// store it
|
|
else *dst++ = '\0';
|
|
}
|
|
// make sure we end on a \0
|
|
if ( dst > m_filteredRootTitleBuf && dst[-1] != '\0' )
|
|
*dst++ = '\0';
|
|
|
|
// int16_tcut
|
|
char *str = m_filteredRootTitleBuf;
|
|
int32_t strSize = dst - m_filteredRootTitleBuf;
|
|
|
|
// copy that over in case root is destroyed
|
|
gbmemcpy ( m_filteredRootTitleBuf , str , strSize );
|
|
m_filteredRootTitleBufSize = strSize;
|
|
|
|
// sanity check, must include the null ni the size
|
|
if ( m_filteredRootTitleBufSize > 0 &&
|
|
m_filteredRootTitleBuf [ m_filteredRootTitleBufSize - 1 ] ) {
|
|
char *xx=NULL;*xx=0;
|
|
//m_filteredRootTitleBuf [ m_filteredRootTitleBufSize-1]='\0';
|
|
//m_filteredRootTitleBufSize++;
|
|
}
|
|
|
|
// sanity check - breach check
|
|
if ( m_filteredRootTitleBufSize > ROOT_TITLE_BUF_MAX ) {
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
m_filteredRootTitleBufValid = true;
|
|
|
|
// make this static to avoid compiler warning
|
|
static char *fp = m_filteredRootTitleBuf;
|
|
|
|
return (char **)&fp;
|
|
//return (char **)&m_filteredRootTitleBuf;
|
|
}
|
|
|
|
//static bool s_dummyBool = 1;
|
|
|
|
class Binky {
|
|
public:
|
|
char *m_text;
|
|
int32_t m_textLen;
|
|
int32_t m_score;
|
|
int64_t m_hash;
|
|
};
|
|
|
|
|
|
int cmpbk ( const void *v1, const void *v2 ) {
|
|
Binky *b1 = (Binky *)v1;
|
|
Binky *b2 = (Binky *)v2;
|
|
return b1->m_score - b2->m_score;
|
|
}
|
|
|
|
char **XmlDoc::getTitleBuf ( ) {
|
|
if ( m_titleBufValid ) return (char **)&m_titleBuf;
|
|
|
|
// recalc this everytime the root page is indexed
|
|
setStatus ( "getting title buf on root");
|
|
|
|
// are we a root?
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
|
|
// this should only be called on the root!
|
|
// . if the site changed for us, but the title rec of what we
|
|
// think is now the root thinks that it is not the root because
|
|
// it is using the old site, then it cores here!
|
|
// . i.e. if the new root is www.xyz.com/user/ted/ and the old root
|
|
// is www.xyz.com then and the old root is stored in ptr_site for
|
|
// the title rec for www.xyz.com/user/ted/ then we core here,
|
|
// . so take this sanity check out
|
|
// . but if the title rec does not think he is the site root yet
|
|
// then just wait until he does so we can get his
|
|
// ptr_rootTitleBuf below
|
|
if ( ! *isRoot ) {
|
|
m_titleBuf[0] = '\0';
|
|
m_titleBufSize = 0;
|
|
m_titleBufValid = true;
|
|
return (char **)&m_titleBuf;
|
|
}
|
|
|
|
// sanity check
|
|
if ( m_setFromTitleRec ) {
|
|
gbmemcpy(m_titleBuf, ptr_rootTitleBuf, size_rootTitleBuf );
|
|
m_titleBufSize = size_rootTitleBuf;
|
|
m_titleBufValid = true;
|
|
return (char **)&m_titleBuf;
|
|
}
|
|
|
|
char *mysite = getSite();
|
|
if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
|
|
// get link info first
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
// error or blocked
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char **)info1;
|
|
|
|
// sanity check
|
|
Xml *xml = getXml();
|
|
// return -1 if it blocked
|
|
if ( xml == (void *)-1 ) return (char **)-1;
|
|
// set up for title
|
|
int32_t tlen ;
|
|
char *title ;
|
|
// on error, ignore it to avoid hammering the root!
|
|
if ( xml == (void *)NULL ) {
|
|
// log it
|
|
log("build: error downloading root xml: %s",
|
|
mstrerror(g_errno));
|
|
// clear it
|
|
g_errno = 0;
|
|
// make it 0
|
|
tlen = 0;
|
|
title = NULL;
|
|
}
|
|
else {
|
|
// get the title
|
|
title = m_xml.getTextForXmlTag ( 0,
|
|
999999 ,
|
|
"title" ,
|
|
&tlen ,
|
|
true ); // skip leading spaces
|
|
}
|
|
|
|
// truncate to 100 chars
|
|
//for ( ; tlen>0 && (tlen > 100 || is_alnum_a(title[tlen])) ; tlen-- )
|
|
// if ( tlen == 0 ) break;
|
|
if ( tlen > 100 ) {
|
|
char *tpend = title + 100;
|
|
char *prev = getPrevUtf8Char ( tpend , title );
|
|
// make that the end so we don't split a utf8 char
|
|
tlen = prev - title;
|
|
}
|
|
|
|
// store tag in here
|
|
char tmp[1024];
|
|
// point to it
|
|
char *ptmp = tmp;
|
|
// set this
|
|
char *pend = tmp + 1024;
|
|
// add that in
|
|
gbmemcpy ( ptmp, title, tlen); ptmp += tlen;
|
|
// null terminate it
|
|
*ptmp++ = '\0';
|
|
|
|
// two votes per internal inlink
|
|
int32_t internalCount = 0;
|
|
// count inlinkers
|
|
int32_t linkNum = 0;
|
|
Binky bk[1000];
|
|
// init this
|
|
//char stbuf[2000];
|
|
//HashTableX scoreTable;
|
|
//scoreTable.set(8,4,64,stbuf,2000,false,m_niceness,"xmlscores");
|
|
// scan each link in the link info
|
|
for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
|
|
// do not breach
|
|
if ( linkNum >= 1000 ) break;
|
|
// is this inlinker internal?
|
|
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// skip corrupted
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 4 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.m_url);
|
|
continue;
|
|
}
|
|
// store these
|
|
// zero out hash
|
|
bk[linkNum].m_hash = 0;
|
|
bk[linkNum].m_text = txt;
|
|
bk[linkNum].m_textLen = tlen;
|
|
bk[linkNum].m_score = 0;
|
|
// internal count
|
|
if ( internal && ++internalCount >= 3 ) continue;
|
|
// it's good
|
|
bk[linkNum].m_score = 1;
|
|
linkNum++;
|
|
/*
|
|
// set into words
|
|
Words w;
|
|
// return NULL on error with g_errno set
|
|
if ( ! w.setx ( txt , tlen , m_niceness ) ) return NULL;
|
|
// int16_tcut
|
|
int64_t *wids = w.getWordIds();
|
|
// init hash
|
|
int64_t h = 0LL;
|
|
// hash all words together
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if not hashable
|
|
if ( ! wids[i] ) continue;
|
|
// mix it up
|
|
h <<= 1LL;
|
|
// xor it in
|
|
h ^= wids[i];
|
|
}
|
|
// update hash
|
|
bk[linkNum].m_hash = h;
|
|
// store in table, return NULL with g_errno set on error
|
|
if ( ! scoreTable.addTerm ( &h ) ) return NULL;
|
|
*/
|
|
}
|
|
// init this
|
|
char dtbuf[1000];
|
|
HashTableX dupTable;
|
|
dupTable.set(8,0,64,dtbuf,1000,false,m_niceness,"xmldup");
|
|
// now set the scores and isdup
|
|
for ( int32_t i = 0 ; i < linkNum ; i++ ) {
|
|
// skip if ignored
|
|
if ( bk[i].m_score == 0 ) continue;
|
|
// get hash
|
|
int64_t h = bk[i].m_hash;
|
|
// assume a dup
|
|
bk[i].m_score = 0;
|
|
// skip if zero'ed out
|
|
if ( ! h ) continue;
|
|
// only do each hash once!
|
|
if ( dupTable.isInTable(&h) ) continue;
|
|
// add to it. return NULL with g_errno set on error
|
|
if ( ! dupTable.addKey(&h) ) return NULL;
|
|
// is it in there?
|
|
bk[i].m_score = 1; // scoreTable.getScore ( &h );
|
|
}
|
|
// now sort the bk array by m_score
|
|
//gbsort ( bk , linkNum , sizeof(Binky), cmpbk , m_niceness );
|
|
|
|
// sanity check - make sure sorted right
|
|
//if ( linkNum >= 2 && bk[0].m_score < bk[1].m_score ) {
|
|
// char *xx=NULL; *xx=0; }
|
|
|
|
// . now add the winners to the buffer
|
|
// . skip if score is 0
|
|
for ( int32_t i = 0 ; i < linkNum ; i++ ) {
|
|
// skip if score is zero
|
|
if ( bk[i].m_score == 0 ) continue;
|
|
// skip if too big
|
|
if ( bk[i].m_textLen + 1 > pend - ptmp ) continue;
|
|
// store it
|
|
gbmemcpy ( ptmp , bk[i].m_text , bk[i].m_textLen );
|
|
// advance
|
|
ptmp += bk[i].m_textLen;
|
|
// null terminate it
|
|
*ptmp++ = '\0';
|
|
}
|
|
|
|
// sanity
|
|
int32_t size = ptmp - tmp;
|
|
if ( size > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0; }
|
|
|
|
gbmemcpy ( m_titleBuf , tmp , ptmp - tmp );
|
|
m_titleBufSize = size;
|
|
m_titleBufValid = true;
|
|
// ensure null terminated
|
|
if ( size > 0 && m_titleBuf[size-1] ) { char *xx=NULL;*xx=0; }
|
|
//ptr_siteTitleBuf = m_siteTitleBuf;
|
|
//size_siteTitleBuf = m_siteTitleBufSize;
|
|
return (char **)&m_titleBuf;
|
|
}
|
|
|
|
|
|
// . now we just get all the tagdb rdb recs to add using this function
|
|
// . then we just use the metalist to update tagdb
|
|
SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
|
|
|
if ( m_newTagBufValid ) return &m_newTagBuf;
|
|
|
|
setStatus ( "getting new tags");
|
|
|
|
int32_t *ic = getIndexCode();
|
|
if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get our ip
|
|
int32_t *ip = getIp();
|
|
// this must not block to avoid re-computing "addme" above
|
|
if ( ip == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! ip || ip == (int32_t *)-1) return (SafeBuf *)ip;
|
|
|
|
// . do not both if there is a problem
|
|
// . otherwise if our ip is invalid (0 or 1) we core in
|
|
// getNumSiteInlinks() which requires a valid ip
|
|
// . if its robots.txt disallowed, then indexCode will be set, but we
|
|
// still want to cache our sitenuminlinks in tagdb! delicious.com was
|
|
// recomputing the sitelinkinfo each time because we were not storing
|
|
// these tags in tagdb!!
|
|
if ( ! *ip || *ip == -1 ) { // *ic ) {
|
|
m_newTagBuf.reset();
|
|
m_newTagBufValid = true;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
// get the tags already in tagdb
|
|
TagRec *gr = getTagRec ( );
|
|
if ( ! gr || gr == (void *)-1 ) return (SafeBuf *)gr;
|
|
|
|
// get our site
|
|
char *mysite = getSite();
|
|
// this must not block to avoid re-computing "addme" above
|
|
if ( mysite == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! mysite || mysite == (char *)-1 ) return (SafeBuf *)mysite;
|
|
|
|
// age of tag in seconds
|
|
int32_t timestamp;
|
|
|
|
// always just use the primary tagdb so we can cache our sitenuminlinks
|
|
char rdbId = RDB_TAGDB;
|
|
//if ( m_useSecondaryRdbs ) rdbId = RDB2_TAGDB2;
|
|
//else rdbId = RDB_TAGDB;
|
|
|
|
// sitenuminlinks special for repair
|
|
if ( m_useSecondaryRdbs &&
|
|
// and not rebuilding titledb
|
|
! m_useTitledb ) {
|
|
m_newTagBuf.reset();
|
|
m_newTagBufValid = true;
|
|
int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,×tamp);
|
|
if ( old1 == m_siteNumInlinks &&
|
|
old1 != -1 &&
|
|
! m_updatingSiteLinkInfoTags )
|
|
return &m_newTagBuf;
|
|
int32_t now = getTimeGlobal();
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"",
|
|
mysite,m_siteNumInlinks);
|
|
if ( ! m_newTagBuf.addTag2(mysite,"sitenuminlinks",now,
|
|
"xmldoc",
|
|
*ip,m_siteNumInlinks,rdbId) )
|
|
return NULL;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
// if doing consistency check, this buf is for adding to tagdb
|
|
// so just ignore those. we use ptr_tagRecData in getTagRec() function
|
|
// but this is really for updating tagdb.
|
|
if ( m_doingConsistencyCheck ) {
|
|
m_newTagBuf.reset();
|
|
m_newTagBufValid = true;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
Xml *xml = getXml();
|
|
if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
|
|
|
|
char *isIndexed = getIsIndexed();
|
|
if ( !isIndexed || isIndexed==(char *)-1 ) return (SafeBuf *)isIndexed;
|
|
|
|
char *isRoot = getIsSiteRoot();
|
|
if ( ! isRoot || isRoot == (char *)-1 ) return (SafeBuf *)isRoot;
|
|
|
|
int32_t *siteNumInlinks = getSiteNumInlinks();
|
|
if ( ! siteNumInlinks ) return NULL;
|
|
if ( siteNumInlinks == (int32_t *)-1) return (SafeBuf *)-1;
|
|
|
|
// ok, get the sites of the external outlinks and they must
|
|
// also be NEW outlinks, added to the page since the last time
|
|
// we spidered it...
|
|
Links *links = getLinks ();
|
|
if ( ! links || links == (Links *)-1 ) return (SafeBuf *)links;
|
|
|
|
// our next slated spider priority
|
|
char *spiderLinks = getSpiderLinks();
|
|
if ( ! spiderLinks || spiderLinks == (char *)-1 )
|
|
return (SafeBuf *)spiderLinks;
|
|
|
|
// . get ips of all outlinks.
|
|
// . use m_msgeForIps class just for that
|
|
// . it sucks if the outlink's ip is a dns timeout, then we never
|
|
// end up being able to store it in tagdb, that is why when
|
|
// rebuilding we need to skip adding firstip tags for the outlinks
|
|
int32_t **ipv = NULL;
|
|
TagRec ***grv = NULL;
|
|
bool addLinkTags = true;
|
|
if ( ! *spiderLinks ) addLinkTags = false;
|
|
if ( ! m_useSpiderdb ) addLinkTags = false;
|
|
if ( addLinkTags ) {
|
|
ipv = getOutlinkFirstIpVector ();
|
|
if ( ! ipv || ipv == (void *)-1 ) return (SafeBuf *)ipv;
|
|
// . uses m_msgeForTagRecs for this one
|
|
grv = getOutlinkTagRecVector();
|
|
if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
|
|
}
|
|
|
|
// get root langid of root page
|
|
uint8_t *rl = getRootLangId();
|
|
if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;
|
|
|
|
char *hci = getHasContactInfo();
|
|
if ( ! hci || hci == (char *)-1 ) return (SafeBuf *)hci;
|
|
|
|
// get the address class
|
|
Addresses *aa = getAddresses ();
|
|
if ( ! aa || aa == (Addresses *)-1 ) return (SafeBuf *)aa;
|
|
|
|
// get comma separated list of email address on page
|
|
char *emails = getEmailBuf ( );
|
|
if ( ! emails || emails == (void *)-1 ) return (SafeBuf *)emails;
|
|
|
|
#ifdef _USETURKS_
|
|
//HashTableX *tvt = getTurkVotingTable ();
|
|
//if ( ! tvt || tvt == (void *)-1 ) return (SafeBuf *)tvt;
|
|
#endif
|
|
|
|
//
|
|
// init stuff
|
|
//
|
|
|
|
// . this gets the root doc and and parses titles out of it
|
|
// . sets our m_rootTitleBuf/m_rootTitleBufSize
|
|
char **rtbufp = getRootTitleBuf();
|
|
if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf *)rtbufp;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// overwrite "getting root title buf" status
|
|
setStatus ("computing new tags");
|
|
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: adding tags for mysite=%s",mysite);
|
|
|
|
// int16_tcut
|
|
//TagRec *tr = &m_newTagRec;
|
|
// current time
|
|
int32_t now = getTimeGlobal();
|
|
// actually, use spider download time if we can. that way
|
|
// Test.cpp's injection runs will be more consistent!
|
|
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
now = getSpideredTime();//m_spideredTime;
|
|
}
|
|
// store tags into here
|
|
SafeBuf *tbuf = &m_newTagBuf;
|
|
// allocate space to hold the tags we will add
|
|
Tag *tag;
|
|
int32_t need = 512;
|
|
// add in root title buf in case we add it too
|
|
need += m_rootTitleBufSize;
|
|
// reserve it all now
|
|
if ( ! tbuf->reserve(need) ) return NULL;
|
|
|
|
|
|
|
|
//
|
|
// add root langid if we need to
|
|
//
|
|
char *oldrl = gr->getString("rootlang",NULL,×tamp);
|
|
// assume no valid id
|
|
int32_t oldrlid = -99;
|
|
// convert to id
|
|
if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );
|
|
|
|
// if not in old tag, or changed from what was in tag, or it has
|
|
// been 10 days or more, then update tagdb with this tag.
|
|
bool addRootLang = false;
|
|
if ( ! oldrl ) addRootLang = true;
|
|
if ( oldrlid != *rl ) addRootLang = true;
|
|
if ( now-timestamp > 10*86400 ) addRootLang = true;
|
|
// injects do not download the root doc for speed reasons, so do not
|
|
// bother for them unless the doc itself is the root.
|
|
if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
|
|
// . get the two letter (usually) language code from the id
|
|
// . i think the two chinese languages are 5 letters
|
|
char *newrl = NULL;
|
|
if ( addRootLang )
|
|
// i've seen this return NULL because *rl is a corrupt 215
|
|
// for some reason
|
|
newrl = getLanguageAbbr( *rl );
|
|
|
|
if ( newrl )
|
|
tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);
|
|
|
|
//
|
|
// add hascontactinfo if we need to
|
|
//
|
|
int32_t oldhci = gr->getLong("hascontactinfo",-1,NULL,×tamp);
|
|
if ( oldhci == -1 || oldhci != *hci || now-timestamp > 10 *86400 ) {
|
|
char *val = "0";
|
|
if ( m_hasContactInfo ) val = "1";
|
|
tbuf->addTag3 (mysite,"hascontactinfo",now,"xmldoc",*ip,val,
|
|
rdbId);
|
|
}
|
|
//
|
|
// add "site" tag
|
|
//
|
|
char *oldsite = gr->getString("site",NULL);
|
|
if ( ! oldsite || strcmp(oldsite,mysite) || now-timestamp > 10*86400)
|
|
tbuf->addTag3(mysite,"site",now,"xmldoc",*ip,mysite,rdbId);
|
|
|
|
//
|
|
// add firstip if not there at all
|
|
//
|
|
char *oldfip = gr->getString("firstip",NULL);
|
|
// convert it
|
|
int32_t ip3 = 0;
|
|
if ( oldfip ) ip3 = atoip(oldfip);
|
|
// if not there or if bogus, add it!! should override bogus firstips
|
|
if ( ! ip3 || ip3 == -1 ) {
|
|
char *ipstr = iptoa(m_ip);
|
|
//if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0; }
|
|
//int32_t iplen = gbstrlen(ipstr);
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
tbuf->addTag3(mysite,"firstip",now,"xmldoc",*ip,ipstr,
|
|
rdbId);
|
|
}
|
|
|
|
//if ( strncmp(m_firstUrl.m_url,"http://delicious.com/",21)==0 )
|
|
// log("boo");
|
|
|
|
// sitenuminlinks
|
|
int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,×tamp);
|
|
if ( old1 == -1 || old1 != m_siteNumInlinks ||
|
|
m_updatingSiteLinkInfoTags ) {
|
|
if ( g_conf.m_logDebugLinkInfo )
|
|
log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"",
|
|
mysite,m_siteNumInlinks);
|
|
if ( ! tbuf->addTag2(mysite,"sitenuminlinks",now,"xmldoc",
|
|
*ip,m_siteNumInlinks,rdbId) )
|
|
return NULL;
|
|
}
|
|
|
|
//int32_t old2, old3, old4;
|
|
|
|
// if running for diffbot crawlbot then isCustomCrawl is true
|
|
// so do not update the siteinlink info already in tagdb since i
|
|
// imported it from my main collection. we do not want to overwrite it.
|
|
// NO, because for single site crawls we bottlenech on msg25
|
|
// when there are millions of urls. we only skip this
|
|
// for the global-index and if already in tagdb!
|
|
// No, let's just not invalidate the sitenuminlinks* tags
|
|
// in XmlDoc::getSiteNumInlinks()
|
|
//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;
|
|
|
|
// sitenuminlinksfresh
|
|
// old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,×tamp);
|
|
// if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
|
|
// m_updatingSiteLinkInfoTags )
|
|
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
|
|
// now,"xmldoc",
|
|
// *ip,m_siteNumInlinksUniqueIp,rdbId))
|
|
// return NULL;
|
|
// // sitepop
|
|
// old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
|
|
// ×tamp);
|
|
// if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
|
|
// m_updatingSiteLinkInfoTags )
|
|
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
|
|
// now,"xmldoc",
|
|
// *ip,m_siteNumInlinksUniqueCBlock,rdbId))
|
|
// return NULL;
|
|
// // total site inlinks
|
|
// old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
|
|
// ×tamp);
|
|
// if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
|
|
// m_updatingSiteLinkInfoTags )
|
|
// if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
|
|
// now,"xmldoc",
|
|
// *ip,m_siteNumInlinksTotal,rdbId))
|
|
// return NULL;
|
|
|
|
// skipSiteInlinks:
|
|
|
|
// get root title buf from old tag
|
|
char *data = NULL;
|
|
int32_t dsize = 0;
|
|
Tag *rt = gr->getTag("roottitles");
|
|
if ( rt ) {
|
|
data = rt->getTagData();
|
|
dsize = rt->getTagDataSize();
|
|
}
|
|
|
|
bool addRootTitle = false;
|
|
// store the root title buf if we need to. if we had no tag yet...
|
|
if ( ! rt )
|
|
addRootTitle = true;
|
|
// or if differs in size
|
|
else if ( dsize != m_rootTitleBufSize )
|
|
addRootTitle = true;
|
|
// or if differs in content
|
|
else if ( memcmp(data,m_rootTitleBuf,m_rootTitleBufSize))
|
|
addRootTitle =true;
|
|
// or if it is 10 days old or more
|
|
if ( now-timestamp > 10*86400 ) addRootTitle = true;
|
|
// but not if injected
|
|
if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false;
|
|
// add it then
|
|
if ( addRootTitle &&
|
|
! tbuf->addTag(mysite,"roottitles",now,"xmldoc",
|
|
*ip,m_rootTitleBuf,m_rootTitleBufSize,
|
|
rdbId,true) )
|
|
return NULL;
|
|
|
|
|
|
//
|
|
// add the VENUEADDRESS tags
|
|
//
|
|
|
|
// init the dedup table so we do not add the same address many times
|
|
char dtbuf[1000];
|
|
HashTableX dt;
|
|
dt.set(8,0,32,dtbuf,1000,false,m_niceness,"xmldt");
|
|
// reset counts
|
|
int32_t numContactAddressTags = 0;
|
|
int32_t numContactEmailTags = 0;
|
|
int32_t tagType2 = getTagTypeFromStr ( "contactaddress" );
|
|
int32_t tagType3 = getTagTypeFromStr ( "contactemails" );
|
|
// before we add the sitevenue to the tagrec let's make sure it is
|
|
// not a dedup.. i.e. that we do not already have this address
|
|
// in there.
|
|
int32_t tagType = getTagTypeFromStr ( "venueaddress" );
|
|
// start at the first tag
|
|
tag = gr->getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
|
|
// count current contact addresses we have
|
|
if ( tag->m_type == tagType2 ) numContactAddressTags++;
|
|
if ( tag->m_type == tagType3 ) numContactEmailTags++;
|
|
// skip if not a venueaddress tag
|
|
if ( tag->m_type != tagType ) continue;
|
|
// point to the serialized address
|
|
char *data = tag->getTagData();
|
|
// get that address hash i guess
|
|
uint64_t ah = getHashFromAddr ( data );
|
|
// add to dedup table - return NULL with g_errno set on error
|
|
if ( ! dt.addKey ( &ah ) ) return NULL;
|
|
}
|
|
int32_t na = aa->getNumAddresses();
|
|
// add up to 10 for now
|
|
for ( int32_t i = 0 ; i < na ; i++ ) {
|
|
// get it
|
|
Address *a = (Address *)aa->m_am.getPtr(i);
|
|
// check if venue
|
|
if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue;
|
|
// must have street on the page, not pointing into a tagrec
|
|
// from tagdb... otherwise we keep re-adding
|
|
if ( a->m_street->m_a < 0 ) continue;
|
|
// dedup! dedup against
|
|
// addresses in tagdb for venueaddress tag. can we use
|
|
// the dc[] array from Address.cpp... we need another
|
|
// set of bit flags for address class:
|
|
if ( dt.isInTable ( &a->m_hash ) ) continue;
|
|
// sanity
|
|
if ( a->m_hash == 0 ) { char *xx=NULL;*xx=0; }
|
|
// . serialize it
|
|
// . TODO: get rid of Address::addToTagRec() functions
|
|
char abuf[5000];
|
|
a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
|
|
// store in safebuf of tags
|
|
if ( ! tbuf->addTag3 (mysite,"venueaddress",now,"xmldoc",
|
|
*ip,abuf,rdbId) ) return NULL;
|
|
// only add once
|
|
if ( ! dt.addKey (&a->m_hash) ) return NULL;
|
|
}
|
|
|
|
//
|
|
//
|
|
// contact info stuff
|
|
//
|
|
//
|
|
|
|
// ensure m_numContactAddresses etc. are valid
|
|
Address **ca = getContactAddresses();
|
|
// blocked?
|
|
if ( ! ca || ca == (void *)-1 ) return (SafeBuf *)ca;
|
|
|
|
// do not do this for root if multiple addresses. this
|
|
// fixes http://obits.abqjournal.com/
|
|
if ( *isRoot && aa->m_uniqueStreetHashes > 1 ) na = 0;
|
|
|
|
// do not store more than 2 contact addresses, or 2 contact emails
|
|
// to avoid tagdb bloat. and also because we do not need that many.
|
|
|
|
// . store contact address if we had one
|
|
// . this is a buffer of Address ptrs
|
|
for ( int32_t i = 0 ; i < m_numContactAddresses ; i++ ) {
|
|
// stop on breach
|
|
if ( numContactAddressTags >= 2 ) break;
|
|
// inc it
|
|
numContactAddressTags++;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *a = ca[i];
|
|
// . serialize it
|
|
// . TODO: get rid of Address::addToTagRec() functions
|
|
char abuf[5000];
|
|
a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
|
|
// store in safebuf of tags
|
|
if ( ! tbuf->addTag3 (mysite,"contactaddress",now,"xmldoc",
|
|
*ip,abuf,rdbId) ) return NULL;
|
|
}
|
|
|
|
// . add email addresses and submission forms to tag
|
|
// . this does not block, so make sure only called once!
|
|
// . contact emails. comma separated list
|
|
if ( emails && numContactEmailTags <= 1 ) {
|
|
numContactEmailTags++;
|
|
if ( ! tbuf->addTag3 (mysite,"contactemails",now,"xmldoc",
|
|
*ip,emails,rdbId) ) return NULL;
|
|
}
|
|
|
|
|
|
//
|
|
//
|
|
// NOW add tags for our outlinks
|
|
//
|
|
//
|
|
|
|
bool oldHighQualityRoot = true;
|
|
// if we are new, do not add anything, because we only add a tagdb
|
|
// rec entry for "new" outlinks that were added to the page since
|
|
// the last time we spidered it
|
|
if ( ! *isIndexed ) oldHighQualityRoot = false;
|
|
// special tags for google search results pages for scraping
|
|
char inGoogle = false;
|
|
if ( strstr(mysite,"google.com") ) inGoogle = true;
|
|
// no updating if we are not root
|
|
if ( ! inGoogle && ! *isRoot ) oldHighQualityRoot = false;
|
|
// must be high quality, too
|
|
if ( ! inGoogle && *siteNumInlinks < 500 ) oldHighQualityRoot = false;
|
|
// . if we are a google url then add tags for each outlink!
|
|
// . more google special tags to replace Scraper.cpp
|
|
char *fu = m_firstUrl.getUrl();
|
|
//char *name = NULL;
|
|
bool inGoogleBlogs = false;
|
|
bool inGoogleNews = false;
|
|
if ( ! strncmp ( fu , "http://www.google.com/blogsearch?", 33 ) )
|
|
inGoogleBlogs = true;
|
|
if ( ! strncmp ( fu , "http://blogsearch.google.com/blogsearch?", 40 ))
|
|
inGoogleBlogs = true;
|
|
if ( ! strncmp ( fu , "http://news.google.com/", 23 ))
|
|
inGoogleNews = true;
|
|
// only do once per site
|
|
char buf[1000];
|
|
HashTableX ht; ht.set (4,0,-1 , buf , 1000 ,false,m_niceness,"sg-tab");
|
|
// get site of outlink
|
|
SiteGetter siteGetter;
|
|
// . must be from an EXTERNAL DOMAIN and must be new
|
|
// . we should already have its tag rec, if any, since we have msge
|
|
int32_t n = links->getNumLinks();
|
|
// not if not spidering links
|
|
if ( ! addLinkTags ) n = 0;
|
|
// get the flags
|
|
linkflags_t *flags = links->m_linkFlags;
|
|
// scan all outlinks we have on this page
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
// get its tag rec
|
|
TagRec *gr = (*grv)[i];
|
|
|
|
// does this hostname have a "firstIp" tag?
|
|
char *ips = gr->getString("firstip",NULL);
|
|
|
|
bool skip = false;
|
|
// skip if we are not "old" high quality root
|
|
if ( ! oldHighQualityRoot ) skip = true;
|
|
// . skip if not external domain
|
|
// . we added this above, so just "continue"
|
|
if ( flags[i] & LF_SAMEDOM ) continue;//skip = true;
|
|
// skip links in the old title rec
|
|
if ( flags[i] & LF_OLDLINK ) skip = true;
|
|
// skip if determined to be link spam! should help us
|
|
// with the text ads we hate so much
|
|
if ( links->m_spamNotes[i] ) skip = true;
|
|
|
|
// if we should skip, and they have firstip already...
|
|
if ( skip && ips ) continue;
|
|
|
|
// get the normalized url
|
|
char *url = links->getLinkPtr(i);
|
|
// get the site. this will not block or have an error.
|
|
siteGetter.getSite(url,gr,timestamp,cr->m_collnum,m_niceness);
|
|
// these are now valid and should reference into
|
|
// Links::m_buf[]
|
|
char *site = siteGetter.m_site;
|
|
int32_t siteLen = siteGetter.m_siteLen;
|
|
|
|
int32_t linkIp = (*ipv)[i];
|
|
|
|
// get site hash
|
|
uint32_t sh = hash32 ( site , siteLen );
|
|
// ensure site is unique
|
|
if ( ht.getSlot ( &sh ) >= 0 ) continue;
|
|
// add it. returns false and sets g_errno on error
|
|
if ( ! ht.addKey ( &sh ) ) return NULL;
|
|
|
|
// . need to add firstip tag for this link's subdomain?
|
|
// . this was in Msge1.cpp but now we do it here
|
|
if ( ! ips && linkIp && linkIp != -1 ) {
|
|
// make it
|
|
char *ips = iptoa(linkIp);
|
|
if (!tbuf->addTag3(site,"firstip",now,"xmldoc",*ip,ips,
|
|
rdbId))
|
|
return NULL;
|
|
}
|
|
|
|
if ( skip ) continue;
|
|
|
|
// if outlink is a .gov or .edu site, do not bother, because
|
|
// getIsSpam() always returns false for those
|
|
// TODO: verify this
|
|
//if ( flags[i] & LF_EDUTLD ) continue;
|
|
//if ( flags[i] & LF_GOVTLD ) continue;
|
|
// this must be valid
|
|
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
|
//int32_t timestamp = m_spideredTime;
|
|
|
|
// how much avail for adding tags?
|
|
int32_t avail = tbuf->getAvail();
|
|
// reserve space
|
|
int32_t need = 512;
|
|
// make sure enough
|
|
if ( need > avail && ! tbuf->reserve ( need ) ) return NULL;
|
|
|
|
// add tag for this outlink
|
|
if ( inGoogle ) {// && ! gr->getTag("ingoogle") ) {
|
|
if ( ! tbuf->addTag(site,"ingoogle",now,"xmldoc",
|
|
*ip,"1",2,rdbId,true) )
|
|
return NULL;
|
|
}
|
|
if ( inGoogleBlogs && //! gr->getTag("ingoogleblogs") &&
|
|
!tbuf->addTag(site,"ingoogleblogs",now,"xmldoc",*ip,"1",2,
|
|
rdbId,true))
|
|
return NULL;
|
|
if ( inGoogleNews && //! gr->getTag("ingooglenews") &&
|
|
!tbuf->addTag(site,"ingooglenews",now,"xmldoc",*ip,"1",2,
|
|
rdbId,true))
|
|
return NULL;
|
|
// link is linked to by a high quality site! 500+ inlinks.
|
|
if ( gr->getNumTagTypes("authorityinlink") < 5 &&
|
|
! tbuf->addTag(site,"authorityinlink",now,"xmldoc",
|
|
*ip,"1",2,rdbId,true) )
|
|
return NULL;
|
|
}
|
|
|
|
m_newTagBufValid = true;
|
|
return &m_newTagBuf;
|
|
}
|
|
|
|
|
|
//
|
|
//
|
|
// BEGIN OLD SPAM.CPP class
|
|
//
|
|
//
|
|
|
|
#define WTMPBUFSIZE (MAX_WORDS *21*3)
|
|
|
|
// . RULE #28, repetitive word/phrase spam detector
|
|
// . set's the "spam" member of each word from 0(no spam) to 100(100% spam)
|
|
// . "bits" describe each word in phrasing terminology
|
|
// . if more than maxPercent of the words are spammed to some degree then we
|
|
// consider all of the words to be spammed, and give each word the minimum
|
|
// score possible when indexing the document.
|
|
// . returns false and sets g_errno on error
|
|
char *XmlDoc::getWordSpamVec ( ) {
|
|
|
|
if ( m_wordSpamBufValid ) {
|
|
char *wbuf = m_wordSpamBuf.getBufStart();
|
|
if ( ! wbuf ) return (char *)0x01;
|
|
return wbuf;
|
|
}
|
|
|
|
setStatus("getting word spam vec");
|
|
|
|
// assume not the repeat spammer
|
|
m_isRepeatSpammer = false;
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (char *)words;
|
|
|
|
m_wordSpamBuf.purge();
|
|
|
|
int32_t nw = words->getNumWords();
|
|
if ( nw <= 0 ) {
|
|
m_wordSpamBufValid = true;
|
|
return (char *)0x01;
|
|
}
|
|
|
|
Phrases *phrases = getPhrases ();
|
|
if ( ! phrases || phrases == (void *)-1 ) return (char *)phrases;
|
|
Bits *bits = getBits();
|
|
if ( ! bits ) return (char *)NULL;
|
|
|
|
m_wordSpamBufValid = true;
|
|
|
|
//if ( m_isLinkText ) return true;
|
|
//if ( m_isCountTable ) return true;
|
|
|
|
// int16_tcuts
|
|
//Words *words = m_words;
|
|
//Bits *bits = m_bits;
|
|
|
|
// if 20 words totally spammed, call it all spam?
|
|
m_numRepeatSpam = 20;
|
|
|
|
// int16_tcut
|
|
int32_t sni = m_siteNumInlinks;
|
|
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set "m_maxPercent"
|
|
int32_t maxPercent = 6;
|
|
if ( sni > 10 ) maxPercent = 8;
|
|
if ( sni > 30 ) maxPercent = 10;
|
|
if ( sni > 100 ) maxPercent = 20;
|
|
if ( sni > 500 ) maxPercent = 30;
|
|
// fix this a bit so we're not always totally spammed
|
|
maxPercent = 25;
|
|
|
|
// assume not totally spammed
|
|
m_totallySpammed = false;
|
|
// get # of words we have to set spam for
|
|
int32_t numWords = words->getNumWords();
|
|
|
|
// set up the size of the hash table (number of buckets)
|
|
int32_t size = numWords * 3;
|
|
|
|
// . add a tmp buf as a scratch pad -- will be freed right after
|
|
// . allocate this second to avoid mem fragmentation more
|
|
// . * 2 for double the buckets
|
|
char tmpBuf [ WTMPBUFSIZE ];
|
|
char *tmp = tmpBuf;
|
|
int32_t need = (numWords * 21) * 3 + numWords;
|
|
if ( need > WTMPBUFSIZE ) {
|
|
tmp = (char *) mmalloc ( need , "Spam" );
|
|
if ( ! tmp ) {
|
|
log("build: Failed to allocate %"INT32" more "
|
|
"bytes for spam detection: %s.",
|
|
need,mstrerror(g_errno));
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
// set up ptrs
|
|
char *p = tmp;
|
|
// first this
|
|
unsigned char *spam = (unsigned char *)p; p += numWords ;
|
|
// . this allows us to make linked lists of indices of words
|
|
// . i.e. next[13] = 23--> word #23 FOLLOWS word #13 in the linked list
|
|
int32_t *next = (int32_t *)p; p += size * 4;
|
|
// hash of this word's stem (or word itself if useStem if false)
|
|
int64_t *bucketHash = (int64_t *)p; p += size * 8;
|
|
// that word's position in document
|
|
int32_t *bucketWordPos = (int32_t *)p; p += size * 4;
|
|
// profile of a word
|
|
int32_t *profile = (int32_t *)p; p += size * 4;
|
|
// is it a common word?
|
|
char *commonWords = (char *)p; p += size * 1;
|
|
|
|
// sanity check
|
|
if ( p - tmp > need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// clear all our spam percentages for these words
|
|
memset ( spam , 0 , numWords );
|
|
|
|
int32_t np;
|
|
// clear the hash table
|
|
int32_t i;
|
|
for ( i = 0 ; i < size ; i++ ) {
|
|
bucketHash [i] = 0;
|
|
bucketWordPos[i] = -1;
|
|
commonWords [i] = 0;
|
|
}
|
|
|
|
// count position since Words class can now have tags in it
|
|
//
|
|
//int32_t pos = 0;
|
|
//bool usePos = false;
|
|
//if ( words->m_tagIds ) usePos = true;
|
|
|
|
int64_t *wids = words->getWordIds();
|
|
|
|
// . loop through each word
|
|
// . hash their stems and place in linked list
|
|
// . if no stemming then don't do stemming
|
|
for ( i = 0 ; i < numWords ; i++ ) {
|
|
// . skip punctuation
|
|
// . this includes tags now , too i guess
|
|
//if ( words->isPunct(i) ) continue;
|
|
if ( wids[i] == 0 ) continue;
|
|
// skip if will not be indexed cuz score is too low
|
|
//if ( wscores && wscores[i] <= 0 ) continue;
|
|
QUICKPOLL(m_niceness);
|
|
// TODO: get phrase stem if stemming is on
|
|
// store the phrase stem this word int32_to the buffer
|
|
// blen = words->getPhraseStem(i,buf,100);
|
|
// if (blen<=0) continue;
|
|
// get the hash of the ith word
|
|
int64_t h = words->getWordId(i);
|
|
// use secondary wordId if available
|
|
//if ( words->getStripWordId(i) )
|
|
// h = words->getStripWordId(i);
|
|
// "j" is the bucket index
|
|
int32_t j = (uint64_t)h % size;
|
|
// make sure j points to the right bucket
|
|
while (bucketHash[j]) {
|
|
if ( h == bucketHash[j] ) break;
|
|
if (++j == size) j = 0;
|
|
}
|
|
// if this bucket is occupied by a word then replace it but
|
|
// make sure it adds onto the "linked list"
|
|
if (bucketHash[j]) {
|
|
// if Words class contain tags as words, do this
|
|
//if ( usePos ) {
|
|
// next [pos] = bucketWordPos[j];
|
|
// bucketWordPos[ j] = pos++;
|
|
//}
|
|
//else {
|
|
// add onto linked list for the ith word
|
|
next[i] = bucketWordPos[j];
|
|
// replace bucket with index to this word
|
|
bucketWordPos[j] = i;
|
|
//}
|
|
}
|
|
// otherwise, we have a new occurence of this word
|
|
else {
|
|
bucketHash [j] = h;
|
|
// if Words class contain tags as words, do this
|
|
//if ( usePos ) {
|
|
// bucketWordPos[ j] = pos++;
|
|
// next [pos] = -1;
|
|
//}
|
|
//else {
|
|
// store our position # (i) in bucket
|
|
bucketWordPos[j] = i;
|
|
// no next occurence of the ith word yet
|
|
next[i] = -1;
|
|
//}
|
|
}
|
|
// if stop word or number then mark it
|
|
if ( bits->isStopWord(i) ) commonWords[j] = 1;
|
|
if ( words->isNum ( i ) ) commonWords[j] = 1;
|
|
}
|
|
// count distinct candidates that had spam and did not have spam
|
|
int32_t spamWords = 0;
|
|
int32_t goodWords = 0;
|
|
// . now cruise down the hash table looking for filled buckets
|
|
// . grab the linked list of indices and make a "profile"
|
|
for ( i = 0 ; i < size ; i++ ) {
|
|
// skip empty buckets
|
|
if (bucketHash[i] == 0) continue;
|
|
np=0;
|
|
// word #j is in bucket #i
|
|
int32_t j = bucketWordPos[i];
|
|
// . cruise down the linked list for this word
|
|
while ( j!=-1) {
|
|
// store position of occurence of this word in profile
|
|
profile [ np++ ] = j;
|
|
// get the position of next occurence of this word
|
|
j = next[ j ];
|
|
}
|
|
// if 2 or less occurences of this word, don't check for spam
|
|
if ( np < 3 ) { goodWords++; continue; }
|
|
|
|
//
|
|
// set m_isRepeatSpammer
|
|
//
|
|
// look for a word repeated in phrases, in a big list,
|
|
// where each phrase is different
|
|
//
|
|
int32_t max = 0;
|
|
int32_t count = 0;
|
|
int32_t knp = np;
|
|
// must be 3+ letters, not a stop word, not a number
|
|
if ( words->m_wordLens[profile[0]] <= 2 || commonWords[i] )
|
|
knp = 0;
|
|
// scan to see if they are a tight list
|
|
for ( int32_t k = 1 ; k < knp ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// are they close together? if not, bail
|
|
if ( profile[k-1] - profile[k] >= 25 ) {
|
|
count = 0;
|
|
continue;
|
|
}
|
|
// otherwise inc it
|
|
count++;
|
|
// must have another word in between or tag
|
|
int32_t a = profile[k];
|
|
int32_t b = profile[k-1];
|
|
bool gotSep = false;
|
|
bool inLink = false;
|
|
for ( int32_t j = a+1 ; j <b ; j++ ) {
|
|
// if in link do not count, chinese spammer
|
|
// does not have his crap in links
|
|
if ( words->m_words[j][0] == '<' &&
|
|
words->m_wordLens[j]>=3 ) {
|
|
// get the next char after the <
|
|
char nc;
|
|
nc=to_lower_a(words->m_words[j][1]);
|
|
// now check it for anchor tag
|
|
if ( nc == 'a' ) {
|
|
inLink = true; break; }
|
|
}
|
|
if ( words->m_words[j][0] == '<' )
|
|
gotSep = true;
|
|
if ( is_alnum_a(words->m_words[j][0]) )
|
|
gotSep = true;
|
|
}
|
|
// . the chinese spammer always has a separator,
|
|
// usually another tag
|
|
// . and fix "BOW BOW BOW..." which has no separators
|
|
if ( ! gotSep ) count--;
|
|
else if ( inLink ) count--;
|
|
// get the max
|
|
if ( count > max ) max = count;
|
|
}
|
|
// a count of 50 such monsters indicates the chinese spammer
|
|
if ( max >= 50 )
|
|
m_isRepeatSpammer = true;
|
|
//
|
|
// end m_isRepeatSpammer detection
|
|
//
|
|
|
|
// . determine the probability this word was spammed by looking
|
|
// at the distribution of it's positions in the document
|
|
// . sets "spam" member of each word in this profile
|
|
// . don't check if word occurred 2 or less times
|
|
// . TODO: what about TORA! TORA! TORA!
|
|
// . returns true if 1+ occurences were considered spam
|
|
QUICKPOLL(m_niceness);
|
|
bool isSpam = setSpam ( profile , np , numWords , spam );
|
|
// don't count stop words or numbers towards this threshold
|
|
if ( commonWords[i] ) continue;
|
|
// tally them up
|
|
if ( isSpam ) spamWords++;
|
|
else goodWords++;
|
|
}
|
|
// what percent of distinct cadidate words were spammed?
|
|
int32_t totalWords = spamWords + goodWords;
|
|
// if no or ver few words return true
|
|
int32_t percent;
|
|
if ( totalWords <= 10 ) goto done;
|
|
percent = ( spamWords * 100 ) / totalWords;
|
|
// if 20% of words we're spammed punish everybody now to 100% spam
|
|
// if we had < 100 candidates and < 20% spam, don't bother
|
|
//if ( percent < 5 ) goto done;
|
|
if ( percent <= maxPercent ) goto done;
|
|
// set flag so linkspam.cpp can see if all is spam and will not allow
|
|
// this page to vote
|
|
m_totallySpammed = true;
|
|
// now only set to 99 so each singleton usually gets hashed
|
|
for ( i = 0 ; i < numWords ; i++ )
|
|
if ( words->getWordId(i) && spam[i] < 99 )
|
|
spam[i] = 99;
|
|
done:
|
|
|
|
// update the weights for the words
|
|
//for ( i = 0 ; i < numWords ; i++ ) {
|
|
// m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100;
|
|
//}
|
|
|
|
// TODO: use the min word spam algo as in Phrases.cpp for this!
|
|
//for ( i = 0 ; i < numWords ; i++ ) {
|
|
// m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100;
|
|
//}
|
|
|
|
// convert from percent spammed into rank.. from 0 to 10 i guess
|
|
for ( i = 0 ; i < numWords ; i++ )
|
|
spam[i] = (MAXWORDSPAMRANK * (100 - spam[i])) / 100;
|
|
|
|
// copy into our buffer
|
|
if ( ! m_wordSpamBuf.safeMemcpy ( (char *)spam , numWords ) )
|
|
return NULL;
|
|
|
|
// free our temporary table stuff
|
|
if ( tmp != tmpBuf ) mfree ( tmp , need , "Spam" );
|
|
|
|
return m_wordSpamBuf.getBufStart();
|
|
}
|
|
|
|
|
|
// . a "profile" is an array of all the positions of a word in the document
|
|
// . a "position" is just the word #, like first word, word #8, etc...
|
|
// . we map "each" subProfile to a probability of spam (from 0 to 100)
|
|
// . if the profile is really big we get really slow (O(n^2)) iterating through
|
|
// many subProfiles
|
|
// . so after the first 25 words, it's automatically considered spam
|
|
// . return true if one word was spammed w/ probability > 20%
|
|
bool XmlDoc::setSpam ( int32_t *profile, int32_t plen , int32_t numWords ,
|
|
unsigned char *spam ) {
|
|
// don't bother detecting spam if 2 or less occurences of the word
|
|
if ( plen < 3 ) return false;
|
|
int32_t i;
|
|
// if we have more than 10 words and this word is 20% or more of
|
|
// them then all but the first occurence is spammed
|
|
//log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam);
|
|
if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) {
|
|
for (i=1; i<plen; i++) spam[profile[i]] = 100;
|
|
return true ;
|
|
}
|
|
// . over 50 repeated words is ludicrous
|
|
// . set all past 50 to spam and continue detecting
|
|
// . no, our doc length based weight takes care of that kind of thing
|
|
//if (plen > 50 && m_version < 93 ) {
|
|
// // TODO: remember, profile[i] is in reverse order!! we should
|
|
// // really do i=0;i<plen-50, but this is obsolete anyway...
|
|
// for (i=50; i<plen;i++) m_spam[profile[i]] = 100;
|
|
// plen = 50;
|
|
//}
|
|
|
|
|
|
// we have to do this otherwise it takes FOREVER to do for plens in
|
|
// the thousands, like i saw a plen of 8338!
|
|
if ( plen > 50 ) { // && m_version >= 93 ) {
|
|
// . set all but the last 50 to a spam of 100%
|
|
// . the last 50 actually occur as the first 50 in the doc
|
|
for (i=0; i<plen-50;i++) spam[profile[i]] = 100;
|
|
// we now have only 50 occurences
|
|
plen = 50;
|
|
// we want to skip the first plen-50 because they actually
|
|
// occur at the END of the document
|
|
profile += plen - 50;
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
// higher quality docs allow more "freebies", but only starting with
|
|
// version 93... (see Titledb.h)
|
|
// profile[i] is actually in reverse order so we subtract off from wlen
|
|
//int32_t off ;
|
|
//if ( m_version >= 93 ) {
|
|
// off = (m_docQuality - 30) / 3;
|
|
// if ( off < 0 ) off = 0;
|
|
//}
|
|
// just use 40% "quality"
|
|
int32_t off = 3;
|
|
|
|
// . now the nitty-gritty part
|
|
// . compute all sub sequences of the profile
|
|
// . similar to a compression scheme (wavelets?)
|
|
// . TODO: word positions should count by two's since punctuation is
|
|
// not included so start step @ 2 instead of 1
|
|
// . if "step" is 1 we look at every word position in the profile
|
|
// . if "step" is 2 we look at every other word position
|
|
// . if "step" is 3 we look at every 3rd word position, etc...
|
|
int32_t maxStep = plen / 4;
|
|
if ( maxStep > 4 ) maxStep = 4;
|
|
// . loop through all possible tuples
|
|
int32_t window, wlen, step, prob;
|
|
for ( step = 1 ; step <= maxStep ; step++ ) {
|
|
for ( window = 0 ; window + 3 < plen ; window+=1) {
|
|
for (wlen = 3; window+wlen <= plen ; wlen+=1) {
|
|
// continue if step isn't aligned with window
|
|
// length
|
|
if (wlen % step != 0) continue;
|
|
// . get probability that this tuple is spam
|
|
// . returns 0 to 100
|
|
prob = getProbSpam ( profile + window ,
|
|
wlen , step);
|
|
// printf("(%i,%i,%i)=%i\n",step,window,
|
|
// wlen,prob);
|
|
// . if the probability is too low continue
|
|
// . was == 100
|
|
if ( prob <= 20 ) continue;
|
|
// set the spammed words spam to "prob"
|
|
// only if it's bigger than their current spam
|
|
for (i=window; i<window+wlen;i++) {
|
|
// first occurences can have immunity
|
|
// due to doc quality being high
|
|
if ( i >= plen - off ) break;
|
|
if (spam[profile[i]] < prob)
|
|
spam[profile[i]] = prob;
|
|
}
|
|
QUICKPOLL(m_niceness);
|
|
}
|
|
|
|
}
|
|
}
|
|
// was this word spammed at all?
|
|
bool hadSpam = false;
|
|
for (i=0;i<plen;i++) if ( spam[profile[i]] > 20 ) hadSpam = true;
|
|
// make sure at least one word survives
|
|
for (i=0;i<plen;i++) if ( spam[profile[i]] == 0) return hadSpam;
|
|
// clear the spam level on this guy
|
|
spam[profile[0]] = 0;
|
|
// return true if we had spam, false if not
|
|
return hadSpam;
|
|
}
|
|
|
|
bool getWordPosVec ( Words *words ,
|
|
Sections *sections,
|
|
//int32_t wordStart,
|
|
//int32_t wordEnd,
|
|
int32_t startDist, // m_dist
|
|
char *fragVec,
|
|
int32_t niceness ,
|
|
SafeBuf *wpos ) {
|
|
|
|
int32_t dist = startDist; // 0;
|
|
Section *lastsx = NULL;
|
|
int32_t tagDist = 0;
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
nodeid_t *tids = words->m_tagIds;
|
|
int64_t *wids = words->m_wordIds;
|
|
int32_t *wlens = words->getWordLens();
|
|
char **wptrs = words->getWords();
|
|
int32_t nw = words->getNumWords();
|
|
|
|
if ( ! wpos->reserve ( nw * 4 ) ) return false;
|
|
int32_t *wposvec = (int32_t *)wpos->getBufStart();
|
|
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
|
|
// save it
|
|
wposvec[i] = dist;
|
|
|
|
// tags affect the distance/wordposition cursor
|
|
if ( tids && tids[i] ) {
|
|
// tag distance affects
|
|
nodeid_t tid = tids[i] & BACKBITCOMP;
|
|
if ( isBreakingTagId ( tid ) ) tagDist += SENT_UNITS;
|
|
dist++;
|
|
continue;
|
|
}
|
|
// . and so do sequences of punct
|
|
// . must duplicate this code in Query.cpp for setting
|
|
// QueryWord::m_posNum
|
|
if ( ! wids[i] ) {
|
|
// simple space or sequence of just white space
|
|
if ( words->isSpaces(i) )
|
|
dist++;
|
|
// 'cd-rom'
|
|
else if ( wptrs[i][0]=='-' && wlens[i]==1 )
|
|
dist++;
|
|
// 'mr. x'
|
|
else if ( wptrs[i][0]=='.' && words->isSpaces2(i,1))
|
|
dist++;
|
|
// animal (dog)
|
|
else
|
|
dist += 2;
|
|
continue;
|
|
}
|
|
// ignore if in repeated fragment
|
|
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) {
|
|
dist++; continue; }
|
|
|
|
Section *sx = NULL;
|
|
if ( sp ) {
|
|
sx = sp[i];
|
|
// ignore if in style tag, etc. and do not
|
|
// increment the distance
|
|
if ( sx->m_flags & NOINDEXFLAGS )
|
|
continue;
|
|
}
|
|
|
|
// different sentence?
|
|
if ( sx &&
|
|
( ! lastsx ||
|
|
sx->m_sentenceSection != lastsx->m_sentenceSection ) ) {
|
|
// separate different sentences with 30 units
|
|
dist += SENT_UNITS; // 30;
|
|
// limit this!
|
|
if ( tagDist > 120 ) tagDist = 120;
|
|
// and add in tag distances as well here, otherwise
|
|
// we do not want "<br>" to really increase the
|
|
// distance if the separated words are in the same
|
|
// sentence!
|
|
dist += tagDist;
|
|
// new last then
|
|
lastsx = sx;
|
|
// store the vector AGAIN
|
|
wposvec[i] = dist;
|
|
}
|
|
|
|
tagDist = 0;
|
|
|
|
dist++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool getDensityRanks ( int64_t *wids ,
|
|
int32_t nw ,
|
|
int32_t hashGroup ,
|
|
SafeBuf *densBuf ,
|
|
Sections *sections ,
|
|
int32_t niceness ) {
|
|
|
|
//int32_t nw = wordEnd - wordStart;
|
|
|
|
// make the vector
|
|
if ( ! densBuf->reserve ( nw ) ) return false;
|
|
|
|
// convenience
|
|
char *densVec = densBuf->getBufStart();
|
|
|
|
// clear i guess
|
|
memset ( densVec , 0 , nw );
|
|
|
|
if ( hashGroup != HASHGROUP_BODY &&
|
|
hashGroup != HASHGROUP_HEADING )
|
|
sections = NULL;
|
|
|
|
// scan the sentences if we got those
|
|
Section *ss = NULL;
|
|
if ( sections ) ss = sections->m_firstSent;
|
|
// sanity
|
|
//if ( sections && wordStart != 0 ) { char *xx=NULL;*xx=0; }
|
|
for ( ; ss ; ss = ss->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// count of the alnum words in sentence
|
|
int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
|
|
// start with one word!
|
|
count--;
|
|
// how can it be less than one alnum word
|
|
if ( count < 0 ) continue;
|
|
// . base density rank on that
|
|
// . count is 0 for one alnum word now
|
|
int32_t dr = MAXDENSITYRANK - count;
|
|
// ensure not negative. make it at least 1. zero means un-set.
|
|
if ( dr < 1 ) dr = 1;
|
|
// mark all in sentence then
|
|
for ( int32_t i = ss->m_senta ; i < ss->m_sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// assign
|
|
densVec[i] = dr;
|
|
}
|
|
}
|
|
// all done if using sections
|
|
if ( sections ) return true;
|
|
|
|
|
|
// count # of alphanumeric words in this string
|
|
int32_t na = 0;
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) if ( wids[i] ) na++;
|
|
// a single alnum should map to 0 "na"
|
|
na--;
|
|
// wtf?
|
|
if ( na < 0 ) return true;
|
|
// compute density rank
|
|
int32_t dr = MAXDENSITYRANK - na ;
|
|
// at least 1 to not be confused with 0 which means un-set
|
|
if ( dr < 1 ) dr = 1;
|
|
// assign
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// assign
|
|
densVec[i] = dr;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . called by hashString() for hashing purposes, i.e. creating posdb keys
|
|
// . string is usually the document body or inlink text of an inlinker or
|
|
// perhaps meta keywords. it could be anything. so we need to create this
|
|
// vector based on that string, which is represented by words/phrases here.
|
|
bool getDiversityVec ( Words *words ,
|
|
Phrases *phrases ,
|
|
HashTableX *countTable ,
|
|
SafeBuf *sbWordVec ,
|
|
//SafeBuf *sbPhraseVec ,
|
|
int32_t niceness ) {
|
|
|
|
int64_t *wids = words->getWordIds ();
|
|
//nodeid_t *tids = words->getTagIds ();
|
|
int32_t nw = words->getNumWords();
|
|
int64_t *pids = phrases->getPhraseIds2();
|
|
|
|
// . make the vector
|
|
// . it will be diversity ranks, so one float per word for now
|
|
// cuz we convert to rank below though, one byte rank
|
|
if ( ! sbWordVec ->reserve ( nw*4 ) ) return false;
|
|
//if ( ! sbPhraseVec->reserve ( nw*4 ) ) return false;
|
|
|
|
// get it
|
|
float *ww = (float *)sbWordVec ->getBufStart();
|
|
//float *pw = (float *)sbPhraseVec->getBufStart();
|
|
|
|
int32_t nexti = -10;
|
|
int64_t pidLast = 0;
|
|
|
|
// . now consider ourselves the last word in a phrase
|
|
// . adjust the score of the first word in the phrase to be
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// yield
|
|
QUICKPOLL ( niceness );
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) { ww[i] = 0.0; continue; }
|
|
// try to inline this
|
|
int64_t nextWid = 0;
|
|
int64_t lastPid = 0;
|
|
// how many words in the bigram?
|
|
int32_t nwp = phrases->getNumWordsInPhrase2(i);
|
|
if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ;
|
|
if ( i == nexti ) lastPid = pidLast;
|
|
// get current pid
|
|
int64_t pid = pids[i];
|
|
// get the word and phrase weights for term #i
|
|
float ww2;
|
|
//float pw2;
|
|
getWordToPhraseRatioWeights ( lastPid , // pids[i-1],
|
|
wids[i] ,
|
|
pid ,
|
|
nextWid , // wids[i+1] ,
|
|
&ww2 ,
|
|
//&pw2 ,
|
|
countTable ,
|
|
1);//m_version );
|
|
// 0 to 1.0
|
|
if ( ww2 < 0 || ww2 > 1.0 ) { char *xx=NULL;*xx=0; }
|
|
// save the last phrase id
|
|
if ( nwp > 0 ) {
|
|
nexti = i + nwp - 1;
|
|
pidLast = pid; // pids[i] ;
|
|
}
|
|
// . apply the weights
|
|
// . do not hit all the way down to zero though...
|
|
// . Words.cpp::hash() will not index it then...
|
|
//if ( ww[i] > 0 ) {
|
|
ww[i] = ww2;
|
|
//}
|
|
/*
|
|
//if ( pw[i] > 0 ) {
|
|
pw[i] = (int32_t)(pw[i] * pw2);
|
|
if ( pw[i] <= 0 ) pw[i] = 1;
|
|
//}
|
|
|
|
// MDW: why was this here?
|
|
//if ( isLinkText ) continue;
|
|
|
|
// do not demote all the way to 0
|
|
//if ( ww[i] <= 0 ) ww[i] = 2;
|
|
|
|
// skip if phrase score is 0
|
|
if ( ! pw[i] ) continue;
|
|
|
|
if ( pid == 0 ) { pw[i] = 0; continue; }
|
|
// skip if does not start phrase
|
|
if ( nwp <= 0 ) continue;
|
|
// sanity check
|
|
if ( nwp == 99 ) { char *xx = NULL; *xx = 0; }
|
|
// now mod the score
|
|
float avg = pw[i];
|
|
// weight by punct in between
|
|
//for ( int32_t j = i+1 ; j < i+nwp ; j++ ) {
|
|
// if ( wids[j] ) continue;
|
|
// avg = (avg * (int64_t)pw[j]) / DW;
|
|
//}
|
|
// do not demote all the way to zero, we still want to index it
|
|
// and when normalized on a 100 point scale, like when printed
|
|
// out by PageParser.cpp, a score of 1 here gets normalized to
|
|
// 0, so make sure it is at least 2.
|
|
if ( avg < 2 )
|
|
avg = 2;
|
|
// set that as our new score
|
|
pw[i] = avg;
|
|
*/
|
|
}
|
|
|
|
// overwrite the array of floats with an array of chars (ranks)
|
|
char *nww = (char *)ww;
|
|
//char *npw = (char *)pw;
|
|
|
|
// convert from float into a rank from 0-15
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
if ( ! ww[i] ) { nww[i] = 0; continue; }
|
|
// 2.50 is max in getWordToPhraseRatioWeights() function
|
|
char wrank = (char) ((ww[i] * ((float)MAXDIVERSITYRANK))/.55);
|
|
// sanity
|
|
if ( wrank > MAXDIVERSITYRANK ) wrank = MAXDIVERSITYRANK;
|
|
if ( wrank < 0 ) { char *xx=NULL;*xx=0; }
|
|
//char prank = (char) ((pw[i] * 15.0) / 2.50);
|
|
// assign now
|
|
nww[i] = wrank;
|
|
//npw[i] = prank;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// match word sequences of NUMWORDS or more words
|
|
#define NUMWORDS 5
|
|
|
|
// . repeated sentence frags
|
|
// . 1-1 with words in body of doc
|
|
char *XmlDoc::getFragVec ( ) {
|
|
|
|
if ( m_fragBufValid ) {
|
|
char *fb = m_fragBuf.getBufStart();
|
|
if ( ! fb ) return (char *)0x01;
|
|
return fb;
|
|
}
|
|
|
|
setStatus("getting frag vec");
|
|
|
|
Words *words = getWords();
|
|
if ( ! words || words == (Words *)-1 ) return (char *)words;
|
|
Bits *bits = getBits();
|
|
if ( ! bits ) return NULL;
|
|
|
|
m_fragBuf.purge();
|
|
|
|
// ez vars
|
|
int64_t *wids = words->getWordIds ();
|
|
int32_t nw = words->getNumWords();
|
|
|
|
// if no words, nothing to do
|
|
if ( nw == 0 ) {
|
|
m_fragBufValid = true;
|
|
return (char *)0x01;//true;
|
|
}
|
|
|
|
// truncate for performance reasons. i've seen this be over 4M
|
|
// and it was VERY VERY SLOW... over 10 minutes...
|
|
// - i saw this tak over 200MB for an alloc for
|
|
// WeightsSet3 below, so lower from 200k to 50k. this will probably
|
|
// make parsing inconsistencies for really large docs...
|
|
if ( nw > MAXFRAGWORDS ) nw = MAXFRAGWORDS;
|
|
|
|
int64_t ringWids [ NUMWORDS ];
|
|
int32_t ringPos [ NUMWORDS ];
|
|
int32_t ringi = 0;
|
|
int32_t count = 0;
|
|
uint64_t h = 0;
|
|
|
|
// . make the hash table
|
|
// . make it big enough so there are gaps, so chains are not too long
|
|
int32_t minBuckets = (int32_t)(nw * 1.5);
|
|
uint32_t nb = 2 * getHighestLitBitValue ( minBuckets ) ;
|
|
int32_t need = nb * (8+4+4);
|
|
char *buf = NULL;
|
|
char tmpBuf[50000];
|
|
if ( need < 50000 ) buf = tmpBuf;
|
|
else buf = (char *)mmalloc ( need , "WeightsSet3" );
|
|
char *ptr = buf;
|
|
uint64_t *hashes = (uint64_t *)ptr; ptr += nb * 8;
|
|
int32_t *vals = (int32_t *)ptr; ptr += nb * 4;
|
|
float *ww = (float *)ptr; ptr += nb * 4;
|
|
if ( ! buf ) return NULL;
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) ww[i] = 1.0;
|
|
|
|
if ( ptr != buf + need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make the mask
|
|
uint32_t mask = nb - 1;
|
|
|
|
// clear the hash table
|
|
memset ( hashes , 0 , nb * 8 );
|
|
|
|
// clear ring of hashes
|
|
memset ( ringWids , 0 , NUMWORDS * 8 );
|
|
|
|
// for sanity check
|
|
int32_t lastStart = -1;
|
|
|
|
// . hash EVERY NUMWORDS-word sequence in the document
|
|
// . if we get a match look and see what sequences it matches
|
|
// . we allow multiple instances of the same hash to be stored in
|
|
// the hash table, so keep checking for a matching hash until you
|
|
// chain to a 0 hash, indicating the chain ends
|
|
// . check each matching hash to see if more than NUMWORDS words match
|
|
// . get the max words that matched from all of the candidates
|
|
// . demote the word and phrase weights based on the total/max
|
|
// number of words matching
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// yield
|
|
QUICKPOLL ( m_niceness );
|
|
// add new to the 5 word hash
|
|
h ^= wids[i];
|
|
// . remove old from 5 word hash before adding new...
|
|
// . initial ring wids are 0, so should be benign at startup
|
|
h ^= ringWids[ringi];
|
|
// add to ring
|
|
ringWids[ringi] = wids[i];
|
|
// save our position
|
|
ringPos[ringi] = i;
|
|
// wrap the ring ptr if we need to, that is why we are a ring
|
|
if ( ++ringi >= NUMWORDS ) ringi = 0;
|
|
// this 5-word sequence starts with word # "start"
|
|
int32_t start = ringPos[ringi];
|
|
// need at least NUMWORDS words in ring buffer to do analysis
|
|
if ( ++count < NUMWORDS ) continue;
|
|
// . skip if it starts with a word which can not start phrases
|
|
// . that way "a new car" being repeated a lot will not
|
|
// decrease the weight of the phrase term "new car"
|
|
// . setCountTable() calls set3() with this set to NULL
|
|
//if ( bits && ! bits->canStartPhrase(start) ) continue;
|
|
// sanity check
|
|
if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
|
|
// reset max matched
|
|
int32_t max = 0;
|
|
// look up in the hash table
|
|
uint32_t n = h & mask;
|
|
// sanity breach check
|
|
if ( n >= nb ) { char *xx=NULL;*xx=0; }
|
|
loop:
|
|
// all done if empty
|
|
if ( ! hashes[n] ) {
|
|
// sanity check
|
|
//if ( n >= nb ) { char *xx = NULL; *xx = 0; }
|
|
// add ourselves to the hash table now
|
|
hashes[n] = h;
|
|
// sanity check
|
|
//if ( wids[start] == 0 ) { char *xx = NULL; *xx = 0; }
|
|
// this is where the 5-word sequence starts
|
|
vals [n] = start;
|
|
// save it
|
|
lastStart = start;
|
|
// debug point
|
|
//if ( start == 7948 )
|
|
// log("heystart");
|
|
// do not demote words if less than NUMWORDS matched
|
|
if ( max < NUMWORDS ) continue;
|
|
// . how much we should we demote
|
|
// . 10 matching words pretty much means 0 weights
|
|
float demote = 1.0 - ((max-5)*.10);
|
|
if ( demote >= 1.0 ) continue;
|
|
if ( demote < 0.0 ) demote = 0.0;
|
|
|
|
// . RULE #26 ("long" phrases)
|
|
// . if we got 3, 4 or 5 in our matching sequence
|
|
// . basically divide by the # of *phrase* terms
|
|
// . multiply by 1/(N-1)
|
|
// . HOWEVER, should we also look at HOW MANY other
|
|
// sequences matches this too!???
|
|
//float demote = 1.0 / ((float)max-1.0);
|
|
// set3() is still called from setCountTable() to
|
|
// discount the effects of repeated fragments, and
|
|
// the count table only understands score or no score
|
|
//if ( max >= 15 ) demote = 0.0;
|
|
|
|
// demote the next "max" words
|
|
int32_t mc = 0;
|
|
int32_t j;
|
|
for ( j = start ; mc < max ; j++ ) {
|
|
// sanity
|
|
if ( j >= nw ) { char *xx=NULL;*xx=0; }
|
|
if ( j < 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip if not an alnum word
|
|
if ( ! wids[j] ) continue;
|
|
// count it
|
|
mc++;
|
|
// demote it
|
|
ww[j] = (int32_t)(ww[j] * demote);
|
|
if ( ww[j] <= 0 ) ww[j] = 2;
|
|
}
|
|
// save the original i
|
|
int32_t mini = i;
|
|
// advance i, it will be incremented by 1 immediately
|
|
// after hitting the "continue" statement
|
|
i = j - 1;
|
|
// must be at least the original i, we are monotinic
|
|
// otherwise ringPos[] will not be monotonic and core
|
|
// dump ultimately cuz j and k will be equal below
|
|
// and we increment matched++ forever.
|
|
if ( i < mini ) i = mini;
|
|
// get next word
|
|
continue;
|
|
}
|
|
// get next in chain if hash does not match
|
|
if ( hashes[n] != h ) {
|
|
// wrap around the hash table if we hit the end
|
|
if ( ++n >= nb ) n = 0;
|
|
// check out bucket #n now
|
|
goto loop;
|
|
}
|
|
// how many words match so far
|
|
int32_t matched = 0;
|
|
// . we have to check starting at the beginning of each word
|
|
// sequence since the XOR compositional hash is order
|
|
// independent
|
|
// . see what word offset this guy has
|
|
int32_t j = vals[n] ;
|
|
// k becomes the start of the current 5-word sequence
|
|
int32_t k = start;
|
|
// sanity check
|
|
if ( j == k ) { char *xx = NULL; *xx = 0; }
|
|
// skip to next in chain to check later
|
|
if ( ++n >= nb ) n = 0;
|
|
// keep advancing k and j as int32_t as the words match
|
|
matchLoop:
|
|
// get next wid for k and j
|
|
while ( k < nw && ! wids[k] ) k++;
|
|
while ( j < nw && ! wids[j] ) j++;
|
|
if ( k < nw && wids[k] == wids[j] ) {
|
|
matched++;
|
|
k++;
|
|
j++;
|
|
goto matchLoop;
|
|
}
|
|
// keep track of the max matched for i0
|
|
if ( matched > max ) max = matched;
|
|
// get another matching string of words, if possible
|
|
goto loop;
|
|
}
|
|
|
|
if ( nw <= 0 ) { char *xx=NULL;*xx=0;}
|
|
|
|
// make space
|
|
if ( ! m_fragBuf.reserve ( nw ) ) {
|
|
// save it
|
|
int32_t saved = g_errno;
|
|
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
|
|
// reinstate it
|
|
g_errno = saved;
|
|
return NULL;
|
|
}
|
|
// validate
|
|
m_fragBufValid = true;
|
|
// handy ptr
|
|
char *ff = m_fragBuf.getBufStart();
|
|
|
|
// convert from floats into frag score, 0 or 1 really
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
if ( ww[i] <= 0.0 ) ff[i] = 0;
|
|
else ff[i] = 1;
|
|
}
|
|
|
|
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
|
|
|
|
// wtf?
|
|
if ( ! ff ) { char *xx=NULL;*xx=0; }
|
|
|
|
return ff;
|
|
}
|
|
|
|
float g_wtab[30][30];
|
|
|
|
// . inline this for speed
|
|
// . if a word repeats in different phrases, promote the word
|
|
// and demote the phrase
|
|
// . if a word repeats in pretty much the same phrase, promote
|
|
// the phrase and demote the word
|
|
// . if you have the window of text "new mexico good times"
|
|
// and word #i is mexico, then:
|
|
// pid1 is "new mexico"
|
|
// wid1 is "mexico"
|
|
// pid2 is "mexico good"
|
|
// wid2 is "good"
|
|
// . we store sliderParm in titleRec so we can update it along
|
|
// with title and header weights on the fly from the spider controls
|
|
void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
|
|
int64_t wid1 ,
|
|
int64_t pid2 ,
|
|
int64_t wid2 , // post word
|
|
float *retww ,
|
|
//float *retpw ,
|
|
HashTableX *tt1 ,
|
|
int32_t titleRecVersion ) {
|
|
|
|
static float s_fsp;
|
|
// from 0 to 100
|
|
char sliderParm = g_conf.m_sliderParm;
|
|
// i'm not too keen on putting this as a parm in the CollectionRec
|
|
// because it is so cryptic...
|
|
//static char sliderParm = 25;
|
|
|
|
// . to support RULE #15 (word to phrase ratio)
|
|
// . these weights are based on the ratio of word to phrase count
|
|
// for a particular word
|
|
static char s_sp = -1;
|
|
if ( s_sp != sliderParm ) {
|
|
// . set it to the newly updated value
|
|
// . should range from 0 up to 100
|
|
s_sp = sliderParm;
|
|
// the float version
|
|
s_fsp = (float)sliderParm / 100.0;
|
|
// sanity test
|
|
if ( s_fsp < 0.0 || s_fsp > 1.0 ) { char *xx = NULL; *xx = 0; }
|
|
// i is the word count, how many times a particular word
|
|
// occurs in the document
|
|
for ( int32_t i = 0 ; i < 30 ; i++ ) {
|
|
// . k is the phrase count, how many times a particular phrase
|
|
// occurs in the document
|
|
// . k can be GREATER than i because we index only phrase terms
|
|
// sometimes when indexing neighborhoods, and not the
|
|
// single words that compose them
|
|
for ( int32_t k = 0 ; k < 30 ; k++ ) {
|
|
// do not allow phrase count to be greater than
|
|
// word count, even though it can happen since we
|
|
// add imported neighborhood pwids to the count table
|
|
int32_t j = k;
|
|
if ( k > i ) j = i;
|
|
// get ratio
|
|
//float ratio = (float)phrcount / (float)wrdcount;
|
|
float ratio = (float)j/(float)i;
|
|
// it should be impossible that this can be over 1.0
|
|
// but might happen due to hash collisions
|
|
if ( ratio > 1.0 ) ratio = 1.0;
|
|
// restrict the range we can weight a word or phrase
|
|
// based on the word count
|
|
//float r = 1.0;
|
|
//if ( i >= 20 ) r = 2.1;
|
|
//else if ( i >= 10 ) r = 1.8;
|
|
//else if ( i >= 4 ) r = 1.5;
|
|
//else r = 1.3;
|
|
//g_ptab[i][k] = 1.00;
|
|
g_wtab[i][k] = 1.00;
|
|
if ( i <= 1 ) continue;
|
|
// . we used to have a sliding bar between 0.0 and 1.0.
|
|
// word is weighted (1.0 - x) and phrase is weighted
|
|
// by (x). however, x could go all the way to 1.0
|
|
// even when i = 2, so we need to restrict x.
|
|
// . x is actually "ratio"
|
|
// . when we have 8 or less word occurences, do not
|
|
// remove more than 80% of its score, a 1/5 penalty
|
|
// is good enough for now. but for words that occur
|
|
// a lot in the link text or pwids, go to town...
|
|
if ( i <= 2 && ratio >= .50 ) ratio = .50;
|
|
else if ( i <= 4 && ratio >= .60 ) ratio = .60;
|
|
else if ( i <= 8 && ratio >= .80 ) ratio = .80;
|
|
else if ( i <= 12 && ratio >= .95 ) ratio = .95;
|
|
// round up, so many "new mexico" phrases but only
|
|
// make it up to 95%...
|
|
if ( ratio >= .95 ) ratio = 1.00;
|
|
// if word's phrase is repeated 3 times or more then
|
|
// is a pretty good indication that we should weight
|
|
// the phrase more and the word itself less
|
|
//if ( k >= 3 && ratio < .90 ) ratio = .90;
|
|
// compute the weights
|
|
//float pw = 2.0 * ratio;
|
|
//float ww = 2.0 * (1.0 - ratio);
|
|
float ww = (1.0 - ratio);
|
|
|
|
// . punish words a little more
|
|
// . if we got 50% ratio, words should not get as much
|
|
// weight as the phrase
|
|
//ww *= .45;
|
|
// do not weight to 0, no less than .15
|
|
if ( ww < 0.0001 ) ww = 0.0001;
|
|
//if ( pw < 0.0001 ) pw = 0.0001;
|
|
// do not overpromote either
|
|
//if ( ww > 2.50 ) ww = 2.50;
|
|
//if ( pw > 2.50 ) pw = 2.50;
|
|
// . do a sliding weight of the weight
|
|
// . a "ww" of 1.0 means to do no weight
|
|
// . can't do this for ww cuz we use "mod" below
|
|
//float newWW = s_fsp*ww + (1.0-s_fsp)*1.00;
|
|
//float newPW = s_fsp*pw + (1.0-s_fsp)*1.00;
|
|
// limit how much we promote a word because it
|
|
// may occur 30 times total, but have a phrase count
|
|
// of only 1. however, the other 29 times it occurs it
|
|
// is in the same phrase, just not this particular
|
|
// phrase.
|
|
//if ( ww > 2.0 ) ww = 2.0;
|
|
g_wtab[i][k] = ww;
|
|
//g_ptab[i][k] = newPW;
|
|
//logf(LOG_DEBUG,"build: wc=%"INT32" pc=%"INT32" ww=%.2f "
|
|
//"pw=%.2f",i,k,g_wtab[i][k],g_ptab[i][k]);
|
|
}
|
|
}
|
|
}
|
|
|
|
int32_t phrcount1 = 0;
|
|
int32_t phrcount2 = 0;
|
|
int32_t wrdcount1 = 0;
|
|
int32_t wrdcount2 = 0;
|
|
if ( tt1->m_numSlotsUsed > 0 ) {
|
|
if (pid1) phrcount1 = tt1->getScore(&pid1);
|
|
if (pid2) phrcount2 = tt1->getScore(&pid2);
|
|
if (wid1) wrdcount1 = tt1->getScore(&wid1);
|
|
if (wid2) wrdcount2 = tt1->getScore(&wid2);
|
|
}
|
|
// if we are always ending the same phrase, like "Mexico"
|
|
// in "New Mexico"... get the most popular phrase this word is
|
|
// in...
|
|
int32_t phrcountMax = phrcount1;
|
|
int32_t wrdcountMin = wrdcount1;
|
|
// these must actually exist to be part of the selection
|
|
if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2;
|
|
if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2;
|
|
|
|
|
|
// . but if we are 'beds' and in a popular phrase like 'dog beds'
|
|
// there maybe a lot of other phrases mentioned that have 'beds'
|
|
// in them like 'pillow beds', 'pet beds', but we need to assume
|
|
// that is phrcountMax is high enough, do not give much weight to
|
|
// the word... otherwise you can subvert this algorithm by just
|
|
// adding other random phrases with the word 'bed' in them.
|
|
// . BUT, if a page has 'X beds' with a lot of different X's then you
|
|
// still want to index 'beds' with a high score!!! we are trying to
|
|
// balance those 2 things.
|
|
// . do this up here before you truncate phrcountMax below!!
|
|
float mod = 1.0;
|
|
if ( phrcountMax <= 6 ) mod = 0.50;
|
|
else if ( phrcountMax <= 8 ) mod = 0.20;
|
|
else if ( phrcountMax <= 10 ) mod = 0.05;
|
|
else if ( phrcountMax <= 15 ) mod = 0.03;
|
|
else mod = 0.01;
|
|
|
|
// scale wrdcount1/phrcountMax down for the g_wtab table
|
|
if ( wrdcount1 > 29 ) {
|
|
float ratio = (float)phrcountMax / (float)wrdcount1;
|
|
phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
|
|
wrdcount1 = 29;
|
|
}
|
|
if ( phrcountMax > 29 ) {
|
|
float ratio = (float)wrdcount1 / (float)phrcountMax;
|
|
wrdcount1 = (int32_t)((29.0 * ratio) + 0.5);
|
|
phrcountMax = 29;
|
|
}
|
|
|
|
// . sanity check
|
|
// . neighborhood.cpp does not always have wid/pid pairs
|
|
// that match up right for some reason... so we can't do this
|
|
//if ( phrcount1 > wrdcount1 ) { char *xx = NULL; *xx = 0; }
|
|
//if ( phrcount2 > wrdcount2 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// apply the weights from the table we computed above
|
|
*retww = mod * g_wtab[wrdcount1][phrcountMax];
|
|
|
|
// slide it
|
|
*retww = s_fsp*(*retww) + (1.0-s_fsp)*1.00;
|
|
|
|
// ensure we do not punish too hard
|
|
if ( *retww <= 0.0 ) *retww = 0.01;
|
|
|
|
if ( *retww > 1.0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
/*
|
|
if ( phrcountMax >= 0 ) {
|
|
int64_t sh = getPrefixHash ( (char *)NULL , 0 , NULL , 0 );
|
|
int64_t tid = g_indexdb.getTermId ( sh , wid1 );
|
|
logf(LOG_DEBUG,"build: phrcountMax=%"INT32" wrdCount1=%"INT32" "
|
|
"*ww=%.4f for word with tid=%"UINT64"",
|
|
phrcountMax,wrdcount1,(float)*ww,tid);
|
|
//if ( phrcountMax < 10 && tid == 16944700235015LL )
|
|
// log("hey");
|
|
}
|
|
*/
|
|
|
|
// sanity check
|
|
//if ( *ww == 0.0 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
/*
|
|
// scale wrdcountMin/phrcount down for the g_ptab table
|
|
if ( wrdcountMin > 29 ) {
|
|
float ratio = (float)phrcount2 / (float)wrdcountMin;
|
|
phrcount2 = (int32_t)((29.0 * ratio) + 0.5);
|
|
wrdcountMin = 29;
|
|
}
|
|
if ( phrcount2 > 29 ) {
|
|
float ratio = (float)wrdcountMin / (float)phrcount2;
|
|
wrdcountMin = (int32_t)((29.0 * ratio) + 0.5);
|
|
phrcount2 = 29;
|
|
}
|
|
*/
|
|
// . if the word is Mexico in 'New Mexico good times' then
|
|
// phrase term #i which is, say, "Mexico good" needs to
|
|
// get the min word count when doings its word to phrase
|
|
// ratio.
|
|
// . it has two choices, it can use the word count of
|
|
// "Mexico" or it can use the word count of "good".
|
|
// . say, each is pretty high in the document so the phrase
|
|
// ends up getting penalized heavily, which is good because
|
|
// it is a nonsense phrase.
|
|
// . if we had "united socialist soviet republic" repeated
|
|
// a lot, the phrase "socialist soviet" would score high
|
|
// and the individual words would score low. that is good.
|
|
// . try to seek the highest weight possible for this phrase
|
|
// by choosing the lowest word count possible
|
|
// . NO LONGER AFFECT phrase weights because just because the
|
|
// words occur a lot in the document and this may be the only
|
|
// occurence of this phrase, does not mean we should punish
|
|
// the phrase. -- MDW
|
|
//*retpw = 1.0;
|
|
return;
|
|
|
|
// do it the old way...
|
|
//*pw = g_ptab[wrdcountMin][phrcount2];
|
|
|
|
// sanity check
|
|
//if ( *pw == 0.0 ) { char *xx = NULL; *xx = 0; }
|
|
}
|
|
|
|
// for registerSleepCallback
|
|
static void clockSyncWaitWrapper ( int fd , void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . a special call
|
|
// . returns -1 if blocked, 1 otherwise, 0 on error
|
|
char XmlDoc::waitForTimeSync ( ) {
|
|
// unregister?
|
|
if ( isClockInSync() && m_alreadyRegistered ) {
|
|
// note it
|
|
log("build: clock now synced for %s",m_firstUrl.m_url);
|
|
g_loop.unregisterSleepCallback(m_masterState,
|
|
clockSyncWaitWrapper);
|
|
}
|
|
// return 1 if synced!
|
|
if ( isClockInSync() ) return 1;
|
|
// already registered? wait another 1000ms
|
|
if ( m_alreadyRegistered ) return -1;
|
|
// flag it
|
|
m_alreadyRegistered = true;
|
|
// note it
|
|
log("build: waiting for clock to sync for %s",m_firstUrl.m_url);
|
|
// this should mean it is re-called later
|
|
if ( g_loop.registerSleepCallback ( 1000 , // 1000 ms
|
|
m_masterState ,
|
|
clockSyncWaitWrapper ,
|
|
m_niceness ))
|
|
// wait for it, return -1 since we blocked
|
|
return -1;
|
|
// if was not able to register, ignore delay
|
|
log("doc: failed to register clock wait callback");
|
|
return 0;
|
|
}
|
|
|
|
////////////////////////////
|
|
//
|
|
// SCRAPING TOOLS
|
|
//
|
|
////////////////////////////
|
|
|
|
void doInjectLoopWrapper ( void *state ) {
|
|
XmlDoc *XD = (XmlDoc *)state;
|
|
// if it blocked, wait
|
|
if ( ! XD->doInjectLoop ( ) ) return;
|
|
// . if we did not inject any links, i guess we are done!
|
|
// . this happens if the ahrefs.com doc had the same outlinks
|
|
// as the ahrefs.com doc for another search result, they are all
|
|
// deduped and it does not block.
|
|
XD->m_finalCallback ( XD->m_finalState );
|
|
}
|
|
|
|
// . return false if blocks, true otherwise
|
|
// . return true and set error on error, with no blocks outstanding
|
|
// . TODO: make this word for ahrefs.com list of links in xml feed
|
|
bool XmlDoc::injectLinks (HashTableX *linkDedupTablePtr ,
|
|
HashTableX *domDedupTablePtr,
|
|
void *finalState ,
|
|
void (* finalCallback)(void *)) {
|
|
|
|
// INJECT 10 at a time. xmldoc is 1MB.
|
|
int32_t i; for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
|
|
XmlDoc *nd;
|
|
// continue if already set it. this was overwriting it
|
|
// and causing a mem leak before
|
|
if ( m_xmlDocs[i] ) continue;
|
|
try { nd = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
break;
|
|
}
|
|
mnew ( nd , sizeof(XmlDoc),"xmldocarr");
|
|
m_xmlDocs[i] = nd;
|
|
}
|
|
|
|
// all null?
|
|
if ( i < (int32_t)MAX_XML_DOCS ) {
|
|
log("scrape: one xmldoc alloc failed");
|
|
return true;
|
|
}
|
|
|
|
m_masterLoop = doInjectLoopWrapper;
|
|
m_masterState = this;
|
|
|
|
m_finalState = finalState;
|
|
m_finalCallback = finalCallback;
|
|
|
|
// note it
|
|
//log("xmldoc: injecting outlinks of %s",m_firstUrl.getUrl());
|
|
|
|
m_linkDedupTablePtr = linkDedupTablePtr;
|
|
m_domDedupTablePtr = domDedupTablePtr;
|
|
|
|
// loop over all links
|
|
m_i = 0;
|
|
m_blocked = 0;
|
|
memset ( m_used , 0 , (int32_t)MAX_XML_DOCS );
|
|
|
|
return doInjectLoop();
|
|
}
|
|
|
|
|
|
void doneInjectingWrapper ( void *state ) {
|
|
XmlDoc *xd = (XmlDoc *)state;
|
|
XmlDoc *XD = (XmlDoc *)xd->m_hack;
|
|
XD->doneInjecting ( xd );
|
|
}
|
|
|
|
// . return false if blocks, true otherwise
|
|
// . return true and set error on error, with no blocks outstanding
|
|
bool XmlDoc::doInjectLoop ( ) {
|
|
|
|
setStatus("inject outlinks");
|
|
|
|
//Links *links = getLinks();
|
|
//if ( ! links ) return (m_blocked == 0);
|
|
//if ( links == (void *)-1 ) return false;
|
|
Sections *sections = getSections();
|
|
if ( ! sections ) return (m_blocked == 0);
|
|
if ( sections == (void *)-1 ) return false;
|
|
Links *links = getLinks();
|
|
if ( ! links ) return (m_blocked == 0);
|
|
if ( links == (void *)-1 ) return false;
|
|
Words *words = getWords();
|
|
if ( ! words ) return (m_blocked == 0);
|
|
if ( words == (void *)-1 ) return false;
|
|
Bits *bp = getBits();
|
|
if ( ! bp ) return (m_blocked == 0);
|
|
if ( bp == (void *)-1 ) return false;
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
int32_t n = links->getNumLinks();
|
|
Url tmpUrl;
|
|
Section *prev = NULL;
|
|
// scan the links now
|
|
for ( ; m_i < n ; ) {
|
|
// get xml doc then
|
|
int32_t j; for ( j = 0 ; j < MAX_XML_DOCS ; j++ )
|
|
if ( ! m_used[j] ) break;
|
|
// none? return false if blocked.
|
|
if ( j >= MAX_XML_DOCS ) return false;
|
|
// get the m_ith link
|
|
char *link = links->getLink ( m_i );
|
|
int32_t linkLen = links->getLinkLen ( m_i );
|
|
// temp term
|
|
if ( link[linkLen] ) { char *xx=NULL;*xx=0; }
|
|
// skip to next link to index
|
|
m_i++;
|
|
// skip injecting if its an internal bing/google outlink
|
|
if ( strncmp(link,"http://www.bing.com/",20) == 0 )
|
|
continue;
|
|
// skip youtube query links. they contain our exact
|
|
// query!! so almost always come up #1
|
|
if ( strstr(link,".youtube.com/") && strstr(link,"&q="))
|
|
continue;
|
|
if ( strstr(link,".msn.com/") )
|
|
continue;
|
|
if ( strstr(link,".microsoft.com/") )
|
|
continue;
|
|
if ( strstr(link,".discoverbing.com/") )
|
|
continue;
|
|
if ( strstr(link,".googleusercontent.com/") )
|
|
continue;
|
|
//if(!strncmp(link,"http://webcache.googleusercontent.com/",38)
|
|
if(!strncmp(link,"http://www.google.com/url?q=http",32)){
|
|
// grab the real url from that
|
|
char *embed = strstr(link,"url?q=http");
|
|
if ( ! embed ) continue;
|
|
link = embed+6;
|
|
char *end = embed;
|
|
for ( ; *end && *end != '&' ; end++) {
|
|
// google appends query to url.. strange
|
|
//if ( end[0] == '%' &&
|
|
// end[1] == '2' &&
|
|
// to_lower_a(end[2]) == 'b' )
|
|
// break;
|
|
}
|
|
SafeBuf mbuf;
|
|
mbuf.reserve ( end - link + 100 );
|
|
int32_t dlen;
|
|
char *bs = mbuf.getBufStart();
|
|
dlen=urlDecode(bs,link , end - link );
|
|
bs[dlen] = '\0';
|
|
tmpUrl.set ( bs );
|
|
link = tmpUrl.getUrl();
|
|
linkLen = tmpUrl.getUrlLen();
|
|
}
|
|
// skip maps.google.com etc.
|
|
if ( strstr(link,".google.com/") )
|
|
continue;
|
|
|
|
// ok, point to title and summary for this result!
|
|
// go up to prev node for first non-clickable text which
|
|
// should be summary
|
|
//Section **sp = sections->m_sectionPtrs;
|
|
// get the section
|
|
int32_t ln = links->getNodeNum(m_i-1);
|
|
// get node ptr
|
|
XmlNode *node = m_xml.getNodePtr(ln);
|
|
char *ptr = node->m_node;
|
|
// find section that contains it i guess
|
|
Section *sx = sections->m_rootSection;
|
|
Section *last = NULL;
|
|
char **wptrs = words->getWords();
|
|
//nodeid_t *tids = words->getTagIds();
|
|
for ( ; sx ; sx = sx->m_next ) {
|
|
// get section ptr
|
|
char *sw = wptrs[sx->m_b-1];
|
|
if ( sw < ptr ) continue;
|
|
// over?
|
|
sw = wptrs[sx->m_a];
|
|
if ( sw > ptr ) break;
|
|
last = sx;
|
|
}
|
|
// assign
|
|
sx = last;
|
|
// telescope section up one i guess
|
|
//sx = sx->m_parent;
|
|
// int16_tcut
|
|
wbit_t *bits = bp->m_bits;
|
|
// if still same first alnum, go another
|
|
//for ( ; sx ; sx = sx->m_parent ) {
|
|
// // skip if same word starts this section
|
|
// //if ( sx->m_firstWordPos == fa ) continue;
|
|
// // must have alnum
|
|
// if ( sx->m_firstWordPos <= 1 ) continue;
|
|
// // must be in link! should be the result TITLE
|
|
// if ( bits[sx->m_firstWordPos] & D_IN_LINK ) break;
|
|
// // word must not be "cached" or whatever...
|
|
//}
|
|
// if in bold tag, should telescope up some more
|
|
//if ( sx && sx->m_tagId == TAG_B ) sx = sx->m_parent;
|
|
//if ( sx && sx->m_tagId == TAG_STRONG ) sx = sx->m_parent;
|
|
// save
|
|
//int32_t fa = sx->m_firstWordPos;
|
|
// that's the title so telescope up as int32_t as that is the
|
|
// first alnum!!!
|
|
for ( ; sx ; sx = sx->m_parent ) {
|
|
//Section *ps = sx->m_parent;
|
|
// do we have a next brother? stop then! that means
|
|
// we are in a list!
|
|
//if ( sx->m_nextBrother ) break;
|
|
//if ( ps->m_firstWordPos != fa ) break;
|
|
// stop when we hit a result delimeter!!
|
|
if ( sx->m_tagId == TAG_LI ) {
|
|
// bing...
|
|
if ( strncmp(wptrs[sx->m_a],
|
|
"<li class=\"sa_wr\">",
|
|
17) == 0 ) {
|
|
break;
|
|
}
|
|
// google...
|
|
if ( strncmp(wptrs[sx->m_a],
|
|
"<li class=\"g\">",
|
|
13) == 0 ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
// if no indicator, bail
|
|
if ( ! sx ) continue;
|
|
// skip link if contained in prev section
|
|
if ( prev == sx )
|
|
continue;
|
|
// save it
|
|
prev = sx;
|
|
// record search result details
|
|
Section *title = NULL;
|
|
Section *cite = NULL;
|
|
Section *summary = NULL;
|
|
// . that is probably the full result then...
|
|
// . title is first sentence
|
|
for ( ; sx ; sx = sx->m_next ) {
|
|
// only sentences
|
|
if ( ! ( sx->m_flags & SEC_SENTENCE ) ) continue;
|
|
// grab it
|
|
if ( ! title ) {
|
|
title = sx;
|
|
continue;
|
|
}
|
|
// skip section if in link
|
|
if ( bits[sx->m_firstWordPos] & D_IN_LINK ) continue;
|
|
// we are sentence section so fix it so we are one
|
|
// above!
|
|
Section *rs = sx; // ->m_parent;
|
|
// telescope up to a div or whatever...
|
|
//for ( ; rs ; rs = rs->m_parent ) {
|
|
// if ( rs->m_tagId == TAG_DIV ) break;
|
|
// if ( rs->m_tagId == TAG_P ) break;
|
|
//}
|
|
// and out of bold
|
|
if ( rs && rs->m_tagId == TAG_B ) rs = rs->m_parent;
|
|
if ( rs && rs->m_tagId == TAG_STRONG) rs=rs->m_parent;
|
|
// bail if no good!
|
|
if ( ! rs ) continue;
|
|
// then site if google
|
|
if ( ! cite ) {
|
|
cite = rs;
|
|
continue;
|
|
}
|
|
// then summary
|
|
summary = rs;
|
|
break;
|
|
}
|
|
m_serpBuf.safePrintf("\t\t<result>\n");
|
|
// print <title> tag
|
|
if ( title ) printSerpFiltered(title,"title");
|
|
// print <sum> tag
|
|
if ( summary ) printSerpFiltered(summary,"sum");
|
|
m_serpBuf.safePrintf("\t\t\t<url>");
|
|
m_serpBuf.safeMemcpy ( link , linkLen );
|
|
m_serpBuf.safePrintf("</url>\n");
|
|
m_serpBuf.safePrintf("\t\t</result>\n");
|
|
|
|
|
|
// if not injecting, skip
|
|
//continue;
|
|
if ( ! m_reallyInjectLinks ) continue;
|
|
|
|
// dedup
|
|
int32_t linkHash32 = hash32 ( link , linkLen );
|
|
if ( m_linkDedupTablePtr &&
|
|
m_linkDedupTablePtr->isInTable (&linkHash32) ) continue;
|
|
// add it otherwise
|
|
if ( m_linkDedupTablePtr )
|
|
m_linkDedupTablePtr->addKey ( &linkHash32 );
|
|
|
|
// we use this when injecting ahrefs links
|
|
if ( m_domDedupTablePtr ) {
|
|
int32_t domLen;
|
|
char *dom = getDomFast ( link , &domLen );
|
|
int32_t dh32 = hash32 ( dom , domLen );
|
|
if ( m_domDedupTablePtr->isInTable (&dh32) ) continue;
|
|
m_domDedupTablePtr->addKey ( &dh32 );
|
|
}
|
|
|
|
// get it
|
|
XmlDoc *xd = m_xmlDocs[j];
|
|
|
|
if ( ! xd ) { char *xx=NULL;*xx=0; }
|
|
|
|
// add www to it
|
|
Url lu;
|
|
lu.set ( link , linkLen , true );
|
|
|
|
char *wwwLink = lu.getUrl();
|
|
|
|
// this can go on the stack since set4() copies it
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
// index this link!
|
|
strcpy(sreq.m_url,wwwLink);
|
|
// parentdocid of 0
|
|
int32_t firstIp = hash32n(wwwLink);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = 0;//hopCount;
|
|
sreq.m_hopCountValid = 1;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
|
|
setStatus("injecting an outlink");
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
if ( ! xd->set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
m_niceness, // 1 ,
|
|
// inject this content
|
|
NULL, // content ,
|
|
false, // deleteFromIndex ,
|
|
0, // forcedIp ,
|
|
0, // contentType ,
|
|
0, // lastSpidered ,
|
|
false )) { // hasMime
|
|
// . g_errno should be set if that returned false
|
|
// . return true if does not need to block
|
|
log("xmldoc: outlink inject: %s",mstrerror(g_errno));
|
|
break;
|
|
}
|
|
|
|
xd->m_hack = this;
|
|
|
|
// make this our callback in case something blocks
|
|
xd->setCallback ( xd , doneInjectingWrapper );
|
|
// . set xd from the old title rec if recycle is true
|
|
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
|
xd->m_recycleContent = false;//true;
|
|
|
|
// avoid looking up ip of each outlink to add "firstip" tag to
|
|
// tagdb because that can be slow!!!!!!!
|
|
xd->m_spiderLinks = false;
|
|
xd->m_spiderLinks2 = false;
|
|
xd->m_spiderLinksValid = true;
|
|
|
|
// . newOnly is true --> do not inject if document is already
|
|
// indexed!
|
|
// . maybe just set indexCode
|
|
xd->m_newOnly = true;//false;//newOnly;
|
|
// need to refresh it!!
|
|
//xd->m_newOnly = false;//newOnly;
|
|
|
|
// turn off robots.txt lookups
|
|
xd->m_isAllowed = true;
|
|
xd->m_isAllowedValid = true;
|
|
xd->m_crawlDelay = -1; // unknown
|
|
xd->m_crawlDelayValid = true;
|
|
|
|
// log it now
|
|
log("inject: indexing outlink %s (hash=%"UINT32")",wwwLink,
|
|
(uint32_t)linkHash32);
|
|
|
|
// costs one API unit, which is one cent. but if we do
|
|
// top 50 on google, top 50 on procog, it can be like
|
|
// $1 every time we do this.
|
|
//xd->injectAhrefsLinks();
|
|
|
|
bool status = true;
|
|
|
|
// this will tell it to index ahrefs first before indexing
|
|
// the doc. but do NOT do this if we are from ahrefs.com
|
|
// ourselves to avoid recursive explosion!!
|
|
xd->m_downloadLevel = m_downloadLevel + 1;
|
|
xd->m_useAhrefs = m_useAhrefs;
|
|
|
|
// inherit dedup tables as well!
|
|
xd->m_linkDedupTablePtr = m_linkDedupTablePtr;
|
|
|
|
// . now tell it to index
|
|
// . this returns false if blocked
|
|
status = xd->indexDoc ( );
|
|
|
|
// log it. i guess only for errors when it does not block?
|
|
// because xmldoc.cpp::indexDoc calls logIt()
|
|
if ( status ) xd->logIt();
|
|
// otherwise, it blocks
|
|
else {
|
|
m_blocked++;
|
|
log("xmldoc: blockedout=%"INT32" slotj=%"INT32" "
|
|
"(this=0x%"PTRFMT",xd=0x%"PTRFMT")",
|
|
m_blocked,j,(PTRTYPE)this,(PTRTYPE)xd);
|
|
m_used[j] = true;
|
|
}
|
|
}
|
|
|
|
// return true if all done
|
|
return (m_blocked == 0);
|
|
}
|
|
|
|
void XmlDoc::doneInjecting ( XmlDoc *xd ) {
|
|
// find it in our list
|
|
int32_t i;
|
|
for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
|
|
if ( ! m_used[i] ) continue;
|
|
if ( m_xmlDocs[i] != xd ) continue;
|
|
break;
|
|
}
|
|
// core if not found in our list, it must be there
|
|
if ( i >= MAX_XML_DOCS ) { char *xx=NULL;*xx=0; }
|
|
// free it up now!
|
|
m_used[i] = 0;
|
|
// free it up
|
|
//mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
|
|
//delete ( m_xmlDocs[i] );
|
|
//m_xmlDocs[i] = NULL;
|
|
m_xmlDocs[i]->reset();
|
|
// uncount it as being outstanding
|
|
m_blocked--;
|
|
// log debug
|
|
log("xmldoc: blockedin=%"INT32" (this=0x%"PTRFMT")",
|
|
m_blocked,(PTRTYPE)this);
|
|
// return if still blocked
|
|
if ( ! doInjectLoop() ) return;
|
|
// log debug
|
|
log("xmldoc: final callback");
|
|
// ok, all have been indexed
|
|
m_finalCallback ( m_finalState );
|
|
}
|
|
|
|
bool XmlDoc::injectAhrefsLinks ( ) {
|
|
|
|
setStatus("get inlinks from ahrefs.com");
|
|
|
|
// skip for now
|
|
//return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
|
|
// make the ahrefs urls
|
|
try { m_ahrefsDoc = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
mnew ( m_ahrefsDoc , sizeof(XmlDoc),"xmldocah");
|
|
// make the url
|
|
SafeBuf ubuf;
|
|
// turn count down to 10 for now
|
|
ubuf.safePrintf("http://api.ahrefs.com/get_backlinks.php?count=350&mode=exact&output=xml&AhrefsKey=0452f27fd5a7fec5e9702e23ba4af223&target=");
|
|
//ubuf.safePrintf("http://www.gigablast.com/?q=poo&u=");
|
|
ubuf.urlEncode (m_firstUrl.getUrl() );
|
|
Url url;
|
|
url.set ( ubuf.getBufStart() );
|
|
char *up = url.getUrl();
|
|
// set by url i guess
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
strcpy(sreq.m_url,up);
|
|
// parentdocid of 0
|
|
int32_t firstIp = hash32n(up);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = 0;//hopCount;
|
|
sreq.m_hopCountValid = 1;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
// int16_tcut
|
|
XmlDoc *ah = m_ahrefsDoc;
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
if ( ! ah->set4 ( &sreq ,
|
|
NULL ,
|
|
cr->m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
m_niceness, // 1 ,
|
|
// inject this content
|
|
NULL, // content ,
|
|
false, // deleteFromIndex ,
|
|
0, // forcedIp ,
|
|
0, // contentType ,
|
|
0, // lastSpidered ,
|
|
false )) { // hasMime
|
|
log("xmldoc: ahref doc error %s",mstrerror(g_errno));
|
|
// g_errno should be set if that returned false
|
|
return true;
|
|
}
|
|
// do not re-call the set
|
|
//m_needsSet = false;
|
|
// make this our callback in case something blocks
|
|
//ah->setCallback ( state , callback );
|
|
// do not re-lookup the robots.txt
|
|
ah->m_isAllowed = true;
|
|
ah->m_isAllowedValid = true;
|
|
ah->m_crawlDelay = -1; // unknown
|
|
ah->m_crawlDelayValid = true;
|
|
|
|
ah->m_downloadLevel = m_downloadLevel + 1;
|
|
|
|
// reset domain table for deduping ahref's links by domain
|
|
// before injecting them... only inject one per domain
|
|
if ( ! m_domDedupTablePtr ) {
|
|
m_domDedupTable.set(4,0,512,NULL,0,false,m_niceness,"dmtab2");
|
|
m_domDedupTablePtr = &m_domDedupTable;
|
|
}
|
|
|
|
// log it now
|
|
//log("inject: indexing injected doc %s",url);
|
|
|
|
// if we are a url like api.ahrefs.com/get_backlinks... then
|
|
// our links can use our table for deduping based on domain, AND
|
|
// they can use our link dedup table in case one outlink is also
|
|
// a search result on google's page...
|
|
if ( ! ah->injectLinks ( m_linkDedupTablePtr,
|
|
m_domDedupTablePtr,
|
|
m_masterState ,
|
|
m_masterLoop ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::printSerpFiltered ( Section *sx , char *tagName ) {
|
|
//int64_t *wids = m_words.getWordIds();
|
|
char **wptrs = m_words.getWords();
|
|
int32_t *wlens = m_words.getWordLens();
|
|
int32_t fa = sx->m_firstWordPos;
|
|
nodeid_t *tids = m_words.getTagIds();
|
|
if ( fa > 0 && tids[fa-1] == TAG_B ) fa--;
|
|
if ( fa > 0 && tids[fa-1] == TAG_STRONG ) fa--;
|
|
int32_t la = sx->m_b;
|
|
int32_t nw = m_words.getNumWords();
|
|
if ( la+1 < nw && tids[la+1] == (TAG_B|BACKBIT) ) la++;
|
|
if ( la+1 < nw && tids[la+1] == (TAG_STRONG|BACKBIT) ) la++;
|
|
|
|
// advance la even more if regular words or br tags or b or strong tags
|
|
for ( ; la < nw ; la++ ) {
|
|
if ( ! tids[la] ) continue;
|
|
if ( (tids[la]&BACKBITCOMP) == TAG_BR ) continue;
|
|
if ( (tids[la]&BACKBITCOMP) == TAG_STRONG ) continue;
|
|
if ( tids[la] == TAG_BR ) continue;
|
|
break;
|
|
}
|
|
|
|
m_serpBuf.safePrintf("\t\t\t<%s>",tagName);
|
|
// cdata!
|
|
m_serpBuf.safePrintf("<![CDATA[");
|
|
// subtract 1 from sx->m_b to avoid ending tag
|
|
for ( int32_t i = fa ; i < la ; i++ ) {
|
|
// skip if br
|
|
if ( tids[i] == TAG_BR ) continue;
|
|
m_serpBuf.cdataEncode ( wptrs[i] , wlens[i] );
|
|
}
|
|
// cdata!
|
|
m_serpBuf.safePrintf("]]>");
|
|
m_serpBuf.safePrintf("</%s>\n",tagName);
|
|
return true;
|
|
}
|
|
|
|
//////////
|
|
//
|
|
// BEGIN NEW SEO MATCHING QUERIES TOOL CODE
|
|
//
|
|
//////////
|
|
|
|
|
|
static void loadTitleRecFromDiskOrSpiderWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
if ( ! THIS->loadTitleRecFromDiskOrSpider() ) return;
|
|
THIS->m_callback1 ( THIS->m_state );
|
|
}
|
|
|
|
// . if we can't load titlerec from titledb, spider it, index it and
|
|
// use that new titlerec
|
|
// . returns false if blocks
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::loadTitleRecFromDiskOrSpider() {
|
|
|
|
if ( ! m_masterLoop ) {
|
|
m_masterState = this;
|
|
m_masterLoop = loadTitleRecFromDiskOrSpiderWrapper;
|
|
}
|
|
|
|
// fix a core when getTermListBuf() calls getMetaList()
|
|
// which calls getNewSpiderReply() which calls
|
|
// getDownloadEndTime() and tries to download the page
|
|
// even though we have a valid titlerec!
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTimeValid = true;
|
|
m_downloadEndTime = 0;
|
|
}
|
|
|
|
// . try to recycle the content first
|
|
// . try to load it from title rec first
|
|
// . we have to do this otherwise our ptr_linkInfo link texts
|
|
// will be somewhat random and cause us to get different scores
|
|
// for the queries we match!!
|
|
// . so do this not just for speed, but to be consistent.
|
|
if ( ! loadFromOldTitleRec() ) return false;
|
|
|
|
// did that fail? i.e. not found!?!?! ignore and just indexx it
|
|
if ( m_oldTitleRecValid && m_oldTitleRec )
|
|
return true;
|
|
|
|
// ok, we gotta index it
|
|
if ( ! m_loggedMsg3 ) {
|
|
m_loggedMsg3 = true;
|
|
log("xmldoc: url %s not in titledb, spidering and indexing",
|
|
m_firstUrl.m_url);
|
|
}
|
|
|
|
// clear that
|
|
g_errno = 0;
|
|
|
|
// turn off recycling i guess since we don't have it
|
|
m_recycleContent = false;
|
|
|
|
// first index it, but only if not already indexed
|
|
// did it block?
|
|
// eror indexing doc? indexCode should be set then
|
|
if ( ! indexDoc() ) return false;
|
|
|
|
// no blocking
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
void getSEOQueryInfoWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// note it
|
|
THIS->setStatus ( "seoqueryinfowrapper" );
|
|
// make sure has not been freed from under us!
|
|
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
|
|
// note it
|
|
THIS->setStatus ( "in seo query info wrapper" );
|
|
// return if it blocked
|
|
if ( THIS->getSEOQueryInfo( ) == (void *)-1 ) return;
|
|
// print any error
|
|
if ( g_errno )
|
|
log("seopipe: getSeoQueryInfo error: %s",mstrerror(g_errno));
|
|
// all done
|
|
else
|
|
log("seopipe: getSeoQueryInfo is done");
|
|
// show timing info
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - THIS->m_beginSEOTime;
|
|
log("seopipe: time: getSeoQueryInfo took %"INT64"ms",took);
|
|
// otherwise, all done, call the caller callback
|
|
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
|
|
else THIS->m_callback2 ( THIS->m_state );
|
|
return;
|
|
}
|
|
|
|
void getSEOQueryInfoWrapper2 ( int fd , void *state ) {
|
|
// just pump! otherwise we might re-launch a msg3a request while
|
|
// one is outstanding causing a core in Multicast::reset()
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// debug log
|
|
THIS->setStatus ("getseoqueryinfowrapper2");
|
|
// if we are waiting just on the pump i guess we are all done!
|
|
if ( ! THIS->m_socketWriteBufValid ) {
|
|
log("seopipe: pumping socket");
|
|
THIS->pumpSocketWriteBuf();
|
|
return;
|
|
}
|
|
// not pumping?
|
|
log("seopipe: pumping socket ready wrapper");
|
|
// otherwise, let it call the callback
|
|
getSEOQueryInfoWrapper ( state );
|
|
}
|
|
|
|
// . return safebuf of xml containing matching and related queries and
|
|
// related urls/titles
|
|
// . this transmits the xml as it generates it to "m_seoSocket" if non-null
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . stores the xml in the m_socketWriteBuf SafeBuf
|
|
// . will keep blocking (returning -1) until the xml is delivered to socket
|
|
// if it is non-NULL
|
|
SafeBuf *XmlDoc::getSEOQueryInfo ( ) {
|
|
|
|
setStatus ( "seo query info" );
|
|
|
|
// only set to valid once it has been all written out!!
|
|
if ( m_socketWriteBufValid ) {
|
|
// all done?
|
|
if ( ! m_seoSocket ) return &m_socketWriteBuf;
|
|
// pump
|
|
pumpSocketWriteBuf();
|
|
// if socket not done being pumped... we block. it's
|
|
// ready wrappers should re-call our wrapper.
|
|
if ( m_socketWriteBufSent >= m_socketWriteBuf.length() )
|
|
return &m_socketWriteBuf;
|
|
// wait for write to finish
|
|
return (SafeBuf *)-1;
|
|
}
|
|
|
|
// the g_errno could be a title rec not found reply coming back
|
|
// so do not process that here! it needs to be processed
|
|
// by the function whose request resulted in an error reply.
|
|
// for instances, the getTitle() call below needs to set g_errno
|
|
// when we call it now, responding to its msg22 reply.
|
|
//if ( g_errno ) return NULL;
|
|
|
|
// a good place to init stuff we need here
|
|
if ( ! m_masterState ) {
|
|
m_printedQueries = false;
|
|
m_printedRelatedDocIds = false;
|
|
m_printedRelatedQueries = false;
|
|
m_printedRecommendedLinks = false;
|
|
m_printedScoredInsertableTerms = false;
|
|
//m_docIndexed = false;
|
|
// time it
|
|
m_beginSEOTime = gettimeofdayInMilliseconds();
|
|
// for our m_masterLoop function, it uses this as the state
|
|
m_masterState = this;
|
|
// this is a main entry point function so anything that blocks
|
|
// should re-call this function
|
|
m_masterLoop = getSEOQueryInfoWrapper;
|
|
// assume indexed
|
|
m_docIndexed = true;
|
|
// fix a core when getTermListBuf() calls getMetaList()
|
|
// which calls getNewSpiderReply() which calls
|
|
// getDownloadEndTime() and tries to download the page
|
|
// even though we have a valid titlerec!
|
|
if ( ! m_downloadEndTimeValid ) {
|
|
m_downloadEndTimeValid = true;
|
|
m_downloadEndTime = 0;
|
|
}
|
|
}
|
|
|
|
// . try to load it from title rec first
|
|
// . we have to do this otherwise our ptr_linkInfo link texts
|
|
// will be somewhat random and cause us to get different scores
|
|
// for the queries we match!!
|
|
// . so do this not just for speed, but to be consistent.
|
|
if ( m_recycleContent && ! loadFromOldTitleRec()) return (SafeBuf *)-1;
|
|
|
|
// did that fail? i.e. not found!?!?! ignore and just indexx it
|
|
if ( m_oldTitleRecValid && ! m_oldTitleRec && m_recycleContent ) {
|
|
// just skip this asshole then
|
|
log("xmldoc: url %s load3 failed",m_firstUrl.m_url);
|
|
// clear that
|
|
g_errno = 0;
|
|
// need to index it
|
|
m_docIndexed = false;
|
|
}
|
|
|
|
// first index it, but only if not already indexed
|
|
if ( ! m_docIndexed ) {
|
|
// turn off recycling i guess since we don't have it
|
|
m_recycleContent = false;
|
|
// did it block?
|
|
// eror indexing doc? indexCode should be set then
|
|
if ( ! indexDoc() ) return (SafeBuf *)-1;
|
|
// do not re-call
|
|
m_docIndexed = true;
|
|
}
|
|
|
|
|
|
// was indexing successful?
|
|
int32_t *indexCode = getIndexCode();
|
|
if ( ! indexCode || indexCode == (void *)-1 )
|
|
return (SafeBuf *)indexCode;
|
|
|
|
// if not successfully indexed send back error msg
|
|
if ( *indexCode && m_seoSocket ) {
|
|
m_socketWriteBuf.safePrintf(
|
|
"\t<errorMsg><![CDATA[%s]]>"
|
|
"</errorMsg>\n"
|
|
"</response>"
|
|
, mstrerror(*indexCode) );
|
|
// send on socket
|
|
pumpSocketWriteBuf();
|
|
// if socket not done being pumped... we block
|
|
if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
|
|
return (SafeBuf *)-1;
|
|
// otherwise, we are done sending
|
|
return &m_socketWriteBuf;
|
|
}
|
|
|
|
|
|
// seo.cpp needs this in printDupSentences
|
|
Sections *sections = getSectionsWithDupStats();
|
|
if ( ! sections || sections == (void *)-1) return (SafeBuf *)sections;
|
|
|
|
// seo.cpp needs this now when it calls getSiteRank()
|
|
int32_t *sni = getSiteNumInlinks();
|
|
if ( ! sni || sni == (void *)-1 ) return (SafeBuf *)sni;
|
|
|
|
// . find all logged queries that this document matches
|
|
// . this will launch msg99 requests to each host in the network
|
|
// . then it scores them
|
|
// . don't worry about sending back in real-time for this since it
|
|
// should be fast
|
|
SafeBuf *qpbuf = getMatchingQueriesScored();
|
|
if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;
|
|
|
|
// . how many queries do we have that match this url?
|
|
// . they should be sorted by our url's score
|
|
int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
|
|
// int16_tcut
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
// cast the msg99 reply ptrs, i.e. query ptrs
|
|
Msg99Reply **queryPtrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
|
|
// store each one as xml then into m_headerBuf99
|
|
if ( ! m_printedQueries && m_seoSocket ) {
|
|
m_printedQueries = true;
|
|
// do not flood the socket! so limit to 1000 queries
|
|
// they should be sorted by queryImportance!
|
|
// cheatcodes.com has like 50,000 matching queries.
|
|
int32_t max = numQueryPtrs;
|
|
if ( max > 1000 ) max = 1000;
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
// int16_tcut
|
|
Msg99Reply *qp = queryPtrs[i];
|
|
// sometimes queries like 'gallery-view' are
|
|
// hard-phrased and do not show up for us, so skip.
|
|
// they should be at the very end so we should be
|
|
// trimming the tail for them, so don't worry about
|
|
// <queryNum> having holes in it.
|
|
if ( qp->m_myDocId == 0LL && qp->m_myScore == 0.0 )
|
|
continue;
|
|
// int16_tcut
|
|
QueryLogEntry *qe = &qp->m_queryLogEntry;
|
|
sb->safePrintf("\t<seoQuery>\n"
|
|
"\t\t<queryNum>%"INT32"</queryNum>\n"
|
|
"\t\t<query><![CDATA[%s]]></query>\n"
|
|
"\t\t<queryTrafficPerDay>%"INT32""
|
|
"</queryTrafficPerDay>\n"
|
|
// our url's score
|
|
"\t\t<myDocId>%"INT64"</myDocId>\n"
|
|
"\t\t<myScore>%f</myScore>\n"
|
|
//"\t\t<mySiteHash32>%"UINT32""
|
|
//"</mySiteHash32>\n"
|
|
"\t\t<queryImportance>%f"
|
|
"</queryImportance>\n"
|
|
|
|
|
|
"\t</seoQuery>\n"
|
|
, i
|
|
, qp->m_queryStr
|
|
// x 10 to estimate google?
|
|
, qe->m_gigablastTraffic *
|
|
GB_TRAFFIC_MODIFIER
|
|
, qp->m_myDocId
|
|
, qp->m_myScore
|
|
//, qp->m_mySiteHash32
|
|
, qp->m_queryImportance
|
|
//,qp->m_queryInfo.m_numUniqueWordForms
|
|
//,qp->m_queryInfo.m_numRepeatWordForms
|
|
//qp->m_queryInfo.m_smallestNormTermFreq
|
|
);
|
|
}
|
|
}
|
|
|
|
// pump it some. i.e. send m_socketWriteBuf contents back to
|
|
// m_seoSocket if it is non-NULL
|
|
pumpSocketWriteBuf();
|
|
|
|
// . now instead try getting the top "imax" queries scored on the
|
|
// whole index
|
|
// . transmit them back on m_seoSocket AS WE GET THEM by calling
|
|
// pumpSocketWriteBuf() function and storing into m_socketWriteBuf
|
|
//qpbuf = getMatchingQueriesScoredForFullQuery ( );
|
|
//if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;
|
|
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
// how many related docids do we have?
|
|
int32_t nr = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
//
|
|
// print out the related urls
|
|
//
|
|
if ( ! m_printedRelatedDocIds && nr && m_seoSocket ) {
|
|
m_printedRelatedDocIds = true;
|
|
int32_t max = 200; // m_maxRelatedUrls;
|
|
if ( max == -1 ) max = nr;
|
|
if ( nr < max ) max = nr;
|
|
sb->safePrintf("\t<relatedUrls>\n");
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
RelatedDocId *rd = &rds[i];
|
|
// fix for titlerec not found errors
|
|
char *title = rd->ptr_rd_title;
|
|
char *url = rd->ptr_rd_url;
|
|
if ( ! title ) title = "";
|
|
if ( ! url ) url = "";
|
|
// print it out
|
|
sb->safePrintf("\t\t<relatedUrl>\n"
|
|
"\t\t\t<urlNum>%"INT32"</urlNum>\n"
|
|
"\t\t\t<url><![CDATA[%s]]></url>\n"
|
|
"\t\t\t<docId>%"INT64"</docId>\n"
|
|
"\t\t\t<siteHash32>%"UINT32"</siteHash32>\n"
|
|
"\t\t\t<title><![CDATA["
|
|
, i
|
|
, url
|
|
, rd->m_docId
|
|
, rd->m_siteHash32
|
|
);
|
|
// encode CDATA stuff in title
|
|
sb->cdataEncode(title);
|
|
sb->safePrintf("]]></title>\n"
|
|
"\t\t\t<queriesInCommon>%"INT32""
|
|
"</queriesInCommon>\n"
|
|
"\t\t\t<similarityScore>%f"
|
|
"</similarityScore>\n"
|
|
, rd->m_numCommonQueries
|
|
, rd->m_dotProduct // similarityScore
|
|
);
|
|
// print the actualy querynums in common
|
|
int32_t firstOff = rd->m_firstCommonQueryNumOff;
|
|
int32_t offset = firstOff;
|
|
sb->safePrintf("\t\t\t<queriesInCommon>\n");
|
|
for ( ; offset >= 0 ; ) {
|
|
// get that node
|
|
char *buf = m_commonQueryNumBuf.getBufStart();
|
|
// and offset
|
|
buf += offset;
|
|
// then cast
|
|
QueryNumLinkedNode *qn;
|
|
qn = (QueryNumLinkedNode *)buf;
|
|
// print that
|
|
sb->safePrintf("\t\t\t\t<queryNum>%"INT32""
|
|
"</queryNum>\n"
|
|
, qn->m_queryNum );
|
|
// advance. will be -1 when done
|
|
offset = qn->m_nextOff;
|
|
}
|
|
sb->safePrintf("\t\t\t</queriesInCommon>\n");
|
|
sb->safePrintf("\t\t</relatedUrl>\n");
|
|
}
|
|
sb->safePrintf("\t</relatedUrls>\n");
|
|
}
|
|
|
|
|
|
//
|
|
// recommended inlinks!
|
|
//
|
|
|
|
// pump it some. i.e. send m_socketWriteBuf contents back to
|
|
// m_seoSocket if it is non-NULL
|
|
pumpSocketWriteBuf();
|
|
|
|
SafeBuf *kbuf = getRecommendedLinksBuf();
|
|
if ( ! kbuf || kbuf == (void *)-1 ) return kbuf;
|
|
|
|
// print out the recommended links in xml
|
|
if ( ! m_printedRecommendedLinks && m_seoSocket ) {
|
|
sb->safePrintf("\t<recommendedLinks>\n");
|
|
char *p = kbuf->getBufStart();
|
|
char *pend = kbuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
// cast it
|
|
RecommendedLink *ri = (RecommendedLink *)p;
|
|
// skip it
|
|
p += ri->getSize();
|
|
// print it out
|
|
sb->safePrintf("\t\t<link>\n"
|
|
"\t\t\t<url><![CDATA[%s]]></url>\n"
|
|
"\t\t\t<title><![CDATA[%s]]></title>\n"
|
|
"\t\t\t<score>%f</score>\n"
|
|
"\t\t\t<siteRank>%"INT32"</siteRanke>\n"
|
|
,ri->getUrl(kbuf)
|
|
,ri->getTitle(kbuf)
|
|
,ri->m_totalRecommendedScore
|
|
,(int32_t)ri->m_siteRank
|
|
);
|
|
}
|
|
sb->safePrintf("\t</recommendedLinks>\n");
|
|
m_printedRecommendedLinks = true;
|
|
}
|
|
|
|
|
|
//
|
|
// related queries
|
|
//
|
|
|
|
// write out
|
|
pumpSocketWriteBuf();
|
|
|
|
|
|
SafeBuf *relBuf = getRelatedQueryBuf();
|
|
if ( ! relBuf || relBuf == (void *)-1 ) return relBuf;
|
|
QueryRel **rels = (QueryRel **)relBuf->getBufStart();
|
|
int32_t numRels = relBuf->length() / sizeof(QueryRel *);
|
|
|
|
//
|
|
// print out the related queries
|
|
//
|
|
if ( ! m_printedRelatedQueries && numRels && m_seoSocket ) {
|
|
sb->safePrintf("\t<relatedQueries>\n");
|
|
int32_t max = 200; // m_maxRelatedQueries;
|
|
if ( max == -1 ) max = numRels;
|
|
if ( numRels < max ) max = numRels;
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
QueryRel *rel = rels[i];
|
|
// must be a first!
|
|
if ( ! rel->m_isFirst ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
//QueryInfo *qi = &rel->m_queryInfo;
|
|
// print it out
|
|
sb->safePrintf("\t\t<relatedQuery>\n"
|
|
"\t\t\t<query><![CDATA[%s]]></query>\n"
|
|
"\t\t\t<relatedDocIdsInCommon>%"INT32""
|
|
"</relatedDocIdsInCommon>\n"
|
|
|
|
"\t\t\t<relatedQueryImportance>%f"
|
|
"</relatedQueryImportance>\n"
|
|
|
|
//"\t</relatedUrl>\n"
|
|
, rel->m_queryStr
|
|
, rel->m_docIdVotes
|
|
|
|
//, qi->m_numUniqueWordForms
|
|
//, qi->m_numRepeatWordForms
|
|
//, qi->m_smallestNormTermFreq
|
|
|
|
, rel->m_totalRelatedQueryImportance
|
|
//, qi->m_myScoreRelated
|
|
);
|
|
// print details!
|
|
sb->safePrintf("\t\t\t<matchingDocIds>\n");
|
|
// linked list of Msg99Replies for the related queries.
|
|
// all in linked list are for the same query but
|
|
// restricted to a different docid!
|
|
for ( ; rel ; rel = rel->m_next ) {
|
|
// get his related docid
|
|
RelatedDocId *rd = rel->m_relatedDocId;
|
|
// print that
|
|
sb->safePrintf("\t\t\t\t<match>\n"
|
|
"\t\t\t\t\t<relatedDocId>%"INT64""
|
|
"</relatedDocId>\n"
|
|
"\t\t\t\t\t<siteHash32>%"UINT32""
|
|
"</siteHash32>\n"
|
|
//"\t\t\t\t\t"
|
|
//"<queryImportance>%f"
|
|
//"</queryImportance>\n"
|
|
"\t\t\t\t\t<docIdSimilarity>%f"
|
|
"</docIdSimilarity>\n"
|
|
"\t\t\t\t\t<docIdScore>%f"
|
|
"</docIdScore>\n"
|
|
"\t\t\t\t</match>\n"
|
|
, rd->m_docId
|
|
, rd->m_siteHash32
|
|
//, rd->m_similarityScore
|
|
, rd->m_dotProduct
|
|
, rel->m_myScore
|
|
);
|
|
}
|
|
sb->safePrintf("\t\t\t</matchingDocIds>\n");
|
|
sb->safePrintf("\t\t</relatedQuery>\n");
|
|
|
|
}
|
|
sb->safePrintf("\t</relatedQueries>\n");
|
|
m_printedRelatedQueries = true;
|
|
}
|
|
|
|
|
|
// write out
|
|
pumpSocketWriteBuf();
|
|
|
|
// this is the Keyword Insertion Tool data (KIT data)
|
|
SafeBuf *sits = getScoredInsertableTerms();
|
|
if ( ! sits || sits == (void *)-1 ) return sits;
|
|
|
|
// try to store into cachedb in case user clicks a different
|
|
// insertable term and we have to update the wordposinfo::m_rankChange
|
|
// stuff in the html src display
|
|
//if ( ! storeIntoCachedb() )
|
|
// // return -1 if it blocked and wait for store to complete
|
|
// return (SafeBuf *)-1;
|
|
|
|
|
|
// print out query changes
|
|
if ( ! m_printedScoredInsertableTerms && m_seoSocket ) {
|
|
// dump out each insertable term and it's corresponding
|
|
// QueryChanges
|
|
if ( ! printScoredInsertableTerms ( sb ) )
|
|
return NULL;
|
|
m_printedScoredInsertableTerms = true;
|
|
// end of xml response?
|
|
sb->safePrintf("</response>\n");
|
|
}
|
|
|
|
// even if not fully pumped, set it to valid here
|
|
m_socketWriteBufValid = true;
|
|
|
|
if ( ! m_seoSocket ) return &m_socketWriteBuf;
|
|
|
|
// write out
|
|
pumpSocketWriteBuf();
|
|
|
|
// if socket not done being pumped... we block
|
|
if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
|
|
return (SafeBuf *)-1;
|
|
|
|
// ok, we are done
|
|
return &m_socketWriteBuf;
|
|
}
|
|
*/
|
|
|
|
// have the smallest twids on top!
|
|
int twidcmp ( const void *a, const void *b ) {
|
|
TermInfo *ua = (TermInfo *)a;
|
|
TermInfo *ub = (TermInfo *)b;
|
|
//uint32_t ua = *(uint32_t *)a;
|
|
//uint32_t ub = *(uint32_t *)b;
|
|
// HACKY: sort by lower 32 bits of the 64 bit termids so
|
|
// seo.cpp can use them with its QueryLogEntries which use 32 bit
|
|
// termids to save mem.
|
|
uint32_t ta = (uint32_t)ua->m_termId64;
|
|
uint32_t tb = (uint32_t)ub->m_termId64;
|
|
// lower first
|
|
if ( ta > tb ) return 1; // swap
|
|
if ( ta < tb ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
// . 1. make a vector of the words in the title, headers, page-inlink-text,
|
|
// and site-inlink-text
|
|
//
|
|
// . 2. pass that word vector to every machine in network to see what queries
|
|
// in the query logs we match. use Msg99.cpp. it should initialize
|
|
// on startup and load in it's share of the query logs. query log file
|
|
// should be sorted then sorted by filtered query then split. should also
|
|
// remove queries from the most aggressive IPs (bots). we would need
|
|
// a program, filterquerylog.cpp to do all that on gk37, our query log
|
|
// storage server. it needs to store # of times query was done, too.
|
|
// all queries should have back to back spaces removed and made lowercase.
|
|
// remove queries that have double quotes or colon operators in them.
|
|
// index each query term in the query log into HashTableX, which will
|
|
// point to the query in the buffer. then we just store the termlist
|
|
// in a SafeBuf that we save on disk. 40GB of queries split 256 ways
|
|
// is still like 175MB per server! (if one server is dead, skip it)
|
|
//
|
|
// . 3. merge all queries received from all hosts and sort by traffic.
|
|
//
|
|
// . 4. perform the queries on procog and cache the scores of the top 10
|
|
// results for each query. should be cached on machine that houses the
|
|
// query. try a 60-day cache max age.
|
|
//
|
|
// . 5. now redo the queries but with a "url:thisurl |" to get this page's
|
|
// score for each query. if the min score of the query on procog is
|
|
// well beyond our grasp, we could just skip it.
|
|
//
|
|
// . 6. then determine the # of inlinks we need to add to get more traffic
|
|
// for each query. assume siterank of 0 per inlink. if that would be
|
|
// impossible then increment the siterank until it gets us in the top 10.
|
|
//
|
|
|
|
|
|
// just use getTopTermsVector
|
|
HashTableX *XmlDoc::getTermIdBufDedupTable32 ( ) {
|
|
SafeBuf *tiBuf = getTermInfoBuf();
|
|
if ( ! tiBuf || tiBuf == (void *)-1 ) return (HashTableX *)tiBuf;
|
|
return &m_tidTable32;
|
|
}
|
|
|
|
// . used by handleRequest8e() which uses msg20::getSummary() with
|
|
// m_getTermListBuf to call this in the local host msg20 handler.
|
|
// . this buf is used to determine what queries this document matches
|
|
SafeBuf *XmlDoc::getTermId32Buf() {
|
|
|
|
if ( m_termId32BufValid )
|
|
return &m_termId32Buf;
|
|
|
|
SafeBuf *tiBuf = getTermInfoBuf ();
|
|
if ( ! tiBuf || tiBuf == (void *) -1 ) return tiBuf;
|
|
|
|
int32_t need = 4 * (tiBuf->length() / sizeof(TermInfo));
|
|
if ( ! m_termId32Buf.reserve(need) ) return NULL;
|
|
|
|
// scan those
|
|
char *p = tiBuf->getBufStart();
|
|
char *pend = tiBuf->getBuf();
|
|
uint32_t last = 0;
|
|
for ( ; p < pend ; ) {
|
|
TermInfo *ti = (TermInfo *)p;
|
|
p += sizeof(TermInfo);
|
|
uint32_t tid32 = (uint32_t)(ti->m_termId64);
|
|
m_termId32Buf.pushLong(tid32);
|
|
// sanity
|
|
if ( last && tid32 <= last ) { char *xx=NULL;*xx=0; }
|
|
last = tid32;
|
|
}
|
|
|
|
m_termId32BufValid = true;
|
|
return &m_termId32Buf;
|
|
}
|
|
|
|
// . used by getTermId32Buf() for getting this document's matching queries
|
|
// . serialize the words in the title and inlink text into a vector
|
|
// . SafeBuf is filled with class TermInfos! defined in seo.h. currently
|
|
// just a int64_t m_termId64 though!
|
|
// . get synonyms of each word too!
|
|
// . we sort them by the 32-bit termid so handleRequest8e() can do its fast
|
|
// compare algo to find matching queries which are also sorted by the lower
|
|
// 32 bits of terms in the query.
|
|
SafeBuf *XmlDoc::getTermInfoBuf ( ) {
|
|
|
|
setStatus ( "getterminfobuf" );
|
|
|
|
if ( m_termInfoBufValid ) return &m_termInfoBuf;
|
|
|
|
bool includeSynonyms = true;
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
|
|
LinkInfo *info1 = getLinkInfo1();
|
|
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
|
|
uint8_t *langId = getLangId();
|
|
if ( ! langId || langId == (uint8_t *)-1 ) return (SafeBuf *)langId;
|
|
|
|
|
|
if (!m_tidTable32.set(4,0,16384,NULL,0,false,m_niceness,"twidtabl"))
|
|
return NULL;
|
|
|
|
//
|
|
// add document body words now to m_twbuf
|
|
//
|
|
|
|
if ( ! addUniqueWordsToBuf ( &m_termInfoBuf ,
|
|
&m_tidTable32 , // dedup table
|
|
NULL, // filter table
|
|
NULL, // mincounttable
|
|
false ,
|
|
ww ,
|
|
includeSynonyms) )
|
|
return NULL;
|
|
|
|
//
|
|
// store count of each term we hash after this into "TMP"
|
|
//
|
|
HashTableX TMP;
|
|
if(!TMP.set(4,4,4096,NULL,0,false,m_niceness,"tmttt") )
|
|
return NULL;
|
|
|
|
//
|
|
// hash meta desc into TMP table
|
|
//
|
|
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription( &mdlen );
|
|
if ( md ) {
|
|
Words ww3;
|
|
ww3.setx ( md , mdlen , m_niceness );
|
|
if (!addUniqueWordsToBuf(NULL,
|
|
NULL , // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable
|
|
true, // store counts?
|
|
&ww3,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// hash meta keywords into TMP table
|
|
//
|
|
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
if ( mk ) {
|
|
Words ww4;
|
|
ww4.setx ( mk , mklen , m_niceness );
|
|
if (!addUniqueWordsToBuf(NULL,
|
|
NULL, // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable
|
|
true, // store counts?
|
|
&ww4,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// hash each link text into TMP table
|
|
//
|
|
|
|
// loop over every link text to this page
|
|
for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the link text
|
|
if ( k->size_linkText <= 1 ) continue;
|
|
// set Url
|
|
Url u;
|
|
u.set ( k->getUrl() , k->size_urlBuf );
|
|
// do not allow anomalous link text to match query
|
|
//if ( k->m_isAnomaly ) continue;
|
|
char *p = k-> getLinkText();
|
|
int32_t plen = k->size_linkText - 1;
|
|
if ( ! verifyUtf8 ( p , plen ) ) {
|
|
log("title: set4 bad link text from url=%s",
|
|
k->getUrl());
|
|
continue;
|
|
}
|
|
// debug
|
|
//log("seo: counttable for link text '%s'",k->getLinkText());
|
|
// now the words.
|
|
Words ww2;
|
|
if ( ! ww2.set ( k->getLinkText() ,
|
|
k->size_linkText-1, // len
|
|
TITLEREC_CURRENT_VERSION ,
|
|
true , // computeIds
|
|
m_niceness ))// niceness
|
|
// g_errno set on error, return NULL
|
|
return NULL;
|
|
// int16_tcuts on link text
|
|
if ( ! addUniqueWordsToBuf( NULL,
|
|
NULL, // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable
|
|
true, // store counts?
|
|
&ww2,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// now only add link texts to main table and buffer if it occurs
|
|
// already in the body, or occurs TWICE in "TMP"
|
|
//
|
|
|
|
|
|
// loop over every link text to this page
|
|
for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the link text
|
|
if ( k->size_linkText <= 1 ) continue;
|
|
// set Url
|
|
Url u;
|
|
u.set ( k->getUrl() , k->size_urlBuf );
|
|
// do not allow anomalous link text to match query
|
|
//if ( k->m_isAnomaly ) continue;
|
|
char *p = k-> getLinkText();
|
|
int32_t plen = k->size_linkText - 1;
|
|
if ( ! verifyUtf8 ( p , plen ) ) {
|
|
log("title: set4 bad link text from url=%s",
|
|
k->getUrl());
|
|
continue;
|
|
}
|
|
// now the words.
|
|
Words ww2;
|
|
if ( ! ww2.set ( k->getLinkText() ,
|
|
k->size_linkText-1, // len
|
|
TITLEREC_CURRENT_VERSION ,
|
|
true , // computeIds
|
|
m_niceness ))// niceness
|
|
// g_errno set on error, return NULL
|
|
return NULL;
|
|
|
|
if ( !addUniqueWordsToBuf( &m_termInfoBuf,
|
|
&m_tidTable32, // dedup table
|
|
NULL, // filter table
|
|
&TMP, // mincounttable, >=2 counts
|
|
false, // store counts?
|
|
&ww2,
|
|
includeSynonyms))
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
|
|
// how many 32-bit twids do we got?
|
|
//m_numTwids = m_twbuf.length() / 4;
|
|
//m_twids = (int32_t *)m_twbuf.getBufStart();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . sort that buf now
|
|
// . HACK: only sorts by last 32 bits of termid!!!!
|
|
qsort ( m_termInfoBuf.getBufStart(),
|
|
m_termInfoBuf.length() / sizeof(TermInfo),
|
|
sizeof(TermInfo), // 32-bit twids = 4 bytes
|
|
twidcmp );
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// if no twids then return a -2 ptr, not NULL, that means error
|
|
// not -1 that means blocked!
|
|
//if ( m_numTwids == 0 ) m_twids = (int32_t *)-2;
|
|
// do not repeat this logic
|
|
//m_twidsValid = true;
|
|
m_termInfoBufValid = true;
|
|
// return the vector
|
|
return &m_termInfoBuf;
|
|
}
|
|
|
|
// . just like getTermInfoBuf but also includes terms from related queries
|
|
// that our document does not have!
|
|
// . we do it this way because for seo.cpp::handleRequest95() it finds
|
|
// matching queries locally based on getNewTermInfoBuf()'s m_newTermInfoBuf.
|
|
SafeBuf *XmlDoc::getNewTermInfoBuf ( ) {
|
|
|
|
setStatus ( "getnewterminfobuf" );
|
|
|
|
if ( m_newTermInfoBufValid ) return &m_newTermInfoBuf;
|
|
|
|
SafeBuf *oldBuf = getTermInfoBuf ();
|
|
if ( ! oldBuf || oldBuf == (void *) -1 ) return oldBuf;
|
|
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
|
|
|
|
|
|
// this should be valid automatically
|
|
HashTableX *oldDedupTable = getTermIdBufDedupTable32 ( );
|
|
|
|
|
|
// get old guy
|
|
if ( ! m_newTermInfoBuf.safeMemcpy ( oldBuf ) )
|
|
return NULL;
|
|
|
|
// a dedup table on stack
|
|
HashTableX newDedup32;
|
|
if (! newDedup32.set(4,0,16384,NULL,0,false,m_niceness,"newdtabl"))
|
|
return NULL;
|
|
|
|
// now scan the insertable terms buf
|
|
char *p = itBuf->getBufStart();
|
|
char*pend = itBuf->getBuf();
|
|
// scan each "term" which might be one or more words
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
char *term = it->getTerm();
|
|
Words ww;
|
|
ww.set9 ( term , m_niceness );
|
|
// we add entries to the dedup table, "newDedup32",
|
|
// but only filter and not add to "oldDedupTable"
|
|
if ( ! addUniqueWordsToBuf ( &m_newTermInfoBuf,
|
|
&newDedup32 , // dedup table
|
|
oldDedupTable, // filter table
|
|
NULL, // mincounttable
|
|
false,
|
|
&ww ,
|
|
true ) )
|
|
return NULL;
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . sort that buf now.
|
|
// . HACK: only sorts by last 32 bits of termid!!!!
|
|
qsort ( m_newTermInfoBuf.getBufStart(),
|
|
m_newTermInfoBuf.length() / sizeof(TermInfo),
|
|
sizeof(TermInfo), // 32-bit twids = 4 bytes
|
|
twidcmp );
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
/*
|
|
// set the term freq of each one
|
|
p = m_newTermInfoBuf.getBufStart();
|
|
pend = m_newTermInfoBuf.getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
TermInfo *ti = (TermInfo *)p;
|
|
p += sizeof(TermInfo);
|
|
// look it up
|
|
int64_t tf = g_posdb.getTermFreq (cr->m_coll,ti->m_termId64);
|
|
// store it
|
|
ti->m_termFreq64 = tf;
|
|
}
|
|
*/
|
|
|
|
// do not repeat this logic
|
|
m_newTermInfoBufValid = true;
|
|
// return the vector
|
|
return &m_newTermInfoBuf;
|
|
}
|
|
|
|
bool XmlDoc::addUniqueWordsToBuf ( SafeBuf *termInfoBuf ,
|
|
HashTableX *dedupTable ,
|
|
HashTableX *filterTable ,
|
|
HashTableX *minCountTable ,
|
|
bool storeCounts,
|
|
Words *ww ,
|
|
bool getSynonyms ) {
|
|
|
|
int32_t nw = ww->getNumWords ();
|
|
uint64_t *wids = (uint64_t *)ww->getWordIds ();
|
|
//nodeid_t *tids = ww->getTagIds ();
|
|
uint8_t *langId = getLangId();
|
|
// this should have been set by parent caller
|
|
if ( ! langId || langId == (uint8_t *)-1 ) {char *xx=NULL;*xx=0; }
|
|
// store the langId here
|
|
uint8_t useLangId = *langId;
|
|
// default that to english i guess if unknown
|
|
if ( useLangId == langUnknown ) {
|
|
static XmlDoc *s_lastPrint = NULL;
|
|
if ( s_lastPrint != this ) {
|
|
log("seopipe: langid of page is unknown for twid "
|
|
"synonyms. assuming english.");
|
|
s_lastPrint = this;
|
|
}
|
|
useLangId = langEnglish;
|
|
}
|
|
|
|
Synonyms syn;
|
|
|
|
//bool inTitle = false;
|
|
|
|
// scan for title
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// out of a link
|
|
//if(tids && tids[i] == TAG_TITLE ) inTitle = true;
|
|
//if(tids && tids[i] == (TAG_TITLE | BACKBIT)) inTitle = false;
|
|
// count it, limit to 30
|
|
//if ( inTitle ) tw++;
|
|
// skip if not alnumword
|
|
if ( ! wids[i] ) continue;
|
|
// make it 32 bit
|
|
uint32_t wid32 = (uint32_t)wids[i];
|
|
// filter table
|
|
if ( filterTable && filterTable->isInTable(&wid32) ) continue;
|
|
/*
|
|
// debug
|
|
if ( minCountTable && storeCounts ) {
|
|
int32_t wlen = ww->m_wordLens[i];
|
|
char *wptr = ww->m_words[i];
|
|
char c= wptr[wlen];
|
|
wptr[wlen] = '\0';
|
|
log("seo: storecount wid=%"UINT32" word=%s",
|
|
(uint32_t)((uint64_t)wids[i]),wptr);
|
|
wptr[wlen] = c;
|
|
}
|
|
*/
|
|
// to avoid link text anomalies, the word must have been
|
|
// repeated in another link text or a meta tag. should
|
|
// fix ibm.com from getting 'lincoln' or 'unc' as high-scoring
|
|
// matching queries. should fix artdaily.com from getting
|
|
// that foreign language phrase in danish. (bedste pa nettet)
|
|
// (best of the web)
|
|
if ( minCountTable &&
|
|
! storeCounts &&
|
|
minCountTable->getScore32(&wid32) <= 1 )
|
|
continue;
|
|
// get slot
|
|
if ( dedupTable && dedupTable->isInTable(&wid32) ) continue;
|
|
// count it!
|
|
if ( storeCounts && ! minCountTable->addTerm32(&wid32) )
|
|
return false;
|
|
// show it
|
|
//if ( wid32 == 1174583722 && storeCounts ) {
|
|
// log("seo: storing occurence. current count=%"INT32"",
|
|
// (int32_t)minCountTable->getScore32(&wid32) );
|
|
//}
|
|
// add it to vector
|
|
TermInfo ti;
|
|
ti.m_termId64 = wids[i];
|
|
//ti.m_termFreq64 = -1;
|
|
if ( termInfoBuf && !
|
|
termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
|
|
return false;
|
|
// add it then
|
|
if ( dedupTable && ! dedupTable->addKey ( &wid32 ) )
|
|
return false;
|
|
// do synonyms now?
|
|
if ( ! getSynonyms ) continue;
|
|
// get its synonyms into tmpBuf
|
|
char tmpBuf[TMPSYNBUFSIZE];
|
|
int32_t naids = syn.getSynonyms(ww,i,useLangId,tmpBuf,m_niceness);
|
|
for ( int32_t j = 0 ; j < naids ; j++ ) {
|
|
// get it
|
|
uint32_t aid32 = (uint32_t)syn.m_aids[j];
|
|
// get slot
|
|
if ( dedupTable && dedupTable->isInTable(&aid32) )
|
|
continue;
|
|
// add it to vector
|
|
TermInfo ti;
|
|
ti.m_termId64 = syn.m_aids[j]; // 64 bit version
|
|
//ti.m_termFreq64 = -1;
|
|
if ( termInfoBuf &&
|
|
! termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
|
|
return false;
|
|
// add it then
|
|
if ( dedupTable && ! dedupTable->addKey(&aid32) )
|
|
return false;
|
|
// count it!
|
|
if ( storeCounts && ! minCountTable->addTerm32(&aid32))
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
static void gotMsg99ReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotMsg99Reply ( slot );
|
|
}
|
|
|
|
void XmlDoc::gotMsg99Reply ( UdpSlot *slot ) {
|
|
// get replying hostid
|
|
int32_t hostId = slot->m_hostId;
|
|
// log
|
|
setStatus ( "gotmsg99reply" );
|
|
// sanity
|
|
if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
|
|
// save it
|
|
int32_t i = m_numMsg99Replies;
|
|
m_msg99ReplyPtrs [i] = slot->m_readBuf;
|
|
m_msg99ReplySizes[i] = slot->m_readBufSize;
|
|
m_msg99ReplyAlloc[i] = slot->m_readBufMaxSize;
|
|
m_msg99HostIds [i] = hostId;
|
|
// steal it so it doesn't free it
|
|
slot->m_readBuf = NULL;
|
|
// note it
|
|
//log("seopipe: got msg99 reply from host #%"INT32" i=%"INT32" alloc=%"INT32"",
|
|
// hostId,i,slot->m_readBufMaxSize);
|
|
// inc the counter
|
|
m_numMsg99Replies++;
|
|
// sanity!
|
|
if ( m_numMsg99Replies > m_numMsg99Requests ) { char *xx=NULL;*xx=0; }
|
|
if ( m_numMsg99Replies > g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
|
|
// don't free the sendbuf, it is shared between all hosts UNLESS
|
|
// we are the last reply received!!!
|
|
if ( m_numMsg99Replies < g_hostdb.m_numHosts )
|
|
slot->m_sendBufAlloc = NULL;
|
|
// return control to transmit function. it will call m_callback1
|
|
// if the function is done. but if a different parent function than
|
|
// transmit called us then we call that. it just depends on the
|
|
// intial entry function that called getMatchingQueries()
|
|
m_masterLoop ( m_masterState );
|
|
}
|
|
*/
|
|
/*
|
|
float getQueryImportance2 ( QueryInfo *qi , float myScore ) {
|
|
// now divide by the top score (or 50th score) for the query
|
|
// so we can see how high we score relatively speaking...
|
|
// although, if all search results for this query have the
|
|
// same score this method kinda sux...
|
|
float imp = myScore / qe->m_minTop50Score;
|
|
return imp;
|
|
// mod because one word query terms get higher scores than
|
|
// multi-word queries because they are divided by distance in
|
|
// the search algo.
|
|
// this hurts 'gigablast' query.
|
|
if ( qi->m_numUniqueWordForms <= 1 ) score /= 10.0;
|
|
// multiply by it?
|
|
score *= qi->m_numUniqueWordForms;
|
|
// until we have the code to fix things like 'coast to coast'
|
|
// where the term is repeated, we have to punish...
|
|
if ( qi->m_numRepeatWordForms >= 1 ) score /= 30.0;
|
|
// kill 'search+engine+search+engine'
|
|
if ( qi->m_numRepeatWordForms >= 2 ) score /= 30.0;
|
|
// if every word in query is repeated... push it down
|
|
// try to fix 'bot+bot' and 'search+search' 'http+http'
|
|
if ( qi->m_numUniqueWordForms == qi->m_numRepeatWordForms )
|
|
score /= 2000.0;
|
|
// fix 'web search search'
|
|
if ( qi->m_numRepeatWordForms > 0 &&
|
|
qi->m_numUniqueWordForms == qi->m_numRepeatWordForms + 1 )
|
|
score /= 200.0;
|
|
// try to kill those queries that are just a single stop word
|
|
// or forms of stop words.
|
|
// this hurts 'gigablast' query, so make it > .9. no, then crap like
|
|
// 'web' and 'http' come up too high...
|
|
if ( qi->m_numUniqueWordForms == 1 ) {
|
|
score *= (1.1 - qi->m_smallestNormTermFreq);
|
|
score *= (1.1 - qi->m_smallestNormTermFreq);
|
|
}
|
|
// http is very common! so make the 'http' or 'http+http' queries
|
|
// very low importance
|
|
if ( qi->m_numControlWordForms == qi->m_numUniqueWordForms )
|
|
score /= 1000000.0;
|
|
// TODO: if query is a single term and it's exact syn min
|
|
// hash is that for 'and' then kill it. fix 'anding'
|
|
|
|
// boost it for more accuracy since we gotta make it into anint
|
|
//score *= 1000;
|
|
return score;
|
|
}
|
|
|
|
// set Msg99Reply::m_queryImportance for all msg99replies
|
|
void setQueryImportance ( Msg99Reply **qptrs , int32_t numQueryPtrs ) {
|
|
}
|
|
|
|
void setQueryImportanceRelated ( QueryRel **qptrs , int32_t numQueryPtrs ) {
|
|
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
|
|
QueryRel *qrel = qptrs[i];
|
|
float score = qrel->m_queryInfo.m_myScoreRelated;
|
|
QueryInfo *qi = &qrel->m_queryInfo;
|
|
float imp = getQueryImportance2 ( qi , score );
|
|
qi->m_queryImportance = imp;
|
|
}
|
|
}
|
|
*/
|
|
/*
|
|
int qp99cmp ( const void *a, const void *b ) {
|
|
Msg99Reply *qa = *(Msg99Reply **)a;
|
|
Msg99Reply *qb = *(Msg99Reply **)b;
|
|
// make sure manually added queries are on top
|
|
if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1;
|
|
if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
|
|
//QueryInfo *qia = &qa->m_queryInfo;
|
|
//QueryInfo *qib = &qb->m_queryInfo;
|
|
// get scores
|
|
float scorea = qa->m_queryImportance;
|
|
float scoreb = qb->m_queryImportance;
|
|
if ( scorea < scoreb ) return 1;
|
|
if ( scorea > scoreb ) return -1;
|
|
// fallback to traffic otherwise i guess
|
|
int32_t traffica = qa->m_queryLogEntry.m_gigablastTraffic;
|
|
int32_t trafficb = qb->m_queryLogEntry.m_gigablastTraffic;
|
|
if ( qa->m_queryLogEntry.m_googleTraffic != -1 )
|
|
traffica = qa->m_queryLogEntry.m_googleTraffic;
|
|
if ( qb->m_queryLogEntry.m_googleTraffic != -1 )
|
|
trafficb = qb->m_queryLogEntry.m_googleTraffic;
|
|
if ( traffica < trafficb ) return 1;
|
|
if ( traffica > trafficb ) return -1;
|
|
// fallback alphabetical otherwise?
|
|
char *qsa = qa->m_queryStr;
|
|
char *qsb = qb->m_queryStr;
|
|
if ( ! qsa ) return 0;
|
|
if ( ! qsb ) return 0;
|
|
return strcmp( qsa , qsb );
|
|
//return 0;
|
|
}
|
|
*/
|
|
|
|
#include "Cachedb.h"
|
|
|
|
// . only check cachedb once per url
|
|
// . return false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::checkCachedb ( ) {
|
|
|
|
|
|
if ( ! m_readFromCachedb ) return true;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// already set?
|
|
//if ( m_seoInfoSetFromCache )
|
|
// return true;
|
|
|
|
// return -1 if this blocked
|
|
if ( ! m_checkedCachedb ) {
|
|
// we now use the contenthash as part of the key because the
|
|
// data we cache is dependent on the content. i guess we don't
|
|
// need to use the user id then...
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
// first check cachedb. enum type cr_MatchingQueries
|
|
int32_t uh32 ;
|
|
uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
|
|
key_t sk = g_cachedb.makeStartKey ( uh32 , ch32 );
|
|
key_t ek = g_cachedb.makeEndKey ( uh32 , ch32 );
|
|
// debug
|
|
log("seo: checking cachedb uh32=%"UINT32" ch32=%"UINT32"",
|
|
(uint32_t)uh32,
|
|
(uint32_t)ch32);
|
|
// do not repeat
|
|
m_checkedCachedb = true;
|
|
// . get it from the appropriate host
|
|
// . get cachedb rec for all types of safebufs for this
|
|
// url/content
|
|
// . then we will set safebufs based on what recs we find
|
|
// in the returned list
|
|
if ( ! m_msg0.getList ( -1, // hostid
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxcacheage
|
|
false, // addtocache?
|
|
RDB_CACHEDB,
|
|
cr->m_collnum ,
|
|
&m_cacheList,
|
|
(char *)&sk ,
|
|
(char *)&ek ,
|
|
30000000, // minrecsizes 30MB
|
|
m_masterState,
|
|
m_masterLoop,
|
|
m_niceness ) )
|
|
// return FALSE if this blocks
|
|
return false;
|
|
}
|
|
|
|
if ( m_processedCachedbReply ) return true;
|
|
|
|
// only scan list once
|
|
m_processedCachedbReply = true;
|
|
|
|
// if empty, that was easy
|
|
if ( m_cacheList.isEmpty() ) return true;
|
|
|
|
// we might have one rec set from cache and another not, and we
|
|
// still want to cache the one that is not in storeIntoCachedb()!
|
|
//m_seoInfoSetFromCache = true;
|
|
|
|
// otherwise, parse out the cache recs
|
|
for ( ; ! m_cacheList.isExhausted() ; m_cacheList.skipCurrentRec() ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
char *rec = m_cacheList.getCurrentRec();
|
|
// . get type of cached rec
|
|
// . enum types cr_MatchingQueries etc. as in Cachedb.h
|
|
char recType = g_cachedb.getTypeFromKey(rec);
|
|
int32_t dataSize = m_cacheList.getCurrentDataSize();
|
|
// sanity. must at least have the cached date
|
|
if ( dataSize < 4 ) { char *xx=NULL;*xx=0; }
|
|
char *data = m_cacheList.getCurrentData ();
|
|
// in data, first int32_t is the cached time in utc
|
|
//int32_t cachedDate = *(int32_t *)data;
|
|
// skip the TIMESTAMP!
|
|
//int32_t timestamp = *(int32_t *)data;
|
|
data += 4;
|
|
dataSize -= 4;
|
|
// and version
|
|
data += 4;
|
|
dataSize -= 4;
|
|
|
|
|
|
// . 1
|
|
// . is it a cached rec for matching queries?
|
|
// . getSeoQueryInfo() needs this
|
|
if (recType == cr_MatchingQueries && !m_matchingQueryBufValid){
|
|
// debug
|
|
log("seo: found matching queries");
|
|
// total size of the msg99replies (totalMsg99ReplySize)
|
|
int32_t size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// just point into the list itself. we will
|
|
// free m_cacheList on reset then.
|
|
m_matchingQueryBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
data += size1;
|
|
// now the m_queryLinkStringBuf
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_matchingQueryStringBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding
|
|
data += size1;
|
|
m_matchingQueryBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// . 2
|
|
// . is it a cached rec for related docis with titles?
|
|
// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
|
|
// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
|
|
// . m_relatedTitleBuf is buf of titles and urls referenced
|
|
// by those classes
|
|
if ( recType == cr_RelatedDocIds &&
|
|
! m_relatedDocIdsWithTitlesValid ) {
|
|
// debug
|
|
log("seo: found related docids");
|
|
// first is the safebuf of RelatedDocId classes
|
|
int32_t size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// point into it
|
|
//char *p = data;
|
|
//char *pend = data + size1;
|
|
// just point into the list itself. we will
|
|
// free m_cacheList on reset then.
|
|
m_relatedDocIdBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// save this
|
|
//char *rtbuf = data;
|
|
// now the string buffer
|
|
m_relatedTitleBuf.setBuf ( data ,
|
|
size1 ,
|
|
size1 ,
|
|
false ,
|
|
0 );
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// now the string buffer
|
|
m_commonQueryNumBuf.setBuf ( data ,
|
|
size1 ,
|
|
size1 ,
|
|
false ,
|
|
0 );
|
|
|
|
// now the RelatedDocId::ptr_url/ptr_rd_title members
|
|
// were hacked to be offsets into this for storage
|
|
// into the cache!
|
|
/*
|
|
for ( ; p < pend ; p += sizeof(RelatedDocId) ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
RelatedDocId *rd = (RelatedDocId *)p;
|
|
// get offsets
|
|
int32_t off1 = (int32_t)rd->ptr_rd_title;
|
|
int32_t off2 = (int32_t)rd->ptr_rd_url;
|
|
int32_t off3 = (int32_t)rd->ptr_rd_site;
|
|
// normalize/store back
|
|
rd->ptr_rd_title = rtbuf + off1;
|
|
rd->ptr_rd_url = rtbuf + off2;
|
|
rd->ptr_rd_site = rtbuf + off3;
|
|
}
|
|
*/
|
|
m_relatedDocIdsWithTitlesValid = true;
|
|
m_relatedTitleBufValid = true;
|
|
m_relatedDocIdBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// . 3
|
|
// . is it a cached rec for related docis with titles?
|
|
// . getSeoQueryInfo() calls getRelatedQueryBuf()
|
|
if ( recType == cr_RelatedQueries && ! m_queryLinkBufValid ) {
|
|
// we changed the format of relatedquerystringbuf
|
|
// to be a bunch of QueryLogEntries now. so ignore
|
|
// if old format.
|
|
//if ( timestamp <= 1367704324 ) continue;
|
|
// debug
|
|
log("seo: found related queries");
|
|
int32_t size1;
|
|
// first is the safebuf m_queryLinkBuf of QueryLinks
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_relatedQueryBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
data += size1;
|
|
// now the m_queryLinkStringBuf
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_relatedQueryStringBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding
|
|
data += size1;
|
|
/*
|
|
// now the ptrs, sorted
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_relPtrs.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// test sorting
|
|
char *p = m_relPtrs.getBufStart();
|
|
char *pend = m_relPtrs.getBuf();
|
|
char *base = m_queryLinkBuf.getBufStart();
|
|
QueryLink *lastqr = NULL;
|
|
for ( ; p < pend ; p += 4 ) {
|
|
QUICKPOLL(m_niceness);
|
|
int32_t qkOff = *(int32_t *)p;
|
|
QueryLink *qr = (QueryRel *)(base+qkOff);
|
|
// no, longer, it is more complicated because
|
|
// if m_uniqueRound scoring addition
|
|
//if ( lastqr &&
|
|
// lastqr->m_totalRelatedQueryImportance <
|
|
// qr ->m_totalRelatedQueryImportance ) {
|
|
// char *xx=NULL;*xx=0;}
|
|
lastqr = qr;
|
|
}
|
|
*/
|
|
// validate
|
|
//m_relPtrsValid = true;
|
|
//m_queryLinkStringBufValid = true;
|
|
m_relatedQueryBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// if it is debug and we are not, skip it!!
|
|
//if(recType == cr_ScoredInsertableTermsDebug && ! m_seoDebug )
|
|
// continue;
|
|
|
|
// or if we are debug and it is not, skip it!
|
|
//if (recType == cr_ScoredInsertableTerms && m_seoDebug )
|
|
// continue;
|
|
|
|
/*
|
|
if ( (recType == cr_MissingTermBuf ) &&
|
|
! m_missingTermBufValid ) {
|
|
// debug
|
|
log("seo: found missingtermbuf");
|
|
int32_t size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_missingTermBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
m_missingTermBufValid = true;
|
|
}
|
|
*/
|
|
|
|
// 3b
|
|
if ( (recType == cr_WordPosInfoBuf ) &&
|
|
! m_wordPosInfoBufValid ) {
|
|
// debug
|
|
log("seo: found wordposinfo");
|
|
int32_t size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_wordPosInfoBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// WordPosInfo::m_term relative to ptr_utf8Content
|
|
char *p = m_wordPosInfoBuf.getBufStart();
|
|
char *pend = m_wordPosInfoBuf.getBuf();
|
|
for ( ; p < pend ; p += sizeof(WordPosInfo) ) {
|
|
QUICKPOLL(m_niceness);
|
|
WordPosInfo *wp = (WordPosInfo *)p;
|
|
int64_t off = (int64_t)wp->m_wordPtr;
|
|
char *ptr = ptr_utf8Content + off;
|
|
if ( off == -1 ) ptr = NULL;
|
|
wp->m_wordPtr = ptr;
|
|
}
|
|
m_wordPosInfoBufValid = true;
|
|
}
|
|
|
|
// . 4
|
|
// . and the insertable terms buffer with its querychanges
|
|
// linked lists!
|
|
if ( recType == cr_ScoredInsertableTerms &&
|
|
! m_scoredInsertableTermsBufValid ) {
|
|
// debug
|
|
log("seo: found scored insertable terms");
|
|
int32_t size1;
|
|
// first is the safebuf m_queryLinkBuf of QueryLinks
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// just point into the list itself. we will
|
|
// free m_cacheList on reset then.
|
|
m_insertableTermsBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// now the buffer of query changes
|
|
// these are normally just referenced by
|
|
// InsertableTerm and in the linked list directly
|
|
// into the Msg95Reply::ptr_queryChanges, but for
|
|
// caching we have to use a new safebuf
|
|
m_queryChangeBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_queryLogBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
/*
|
|
// skip that
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_itStrBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
*/
|
|
/*
|
|
// debug scoring. QueryChange::m_debugScoreInfoOffset
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_debugScoreInfoBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
// debug scoring. QueryChange::m_origScoreInfoOffset
|
|
data += size1;
|
|
size1 = *(int32_t *)data;
|
|
data += 4;
|
|
m_origScoreInfoBuf.setBuf ( data ,
|
|
size1 , // size
|
|
size1 , // allocated
|
|
false , // owndata?
|
|
0 ); // encoding none
|
|
*/
|
|
// insertable terms deserialization logic
|
|
char *p = m_insertableTermsBuf.getBufStart();
|
|
char *pend = m_insertableTermsBuf.getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// normalize m_firstQueryChange
|
|
int64_t off =(int64_t)(it->m_firstQueryChange);
|
|
// fix this
|
|
char *buf = m_queryChangeBuf.getBufStart();
|
|
// int16_tcut
|
|
QueryChange *fqc = (QueryChange *)(buf+off);
|
|
// -1 means NULL
|
|
if ( off == -1 ) fqc = NULL;
|
|
// put back
|
|
it->m_firstQueryChange = fqc;
|
|
// terms
|
|
//off = (int32_t)it->m_termStr;
|
|
// to this
|
|
//buf = m_itStrBuf.getBufStart();
|
|
// cast it
|
|
//it->m_termStr = (char *)(buf+off);
|
|
}
|
|
// . now we set QueryChange::m_next and
|
|
// InsertableTerm::m_firstQueryChange to be offsets
|
|
// into the new m_queryChangeBuf before we stored
|
|
// into the cache....
|
|
p = m_queryChangeBuf.getBufStart();
|
|
pend = m_queryChangeBuf.getBuf();
|
|
for ( ; p < pend ; p += sizeof(QueryChange) ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
QueryChange *qc = (QueryChange *)p;
|
|
// normalize m_next
|
|
int64_t off = (int64_t)qc->m_next;
|
|
// offset into this
|
|
char *buf = m_queryChangeBuf.getBufStart();
|
|
// put back
|
|
qc->m_next = (QueryChange *)(buf + off);
|
|
// -1 means NULL
|
|
if ( off == -1 ) qc->m_next = NULL;
|
|
}
|
|
// now all ptrs should be set correctly
|
|
m_scoredInsertableTermsBufValid = true;
|
|
m_insertableTermsBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
// . 2
|
|
// . is it a cached rec for related docis with titles?
|
|
// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
|
|
// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
|
|
// . m_relatedTitleBuf is buf of titles and urls referenced
|
|
// by those classes
|
|
if ( recType == cr_RecommendedLinks &&
|
|
! m_recommendedLinksBufValid ) {
|
|
// debug
|
|
log("seo: found recommended links buf");
|
|
// first is the safebuf of RelatedDocId classes
|
|
int32_t size1 = *(int32_t *)data;
|
|
data += 4;
|
|
// now the string buffer
|
|
m_recommendedLinksBuf.setBuf ( data ,
|
|
size1 ,
|
|
size1 ,
|
|
false ,
|
|
0 );
|
|
m_recommendedLinksBufValid = true;
|
|
continue;
|
|
}
|
|
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#define CACHEDB_CURRENT_VERSION 1
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . flush the msg4 until it completes i guess
|
|
bool XmlDoc::storeMatchingQueriesIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
// all these things should already be validated so they should
|
|
// not block or have errors
|
|
//SafeBuf *qpbuf = getMatchingQueriesScored();
|
|
//SafeBuf *qpbuf = &m_queryPtrs;
|
|
if ( ! m_matchingQueryBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
//int32_t totalMsg99ReplySize = 0;
|
|
//int32_t numQueryPtrs = 0;
|
|
//Msg99Reply **qptrs = NULL;
|
|
|
|
// 1. msg99replies for matchingQueries
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize+cacheDate(now)+ver
|
|
need += 4 + m_matchingQueryBuf.length();
|
|
need += 4 + m_matchingQueryStringBuf.length();
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: mq listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
//
|
|
// 1. first add the matching queries, msg99 replies
|
|
//
|
|
k = g_cachedb.makeKey ( uh32, ch32 , cr_MatchingQueries );
|
|
|
|
// note it
|
|
log("seo: cachedb storing matchingqueries "
|
|
"uh32=%"UINT32" ch32=%"UINT32""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_matchingQueryBuf.length();
|
|
dataSize += 4 + m_matchingQueryStringBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
listBuf.pushLong ( m_matchingQueryBuf.length() );
|
|
listBuf.safeMemcpy ( &m_matchingQueryBuf );
|
|
listBuf.pushLong ( m_matchingQueryStringBuf.length() );
|
|
listBuf.safeMemcpy ( &m_matchingQueryStringBuf );
|
|
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding matching query list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeRelatedDocIdsIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_queryPtrsWholeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
if ( ! m_relatedDocIdsWithTitlesValid ) { char *xx=NULL;*xx=0;}
|
|
if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0;}
|
|
|
|
// 2. related docids
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_relatedDocIdBuf.length();
|
|
need += 4 + m_relatedTitleBuf.length();
|
|
need += 4 + m_commonQueryNumBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: rd listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
char *p1;
|
|
char *p2;
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// 2. then add related docids
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedDocIds );
|
|
|
|
// note it
|
|
log("seo: cachedb storing relateddocids "
|
|
"uh32=%"UINT32" ch32=%"UINT32""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_relatedDocIdBuf.length();
|
|
dataSize += 4 + m_relatedTitleBuf.length();
|
|
dataSize += 4 + m_commonQueryNumBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
listBuf.pushLong ( m_relatedDocIdBuf.length() );
|
|
p1 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_relatedDocIdBuf );
|
|
p2 = listBuf.getBuf();
|
|
listBuf.pushLong ( m_relatedTitleBuf.length() );
|
|
listBuf.safeMemcpy ( &m_relatedTitleBuf );
|
|
//char *tbuf = m_relatedTitleBuf.getBufStart();
|
|
listBuf.pushLong ( m_commonQueryNumBuf.length() );
|
|
listBuf.safeMemcpy ( &m_commonQueryNumBuf );
|
|
|
|
// make ptrs into offsets into m_relatedTitleBuf
|
|
/*
|
|
for ( ; p1 < p2 ; p1 += sizeof(RelatedDocId )) {
|
|
QUICKPOLL(m_niceness);
|
|
RelatedDocId *rd = (RelatedDocId *)p1;
|
|
int32_t off;
|
|
off = rd->ptr_rd_url - tbuf;
|
|
rd->ptr_rd_url = (char *)off;
|
|
off = rd->ptr_rd_title - tbuf;
|
|
rd->ptr_rd_title = (char *)off;
|
|
off = rd->ptr_rd_site - tbuf;
|
|
rd->ptr_rd_site = (char *)off;
|
|
}
|
|
*/
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding related docids list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::storeRecommendedLinksBuf ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
if ( ! m_recommendedLinksBufValid ) { char *xx=NULL;*xx=0;}
|
|
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_recommendedLinksBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: reclnx listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// 2. then add related docids
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr_RecommendedLinks );
|
|
|
|
// note it
|
|
log("seo: cachedb storing recommendedlinksbuf "
|
|
"uh32=%"UINT32" ch32=%"UINT32""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_recommendedLinksBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_recommendedLinksBuf.length() );
|
|
listBuf.safeMemcpy ( &m_recommendedLinksBuf );
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding recommendedlinksbuf list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool XmlDoc::storeRelatedQueriesIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_relatedQueryBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
//SafeBuf *relBuf = NULL;
|
|
//if ( m_relPtrsValid ) relBuf = &m_relPtrs;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
// 3. related queries. buf of QueryLinks
|
|
need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_relatedQueryBuf.length();
|
|
need += 4 + m_relatedQueryStringBuf.length();
|
|
//need += 4 + m_relPtrs.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: rq listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
//
|
|
// 3. then related queries (STORED by m_queryImportanceRelated)
|
|
//
|
|
//int32_t sizeRels = (m_relPtrs.length() / 4) * sizeof(QueryLink);
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedQueries );
|
|
|
|
// note it
|
|
log("seo: cachedb storing relatedqueries "
|
|
"uh32=%"UINT32" ch32=%"UINT32""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_relatedQueryBuf.length(); // sizeRels;
|
|
dataSize += 4 + m_relatedQueryStringBuf.length();
|
|
//dataSize += 4 + m_relPtrs.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_relatedQueryBuf.length() );
|
|
//char *p3 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_relatedQueryBuf );
|
|
//char *p4 = listBuf.getBuf();
|
|
listBuf.pushLong ( m_relatedQueryStringBuf.length() );
|
|
listBuf.safeMemcpy ( &m_relatedQueryStringBuf );
|
|
//listBuf.pushLong ( m_relPtrs.length() );
|
|
//char *p5 = listBuf.getBuf();
|
|
//listBuf.safeMemcpy ( &m_relPtrs );
|
|
// sanity tests
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding related queries list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeWordPosInfoBufIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_wordPosInfoBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_wordPosInfoBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: wpi listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
// 4. then the insertable terms and their query changes and log buf
|
|
// mangle key a little if in debug mode because that is the only
|
|
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
|
|
uint8_t cr8 = cr_WordPosInfoBuf;
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr8 );
|
|
|
|
// note it
|
|
log("seo: cachedb storing wordposinfobuf "
|
|
"uh32=%"UINT32" ch32=%"UINT32""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_wordPosInfoBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_wordPosInfoBuf.length() );
|
|
char *p8 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_wordPosInfoBuf );
|
|
char *p9 = listBuf.getBuf();
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// WordPosInfo::m_term relative to html ptr_utf8Content!
|
|
for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
|
|
QUICKPOLL(m_niceness);
|
|
WordPosInfo *wp = (WordPosInfo *)p8;
|
|
int64_t off = wp->m_wordPtr - ptr_utf8Content;
|
|
// if its a tag or fielded term it won't be in the
|
|
// html like ext:html or filetype:html
|
|
if ( wp->m_wordPtr< ptr_utf8Content )
|
|
off = -1;
|
|
if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
|
|
off = -1;
|
|
wp->m_wordPtr = (char *)off;
|
|
}
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding wordposinfobuf list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
bool XmlDoc::storeMissingTermBufIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_missingTermBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_missingTermBuf.length();
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: wpi listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
|
|
// 4. then the insertable terms and their query changes and log buf
|
|
// mangle key a little if in debug mode because that is the only
|
|
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
|
|
uint8_t cr = cr_MissingTermBuf;
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr );
|
|
|
|
// note it
|
|
log("seo: cachedb storing missingtermbuf "
|
|
"uh32=%"UINT32" ch32=%"UINT32"",uh32,ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_missingTermBuf.length();
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
listBuf.pushLong ( m_missingTermBuf.length() );
|
|
listBuf.safeMemcpy ( &m_missingTermBuf );
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding missingtermbuf list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . flush the msg4 until it completes i guess
|
|
bool XmlDoc::storeScoredInsertableTermsIntoCachedb ( ) {
|
|
|
|
if ( ! m_writeToCachedb ) return true;
|
|
|
|
if ( ! m_scoredInsertableTermsBufValid ) return true;
|
|
|
|
int32_t *ch32p = getContentHash32();
|
|
if ( ! ch32p ) return true;
|
|
if ( ch32p == (void *)-1 ) return false;
|
|
int32_t ch32 = *ch32p;
|
|
// include spider date now in case indexed copy changes
|
|
// site rank, tags, etc.
|
|
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return true;
|
|
|
|
int32_t now = getTimeGlobal();
|
|
|
|
// calc how much space we need
|
|
int32_t need = 0;
|
|
need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
|
|
need += 4 + m_insertableTermsBuf.length();
|
|
// InsertableTerm::m_firstQryChange:
|
|
need += 4 + m_queryChangeBuf.length();
|
|
//4 QueryChange::m_replyQueryOffset :
|
|
need += 4 + m_queryLogBuf.length();
|
|
//InsertableTerm::m_termStr reference
|
|
//need += 4 + m_itStrBuf.length();
|
|
//need += 4 + m_wordPosInfoBuf.length();
|
|
// TOO BIG to score into cachedb!
|
|
//need += 4 + m_debugScoreInfoBuf.length(); // debug only
|
|
//need += 4 + m_origScoreInfoBuf.length(); // debug only
|
|
|
|
// sanity
|
|
if ( need > 20000000 ) {
|
|
log("cachedb: listsize %"INT32" too big for cachedb",need);
|
|
return true;
|
|
}
|
|
|
|
SafeBuf listBuf;
|
|
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
|
|
// does not trigger a reserve
|
|
if ( ! listBuf.reserve ( need + 4 ) ) return true;
|
|
|
|
// ensure no reallocating - that would screw logic below up
|
|
char *orig = listBuf.getBufStart();
|
|
|
|
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
key_t k;
|
|
|
|
int32_t dataSize = 0;
|
|
char *p1;
|
|
char *p2;
|
|
|
|
// 4. then the insertable terms and their query changes and log buf
|
|
// mangle key a little if in debug mode because that is the only
|
|
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
|
|
uint8_t cr8 = cr_ScoredInsertableTerms;
|
|
//if ( m_seoDebug ) cr = cr_ScoredInsertableTermsDebug;
|
|
k = g_cachedb.makeKey ( uh32 , ch32, cr8 );
|
|
|
|
// note it
|
|
log("seo: cachedb storing scoredinsertableterms "
|
|
"uh32=%"UINT32" ch32=%"UINT32""
|
|
,(uint32_t)uh32,(uint32_t)ch32);
|
|
|
|
listBuf.safeMemcpy ( &k , sizeof(key_t) );
|
|
dataSize = 0;
|
|
dataSize += 4; // timestamp
|
|
dataSize += 4; // version
|
|
dataSize += 4 + m_insertableTermsBuf.length();
|
|
dataSize += 4 + m_queryChangeBuf.length();
|
|
dataSize += 4 + m_queryLogBuf.length();
|
|
//dataSize += 4 + m_itStrBuf.length();
|
|
//dataSize += 4 + m_wordPosInfoBuf.length();
|
|
//dataSize += 4 + m_debugScoreInfoBuf.length(); // debug only
|
|
//dataSize += 4 + m_origScoreInfoBuf .length(); // debug only
|
|
listBuf.pushLong ( dataSize );
|
|
listBuf.pushLong ( now ); // cached date
|
|
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
|
|
|
|
// m_insertableTermsBuf
|
|
listBuf.pushLong ( m_insertableTermsBuf.length() );
|
|
p1 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_insertableTermsBuf );
|
|
char *p1End = listBuf.getBuf();
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_queryChangeBuf
|
|
listBuf.pushLong ( m_queryChangeBuf.length() );
|
|
p2 = listBuf.getBuf();
|
|
listBuf.safeMemcpy ( &m_queryChangeBuf );
|
|
char *p2End = listBuf.getBuf();
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_queryLogBuf
|
|
listBuf.pushLong ( m_queryLogBuf.length() );
|
|
listBuf.safeMemcpy ( &m_queryLogBuf );
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_itStrBuf referenced by InsertableTerm::m_termStr
|
|
//listBuf.pushLong ( m_itStrBuf.length() );
|
|
//listBuf.safeMemcpy ( &m_itStrBuf );
|
|
//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// m_itStrBuf referenced by InsertableTerm::m_termStr
|
|
//listBuf.pushLong ( m_wordPosInfoBuf.length() );
|
|
//char *p8 = listBuf.getBuf();
|
|
//listBuf.safeMemcpy ( &m_wordPosInfoBuf );
|
|
//char *p9 = listBuf.getBuf();
|
|
//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
// debug buffers, QueryChange::m_*Offset parms ref them if
|
|
// m_seoDebug is true. TOO BIG TO STORE INTO CACHEDB!
|
|
//listBuf.pushLong ( m_debugScoreInfoBuf.length() );
|
|
//listBuf.safeMemcpy ( &m_debugScoreInfoBuf );
|
|
//listBuf.pushLong ( m_origScoreInfoBuf.length() );
|
|
//listBuf.safeMemcpy ( &m_origScoreInfoBuf );
|
|
// make the InsertableTerm::m_firstQueryChange parms into
|
|
// offsets
|
|
for ( ; p1 < p1End ; ) { // p1 += sizeof(InsertableTerm) ) {
|
|
QUICKPOLL(m_niceness);
|
|
InsertableTerm *it = (InsertableTerm *)p1;
|
|
p1 += it->getSize();
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
int64_t qoff =(char *)qc - m_queryChangeBuf.getBufStart();
|
|
if ( qc == NULL ) qoff = -1;
|
|
it->m_firstQueryChange = (QueryChange *)qoff;
|
|
// and m_termStr
|
|
//int32_t off = it->m_termStr - m_itStrBuf.getBufStart();
|
|
//it->m_termStr = (char *)off;
|
|
}
|
|
// make QueryChange::m_next ptrs into offsets as well
|
|
for ( ; p2 < p2End ; p2 += sizeof(QueryChange) ) {
|
|
QUICKPOLL(m_niceness);
|
|
QueryChange *qc = (QueryChange *)p2;
|
|
QueryChange *next = qc->m_next;
|
|
int64_t noff =(char *)next-m_queryChangeBuf.getBufStart();
|
|
if ( next == NULL ) noff = -1;
|
|
qc->m_next = (QueryChange *)noff;
|
|
}
|
|
// WordPosInfo::m_term relative to html ptr_utf8Content!
|
|
/*
|
|
for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
|
|
QUICKPOLL(m_niceness);
|
|
WordPosInfo *wp = (WordPosInfo *)p8;
|
|
int32_t off = wp->m_wordPtr - ptr_utf8Content;
|
|
// if its a tag or fielded term it won't be in the
|
|
// html like ext:html or filetype:html
|
|
if ( wp->m_wordPtr< ptr_utf8Content )
|
|
off = -1;
|
|
if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
|
|
off = -1;
|
|
wp->m_wordPtr = (char *)off;
|
|
}
|
|
*/
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure list did not realloc, that would screw up everything!
|
|
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
|
|
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
|
|
|
|
// . list is ready now
|
|
// . this only returns when each record has been added
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
log("xmldoc: adding insertable terms list of %"INT32" bytes to cachedb",
|
|
m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
#define MAX_TOP_MATCHING_QUERIES 300
|
|
|
|
/*
|
|
// returns -1 if blocked, NULL with g_errno set on error
|
|
SafeBuf *XmlDoc::getMatchingQueriesScored ( ) {
|
|
|
|
setStatus ( "getmatchingqueriesscored" );
|
|
|
|
// try to set m_queryPtrs from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
// just re-use the same m_queryPtrs SafeBuf we used above but we
|
|
// set the Msg99Reply::m_myScore here and sort them by that
|
|
if ( m_queryPtrsSortedValid )
|
|
return &m_queryPtrs;
|
|
|
|
// get the queries from msg99 replies first
|
|
SafeBuf *mq = getMatchingQueries(false,-1);
|
|
if ( mq == NULL || mq == (void *)-1 ) return mq;
|
|
|
|
// time it
|
|
if ( ! m_beginTimeMatchUrl )
|
|
m_beginTimeMatchUrl = gettimeofdayInMilliseconds();
|
|
|
|
// i'm assuming this is quer ptrs!?!?!
|
|
int32_t numQueryPtrs = mq->length() / sizeof(Msg99Reply *);
|
|
|
|
// get the qptrs
|
|
Msg99Reply **qptrs = (Msg99Reply **)mq->getBufStart();
|
|
|
|
// score them in parallel over all hosts in network
|
|
if ( ! scoreDocIdRestrictedQueries ( qptrs,NULL,numQueryPtrs) )
|
|
return (SafeBuf *)-1;
|
|
// error?
|
|
if ( g_errno ) return NULL;
|
|
|
|
// total pages indexed!
|
|
int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
|
|
// take 25% of that. i think 'the', the most common term, is in about
|
|
// 25% of those pages
|
|
numPagesIndexed /= 4;
|
|
|
|
//
|
|
// SET QUERY IMPORTANCE
|
|
//
|
|
// . set the m_queryImportance float and sort by that
|
|
// . how important is the matching query for the main url?
|
|
// . just divide the main url's score by the
|
|
// QueryLogEntry::m_mintop50Score for the query to normalize it
|
|
// . however, when we compute RelatedDocId::m_dotProduct we normalize
|
|
// using the score of the #1 result because we executed the full
|
|
// query, so keep that in mind. we can't mix the two.
|
|
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
|
|
Msg99Reply *qp = qptrs[i];
|
|
// int16_tcut
|
|
QueryLogEntry *qe = &qp->m_queryLogEntry;
|
|
// get # results
|
|
int64_t numResults = qe->m_numTotalResultsInSlice;
|
|
// fix it to be global
|
|
numResults *= (int64_t)g_hostdb.getNumGroups();
|
|
// big indexes did the "slice logic" restricting docid
|
|
// range to MAX_DOCID * .10 when setting this!
|
|
if ( numPagesIndexed > 10000000 ) numResults *= 10;
|
|
// point to query
|
|
char *qstr = qp->m_queryStr;
|
|
// if not processed assume like 1M?
|
|
if ( numResults < 0 ) {
|
|
log("seo: guessing query importance for '%s' from "
|
|
"hostid #%"INT32"",
|
|
qstr,(int32_t)qp->m_replyingHostId);
|
|
qp->m_queryImportance = 0.0;
|
|
continue;
|
|
}
|
|
// zero means make it 1 to avoid div by zero below
|
|
if ( numResults == 0 ) numResults = 1;
|
|
|
|
// and also weight by traffic! the more traffic the
|
|
// more important perhaps...
|
|
// NO! with this we get 'www' 'view' etc for
|
|
// jezebelgallery.com coming up in the top 50 matching
|
|
// queries by importance. crap, but it hurts cheatcodes.com
|
|
// then.
|
|
// fix
|
|
|
|
//if ( strcmp(qstr,"search engine") == 0 )
|
|
// log("poo");
|
|
|
|
// adjust since numPagesIndexed is actually a quarter of
|
|
// the # of pages indexed since 'the' is only in about
|
|
// 1/4 of the pages and it is the most common term
|
|
if ( numResults > numPagesIndexed )
|
|
numResults = numPagesIndexed;
|
|
|
|
// try doubling this to get rid of www problem for
|
|
// jezebelgallery.com. it put www and view down some more.
|
|
float popRatio = (float)numResults / (float)numPagesIndexed;
|
|
|
|
// stuff like 'www' and 'view' will be near 1.0
|
|
float weight = 1.0 - popRatio;//(popRatio * popRatio);
|
|
// go crazy
|
|
weight *= weight;
|
|
weight *= weight;
|
|
weight *= weight;
|
|
weight *= weight;
|
|
|
|
// do not let this be 1.0 because 'web page searching' is
|
|
// getting 1.0 for it and getting a weight of 0.0 and making
|
|
// it the same as the ignored matching queries for
|
|
// gigablast.com, so we end up using the ignored common
|
|
// word matching queries for getting competitor pages and it
|
|
// is bad! we need to fix that to not use such queries if
|
|
// their importance is 0!
|
|
if ( weight < .01 ) weight = .01;
|
|
|
|
|
|
// because you are in the top 50
|
|
//numResults = (int32_t)powf ( (float)numResults , .4 );
|
|
//if ( numResults == 0 )
|
|
// imp /= 1;
|
|
// otherwise, normalize by division
|
|
//else
|
|
// imp /= numResults;
|
|
// boost it!
|
|
//imp *= 10000;
|
|
//QueryInfo *qi = &qp->m_queryInfo;
|
|
//float imp = getQueryImportance2 ( qi , score );
|
|
|
|
// just try sorting by your serp score, hopefully we remove
|
|
// shit like 'www' becaise isCommonQueryWordInEnglish()
|
|
// takes care of it below.
|
|
// consider *= weight on this
|
|
|
|
// the idea is to ignore the top serp score because
|
|
// you do not want terms that you may be able to be #1
|
|
// for but are not really relevant for your doc. so for this
|
|
// let's focus on just getting the queries that best represent
|
|
// your doc...
|
|
double imp = qp->m_myScore * weight;
|
|
|
|
|
|
qp->m_queryImportance = (float)imp;
|
|
// just use this!!!
|
|
//qp->m_queryImportance = qp->m_myScore /
|
|
// (float)(numResults*numResults);
|
|
// set importance to 0 for queries with minus sign in them
|
|
// that indicates negative terms...
|
|
for ( char *p = qstr; *p ; p++ ) {
|
|
if ( *p != ' ' ) continue;
|
|
if ( p[1] != '-' ) continue;
|
|
// 'a - b' is ok
|
|
if ( p[2] == ' ' ) continue;
|
|
qp->m_queryImportance = 0.00;
|
|
log("seo: ignoring query '%s' with minus sign", qstr);
|
|
break;
|
|
}
|
|
// avoid common queries with just common words in them:
|
|
// http web www com org us we 1 2 3 by on i https one page
|
|
Words ww;
|
|
ww.set3 ( qstr );
|
|
int32_t i; for ( i = 0 ; i < ww.m_numWords ; i++ ) {
|
|
int64_t wid = ww.m_wordIds[i];
|
|
if ( wid == 0 ) continue;
|
|
if ( ! isCommonQueryWordInEnglish ( wid ) ) break;
|
|
}
|
|
if ( i >= ww.m_numWords ) {
|
|
qp->m_queryImportance = 0.00;
|
|
log("seo: ignoring common query '%s'", qstr);
|
|
}
|
|
// skip debug for now
|
|
if ( ! m_seoDebug ) continue;
|
|
// note it
|
|
log("seo: "
|
|
"imp=%f "
|
|
"numresults=%"INT64" "
|
|
"numpagesindexed=%"INT64" "
|
|
"popweight=%f "
|
|
"myscore=%f "
|
|
"topscore=%f "
|
|
"qstr=%s",
|
|
qp->m_queryImportance,
|
|
numResults,
|
|
numPagesIndexed,
|
|
weight,
|
|
qp->m_myScore,
|
|
qe->m_topSERPScore,
|
|
qstr);
|
|
}
|
|
|
|
|
|
// let's sort them first
|
|
qsort ( qptrs ,
|
|
numQueryPtrs ,
|
|
sizeof(Msg99Reply *),
|
|
qp99cmp );
|
|
|
|
|
|
|
|
// log for debug
|
|
int32_t maxk = numQueryPtrs;
|
|
// limit to logging 300 to avoid log spam
|
|
if ( maxk > MAX_TOP_MATCHING_QUERIES )
|
|
maxk = MAX_TOP_MATCHING_QUERIES; // 300;
|
|
|
|
// limit to top 300 dammit, otherwise we can't store all
|
|
// into cachedb!!!
|
|
int32_t newLen = maxk * sizeof(Msg99Reply *);
|
|
m_queryPtrs.setLength ( newLen );
|
|
|
|
for ( int32_t k = 0 ; k < maxk ; k++ ) {
|
|
Msg99Reply *kp = qptrs[k];
|
|
log("seopipe: newquery=\"%s\" myscore=%f imp=%f",
|
|
kp->m_queryStr,
|
|
kp->m_myScore,
|
|
kp->m_queryImportance);
|
|
}
|
|
|
|
// time it
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginTimeMatchUrl;
|
|
log("seopipe: time: matchingscoredqueries took %"INT64" ms",took);
|
|
|
|
m_queryPtrsSortedValid = true;
|
|
|
|
if ( ! storeMatchingQueriesIntoCachedb() )
|
|
// return -1 if it blocked and wait for store to complete
|
|
return (SafeBuf *)-1;
|
|
|
|
return mq;
|
|
}
|
|
|
|
*/
|
|
|
|
static void gotMsg3aReplyForFullQueryWrapper ( void *state ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->setStatus ( "gotmsg3areplyforfullquerywrapper" );
|
|
THIS->gotMsg3aReplyForFullQuery();
|
|
// . go back to the main entry function
|
|
// . make sure g_errno is clear from a msg3a g_errno before calling
|
|
// this lest it abandon the loop
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
/*
|
|
void XmlDoc::gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
|
|
Msg99Reply *qp ) {
|
|
|
|
// try again for next guy
|
|
m_triedCache = false;
|
|
|
|
char *p = cachedRec;
|
|
// # docids
|
|
int32_t numDocIds = *(int32_t *)p;
|
|
p += 4;
|
|
// total # results
|
|
int32_t numTotalResults = *(int32_t *)p;
|
|
p += 4;
|
|
// docids
|
|
int64_t *docIds = (int64_t *)p;
|
|
p += 8 * numDocIds;
|
|
// scores
|
|
float *scores = (float *)p;
|
|
p += sizeof(float) * numDocIds;
|
|
// site hashes
|
|
int32_t *siteHashes = (int32_t *)p;
|
|
p += 4 * numDocIds;
|
|
|
|
// store score info into this class
|
|
TopDocIds *td = qp->m_topDocIds;
|
|
|
|
// store reply info, like # docids, in the query ptr
|
|
int32_t max = numDocIds;
|
|
if ( max > (int32_t)NUM_TOP_RESULTS ) max = (int32_t)NUM_TOP_RESULTS;
|
|
td->m_numDocIds = max;
|
|
|
|
// count replies
|
|
m_numMsg3aReplies++;
|
|
|
|
// log to log as well
|
|
char tmp[50000];
|
|
p = tmp;
|
|
p += sprintf(p,
|
|
"seopipe: got full results CACHED "
|
|
"qrynum=%"INT32"of%"INT32" docids=%"INT32" "
|
|
"query=\"%s\" ",
|
|
m_numMsg3aReplies,//m_msg3a->m_hackQNum,
|
|
m_maxFullQueries ,
|
|
td->m_numDocIds,
|
|
qp->m_queryStr );
|
|
// log each docid
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
//float score = m_msg3a->getScores()[i];
|
|
int64_t d = docIds[i];
|
|
//int32_t sh32 = m_msg3a->getSiteHash32(i);
|
|
p += sprintf(p,"d%"INT32"=%"INT64" ",i,d);
|
|
}
|
|
log(tmp);
|
|
|
|
|
|
// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
|
|
sb->safePrintf(
|
|
"\t<seoQueryScoreInfo>\n"
|
|
"\t\t<queryNum>%"INT32"</queryNum>\n"
|
|
"\t\t<numTotalEstimatedSearchResults>%"INT32""
|
|
"</numTotalEstimatedSearchResults>\n"
|
|
"\t\t<numDocIds>%"INT32"</numDocIds>\n"
|
|
, m_msg3a->m_hackQNum
|
|
, numTotalResults
|
|
, numDocIds
|
|
);
|
|
// print the top 50 scores
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
float score = scores[i];
|
|
int64_t d = docIds[i];
|
|
int32_t sh32 = siteHashes[i];
|
|
sb->safePrintf("\t\t<searchResult>\n");
|
|
sb->safePrintf("\t\t\t<rank>%"INT32"</rank>\n",i+1);
|
|
sb->safePrintf("\t\t\t<score>%f</score>\n",score);
|
|
sb->safePrintf("\t\t\t<docId>%"INT64"</docId>\n",d);
|
|
sb->safePrintf("\t\t\t<siteHash32>%"UINT32"</siteHash32>\n",sh32);
|
|
sb->safePrintf("\t\t</searchResult>\n");
|
|
// store results for this Msg99Reply
|
|
td->m_topDocIds[i] = d;
|
|
td->m_topScores[i] = score;
|
|
td->m_topSiteHashes[i] = sh32;
|
|
}
|
|
// reset rest so it prints pretty on gdb debug print cmd
|
|
for ( int32_t i = max ; i < (int32_t)NUM_TOP_RESULTS ; i++ ) {
|
|
td->m_topDocIds[i] = 0LL;
|
|
td->m_topScores[i] = 0.0;
|
|
td->m_topSiteHashes[i] = 0;
|
|
}
|
|
|
|
sb->safePrintf("\t</seoQueryScoreInfo>\n");
|
|
|
|
// pump m_socketWriteBuf to m_seoSocket
|
|
pumpSocketWriteBuf ( );
|
|
}
|
|
*/
|
|
|
|
// . this is the msg3a reply for related docids only
|
|
// . the full replies we get for determining ranks from scores for the
|
|
// HTML simulator, are handled in seo.cpp using State95::m_msg3a.
|
|
void XmlDoc::gotMsg3aReplyForFullQuery ( ) {
|
|
|
|
int32_t err = g_errno;
|
|
|
|
// save it so we know related docid generation had an error...
|
|
if ( g_errno && ! m_msg3aErrno )
|
|
m_msg3aErrno = g_errno;
|
|
|
|
setStatus ( "gotmsg3areplyforfullquery" );
|
|
|
|
if ( g_errno ) {
|
|
log("seopipe: got msg3a reply error: %s",mstrerror(g_errno));
|
|
g_errno = 0;
|
|
}
|
|
|
|
// try again for next guy
|
|
//m_triedCache = false;
|
|
|
|
// how many docids in the search results were returned to us?
|
|
int32_t numDocIds = m_msg3a->getNumDocIds();
|
|
// total # search results estimated
|
|
//int32_t numTotalResults = m_msg3a->getNumTotalEstimatedHits();
|
|
// get the query as we received it in the msg99 reply
|
|
//Msg99Reply *qp = (Msg99Reply *)m_msg3a->m_hackQPtr;
|
|
int32_t queryNum = (int32_t)m_msg3a->m_hackQNum;
|
|
|
|
// . point to the empty class we reserved in the buf
|
|
// . store score info into this class
|
|
//TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBuf();//Start();
|
|
// ensure enough room
|
|
//if ( m_topDocIdsBuf.getAvail() < sizeof(TopDocIds) )
|
|
// m_topDocIdsBuf.reserve(sizeof(TopDocIds) )
|
|
|
|
// get next available spot to store this
|
|
TopDocIds *td = (TopDocIds *)m_topDocIdsBuf.getBuf();
|
|
int32_t tdnum = m_topDocIdsBuf.length() / sizeof(TopDocIds);
|
|
m_topDocIdsBuf.incrementLength(sizeof(TopDocIds));
|
|
if ( m_topDocIdsBuf.length() > m_topDocIdsBuf.m_capacity ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
QueryLink *qks = (QueryLink *)m_matchingQueryBuf.getBufStart();
|
|
QueryLink *qk = &qks[queryNum];
|
|
|
|
// the relateddocidnum hack
|
|
if ( tdnum > 32000 ) { char *xx=NULL;*xx=0; }
|
|
qk->m_relatedDocIdNum = tdnum;
|
|
|
|
// store reply info, like # docids, in the query ptr
|
|
int32_t max = numDocIds;
|
|
if ( max > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS )
|
|
max = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
|
|
td->m_numDocIds = max;
|
|
|
|
// QueryLink # in the m_matchingQueryBuf buffer we represent
|
|
td->m_queryNum = queryNum;
|
|
|
|
// keep it clean
|
|
//qp->m_docIdVotes = 0;
|
|
|
|
// get the query base hash and use that to
|
|
// dedup. the query base hash ignores common
|
|
// words and converts words to their synonym
|
|
// with the smallest hash
|
|
//int64_t qbh = getQueryBaseHash(qstr);
|
|
|
|
//m_msg3a->m_hackQNum = m_queryNum;
|
|
//m_msg3a->m_hackQPtr = (char *)qp;
|
|
|
|
// count replies
|
|
m_numMsg3aReplies++;
|
|
|
|
// log to log as well
|
|
//char tmp[50000];
|
|
SafeBuf tmp;
|
|
//char *p = tmp;
|
|
tmp.safePrintf(
|
|
"seopipe: got list of %"INT32" related docids for "
|
|
"qrynum=%"INT32" "
|
|
//"of%"INT32""
|
|
"numDocids=%"INT32" "
|
|
"query=\"",
|
|
numDocIds,
|
|
m_numMsg3aReplies,//m_msg3a->m_hackQNum,
|
|
//m_maxFullQueries ,
|
|
td->m_numDocIds);
|
|
char *qqq = qk->getQueryString(&m_matchingQueryStringBuf);
|
|
tmp.safeStrcpy(qqq);
|
|
tmp.safePrintf("\" (err=%s)",
|
|
mstrerror(err));
|
|
// log each docid
|
|
//for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
// //float score = m_msg3a->getScores()[i];
|
|
// int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
|
|
// //int32_t sh32 = m_msg3a->getSiteHash32(i);
|
|
// p += sprintf(p,"d%"INT32"=%"INT64" ",i,d);
|
|
//}
|
|
char *msg = tmp.getBufStart();
|
|
log("%s",msg);
|
|
|
|
/*
|
|
// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
|
|
sb->safePrintf(
|
|
"\t<seoQueryScoreInfo>\n"
|
|
"\t\t<queryNum>%"INT32"</queryNum>\n"
|
|
"\t\t<numTotalEstimatedSearchResults>%"INT32""
|
|
"</numTotalEstimatedSearchResults>\n"
|
|
"\t\t<numDocIds>%"INT32"</numDocIds>\n"
|
|
, m_msg3a->m_hackQNum
|
|
, numTotalResults
|
|
, numDocIds
|
|
);
|
|
*/
|
|
// print the top 50 scores
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
float score = m_msg3a->m_scores[i];//getScores()[i];
|
|
int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
|
|
int32_t sh26 = m_msg3a->getSiteHash26(i);
|
|
/*
|
|
sb->safePrintf("\t\t<searchResult>\n");
|
|
sb->safePrintf("\t\t\t<rank>%"INT32"</rank>\n",i+1);
|
|
sb->safePrintf("\t\t\t<score>%f</score>\n",score);
|
|
sb->safePrintf("\t\t\t<docId>%"INT64"</docId>\n",d);
|
|
sb->safePrintf("\t\t\t<siteHash32>%"UINT32"</siteHash32>\n",sh32);
|
|
sb->safePrintf("\t\t</searchResult>\n");
|
|
*/
|
|
// store results for this Msg99Reply
|
|
td->m_topDocIds[i] = d;
|
|
td->m_topScores[i] = score;
|
|
td->m_topSiteHashes26[i] = sh26;
|
|
}
|
|
// reset rest so it prints pretty on gdb debug print cmd
|
|
for ( int32_t i = max ; i < (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; i++ ) {
|
|
td->m_topDocIds[i] = 0LL;
|
|
td->m_topScores[i] = 0.0;
|
|
td->m_topSiteHashes26[i] = 0;
|
|
}
|
|
|
|
/*
|
|
sb->safePrintf("\t</seoQueryScoreInfo>\n");
|
|
*/
|
|
|
|
// give front-end the progress bar info
|
|
if ( m_seoSocket && m_progressBar ) {
|
|
// tmp buf
|
|
char tmp[16];
|
|
float percent = (float)m_numMsg3aReplies ;
|
|
//percent /= (float)m_maxFullQueries;
|
|
percent *= 100.0;
|
|
// these are 80% of the pipeline if getting competitor
|
|
// backlinks
|
|
if ( m_progressBar == 2 ) percent *= .80;
|
|
int32_t percentLong = (int32_t)percent;
|
|
if ( percentLong >= 100 ) percentLong = 99;
|
|
int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong);
|
|
if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen);
|
|
// try a send on non-blocking socket
|
|
int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
|
|
if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n);
|
|
// forget error
|
|
errno = 0;
|
|
}
|
|
}
|
|
|
|
bool XmlDoc::clientClosedConnection ( ) {
|
|
|
|
if ( ! m_seoSocket ) return false;
|
|
|
|
if ( m_clientClosed ) return true;
|
|
|
|
if ( g_now - m_lastCheckTime < 50 ) return m_clientClosed;
|
|
|
|
m_lastCheckTime = g_now;
|
|
|
|
char buffer[100];
|
|
if ( recv(m_seoSocket->m_sd,buffer,99,MSG_PEEK|MSG_DONTWAIT) == 0 ) {
|
|
m_clientClosed = true;
|
|
log("xmldoc: CLIENT CLOSED CONNECTION!!");
|
|
}
|
|
|
|
return m_clientClosed;
|
|
}
|
|
|
|
// . returns -1 if blocked, NULL with g_errno set on error
|
|
// . we do this to get related docids
|
|
SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
|
|
|
|
setStatus ( "getmatchingqueriesscoredforfullquery" );
|
|
|
|
// just re-use the same m_queryPtrs SafeBuf we used above but we
|
|
// set the Msg99Reply::m_myScore here and sort them by that
|
|
if ( m_queryPtrsWholeValid )
|
|
return &m_matchingQueryBuf;
|
|
|
|
// get the queries sorted by the url: | scores for our main url
|
|
SafeBuf *mq = getMatchingQueryBuf();
|
|
if ( mq == NULL || mq == (void *)-1 ) return mq;
|
|
|
|
// setup timer
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
if ( ! m_beginTimeFullQueries )
|
|
m_beginTimeFullQueries = gettimeofdayInMilliseconds();
|
|
|
|
// this buffer holds a ptr to each query in each msg99 reply we
|
|
// received from all hosts in the network
|
|
QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
int32_t nks = mq->length()/sizeof(QueryLink);
|
|
|
|
int32_t maxFullQueries = 50;
|
|
int32_t tneed = maxFullQueries * sizeof(TopDocIds);
|
|
if ( m_topDocIdsBuf.length() == 0 && ! m_topDocIdsBuf.reserve(tneed) )
|
|
return NULL;
|
|
|
|
// . now launch msg3as at them
|
|
// . this is 60k so new it here
|
|
if ( ! m_msg3a ) {
|
|
// reset the query # we are processing
|
|
m_queryNum = 0;
|
|
m_numMsg3aRequests = 0;
|
|
m_numMsg3aReplies = 0;
|
|
if ( ! m_fullQueryDedup.set(8,0,256,NULL,0,
|
|
false,m_niceness,"fqdd"))
|
|
return NULL;
|
|
try { m_msg3a = new ( Msg3a ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_msg3a, sizeof(Msg3a),"xdmsg3a");
|
|
// need this too now i guess since it is 65k
|
|
try { m_query3a = new ( Query ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
return NULL;
|
|
}
|
|
mnew ( m_query3a, sizeof(Query),"xdqry3a");
|
|
}
|
|
|
|
|
|
loop:
|
|
|
|
// breath in case we hit all cache
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// have we launched all the requests we need to
|
|
bool exhausted = false;
|
|
if ( m_queryNum >= nks ) exhausted = true;
|
|
if ( m_numMsg3aRequests >= maxFullQueries ) exhausted = true;
|
|
// if client closed browser connection by hitting the stop sign
|
|
// then stop here!
|
|
if ( clientClosedConnection() ) m_hadMatchError = ESOCKETCLOSED;
|
|
if ( m_hadMatchError ) exhausted = true;
|
|
|
|
// if nothing to launch
|
|
if ( exhausted &&
|
|
// and all replies received
|
|
m_numMsg3aReplies >= m_numMsg3aRequests ) {
|
|
// nuke the msg3a to save mem
|
|
mdelete ( m_msg3a, sizeof(Msg3a) , "msg3a" );
|
|
delete ( m_msg3a );
|
|
m_msg3a = NULL;
|
|
mdelete ( m_query3a , sizeof(Query), "qry3a" );
|
|
delete ( m_query3a );
|
|
m_query3a = NULL;
|
|
// time it
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginTimeFullQueries;
|
|
log("seopipe: time: fullqueries took %"INT64" ms",took);
|
|
// force closed?
|
|
if ( m_hadMatchError ) return NULL;
|
|
// we are done!
|
|
m_queryPtrsWholeValid = true;
|
|
return &m_matchingQueryBuf;//queryPtrs;
|
|
}
|
|
|
|
// if nothing to launch wait for all replies
|
|
if ( exhausted )
|
|
return (SafeBuf *)-1;
|
|
|
|
// get the current query to process
|
|
//Msg99Reply *qp = queryPtrs[m_queryNum];
|
|
QueryLink *qk = &qks[m_queryNum];
|
|
|
|
int32_t savedQueryNum = m_queryNum;
|
|
|
|
QueryLogEntry *qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
|
|
|
|
// int16_tcut
|
|
//int64_t h64 = qk->m_querySynBaseHash64;
|
|
int64_t h64 = getSynBaseHash64 ( qe->getQueryString(),qe->m_langId);
|
|
|
|
// . if we already did a similar query, then skip it
|
|
// . Msg99Reply::m_topDocIds will be NULL so getRelatedDocIds() will
|
|
// know we skipped this query and to ignore it
|
|
if ( m_fullQueryDedup.isInTable(&h64) ) {
|
|
m_queryNum++;
|
|
goto loop;
|
|
}
|
|
|
|
// or if importance is 0, which means to ignore!
|
|
if ( qk->m_queryImportance <= 0.0 ) {
|
|
m_queryNum++;
|
|
goto loop;
|
|
}
|
|
|
|
// int16_tcut
|
|
char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
|
|
|
|
// sanity
|
|
if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
|
|
// this is required for synonyms!
|
|
// TODO: use whatever language the query is!!!
|
|
uint8_t langId = langEnglish;
|
|
|
|
// int16_tcut
|
|
int32_t qlen = gbstrlen(qstr);
|
|
|
|
//int32_t collLen = gbstrlen(cr->m_coll);
|
|
// set the request
|
|
m_mr2.reset();
|
|
m_mr2.ptr_query = qstr;
|
|
m_mr2.size_query = qlen+1;
|
|
//m_mr2.ptr_coll = cr->m_coll;
|
|
//m_mr2.size_coll = collLen+1;
|
|
m_mr2.m_collnum = cr->m_collnum;
|
|
m_mr2.m_queryExpansion = 1;
|
|
m_mr2.m_language = langId;
|
|
m_mr2.m_niceness = m_niceness;
|
|
// . get top 50 results now
|
|
// . then related docids will have to be in there
|
|
m_mr2.m_docsToGet = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
|
|
m_mr2.m_useSeoResultsCache = true;
|
|
// we do not need this, we just want the related docids/scores
|
|
m_mr2.m_getDocIdScoringInfo = false;
|
|
// use cache for 7 days since it is just for getting related docids
|
|
// right now. make sure that that cache saves to disk.
|
|
// MDW: why is this not working?
|
|
//m_mr2.m_maxAge = 86400 * 7;
|
|
//m_mr2.m_addToCache = true;
|
|
//m_mr2.m_debug = 1;
|
|
// prepend to the query?
|
|
int32_t ulen = m_firstUrl.m_ulen;
|
|
// go to next guy if this query is too big already
|
|
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
|
|
m_queryNum++;
|
|
goto loop;
|
|
}
|
|
|
|
// support for the new TopDocIds class which holds detailed search
|
|
// results for selected matching queries QueryLinks
|
|
//int32_t maxt = numQueryPtrs;
|
|
//if ( maxt > m_maxQueries ) maxt = m_maxQueries;
|
|
//if ( ! maxt ) { char *xx=NULL;*xx=0; }
|
|
// we also need the top docids
|
|
//if ( ! m_topDocIdsBuf.m_capacity ) {
|
|
// int32_t need = sizeof(TopDocIds) * (int32_t)MAX_MATCHING_QUERIES;
|
|
// if ( ! m_topDocIdsBuf.reserve ( need ,"tdbuf" ) ) return NULL;
|
|
// //m_nextAvailTopDocIdsOffset = 0;// = m_topDocIdsBuf;
|
|
//}
|
|
// make matching query, "qk", point to the topdocids that we
|
|
// will fill in when we execute this query in full below
|
|
// sanity!
|
|
//int32_t off3 = m_nextAvailTopDocIdsOffset ;
|
|
//if ( off3/(int32_t)sizeof(TopDocIds)>=maxt){char *xx=NULL;*xx=0;}
|
|
// seo.cpp's handleRequest99() should have set it to -1
|
|
//if ( qp->m_topDocIdsBufOffset != -1 ) { char *xx=NULL;*xx=0; }
|
|
// assign this TopDocIds class to this query ptr now
|
|
//qp->m_topDocIdsBufOffset = m_nextAvailTopDocIdsOffset;
|
|
// get that ptr to reset its count to 0
|
|
//TopDocIds *ttt = qp->getTopDocIds(&m_topDocIdsBuf);
|
|
//ttt->m_numDocIds = 0;
|
|
// inc it
|
|
//m_nextAvailTopDocIdsOffset += sizeof(TopDocIds);
|
|
// update length since we store topdocids buf based on its m_length
|
|
//m_topDocIdsBuf.setLength ( m_nextAvailTopDocIdsOffset );
|
|
|
|
// advance for next guy
|
|
m_queryNum++;
|
|
|
|
// add it to dedup table
|
|
if ( ! m_fullQueryDedup.addKey(&h64) ) {
|
|
m_hadMatchError = g_errno;
|
|
goto loop;
|
|
}
|
|
|
|
// mark it out
|
|
m_numMsg3aRequests++;
|
|
|
|
// . set the query class for msg3a
|
|
// . queryExpansion = true
|
|
m_query3a->set2 ( qstr , langId , true );
|
|
|
|
// a debug thing
|
|
m_query3a->m_containingParent = (void *)this;
|
|
|
|
// secret variable latchon
|
|
m_msg3a->m_hack = this;
|
|
|
|
m_msg3a->m_hackQNum = savedQueryNum;
|
|
m_msg3a->m_hackQPtr = NULL;//(char *)qp;
|
|
|
|
// note it
|
|
setStatus("launching msg3a");
|
|
|
|
// . get the docIds
|
|
// . this sets m_msg3a.m_clusterLevels[] for us
|
|
// . it sends a msg39 request to each alive host in the network
|
|
bool status = m_msg3a->getDocIds ( &m_mr2,
|
|
m_query3a,
|
|
this,//m_msg3a , // this ,
|
|
gotMsg3aReplyForFullQueryWrapper);
|
|
// return false if msg3a blocked
|
|
if ( ! status ) return (SafeBuf *)-1;
|
|
// error?
|
|
if ( g_errno ) {
|
|
m_hadMatchError = g_errno;
|
|
m_numMsg3aReplies++;
|
|
goto loop;
|
|
}
|
|
// i guess did not block... can this happen? cached?
|
|
//log("xmldoc: msg3a did not block");
|
|
// not supported yet. we need to process reply.
|
|
//char *xx=NULL;*xx=0;
|
|
// yeah, msg17 in there can cache in seoresults cache now
|
|
gotMsg3aReplyForFullQuery();
|
|
// try looping
|
|
goto loop;
|
|
}
|
|
|
|
static int rdCmp ( const void *a, const void *b ) {
|
|
RelatedDocId *da = (RelatedDocId *)a;
|
|
RelatedDocId *db = (RelatedDocId *)b;
|
|
// get scores
|
|
float scorea = da->m_relatedWeight;//dotProduct;//similarityScore;
|
|
float scoreb = db->m_relatedWeight;//dotProduct;//similarityScore;
|
|
if ( scorea < scoreb ) return 1;
|
|
if ( scorea > scoreb ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
static int lkCmp ( const void *a, const void *b ) {
|
|
QueryNumLinkedNode *ka = *(QueryNumLinkedNode **)a;
|
|
QueryNumLinkedNode *kb = *(QueryNumLinkedNode **)b;
|
|
// get scores
|
|
int32_t ra = ka->m_relatedDocIdRank;
|
|
int32_t rb = kb->m_relatedDocIdRank;
|
|
if ( ra >= 0 && rb >= 0 ) {
|
|
if ( ra < rb ) return -1;
|
|
if ( ra > rb ) return 1; // swap
|
|
}
|
|
if ( ra >= 0 ) return -1;
|
|
if ( rb >= 0 ) return 1; // swap
|
|
// if neither ranked, go by serp score i guess
|
|
float sa = ka->m_relatedDocIdSerpScore;
|
|
float sb = kb->m_relatedDocIdSerpScore;
|
|
if ( sa > sb ) return -1;
|
|
if ( sa < sb ) return 1; // swap
|
|
return 0;
|
|
}
|
|
|
|
// buf is an array of RelatedDocId members
|
|
SafeBuf *XmlDoc::getRelatedDocIds ( ) {
|
|
|
|
setStatus ( "getrelateddocids" );
|
|
|
|
if ( m_relatedDocIdBufValid )
|
|
return &m_relatedDocIdBuf;
|
|
|
|
// get the full replies with the top 50 docids and scores listed
|
|
// for each query. should be sorted by m_myScore.
|
|
SafeBuf *mq = getMatchingQueriesScoredForFullQuery ( );
|
|
if ( ! mq || mq == (void *)-1 ) return mq;
|
|
|
|
// . how many queries do we have that match this url?
|
|
// . they should be sorted by our url's score
|
|
//QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
//int32_t nks = mq->length()/sizeof(QueryLink);
|
|
|
|
|
|
int32_t *sh32 = getSiteHash32();
|
|
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SafeBuf *)sh32;
|
|
|
|
int32_t dh32 = getDomHash32();
|
|
|
|
//if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
int32_t ourSiteHash26 = *sh32 & 0x03ffffff;
|
|
int32_t ourDomHash26 = dh32 & 0x03ffffff;
|
|
|
|
// for deduping queries with the same "base hash" we do not want
|
|
// them to count twice for RelatedDocId::m_numCommonQueries
|
|
//HashTableX dedup;
|
|
//if ( ! dedup.set(8,0,1024,NULL,0,false,0,"dddtab"))
|
|
// return NULL;
|
|
|
|
// scan the top docids
|
|
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
|
|
int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
|
|
for ( int32_t i = 0 ; i < ntds ; i++ ) {
|
|
TopDocIds *td = &tds[i];
|
|
int32_t queryNum = td->m_queryNum;
|
|
//QueryLink *qk = &qks[queryNum];
|
|
// sanity
|
|
int32_t nd = td->m_numDocIds;
|
|
if( nd < 0) { char *xx=NULL;*xx=0; }
|
|
if( nd > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS){
|
|
char *xx=NULL;*xx=0;}
|
|
// get main url score for query
|
|
//float ourScore = qp->m_myScore;
|
|
// and the score of the top result
|
|
//float normScore = td->m_topScores[0];
|
|
// norm main url score
|
|
//ourScore /= normScore;
|
|
// scan the top 50 (or more) docids for this query
|
|
for ( int32_t j = 0 ; j < nd ; j++ ) {
|
|
// . do not allow related docid (aka competitor page)
|
|
// to be from our site! will make sure we exclude
|
|
// our url itself, too. otherwise competitor
|
|
// backlinks mentions when a link links to us, and
|
|
// we don't care about that, we already have the
|
|
// link. we just want to see recommneded backlinks
|
|
// we do not yet have, so we can get them.
|
|
// . skip it if from our same sitehash26
|
|
if ( td->m_topSiteHashes26[j] == ourSiteHash26 )
|
|
continue;
|
|
// fix cheatcodes.com being a competitor page when
|
|
// our main url is www.cheatcodes.com
|
|
if ( td->m_topSiteHashes26[j] == ourDomHash26 )
|
|
continue;
|
|
// skip twitter facebook, etc
|
|
int64_t docId = td->m_topDocIds[j];
|
|
if ( docId == 114607849462LL || // https://www.twitter
|
|
docId == 273941610476LL || // twitter.com
|
|
docId == 1628437294LL || // facebook.com
|
|
docId == 146394931444LL ) // cnn.com/video/
|
|
continue;
|
|
// add RelatedDocId into m_relatedDocIdBuf and/or
|
|
// augment its linked list of query/score pairs
|
|
addRelatedDocIdInfo ( td->m_topDocIds[j],
|
|
queryNum ,
|
|
td->m_topScores[j], // score
|
|
j , // rank
|
|
td->m_topSiteHashes26[j] );
|
|
}
|
|
}
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// this is now in getRelatedDocIdsScored()!!!!!!!
|
|
/*
|
|
char *rdbuf = m_relatedDocIdBuf.getBufStart();
|
|
int32_t numDocIds = m_relatedDocIdBuf.length()/sizeof(RelatedDocId);
|
|
// now sort by RelatedDocId::m_relatedWeight
|
|
qsort ( rdbuf , numDocIds, sizeof(RelatedDocId),qp99docIdCmp );
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// limit to top MAX_RELATED_DOCIDS related docids
|
|
// will take longer to get titles/urls and related queries the
|
|
// higher this number is, but we will have more competitor backlinks
|
|
// and terms etc.
|
|
int32_t maxLen = sizeof(RelatedDocId) * MAX_RELATED_DOCIDS;
|
|
int32_t currentLen = m_relatedDocIdBuf.length();
|
|
if ( currentLen > maxLen ) currentLen = maxLen;
|
|
m_relatedDocIdBuf.setLength(currentLen);
|
|
numDocIds = currentLen / sizeof(RelatedDocId);
|
|
*/
|
|
|
|
int32_t numDocIds = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
|
|
/*
|
|
// log out for debug
|
|
char *rdbuf = m_relatedDocIdBuf.getBufStart();
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf;
|
|
for ( int32_t i = 0 ; g_conf.m_logDebugSEO && i < numDocIds ; i++ ) {
|
|
log("seopipe: related docId #%"INT32" docid=%"INT64" "
|
|
"score=?? common=%"INT32"",
|
|
i,
|
|
rds[i].m_docId,
|
|
//rds[i].m_relatedWeight,//dotProduct, // similarityScore,
|
|
rds[i].m_numCommonQueries);
|
|
}
|
|
*/
|
|
|
|
log("seo: got %"INT32" related docids in buf",numDocIds);
|
|
|
|
m_relatedDocIdBufValid = true;
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
|
|
// used as part of the msg4f request
|
|
SafeBuf *XmlDoc::getTopMatchingQueryBuf ( ) {
|
|
|
|
if ( m_topMatchingQueryBufValid )
|
|
return &m_topMatchingQueryBuf;
|
|
|
|
// scan matching queries that we evaluated fully using msg3a
|
|
SafeBuf *qkbuf = getMatchingQueriesScoredForFullQuery ( );
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
|
|
//int32_t nks = qkbuf->length()/sizeof(QueryLink);
|
|
|
|
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
|
|
int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
|
|
for ( int32_t i = 0 ; i < ntds ; i++ ) {
|
|
TopDocIds *td = &tds[i];
|
|
int32_t queryNum = td->m_queryNum;
|
|
QueryLink *qk = &qks[queryNum];
|
|
// ok, get it
|
|
char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
|
|
int32_t qlen = gbstrlen(qstr);
|
|
// store query #
|
|
if ( ! m_topMatchingQueryBuf.pushLong(queryNum) )
|
|
return NULL;
|
|
// then query
|
|
if ( ! m_topMatchingQueryBuf.safeMemcpy(qstr,qlen+1))
|
|
return NULL;
|
|
}
|
|
|
|
m_topMatchingQueryBufValid = true;
|
|
return &m_topMatchingQueryBuf;
|
|
}
|
|
|
|
|
|
|
|
static void gotMsg4fReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
// a bit of a hack
|
|
THIS->m_savedSlot = slot;
|
|
// ultimately, getRelatedDocIdsScored() will be called from this
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
// . lets just put everything in this one function
|
|
// . launch a msg4f request for each relateddocid
|
|
// . get the msg4f reply back and add the positive scoring queries to the
|
|
// related docids linked list of QueryNumLinkedNodes in the
|
|
// m_commonQueryNumBuf, avoid dups.
|
|
// . then score each related docid by calling setRelatedDocIdScores()
|
|
SafeBuf *XmlDoc::getRelatedDocIdsScored ( ) {
|
|
|
|
setStatus ( "getrelateddocidsscored");
|
|
|
|
if ( m_relatedDocIdsScoredBufValid ) {
|
|
// and return the buf of RelatedDocIds
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
// what docids share our TOP-scoring matching queries?
|
|
SafeBuf *rdbuf = getRelatedDocIds();
|
|
if ( ! rdbuf || rdbuf == (void *)-1) return (SafeBuf *) rdbuf;
|
|
|
|
SafeBuf *tmq = getTopMatchingQueryBuf();
|
|
if ( ! tmq || tmq == (void *)-1) return (SafeBuf *) tmq;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// the top 50 or so matching queries will each be scored for
|
|
// every related docid we have in m_relatedDocIdBuf. these are
|
|
// the same queries we got the full results for above!!!
|
|
// we have to score them for each related docid here because we only
|
|
// get the top 300 or so results above for each one. so if the
|
|
// related docid matched the query but was not in the top 300 results,
|
|
// it would have appeared to NOT match the query. bad. that was
|
|
// causing google to come up high in related docids because it
|
|
// ranked high for so many generic queries. and the other good
|
|
// related docids did not rank in the top 300 for those same
|
|
// generic queries. so at least this logic will show that the
|
|
// related docids do indeed match those generic queries, too.
|
|
// and they will get higher scores (RelatedDocId::m_relatedWeight)
|
|
|
|
// we must be an incoming reply if we already sent out all the requests
|
|
if ( m_numMsg4fRequests > 0 ) {
|
|
// increment our reply counter
|
|
m_numMsg4fReplies++;
|
|
// . m_savedSlot is a hack
|
|
// . now parse the reply and add QueryNumLinkedNode
|
|
// into m_commonQueryNumBuf.
|
|
char *p = m_savedSlot->m_readBuf;
|
|
char *pend = m_savedSlot->m_readBufSize + p;
|
|
// now scan the reply
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// the queryNum is relative to the m_queryPtrs array
|
|
// which has all the matching queries of this document,
|
|
// not just the "top" 50 matching queries by score.
|
|
int32_t queryNum = *(int32_t *)p;
|
|
// sanity
|
|
if ( queryNum<0 ) {char *xx=NULL;*xx=0; }
|
|
p += 4;
|
|
// then docid of related docid that had this score
|
|
int64_t docId = *(int64_t *)p;
|
|
p += 8;
|
|
// then score
|
|
float score = *(float *)p;
|
|
p += 4;
|
|
// this will add the query/score pair into the
|
|
// related docid buf. it will not add dups if already
|
|
// ranked!
|
|
addRelatedDocIdInfo ( docId ,
|
|
queryNum ,
|
|
score ,
|
|
-1 , // rank unknown
|
|
-1 ); // sitehash26 unknown
|
|
}
|
|
|
|
// return if awaiting more replies
|
|
if ( m_numMsg4fReplies < m_numMsg4fRequests )
|
|
return (SafeBuf *)-1;
|
|
|
|
// point to buffer of related docids
|
|
char *rdbuf = m_relatedDocIdBuf.getBufStart();
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf;
|
|
int32_t nr = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
|
|
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
|
// int16_tcut
|
|
RelatedDocId *rd = &rds[i];
|
|
// now score it since we have all the serpscores for
|
|
// all top matching queries.
|
|
setRelatedDocIdWeightAndRank(rd);
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// now sort by RelatedDocId::m_relatedWeight
|
|
qsort ( rdbuf , nr , sizeof(RelatedDocId),rdCmp );
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// limit to top MAX_RELATED_DOCIDS related docids
|
|
// will take longer to get titles/urls and related queries the
|
|
// higher this number is, but we will have more competitor
|
|
// backlinks and terms etc. less space in cachedb too!
|
|
int32_t maxLen = MAX_RELATED_DOCIDS * sizeof(RelatedDocId);
|
|
int32_t newLen = m_relatedDocIdBuf.length();
|
|
if ( newLen > maxLen ) newLen = maxLen;
|
|
m_relatedDocIdBuf.setLength(newLen);
|
|
|
|
//
|
|
// make a new buffer for m_commonQueryNumBuf just for the
|
|
// related docids we picked, and sort them by rel docid rank.
|
|
// so it will be smaller and sorted.
|
|
//
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( m_commonQueryNumBuf.length() ) )
|
|
return NULL;
|
|
// scan each related docid in the top 300 or so
|
|
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
|
// int16_tcut
|
|
RelatedDocId *rd = &rds[i];
|
|
// store ptrs to query nums so we can sort them
|
|
QueryNumLinkedNode *links[1024];
|
|
int32_t nn = 0;
|
|
int32_t fo = rd->m_firstCommonQueryNumOff;
|
|
char *base = m_commonQueryNumBuf.getBufStart();
|
|
// scan down the linked list and store ptrs to links[]
|
|
for ( ; fo >= 0 ; ) {
|
|
// cast it
|
|
QueryNumLinkedNode *qn;
|
|
qn = (QueryNumLinkedNode *)(base + fo);
|
|
// point to next
|
|
fo = qn->m_nextOff;
|
|
// store this guy for sorting
|
|
links[nn] = qn;
|
|
nn++;
|
|
if ( nn >= 1024 ) break;
|
|
}
|
|
// now sort them by m_relatedDocIdRank
|
|
qsort( links, nn,sizeof(QueryNumLinkedNode *),lkCmp);
|
|
// point to our new linked list in tmpBuf, we will
|
|
// store them here.
|
|
rd->m_firstCommonQueryNumOff = tmpBuf.length();
|
|
QueryNumLinkedNode *prev = NULL;
|
|
// now store into tmpbuf
|
|
for ( int32_t k = 0 ; k < nn ; k++ ) {
|
|
QueryNumLinkedNode *qn = links[k];
|
|
int32_t size = sizeof(QueryNumLinkedNode);
|
|
if ( !tmpBuf.reserve(size) ) return NULL;
|
|
QueryNumLinkedNode *nn ;
|
|
nn = (QueryNumLinkedNode *)tmpBuf.getBuf();
|
|
int32_t clen = tmpBuf.length();
|
|
tmpBuf.safeMemcpy(qn,size);
|
|
// we are the previous guy's next node
|
|
if ( prev ) prev->m_nextOff = clen;
|
|
// assume nobody follows us
|
|
nn->m_nextOff = -1;
|
|
// we are now next guy's prev
|
|
prev = nn;
|
|
}
|
|
}
|
|
|
|
// now steal tmpbuf, and free our old stuff
|
|
m_commonQueryNumBuf.stealBuf ( &tmpBuf );
|
|
|
|
// i guess we are done now!
|
|
m_relatedDocIdsScoredBufValid = true;
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
|
|
// . there's a massive # of related docids at this point
|
|
// . possibly 50 x 300 = 15,000
|
|
// . so launch one msg4f for each host in our network
|
|
// . just specify all the related docids in the msg4f request and have
|
|
// the handleRequest4f() function in seo.cpp get the title rec.
|
|
// . make sure all docids are local to that host
|
|
// . dispatch the msg4f request to the machine that has that docid
|
|
// local so it can just hit disk
|
|
// . handleRequest4f() can follow the same logic as in
|
|
// getRelatedQueryLinks() which make a new xmldoc. then it can
|
|
// call newxd->getTermListBuf() instead of us passing it in.
|
|
// . so each host has a bin, a host bin
|
|
//#ifdef __APPLE__
|
|
SafeBuf hostBin[MAX_HOSTS];
|
|
//#else
|
|
//SafeBuf hostBin[g_hostdb.m_numHosts];
|
|
//#endif
|
|
|
|
// scan the related docids and send the requests if we have not already
|
|
for ( int32_t i = 0 ; ! m_sentMsg4fRequests && i < numRelated ; i++ ) {
|
|
RelatedDocId *rd = &rds[i];
|
|
//uint32_t gid=g_hostdb.getGroupIdFromDocId (rd->m_docId);
|
|
// pick host in that group
|
|
//Host *group = g_hostdb.getGroup ( gid );
|
|
int32_t shardNum = getShardNumFromDocId ( rd->m_docId );
|
|
Host *group = g_hostdb.getShard ( shardNum );
|
|
int32_t nh = g_hostdb.m_numHostsPerShard;
|
|
int32_t hostNum = rd->m_docId % nh;
|
|
Host *h = &group[hostNum];
|
|
int32_t hostId = h->m_hostId;
|
|
// skip if dead
|
|
int32_t count = 0;
|
|
if ( g_hostdb.isDead(hostId) && h->m_wasEverAlive ) {
|
|
// increment hostnum if that one is dead
|
|
if ( ++hostNum >= nh ) hostNum = 0;
|
|
// set these again
|
|
h = &group[hostNum];
|
|
hostId = h->m_hostId;
|
|
// if all dead, just pick this one i guess
|
|
if ( ++count >= nh ) break;
|
|
}
|
|
// int16_tcut
|
|
SafeBuf *hbin = &hostBin[hostId];
|
|
// if bin is empty initialize
|
|
if ( hbin->length() == 0 ) {
|
|
// provide only collection to handleRequest4f()
|
|
if ( ! hbin->safeMemcpy(cr->m_coll,
|
|
gbstrlen(cr->m_coll)+1) )
|
|
return NULL;
|
|
// . store the queries we want it to evaluate
|
|
// . these are null-terminated query strings preceeded
|
|
// by their corresponding query number in our
|
|
// m_queryPtrs[] array which pts to a Msg99Reply
|
|
if ( ! hbin->pushLong(tmq->length()))
|
|
return NULL;
|
|
if ( ! hbin->safeMemcpy(tmq))
|
|
return NULL;
|
|
}
|
|
// store this new docid, which is local to this host
|
|
if ( ! hbin->pushLongLong(rd->m_docId) ) return NULL;
|
|
}
|
|
|
|
// shotgun out the msg4f requests now
|
|
for ( int32_t i = 0 ;
|
|
! m_sentMsg4fRequests && i < g_hostdb.getNumHosts() ; i++ ) {
|
|
// int16_tcut
|
|
SafeBuf *hbin = &hostBin[i];
|
|
// get that host
|
|
Host *host = g_hostdb.getHost(i);
|
|
// make a copy for sending out
|
|
SafeBuf copy;
|
|
if ( ! copy.safeMemcpy ( hbin ) ) continue;
|
|
// get the bin copy
|
|
char *req = copy.getBufStart();
|
|
int32_t reqSize = copy.length();
|
|
// detach it so udpserver can free it when done transmitting
|
|
copy.detachBuf ();
|
|
// free this guy now i guess
|
|
hbin->purge();
|
|
// count as launched
|
|
m_numMsg4fRequests++;
|
|
// launch it
|
|
if ( ! g_udpServer.sendRequest ( req ,
|
|
reqSize,
|
|
0x4f , // msgtype
|
|
host->m_ip , // ip
|
|
host->m_port , // port
|
|
host->m_hostId,
|
|
NULL, // retslot
|
|
this,
|
|
gotMsg4fReplyWrapper,
|
|
10000 , // timeout
|
|
-1 , // backoff
|
|
-1 , // maxwait
|
|
NULL, // replybuf
|
|
0, // replybufmaxsize
|
|
m_niceness // niceness
|
|
)) {
|
|
// let admin know about error
|
|
log("seopipe: sendRequest 4f had error: %s",
|
|
mstrerror(g_errno));
|
|
// count it as replied then
|
|
m_numMsg4fReplies++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// do not re-send the requests
|
|
m_sentMsg4fRequests = true;
|
|
|
|
// wait for all replies to come in
|
|
if ( m_numMsg4fRequests > m_numMsg4fReplies ) return (SafeBuf *)-1;
|
|
|
|
// how can they all be done? all errors!
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
// remote host will alloc an xmldoc, about 1MB each...
|
|
#define MAX_OUT_MSG20S 30
|
|
|
|
// . like getRelatedDocIds() but with titles, etc.
|
|
// . return a list of competiting docids/titles/etc.
|
|
SafeBuf *XmlDoc::getRelatedDocIdsWithTitles ( ) {
|
|
|
|
setStatus ( "getrelateddocidswithtitles" );
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_relatedDocIdsWithTitlesValid )
|
|
return &m_relatedDocIdBuf;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
SafeBuf *rdbuf = getRelatedDocIdsScored();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
// now look up each docid in titledb and store the url title
|
|
// into m_relatedTitleBuf safebuf and set the RelatedDocId::
|
|
// rd_title_off and rd_url_off into that when done. store offsets for
|
|
// now and make into full out ptrs when done in case the
|
|
// m_relatedTitleBuf reallocs.
|
|
|
|
if ( ! m_msg20Buf.length() ) {
|
|
int32_t need = sizeof(Msg20) * MAX_OUT_MSG20S ;
|
|
if ( ! m_msg20Buf.reserve ( need,"m20buf" ) ) return NULL;
|
|
// mark it all in use
|
|
m_msg20Buf.setLength(need);
|
|
// init them
|
|
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
|
|
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
|
|
// reset cursor to start with first related docid
|
|
m_rdCursor = 0;
|
|
m_relatedDocIdError = 0;
|
|
m_numMsg20Replies = 0;
|
|
}
|
|
|
|
// point to buffer of related docids
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();;
|
|
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
|
|
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
|
|
|
|
// scan the msg20s we allocated to see if any got a reply
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &mp[i];
|
|
// skip if never launched
|
|
if ( ! msg20->m_launched ) continue;
|
|
// skip if it is in progress, awaiting its reply
|
|
if ( msg20->m_inProgress ) continue;
|
|
// get the reply from it (might be NULL iff g_errno is set)
|
|
Msg20Reply *reply = msg20->getReply(); // m_r
|
|
// get the corresponding related docid
|
|
int32_t hisCursor = msg20->m_hack2;
|
|
// int16_tcut
|
|
RelatedDocId *rd = &rds[hisCursor];
|
|
// ok, it has a reply. could be NULL if g_errno was set.
|
|
if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , reply ) )
|
|
m_relatedDocIdError = g_errno;
|
|
// reset it for later us... or not...
|
|
msg20->reset();
|
|
// count reply as back now
|
|
m_numMsg20Replies++;
|
|
}
|
|
|
|
// launch more if we can. one launch per msg20.
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// no more related docids left to launch?
|
|
if ( m_rdCursor >= numRelated ) break;
|
|
// int16_tcut
|
|
Msg20 *msg20 = &mp[i];
|
|
// skip if already launched/inuse
|
|
if ( msg20->m_inProgress ) continue;
|
|
// get current related docid
|
|
RelatedDocId *rd = &rds[m_rdCursor];
|
|
// make the request
|
|
Msg20Request req;
|
|
//req.ptr_coll = cr->m_coll;
|
|
//req.size_coll = gbstrlen(cr->m_coll)+1;
|
|
req.m_collnum = cr->m_collnum;
|
|
req.m_docId = rd->m_docId;
|
|
req.m_expected = true;
|
|
req.m_niceness = m_niceness;
|
|
req.m_state = m_masterState;
|
|
req.m_callback2 = m_masterLoop;
|
|
// do not get summary stuff. too slow.
|
|
req.m_numSummaryLines = 0;
|
|
// if it has an outlink to our site/domain set
|
|
// Msg20Reply::m_hasLinkToOurDomOrHost
|
|
req.m_ourHostHash32 = getHostHash32a();
|
|
req.m_ourDomHash32 = getDomHash32();
|
|
// store cursor in msg20 itself so we know what rd it's using
|
|
msg20->m_hack2 = m_rdCursor;
|
|
// advance cursor!!!
|
|
m_rdCursor++;
|
|
// launch it
|
|
if ( ! msg20->getSummary ( &req ) ) continue;
|
|
// it did not block... wtf? g_errno might be set. ENOMEM?
|
|
if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , NULL ) )
|
|
m_relatedDocIdError = g_errno;
|
|
// reset it
|
|
msg20->reset();
|
|
// count reply as back now
|
|
m_numMsg20Replies++;
|
|
// it is not launched
|
|
i--;
|
|
}
|
|
|
|
// wait for one reply per related docid
|
|
if ( m_numMsg20Replies < numRelated )
|
|
return (SafeBuf *)-1;
|
|
|
|
// call msg20 destructor
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
Msg20 *msg20 = &mp[i];
|
|
msg20->destructor();
|
|
}
|
|
// purge the mem they used
|
|
m_msg20Buf.purge();
|
|
|
|
// now we are done
|
|
m_relatedDocIdsWithTitlesValid = true;
|
|
m_relatedTitleBufValid = true;
|
|
|
|
// store it in cachedb
|
|
if ( ! storeRelatedDocIdsIntoCachedb( ))
|
|
return (SafeBuf *)-1;
|
|
|
|
return &m_relatedDocIdBuf;
|
|
}
|
|
|
|
|
|
bool XmlDoc::setRelatedDocIdInfoFromMsg20Reply ( RelatedDocId *rd ,
|
|
Msg20Reply *reply ) {
|
|
|
|
// get error. g_errno can be ENOTFOUND if titlerec not found
|
|
int32_t error = g_errno;
|
|
// . or could be EDOCBANNED/EDOCFILTERED etc.
|
|
// . if reply is NULL then g_errno MUST be set
|
|
if ( ! error ) error = reply->m_errno;
|
|
|
|
// int16_tcuts
|
|
char *urlStr = NULL;
|
|
char *titleStr = NULL;
|
|
char *siteStr = NULL;
|
|
|
|
if ( reply ) {
|
|
urlStr = reply->ptr_ubuf;
|
|
titleStr = reply->ptr_tbuf;
|
|
siteStr = reply->ptr_site;
|
|
}
|
|
|
|
// did that fail? i.e. docid not found!?!?!
|
|
if ( error ) {
|
|
// . just skip this asshole then
|
|
// . might be EDOCBANNED or EDOCFILTERED!
|
|
// . some are filtered because they are domain-only urls
|
|
// which should not be in the index because we force
|
|
// a "www." prepend on all urls now.
|
|
log("seo: msg20 reply for docid=%"INT64" url=%s had "
|
|
"error: %s", rd->m_docId,urlStr,mstrerror(error));
|
|
// clear that
|
|
g_errno = 0;
|
|
ignoreRelatedDocId:
|
|
// mark them offsets as not-founds
|
|
rd->rd_title_off = -1;
|
|
rd->rd_url_off = -1;
|
|
rd->rd_site_off = -1;
|
|
return true;
|
|
}
|
|
|
|
// bar facebook.com and twitter.com roots... too popular for all!
|
|
// was coming up for jezebelgallery.com
|
|
if ( strcmp(urlStr,"http://www.twitter.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"https://www.twitter.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"http://www.facebook.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
// "/home.php?" or "home.*"
|
|
if ( strncmp(urlStr,"http://www.facebook.com/home.",29) == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"https://www.facebook.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
if ( strcmp(urlStr,"http://www.cnn.com/video/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
// fix robothits.com competitor pages
|
|
if ( strcmp(urlStr,"http://www.google.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
if ( strcmp(urlStr,"http://www.msn.com/") == 0 )
|
|
goto ignoreRelatedDocId;
|
|
|
|
// null means no title i guess
|
|
if ( ! titleStr ) titleStr = "";
|
|
|
|
// or if he links to us
|
|
if ( reply->m_hasLinkToOurDomOrHost ) {
|
|
log("seo: related docid=%"INT64" url=%s links to our domain",
|
|
reply->m_docId,
|
|
urlStr);
|
|
goto ignoreRelatedDocId;
|
|
}
|
|
|
|
|
|
// store title
|
|
int32_t titleOffset = m_relatedTitleBuf.length();
|
|
if ( ! m_relatedTitleBuf.safeStrcpy ( titleStr ) ) return false;
|
|
m_relatedTitleBuf.pushChar('\0');
|
|
|
|
// then url
|
|
int32_t urlOffset = m_relatedTitleBuf.length();
|
|
if ( ! m_relatedTitleBuf.safeStrcpy ( urlStr ) ) return false;
|
|
m_relatedTitleBuf.pushChar('\0');
|
|
|
|
// then site
|
|
int32_t siteOffset = m_relatedTitleBuf.length();
|
|
if ( ! m_relatedTitleBuf.safeStrcpy ( siteStr ) ) return false;
|
|
m_relatedTitleBuf.pushChar('\0');
|
|
|
|
// then linkinfo
|
|
//int32_t linkInfo1Offset = m_relatedTitleBuf.length();
|
|
//if(!m_relatedTitleBuf.safeMemcpy(info1,info1->getSize()))return NULL;
|
|
|
|
// store as offset for easy serialization for storage into cachedb
|
|
//rd->m_linkInfo1Offset = linkInfo1Offset;
|
|
rd->m_relatedFirstIp = reply->m_firstIp;
|
|
rd->m_relatedCurrentIp = reply->m_ip;
|
|
rd->m_rd_siteRank = reply->m_siteRank;
|
|
rd->m_rd_langId = reply->m_language;
|
|
|
|
rd->m_rd_siteHash32 = 0;
|
|
if ( reply->ptr_site )
|
|
rd->m_rd_siteHash32 = hash32n ( reply->ptr_site );
|
|
|
|
// record the offsets of title/url/site in the m_relatedTitleBuf
|
|
rd->rd_title_off = titleOffset;
|
|
rd->rd_url_off = urlOffset;
|
|
rd->rd_site_off = siteOffset;
|
|
|
|
SafeBuf *rdbuf = getRelatedDocIds();
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
// log out for debug
|
|
log(LOG_DEBUG,
|
|
"seopipe: related docid (%"INT32"of%"INT32") docid=%"INT64" score=%f "
|
|
"title=\"%s\" url=\"%s\"",
|
|
m_numMsg20Replies,
|
|
numRelated-1,
|
|
rd->m_docId,
|
|
rd->m_relatedWeight,
|
|
titleStr,
|
|
urlStr);
|
|
|
|
return true;
|
|
}
|
|
/*
|
|
HashTableX *XmlDoc::getMatchingQueryHashTable ( ) {
|
|
|
|
setStatus ( "getmatchingqueryhashtable" );
|
|
|
|
if ( m_queryHashTableValid )
|
|
return &m_queryHashTable;
|
|
|
|
SafeBuf *qpbuf = getMatchingQueries(false);
|
|
if ( ! qpbuf || qpbuf == (void *)-1) return (HashTableX *)qpbuf;
|
|
|
|
// how many queries do we have that match this url?
|
|
Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
|
|
// init it
|
|
if ( ! m_queryHashTable.set(8,
|
|
0,
|
|
numQueryPtrs*4,
|
|
NULL,
|
|
0,
|
|
false,
|
|
m_niceness,
|
|
"qdht") )
|
|
return NULL;
|
|
|
|
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
|
|
// cast it
|
|
Msg99Reply *qp = qptrs[i];
|
|
// int16_tcut
|
|
int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
|
|
// hash it up
|
|
if ( ! m_queryHashTable.addKey ( &eh64 ) )
|
|
return NULL;
|
|
}
|
|
|
|
// all done
|
|
m_queryHashTableValid = true;
|
|
return &m_queryHashTable;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
HashTableX *XmlDoc::getMatchingQueryOffsetTable ( ) {
|
|
|
|
setStatus ( "getmatchingqueryoffsettable" );
|
|
|
|
if ( m_queryOffsetTableValid )
|
|
return &m_queryOffsetTable;
|
|
|
|
SafeBuf *qkbuf = getMatchingQueryBuf();
|
|
if ( ! qkbuf || qkbuf == (void *)-1) return (HashTableX *)qkbuf;
|
|
|
|
// how many queries do we have that match this url?
|
|
//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
|
|
//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
|
|
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
|
|
int32_t nks = qkbuf->length()/sizeof(QueryLink);
|
|
|
|
|
|
// init it
|
|
if ( ! m_queryOffsetTable.set(8,
|
|
0,
|
|
nks*4,
|
|
NULL,
|
|
0,
|
|
false,
|
|
m_niceness,
|
|
"qdot") )
|
|
return NULL;
|
|
|
|
for ( int32_t i = 0 ; i < nks ; i++ ) {
|
|
// cast it
|
|
QueryLink *qk = &qks[i];
|
|
// int16_tcut
|
|
//int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
|
|
int64_t eh64 = qp->m_replyingHostId;
|
|
eh64 <<= 32;
|
|
eh64 |= qp->m_qbufOffset;
|
|
// hash it up
|
|
if ( ! m_queryOffsetTable.addKey ( &eh64 ) )
|
|
return NULL;
|
|
}
|
|
|
|
// all done
|
|
m_queryOffsetTableValid = true;
|
|
return &m_queryOffsetTable;
|
|
}
|
|
|
|
//static char *s_base = NULL;
|
|
|
|
// related QUERY compate
|
|
int qp99relatedCmp ( const void *a, const void *b ) {
|
|
// these are offsets
|
|
//int32_t offa = *(int32_t *)a;
|
|
//int32_t offb = *(int32_t *)b;
|
|
QueryLink *qa = *(QueryLink **)a;
|
|
QueryLink *qb = *(QueryLink **)b;
|
|
// make sure manually added queries are on top
|
|
//if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1;
|
|
//if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
|
|
//QueryInfo *qia = &qa->m_queryInfo;
|
|
//QueryInfo *qib = &qb->m_queryInfo;
|
|
// get scores
|
|
float scorea = qa->m_rq_totalScore;
|
|
float scoreb = qb->m_rq_totalScore;
|
|
if ( scorea < scoreb ) return 1;
|
|
if ( scorea > scoreb ) return -1;
|
|
//return 0;
|
|
// let docidsincommon break ties
|
|
return qb->m_docIdVotes - qa->m_docIdVotes;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
static int qlCmp ( const void *a, const void *b ) {
|
|
QueryLink *qa = (QueryLink *)a;
|
|
QueryLink *qb = (QueryLink *)b;
|
|
|
|
// let docid break ties
|
|
int64_t da = qa->getRelatedDocId(s_rdBuf)->m_docId;
|
|
int64_t db = qb->getRelatedDocId(s_rdBuf)->m_docId;
|
|
|
|
//int64_t da = qa->m_relatedDocId->m_docId;
|
|
//int64_t db = qb->m_relatedDocId->m_docId;
|
|
|
|
// always niceness 1 i guess
|
|
QUICKPOLL(1);
|
|
|
|
if ( da > db )
|
|
return 1; // 1 means to swap!
|
|
if ( da < db )
|
|
return -1;
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
#include <math.h> // sqrtf()
|
|
|
|
// now we can do square roots in gdb by calling this
|
|
float gbsqrt ( float x ) {
|
|
return sqrtf(x);
|
|
}
|
|
|
|
|
|
/*
|
|
// sort the related query links intersected buf by docid
|
|
QueryLink *ptrs;
|
|
ptrs = (QueryLink *)m_relatedQueryLinksIntersected.getBufStart();
|
|
int32_t nk = m_relatedQueryLinksIntersected.length() / sizeof(QueryLink);
|
|
qsort ( ptrs ,
|
|
nk,
|
|
sizeof(QueryLink),
|
|
qlCmp );
|
|
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - start;
|
|
log("seopipe: time: relatedqueryintersection took %"INT64" ms",took);
|
|
*/
|
|
|
|
/*
|
|
void XmlDoc::gotMsg98Reply ( UdpSlot *slot ) {
|
|
// get replying hostid
|
|
int32_t hostId = slot->m_hostId;
|
|
// log
|
|
setStatus ( "gotmsg98reply" );
|
|
// sanity
|
|
if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
|
|
// point to it
|
|
char *p = slot->m_readBuf;
|
|
char *pend = p + slot->m_readBufSize;
|
|
// int16_tcuts
|
|
QueryLink *qks = (QueryLink *)m_tmpBuf5.getBufStart();
|
|
// sanity, i guess if oom
|
|
int32_t maxLinkOff = m_tmpBuf5.length() ;
|
|
maxLinkOff /= sizeof(QueryLink);
|
|
// make some space
|
|
int32_t need = slot->m_readBufSize;
|
|
if ( ! m_tmpStringBuf5.reserve(need,"rqdbuf") ) {
|
|
m_msg98ReplyError = g_errno;
|
|
// do not bother scanning the reply
|
|
p = pend;
|
|
}
|
|
|
|
// init table
|
|
if ( m_qstringTable.m_numSlots == 0 ) {
|
|
// 1M slots!
|
|
if ( ! m_qstringTable.set(4,4,1000000,NULL,0,false,
|
|
m_niceness,"qstrtbl") ) {
|
|
m_msg98ReplyError = g_errno;
|
|
// do not bother scanning the reply
|
|
p = pend;
|
|
}
|
|
}
|
|
|
|
|
|
//int32_t numQueryLinks = m_relatedQueryLinksIntersected.length() ;
|
|
//numQueryLinks /= sizeof(QueryLink);
|
|
// put strings into m_tmpStringBuf5
|
|
// parse these strings
|
|
// maybe index so we can assign to QueryLinks::m_queryStringOffset
|
|
// maybe include querylink # so we can assign quickly!
|
|
QueryLink *qk;
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// offset of query link
|
|
int32_t queryLinkOff = *(int32_t *)p;
|
|
p += 4;
|
|
// crazy? maybe we went oom on m_relatedQueryLinksIntersected
|
|
if ( queryLinkOff >= maxLinkOff ) {
|
|
log("seopipe: msg98 reply link off breach %"INT32">=%"INT32"",
|
|
queryLinkOff,maxLinkOff);
|
|
m_msg98ReplyError = ENOMEM;
|
|
break;
|
|
}
|
|
|
|
// get that
|
|
QueryLogEntry *qe = (QueryLogEntry *)p;
|
|
// skip it
|
|
p += qe->getSize();
|
|
|
|
// point to it
|
|
qk = &qks[queryLinkOff];
|
|
|
|
// do not duplicate query strings!
|
|
int32_t qh32 = hash32n ( qe->getQueryString() );
|
|
int32_t slot = m_qstringTable.getSlot ( &qh32 );
|
|
if ( slot >= 0 ) {
|
|
int32_t qeOff;
|
|
qeOff =*(int32_t *)m_qstringTable.getValueFromSlot(slot);
|
|
qk->m_queryStringOffset = qeOff;
|
|
qk->m_queryHostId = -1;
|
|
continue;
|
|
}
|
|
|
|
// get offset of string in string bug
|
|
int32_t stringOff = m_tmpStringBuf5.length();
|
|
// store good serp score
|
|
if ( ! m_tmpStringBuf5.safeMemcpy(qe,qe->getSize() ) ) {
|
|
m_msg98ReplyError = g_errno;
|
|
break;
|
|
}
|
|
|
|
// add to table
|
|
if ( ! m_qstringTable.addKey(&qh32,&stringOff) ) {
|
|
m_msg98ReplyError = g_errno;
|
|
break;
|
|
}
|
|
|
|
|
|
// show it
|
|
//log("seopipe: DEBUG. mapped remote off %"INT32" (hostid%"INT32") to "
|
|
// "local off %"INT32" (%s)"
|
|
// ,qk->m_queryStringOffset,qk->m_queryHostId,stringOff,qstr);
|
|
// . save string offset
|
|
// . THIS OVERWRITES the g_qbuf offset that was in there!!!
|
|
qk->m_queryStringOffset = stringOff;
|
|
// to indicate that this QueryLink::m_queryStringOffset is now
|
|
// an offset into m_relatedQueryStringBuf and no longer an
|
|
// offset into g_qbuf of the specific hostid, we set hostid
|
|
// to -1
|
|
qk->m_queryHostId = -1;
|
|
}
|
|
// steal it so it doesn't free it
|
|
//slot->m_readBuf = NULL;
|
|
// inc the counter
|
|
m_numMsg98Replies++;
|
|
// return control to transmit function. it will call m_callback1
|
|
// if the function is done. but if a different parent function than
|
|
// transmit called us then we call that. it just depends on the
|
|
// intial entry function that called getMatchingQueries()
|
|
m_masterLoop ( m_masterState );
|
|
}
|
|
|
|
|
|
|
|
|
|
static void gotMsg3fReplyWrapper ( void *state , void *state2 ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
//Multicast *m = (Multicast *)state2;
|
|
Bin *bin = (Bin *)state2;
|
|
THIS->gotMsg3fReply ( bin ); // m
|
|
}
|
|
*/
|
|
|
|
static int mtCmp ( const void *a, const void *b ) {
|
|
MissingTerm *wa = *(MissingTerm **)a;
|
|
MissingTerm *wb = *(MissingTerm **)b;
|
|
if ( wb->m_importance > wa->m_importance ) return 1; // swap
|
|
if ( wb->m_importance < wa->m_importance ) return -1;
|
|
if ( wb->m_votes > wa->m_votes ) return 1; // swap
|
|
if ( wb->m_votes < wa->m_votes ) return -1;
|
|
if ( (int64_t)b < (int64_t)a ) return 1; // swap
|
|
if ( (int64_t)b > (int64_t)a ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
// . called by getMissingTermBuf() and getMatchingTermBuf()
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::addTermsFromQuery ( char *qstr,
|
|
uint8_t queryLangId,
|
|
int32_t gigablastTraffic,
|
|
int32_t googleTraffic2,
|
|
//QueryLogEntry *qe ,
|
|
int32_t hackqoff,
|
|
SafeBuf *tmpBuf ,
|
|
HashTableX *scoreTable ,
|
|
HashTableX *topTermsTable ,
|
|
float imp, // importance
|
|
bool isRelatedQuery ) {
|
|
|
|
// sanity
|
|
if ( hackqoff < 0 ) { char *xx=NULL;*xx=0; }
|
|
// print query but bold-face the terms our doc has not
|
|
Query qq;
|
|
//SafeBuf *rqsb = &m_relatedQueryStringBuf;
|
|
// doQueryExpansion = false
|
|
//char *qstr = qe->getQueryString ( );
|
|
qq.set2 ( qstr , queryLangId , false );
|
|
int32_t lastStart = -1;
|
|
for ( int32_t k = 0 ; k < qq.m_numWords ; k++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
QueryWord *qw = &qq.m_qwords[k];
|
|
int32_t tid32 = qw->m_wordId & 0xffffffff;
|
|
// is it not contained by our doc
|
|
if ( ! tid32 ) continue;
|
|
// skip if we contain it already
|
|
if ( isRelatedQuery && topTermsTable->isInTable ( &tid32 ) )
|
|
continue;
|
|
// skip if common word like "on" "at" etc.
|
|
if ( isCommonQueryWordInEnglish(tid32) ) continue;
|
|
// get start of wikipedia phrase it is in
|
|
int32_t start = qw->m_wikiPhraseStart;
|
|
int32_t nwk = qw->m_numWordsInWikiPhrase;
|
|
// if not in wiki phrase at all, just use single word
|
|
if ( qw->m_wikiPhraseId == 0 ) {
|
|
start = k;
|
|
nwk = 1;
|
|
}
|
|
// do not re-do any words in here
|
|
if ( start == lastStart ) continue;
|
|
lastStart = start;
|
|
// hash each wordid in the term into the th64 hash
|
|
int64_t th64 = 0LL;
|
|
//int32_t alnumWordCount = 0;
|
|
for ( int32_t j = start ; j < start + nwk ; j++ ) {
|
|
// int16_tcut
|
|
QueryWord *qw = &qq.m_qwords[j];
|
|
// skip punct
|
|
if ( qw->m_wordId == 0 ) continue;
|
|
// hash otherwise
|
|
th64 ^= qw->m_wordId;
|
|
// count it
|
|
//alnumWordCount++;
|
|
}
|
|
|
|
// get traffic of related query
|
|
int32_t traffic = gigablastTraffic;
|
|
// make gb traffic into google monthly traffic
|
|
traffic *= GB_TRAFFIC_MODIFIER;
|
|
// ues google numbers if we have them, more accurate
|
|
int32_t googleTraffic = googleTraffic2;
|
|
if ( googleTraffic >= 0 ) traffic = googleTraffic;
|
|
|
|
|
|
// now score that term
|
|
int32_t slot = scoreTable->getSlot ( &th64 );
|
|
if ( slot >= 0 ) {
|
|
int32_t off;
|
|
off=*(int32_t *)scoreTable->getValueFromSlot(slot);
|
|
char *base = tmpBuf->getBufStart();
|
|
MissingTerm *pt=(MissingTerm *)(base + off);
|
|
pt->m_importance += imp;
|
|
pt->m_votes++;
|
|
pt->m_traffic += traffic;
|
|
// store first 10 related query strings
|
|
// we got this term from
|
|
for ( int32_t x = 1 ; x < 10 ; x++ ) {
|
|
if ( pt->m_hackQueryOffsets[x] != -1 )
|
|
continue;
|
|
// grab it. querylogentry ptr!!
|
|
pt->m_hackQueryOffsets[x] = hackqoff;
|
|
break;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
|
|
// set a class to store in safebuf
|
|
MissingTerm mt;
|
|
mt.m_importance = imp;
|
|
//mt.m_numAlnumWords = alnumWordCount;
|
|
mt.m_synOf = NULL;
|
|
mt.m_votes = 1;
|
|
mt.m_traffic = traffic;
|
|
mt.m_hackQueryOffsets[0] = hackqoff;
|
|
// if not a missing term, we are a MATCHING term
|
|
mt.m_isMissingTerm = isRelatedQuery;
|
|
// invalidate the remaining 9 query offsets
|
|
for ( int32_t x = 1 ; x < 10 ; x++ )
|
|
mt.m_hackQueryOffsets[x] = -1;
|
|
int32_t offset = tmpBuf->length();
|
|
int32_t toCopy = sizeof(MissingTerm);
|
|
if ( ! tmpBuf->safeMemcpy(&mt,toCopy))
|
|
return false;
|
|
// for calculating length of stored term string
|
|
int32_t startLen = tmpBuf->length();
|
|
// . if first time in scoretable, add stuff
|
|
// . store the string, each word separately
|
|
for ( int32_t j = start ; j < start + nwk ; j++ ) {
|
|
// int16_tcut
|
|
QueryWord *qw = &qq.m_qwords[j];
|
|
// point to word as string
|
|
char *str = qw->m_word;
|
|
int32_t len = qw->m_wordLen;
|
|
// make all punct a space
|
|
if ( qw->m_wordId == 0 ) {
|
|
str = " ";
|
|
len = 1;
|
|
}
|
|
// store term string after MissingTerm class
|
|
if ( ! tmpBuf->safeMemcpy(str,len) )
|
|
return false;
|
|
}
|
|
tmpBuf->pushChar('\0');
|
|
// record MissingTerm::m_termSize
|
|
int32_t delta = tmpBuf->length() - startLen;
|
|
char *base = tmpBuf->getBufStart();
|
|
MissingTerm *pmt = (MissingTerm *)(base + offset);
|
|
pmt->m_termSize = delta;
|
|
// now score table entry
|
|
if ( ! scoreTable->addKey ( &th64 , &offset ) )
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// this is used to sort the MissingTerm instances in a safeBuf,
|
|
// missingTermBuf. it is also used to sort the Matching terms from
|
|
// getMatchingTermBuf() as well now!
|
|
bool XmlDoc::sortTermsIntoBuf ( HashTableX *scoreTable ,
|
|
SafeBuf *tmpBuf ,
|
|
SafeBuf *missingTermBuf ) {
|
|
|
|
// make ptrs for sorting
|
|
int32_t numTerms = scoreTable->getNumUsedSlots();
|
|
int32_t need = numTerms * 4;
|
|
SafeBuf ptrBuf;
|
|
if ( ! ptrBuf.reserve ( need ,"srtbuf") ) return false;
|
|
char *p = tmpBuf->getBufStart();
|
|
char *pend = tmpBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
MissingTerm *mt = (MissingTerm *)p;
|
|
p += mt->getSize();
|
|
ptrBuf.pushPtr ( mt );
|
|
}
|
|
gbqsort ( ptrBuf.getBufStart(),
|
|
numTerms,
|
|
sizeof(MissingTerm *),
|
|
mtCmp,
|
|
m_niceness);
|
|
|
|
// now write the missingTerm instances into m_missingTermBuf
|
|
int32_t need2 = tmpBuf->length();
|
|
if ( ! missingTermBuf->reserve ( need2 ,"mtbuf") ) return false;
|
|
// now write back into the real buf
|
|
MissingTerm **pp = (MissingTerm **)ptrBuf.getBufStart();
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
|
|
MissingTerm *mt = pp[i];
|
|
missingTermBuf->safeMemcpy ( mt , mt->getSize() );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . now this uses the related queries
|
|
// . use logic from getInsertableTerms()!!!
|
|
SafeBuf *XmlDoc::getMissingTermBuf ( ) {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_missingTermBufValid )
|
|
return &m_missingTermBuf;
|
|
|
|
SafeBuf *qkbuf = getRelatedQueryBuf ();
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
|
|
HashTableX *topTermsTable = getTermIdBufDedupTable32();
|
|
if ( ! topTermsTable || topTermsTable == (void *)-1 )
|
|
return (SafeBuf *)topTermsTable;
|
|
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( 100000 ,"t3buf" ) ) return NULL;
|
|
|
|
// maps 64-bit term hash (can be multiple words in a term) to
|
|
// an offset into tmpBuf.
|
|
HashTableX scoreTable;
|
|
if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
|
|
return NULL;
|
|
|
|
//
|
|
// taken from seo.cpp's printRelatedQueries() function
|
|
//
|
|
//int32_t *qrOffs = (int32_t *)relBuf->getBufStart();
|
|
//int32_t numRels = relBuf->length() / sizeof(int32_t);
|
|
//char *base = m_queryRelBuf.getBufStart();
|
|
//SafeBuf *rqsb = &m_relatedQueryStringBuf;
|
|
|
|
int32_t nks = qkbuf->length() / sizeof(QueryLink);
|
|
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
|
|
int32_t i;
|
|
for ( i = 0 ; i < nks ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
// stop at 300?
|
|
//if ( i >= 300 ) break;
|
|
QueryLink *qk = &qks[i];
|
|
int32_t qkOff = (char *)qk - qkbuf->getBufStart();
|
|
//int32_t relOff = qrOffs[i];
|
|
//QueryRel *rel = (QueryRel *)(base+relOff);
|
|
// skip if not head of a linked list
|
|
if ( ! qk->m_isFirst ) continue;
|
|
QueryLogEntry *qe ;
|
|
qe = qk->getQueryLogEntry(&m_relatedQueryStringBuf);
|
|
// relative to rqsb! m_relatedQueryStringBuf
|
|
float imp = qk->m_totalQueryImportance;
|
|
// modify by unique round? not yet...
|
|
//imp -= rel->m_uniqueRound * 1000;
|
|
// now use this function
|
|
if ( ! addTermsFromQuery ( qe->getQueryString() ,
|
|
qe->m_langId,
|
|
qe->m_gigablastTraffic,
|
|
qe->m_googleTraffic,
|
|
qkOff, // hackqoff
|
|
&tmpBuf ,
|
|
&scoreTable ,
|
|
topTermsTable ,
|
|
imp ,
|
|
true ) ) // is related query?
|
|
return NULL;
|
|
}
|
|
|
|
// sort MissingTerms from tmpBuf into m_missingTermBuf by
|
|
// MissingTerm::m_importance
|
|
if ( ! sortTermsIntoBuf ( &scoreTable,
|
|
&tmpBuf,
|
|
&m_missingTermBuf ) )
|
|
return NULL;
|
|
|
|
m_missingTermBufValid = true;
|
|
|
|
//m_numMissingTerms = i;
|
|
|
|
// store it
|
|
//if ( ! storeMissingTermBufIntoCachedb() )
|
|
// return (SafeBuf *)-1;
|
|
|
|
return &m_missingTermBuf;
|
|
}
|
|
|
|
|
|
|
|
// . now get the best terms from our matching queries
|
|
// . basically the exact same algo as getMissingTermBuf
|
|
SafeBuf *XmlDoc::getMatchingTermBuf ( ) {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_matchingTermBufValid )
|
|
return &m_matchingTermBuf;
|
|
|
|
SafeBuf *mq = getMatchingQueryBuf();
|
|
if ( mq == NULL || mq == (void *)-1 ) return mq;
|
|
|
|
|
|
HashTableX *topTermsTable = getTermIdBufDedupTable32();
|
|
if ( ! topTermsTable || topTermsTable == (void *)-1 )
|
|
return (SafeBuf *)topTermsTable;
|
|
|
|
// tmpBuf will hold the MissingTerms we add.
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( 100000 ,"t4buf") ) return NULL;
|
|
|
|
// maps 64-bit term hash (can be multiple words in a term) to
|
|
// an offset into tmpBuf. tmpBuf holds the missing terms, so we
|
|
// use scoreTable to accumulate MissingTerm::m_importance for
|
|
// the same term in different queries.
|
|
HashTableX scoreTable;
|
|
if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
|
|
return NULL;
|
|
|
|
// scan the queries this doc matches and add MissingTerms for them
|
|
// into tmpBuf
|
|
int32_t nks = mq->length() / sizeof(QueryLink);
|
|
QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
|
|
|
|
int32_t i; for ( i = 0 ; i < nks ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
QueryLink *qk = &qks[i];
|
|
// stop at 300?
|
|
if ( i >= 300 ) break;
|
|
// "matching terms" have different hackqoff than missing terms
|
|
int32_t qkOff = (char *)qk - mq->getBufStart();
|
|
// relative to rqsb! m_relatedQueryStringBuf
|
|
float imp = qk->m_queryImportance;
|
|
// querylogentry does not have string info here! it is
|
|
// just the basic class
|
|
QueryLogEntry *qe ;
|
|
qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
|
|
// . now use this function
|
|
if ( ! addTermsFromQuery ( qe->getQueryString(),
|
|
qe->m_langId,
|
|
qe->m_gigablastTraffic,
|
|
qe->m_googleTraffic,
|
|
qkOff, // hackqoff
|
|
&tmpBuf ,
|
|
&scoreTable ,
|
|
topTermsTable ,
|
|
imp ,
|
|
false ) ) // is related query?
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// sort MatchingTerms from tmpBuf into m_matchingTermBuf by
|
|
// MatchingTerm::m_importance
|
|
if ( ! sortTermsIntoBuf ( &scoreTable,
|
|
&tmpBuf,
|
|
&m_matchingTermBuf ) )
|
|
return NULL;
|
|
|
|
m_matchingTermBufValid = true;
|
|
|
|
//m_numMatchingTerms = i;
|
|
|
|
// store it
|
|
//if ( ! storeMatchingTermBufIntoCachedb() )
|
|
// return (SafeBuf *)-1;
|
|
|
|
return &m_matchingTermBuf;
|
|
}
|
|
/*
|
|
// . max # of outstanding msg3f requests we can send to one host
|
|
// . now just make it 1 since it is msg3f NOT msg39
|
|
#define MAXOUT 1
|
|
|
|
//#define BINSIZE 100000
|
|
|
|
class Bin {
|
|
public:
|
|
// the current position for adding queries into m_buf
|
|
int32_t m_cursor;
|
|
int32_t m_maxCursor;
|
|
int32_t m_allocSize;
|
|
// some hack storage
|
|
Host *m_hackHost;
|
|
bool m_hackIsMsg99ReplyPtr;
|
|
// for sending the m_buf to its host
|
|
Multicast m_mcast;
|
|
// allocates size of BINSIZE bytes
|
|
char m_buf[0];
|
|
};
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . returns true on successful launch of request, it will block always
|
|
bool XmlDoc::sendBin ( int32_t i ) {
|
|
|
|
Bin *bin = m_currentBinPtrs[i];
|
|
|
|
// get host
|
|
Host *h = g_hostdb.getHost(i);
|
|
|
|
// copy it
|
|
//int32_t reqSize = p - tmpBuf;
|
|
//char *req = mdup ( tmpBuf , reqSize , "3freq" );
|
|
//if ( ! req ) return true;
|
|
|
|
// increment outstanding requests he has
|
|
h->m_numOutstandingRequests++;
|
|
|
|
// this could be a ptr to a msg99reply or a querylink
|
|
Multicast *mcast = &bin->m_mcast;
|
|
|
|
//bin->m_hackxd = this;
|
|
//bin->m_hackPtrCursor = firstPtrCursor;
|
|
bin->m_hackHost = h;
|
|
|
|
// get his group id
|
|
uint32_t groupId = h->m_groupId;
|
|
|
|
char *req = bin->m_buf;
|
|
int32_t reqSize = bin->m_cursor;
|
|
|
|
// disown it so mcast can free it when its udpslot is destroyed
|
|
m_currentBinPtrs[i] = NULL;
|
|
|
|
// note that
|
|
setStatus("launching msg3f");
|
|
// log it too
|
|
//log("seopipe: launching msg3f request of %"INT32" gbdocid queries to "
|
|
// "score to host %"INT32"", queryCount,h->m_hostId);
|
|
// get the docIds for this query using msg3f.cpp's handleRequest3f()
|
|
bool status = mcast->send ( req ,
|
|
reqSize,
|
|
0x3f ,
|
|
false, // mcast frees request? no!!!
|
|
groupId, // group to send to
|
|
false, // send to whole group?
|
|
0 , // query hash for host in group select
|
|
this , // state1
|
|
bin,//mcast, // state2
|
|
gotMsg3fReplyWrapper,
|
|
86401, // timeout in seconds. LONG TIME!
|
|
m_niceness,
|
|
false, // realtime?
|
|
h->m_hostId // firsthostid to try
|
|
);
|
|
// mark it out
|
|
m_numMsg3fRequests++;
|
|
// if this is true then it was a success and we BLOCKED
|
|
if ( status ) {
|
|
// must BE IN USE!
|
|
if ( ! mcast->m_inUse ) { char *xx=NULL;*xx=0; }
|
|
// success
|
|
return true;
|
|
}
|
|
// it came back?
|
|
m_numMsg3fReplies++;
|
|
// undo this
|
|
h->m_numOutstandingRequests--;
|
|
// errno should be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// set error
|
|
m_binError = g_errno;
|
|
// note it
|
|
log("seopipe: mcast had error: %s", mstrerror(g_errno));
|
|
// free that bin i guess
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
// return false on error
|
|
return false;
|
|
}
|
|
|
|
|
|
// . this is called from two places:
|
|
// 1. getMatchingQueriesScored() (Msg99Reply ptrs)
|
|
// 2. getRelatedQueryBuf() (QueryLink ptrs)
|
|
// . this can take Msg99Reply ptrs or it can take QueryLink ptrs
|
|
// . it will glean the docid from either of these two ptrs types as well
|
|
// as glean the pointer to the query string.
|
|
// . THEN it can create a 'gbdocid:xxxx | <queryString>' query which
|
|
// it will send to a host in the network.
|
|
// . it will try to keep each host in the network answering 5 such queries
|
|
// at any one time. bins are no longer used.
|
|
// . we need to implement heavy termlist caching remotely and locally to
|
|
// ensure optimal speed
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true with g_errno set on error
|
|
bool XmlDoc::scoreDocIdRestrictedQueries ( Msg99Reply **replyPtrs ,
|
|
QueryLink *linkPtrs ,
|
|
int32_t numPtrs ) {
|
|
|
|
//log("debug: entered scoredocidrestrictedqueries");
|
|
|
|
if ( numPtrs == 0 ) return true;
|
|
|
|
// . sanity check
|
|
// . you can only score your Msg99Replies or your QueryLinks
|
|
// . score your Msg99Replies for queries that match the main url
|
|
// . score your QueryLinks for queries that match a related docid
|
|
if ( ! replyPtrs && ! linkPtrs ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( replyPtrs && m_setForReplyPtrs ) return true;
|
|
if ( linkPtrs && m_setForLinkPtrs ) return true;
|
|
|
|
// we now send the termlistbuf to each host receiving a msg3f
|
|
// request so when it performs the msg39 on a query we provide it
|
|
// will set QueryTerm::m_posdbListPtr to point to the termlists we
|
|
// provided only, just for this docid
|
|
SafeBuf *termListBuf = NULL;
|
|
if ( ! linkPtrs ) {
|
|
termListBuf = getTermListBuf();
|
|
if ( ! termListBuf ) return true;
|
|
if ( termListBuf==(void *)-1 ) return false;
|
|
}
|
|
|
|
// force to ten for debug
|
|
//numPtrs = 20;
|
|
|
|
sendLoop:
|
|
|
|
//
|
|
// cleanup if got all replies we can
|
|
//
|
|
if ( m_numMsg3fReplies == m_numMsg3fRequests &&
|
|
((m_qcursor >= numPtrs) || m_binError) ) {
|
|
|
|
//log("debug: cleanup");
|
|
|
|
// there might be remnant bins if we stopped trying to
|
|
// call sendBin because we hit m_binError
|
|
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
|
// see if that bin is still around
|
|
Bin *bin = m_currentBinPtrs[i];
|
|
if ( ! bin ) continue;
|
|
// this will core if the multicast is in use
|
|
bin->m_mcast.destructor();
|
|
// now nuke it then
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
// now make it null
|
|
m_currentBinPtrs[i] = NULL;
|
|
}
|
|
// nuke this too!
|
|
if ( m_newxd2 ) {
|
|
mdelete ( m_newxd2 , sizeof(XmlDoc) , "newxd2" );
|
|
delete ( m_newxd2 );
|
|
m_newxd2 = NULL;
|
|
}
|
|
// free table's mem if used
|
|
m_tmpDupTable.reset();
|
|
// do not repeat this logic!
|
|
if ( replyPtrs ) {
|
|
m_setForReplyPtrs = true;
|
|
m_binErrorForReplyPtrs = m_binError;
|
|
}
|
|
if ( linkPtrs ) {
|
|
m_setForLinkPtrs = true;
|
|
m_binErrorForLinkPtrs = m_binError;
|
|
}
|
|
// inherit error? pass it on to caller
|
|
//if ( m_binError ) g_errno = m_binError;
|
|
// reset for another call to this function since we call
|
|
// if from two different places above
|
|
m_numMsg3fRequests = 0;
|
|
m_numMsg3fReplies = 0;
|
|
m_qcursor = 0;
|
|
m_binError = 0;
|
|
// all done!
|
|
g_errno = 0;
|
|
return true;
|
|
}
|
|
|
|
// int16_tcut
|
|
char *base = m_tmpStringBuf5.getBufStart();
|
|
|
|
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// store the queries in our buffer into the various bins and send
|
|
// a bin off when it gets full
|
|
queryLoop:
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// nothing left to do except wait for replies?
|
|
if ( m_qcursor >= numPtrs ) return false;
|
|
|
|
// assume ptr is good
|
|
bool good = true;
|
|
// set these
|
|
int64_t docId;
|
|
// the query as a string
|
|
char *qstr = NULL;
|
|
// for passing to mcast::m_hackQPtrs
|
|
void *vptr;
|
|
// get the ith QueryLink?
|
|
if ( linkPtrs ) {
|
|
QueryLink *qk = &linkPtrs[m_qcursor];
|
|
// skip if was not successfully processed above
|
|
// because it's hostid was dead perhaps?
|
|
if ( qk->m_queryHostId != -1 ) good = false;
|
|
// get from related docid in this case
|
|
SafeBuf *rdbuf = &m_relatedDocIdBuf;
|
|
if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
|
|
RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
|
|
docId = rd->m_docId;
|
|
// get it
|
|
QueryLogEntry *qe ;
|
|
qe = (QueryLogEntry *)(qk->m_queryStringOffset + base);
|
|
// and this. skip over goodserpscore, gigablastTraffic and
|
|
// googleTraffic
|
|
qstr = qe->getQueryString();
|
|
// save it
|
|
vptr = qk;
|
|
}
|
|
// make a new one for the first time
|
|
if ( linkPtrs && ! m_newxd2 ) {
|
|
try { m_newxd2 = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
mnew ( m_newxd2, sizeof(XmlDoc),"newxd2");
|
|
}
|
|
// set the xmldoc to this new docid, if it is new...
|
|
if ( linkPtrs && m_newxd2->m_docId != docId ) {
|
|
// a good stopping point?
|
|
if ( clientClosedConnection() ) {
|
|
m_binError = ESOCKETCLOSED;
|
|
goto sendLoop;
|
|
}
|
|
// set it from related doc's docid
|
|
if ( ! m_newxd2->set3 ( docId ,cr->m_coll, m_niceness ) ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// alloc space for tablen
|
|
if ( m_tmpDupTable.getNumSlots() <= 0 &&
|
|
! m_tmpDupTable.set ( 8,0,1024,NULL,0,false,m_niceness,
|
|
"tdtbl") ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// must not be in there already!
|
|
if ( m_tmpDupTable.isInTable ( &docId ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// add it
|
|
if ( ! m_tmpDupTable.addKey ( &docId ) ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// ensure content is recycled from title rec
|
|
m_newxd2->m_recycleContent = true;
|
|
// newxd2 needs to use our master functions. so
|
|
// anytime one of its internal functions blocks, then
|
|
// our m_masterLoop will be called
|
|
// and we'll end up right here again!
|
|
m_newxd2->m_masterLoop = m_masterLoop;
|
|
m_newxd2->m_masterState = m_masterState;
|
|
// only get posdb keys really for this stuff
|
|
m_newxd2->m_useTitledb = false;
|
|
m_newxd2->m_useTagdb = false;
|
|
m_newxd2->m_useClusterdb = false;
|
|
m_newxd2->m_useSpiderdb = false;
|
|
m_newxd2->m_useLinkdb = false;
|
|
// debug
|
|
log("seopipe: setting newxd2 docid=%"INT64"",docId);
|
|
}
|
|
// pump this
|
|
if ( linkPtrs && ! m_newxd2->m_loaded ) {
|
|
|
|
// . CRAP, blocking here sucks because when this function
|
|
// is re-entered it can also be from a Msg3f reply
|
|
// not because this document is back from msg22a...
|
|
|
|
//log("debug: loading newxd2");
|
|
|
|
// try to set from title rec first. return false if blocks.
|
|
if ( ! m_newxd2->loadFromOldTitleRec() ) {
|
|
m_newxd2Blocked = true;
|
|
//log("debug: newxd2 blocked");
|
|
return false;
|
|
}
|
|
}
|
|
// i guess no longer out
|
|
if ( linkPtrs && m_newxd2->m_loaded )
|
|
m_newxd2Blocked = false;
|
|
|
|
//if ( linkPtrs )
|
|
// log("debug: newxd2 loaded=%"INT32"",(int32_t)m_newxd2->m_loaded);
|
|
|
|
// sanity check
|
|
if ( linkPtrs && ! m_newxd2->m_oldTitleRecValid ) {
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// . did that fail? i.e. docid not found!?!?!
|
|
// . do not increment m_qcursor if m_binError is set
|
|
if ( linkPtrs && ! m_newxd2->m_oldTitleRec && ! m_binError ) {
|
|
// just skip this asshole then
|
|
if ( m_lastPrintedDocId != docId ) {
|
|
log("seopipe: related docid %"INT64" titlerec "
|
|
"load failed99",
|
|
docId);
|
|
}
|
|
m_lastPrintedDocId = docId;
|
|
// clear that
|
|
g_errno = 0;
|
|
// skip it
|
|
m_qcursor++;
|
|
// try the next one
|
|
goto queryLoop;
|
|
}
|
|
if ( linkPtrs ) {
|
|
|
|
// . CRAP, blocking here sucks because when this function
|
|
// is re-entered it can also be from a Msg3f reply
|
|
// not because it has the termlistbuf ready
|
|
|
|
// . use termlist buf of related docid
|
|
// . we need to ENSURE that the QueryLinks are clustered
|
|
// by related docid so this logic is efficient here
|
|
termListBuf = m_newxd2->getTermListBuf();
|
|
// return false if it blocked
|
|
if ( termListBuf == (void *)-1 ) {
|
|
//log("debug: newxd2 blocked in termlistbuf");
|
|
m_newxd2Blocked = true;
|
|
return false;
|
|
}
|
|
// this sucks. error!
|
|
if ( ! termListBuf ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
}
|
|
// i guess no longer out
|
|
if ( linkPtrs ) {
|
|
//log("debug: newxd2 UNblocked in termlistbuf");
|
|
m_newxd2Blocked = false;
|
|
}
|
|
|
|
// wait for replies to come in so we can stop even if m_qcursor
|
|
// did not complete its scan!
|
|
// shit, but what if we are a msg22 coming in for m_newxd2? that
|
|
// is why i moved this check down here so we can set m_newxd2Blocked
|
|
// to false and allow the msg3f replies to come back in and free
|
|
// all the bins. this is kinda fucked up because everything is
|
|
// asynchronous.
|
|
if ( m_binError ) return false;
|
|
|
|
// otherwise the Msg99Reply
|
|
if ( ! linkPtrs ) {
|
|
Msg99Reply *qp = replyPtrs[m_qcursor];
|
|
// tis us!
|
|
docId = m_docId;
|
|
// sanity
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
// and query string
|
|
qstr = qp->m_queryStr;
|
|
// save it
|
|
vptr = qp;
|
|
}
|
|
|
|
int32_t qlen = gbstrlen(qstr);
|
|
|
|
// mark as bad if this query is too big already
|
|
if ( m_firstUrl.m_ulen + qlen + 10 > MAX_QUERY_LEN )
|
|
good = false;
|
|
|
|
// if ptr was bad, do not evaluate at all
|
|
if ( ! good ) {
|
|
m_qcursor++;
|
|
goto queryLoop;
|
|
}
|
|
|
|
// sanity
|
|
if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . get hash of query to determine bin
|
|
// . this keeps our term freqs consistent since every query goes
|
|
// back TO THE SAME HOST!!! thus our scores remain consistent.
|
|
// each host has a slightly different TermFreq/Weight for the
|
|
// exact same query because the termfreq is based on the termlist
|
|
// length for that termid. and each host has a different set of
|
|
// docids in its index for the most part.
|
|
uint32_t h32 = hash32n ( qstr );
|
|
int32_t numHosts = g_hostdb.getNumHosts();
|
|
// do not send to host #0 if we got a lot of hosts
|
|
if ( g_hostdb.getNumHosts() >= 8 ) numHosts--;
|
|
int32_t hostNum = h32 % numHosts;
|
|
// skip host #0 which is us i guess!
|
|
if ( g_hostdb.getNumHosts() >= 8 ) hostNum++;
|
|
// sanity for that
|
|
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get the current bin for that host
|
|
Bin *bin = m_currentBinPtrs [ hostNum ];
|
|
|
|
// alloc on demand
|
|
if ( ! bin ) {
|
|
// how big is the termlistbuf?
|
|
int32_t tsize = termListBuf->length();
|
|
int32_t collLen = gbstrlen(cr->m_coll);
|
|
// how much space do we need for a good bin?
|
|
int32_t alloc = sizeof(Bin) + 8 +1+ collLen + 1 + tsize + 100000;
|
|
// make that
|
|
char *mem = (char *)mmalloc ( alloc ,"binreq" );
|
|
if ( ! mem ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// cast it
|
|
bin = (Bin *)mem;
|
|
// store it
|
|
m_currentBinPtrs [ hostNum ] = bin;
|
|
// this includes a Multicast in the Bin
|
|
bin->m_mcast.constructor();
|
|
// for freeing
|
|
bin->m_allocSize = alloc;
|
|
// the end of it
|
|
char *memEnd = mem + alloc;
|
|
// reset offset into Bin::m_buf
|
|
bin->m_cursor = 0;
|
|
// is it to a msg99reply? so the reply handler knows how to
|
|
// handle mcast::m_hackQPtr and what action to take. it is
|
|
// slightly different.
|
|
if ( linkPtrs ) bin->m_hackIsMsg99ReplyPtr = 0;
|
|
else bin->m_hackIsMsg99ReplyPtr = 1;
|
|
// . before we add any queries, store langid of QUERY
|
|
// . crap just use doc langid for now
|
|
char *bp = bin->m_buf;
|
|
// first is docid. if doing QueryLinks this is the docid
|
|
// of the related docid, otherwise, it is that of our main doc
|
|
*(int64_t *)bp = docId; bp += 8;
|
|
// then langid
|
|
*bp = m_langId; bp++;
|
|
// then the coll
|
|
gbmemcpy ( bp , cr->m_coll , collLen );
|
|
bp += collLen;
|
|
*bp++ = '\0';
|
|
// sanity!
|
|
if ( bp >= memEnd ) { char *xx=NULL;*xx=0; }
|
|
// the size of the termlist buf
|
|
*(int32_t *)bp = tsize; bp += 4;
|
|
// then the termlistbuf that has all the termlists forour docid
|
|
gbmemcpy ( bp , termListBuf->getBufStart(), tsize ); bp += tsize;
|
|
// update bin's cursor
|
|
bin->m_cursor = bp - bin->m_buf;
|
|
// for breach detection. send off Bin when breach happens.
|
|
bin->m_maxCursor = alloc - sizeof(Bin);
|
|
}
|
|
|
|
// can we store the current query into this bin?
|
|
bool storeInBin = true;
|
|
|
|
// is there enough room for this query in the bin?
|
|
int32_t need = qlen + 40;
|
|
if ( bin->m_cursor + need >= bin->m_maxCursor )
|
|
storeInBin = false;
|
|
|
|
// does docid of bin match?
|
|
int64_t binDocId = *(int64_t *)(bin->m_buf);
|
|
if ( docId != binDocId )
|
|
storeInBin = false;
|
|
|
|
// if we can't store this query into the bin, send it off now
|
|
if ( ! storeInBin ) {
|
|
// use its multicast to send this bin off if too full
|
|
if ( ! sendBin ( hostNum ) ) {
|
|
m_binError = g_errno;
|
|
goto sendLoop;
|
|
}
|
|
// . now the current bin should have been emptied
|
|
// . go back to top to realloc Bin::m_buf to hold this query
|
|
goto queryLoop;
|
|
}
|
|
|
|
char *p = bin->m_buf + bin->m_cursor;
|
|
|
|
// first store the offset from the buf so we can return it
|
|
// in the reply which is a list of scores basically and we know
|
|
// what score goes with what m_qcursor
|
|
*(int32_t *)p = m_qcursor;
|
|
p += 4;
|
|
|
|
// now store queries in the request buf for the msg3f
|
|
p += sprintf(p,"gbdocid:%"UINT64" | %s",docId,qstr);
|
|
*p++ = '\0';
|
|
|
|
// update cursor
|
|
bin->m_cursor = p - bin->m_buf;
|
|
|
|
// skip to next query/docid to evaluate
|
|
m_qcursor++;
|
|
|
|
// if we have more queries left, add them to bins now
|
|
if ( m_qcursor < numPtrs ) goto queryLoop;
|
|
|
|
// now send every bin, we have no queries left.
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! m_currentBinPtrs[i] ) continue;
|
|
// this will transfer the request buffer over to mcast
|
|
// so it will be freed when mcast returns
|
|
sendBin ( i );
|
|
}
|
|
|
|
goto sendLoop;
|
|
}
|
|
|
|
// we got back the score for each query link in
|
|
// the bin that we sent out for the docid specified in the bin header request
|
|
void XmlDoc::gotMsg3fReply ( Bin *bin ) { // Multicast *mcast ) {
|
|
|
|
setStatus ( "gotmsg3freply" );
|
|
|
|
// do some housekeeping
|
|
Host *h = bin->m_hackHost;
|
|
h->m_numOutstandingRequests--;
|
|
|
|
m_numMsg3fReplies++;
|
|
|
|
// sanity
|
|
Multicast *mcast = &bin->m_mcast;
|
|
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get the reply
|
|
bool freeIt = false;
|
|
int32_t replySize = 0;
|
|
int32_t replyMaxSize;
|
|
char *rbuf = mcast->getBestReply ( &replySize ,
|
|
&replyMaxSize ,
|
|
&freeIt ,
|
|
true ); // steal it?
|
|
|
|
// log it too
|
|
//log("seopipe: got msg3f reply from host %"INT32" size=%"INT32" bytes",
|
|
// h->m_hostId,replySize);
|
|
|
|
// cast it
|
|
//Msg3fReply *mr = (Msg3fReply *)rbuf;
|
|
// in case of mem-leak this helps
|
|
//if ( rbuf ) relabel(rbuf,replyMaxSize,"xx-rb");
|
|
// . we must be able to free it... we must own it
|
|
// . this is true if we should free it, but we should not have
|
|
// to free it since it is owned by the slot?
|
|
if ( freeIt ) {
|
|
log(LOG_LOGIC,"query: msg3f: Steal failed.");
|
|
char *xx = NULL; *xx=0;
|
|
}
|
|
|
|
// if it failed for some reason i guess just bail
|
|
if ( ! rbuf ) {
|
|
// clean up the bin and the multicast and the request buffer
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
g_errno = EBADREPLYSIZE;
|
|
log(LOG_LOGIC,"seopipe: bad msg3f empty reply");
|
|
return;
|
|
}
|
|
|
|
// reply is just sequence of docid/score pairs
|
|
char *rp = rbuf;
|
|
char *rpEnd = rbuf + replySize;
|
|
|
|
//int32_t firstCursor = bin->m_hackPtrCursor;
|
|
|
|
// scan the msg99 replies and insert the scores we got for each
|
|
// query from the msg3f reply in "rbuf"
|
|
for ( ; rp < rpEnd ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . first is index, what query # in the request are we
|
|
// processing now, might not be in order because we launch
|
|
// a bunch of msg39s in parallel in handleRequest3f()'s call
|
|
// to processQueries()
|
|
// . but the corresponding msg99reply is reply # "qcursor"
|
|
int32_t qcursor = *(int32_t *)rp;
|
|
rp += 4;
|
|
int64_t docId = *(int64_t *)rp;
|
|
rp += 8;
|
|
float score = *(float *)rp;
|
|
rp += 4;
|
|
// . if this is true that means qcursor is referencing a
|
|
// msg99reply and we should set the score of that msg99
|
|
// reply to what the handlerequest3f provided
|
|
// . so store the docid and score for our url for this query
|
|
if ( bin->m_hackIsMsg99ReplyPtr ) {
|
|
SafeBuf *mqbuf = getMatchingQueries(false,-1);
|
|
Msg99Reply **qptrs=(Msg99Reply **)mqbuf->getBufStart();
|
|
Msg99Reply *qr = qptrs[qcursor];
|
|
qr->m_myScore = score;
|
|
qr->m_myDocId = docId;
|
|
int32_t numQueryPtrs=mqbuf->length()/sizeof(Msg99Reply *);
|
|
// if too many skip some
|
|
if ( numQueryPtrs > 1000 && (qcursor%1000)!=0)continue;
|
|
// if too many skip some
|
|
if ( numQueryPtrs > 400 && (qcursor%100) !=0)continue;
|
|
char *qstr = qr->m_queryStr;
|
|
log("seopipe: got query #%"INT32"of%"INT32" score=%f qstr=%s"
|
|
,qcursor+1
|
|
,numQueryPtrs
|
|
,score
|
|
,qstr
|
|
);
|
|
continue;
|
|
}
|
|
// might be storing in a QueryLink (doing related docids)
|
|
//SafeBuf *ibuf = getRelatedQueryLinksWithStrings();
|
|
QueryLink *qks =(QueryLink *)m_tmpBuf5.getBufStart();
|
|
//int32_t numQueryLinks = ibuf->length() / sizeof(QueryLink);
|
|
QueryLink *qk = &qks[qcursor];
|
|
// sanity. make sure qk->m_queryStringOffset is related to our
|
|
// local m_tmpStringBuf5 and not relative to the
|
|
// g_qbuf of the hostid that sent back the msg99 reply.
|
|
if ( qk->m_queryHostId != -1 ) { char *xx=NULL;*xx=0; }
|
|
// how many related query links do we got? for logging.
|
|
int32_t nks = m_tmpBuf5.length()/sizeof(QueryLink);
|
|
// int16_tcuts
|
|
char *base = m_tmpStringBuf5.getBufStart();
|
|
// skip over gigablastTraffic and googleTraffic
|
|
QueryLogEntry *qe;
|
|
qe = (QueryLogEntry *)(base + qk->m_queryStringOffset);
|
|
SafeBuf *rdbuf = &m_relatedDocIdBuf;
|
|
if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
|
|
RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
|
|
// note it
|
|
if ( (qcursor % 1000) == 0 ) // || qcursor < 100 )
|
|
log("seopipe: got msg3f reply for related query "
|
|
"#%"INT32"of%"INT32" "
|
|
"query \"gbdocid:%"INT64" | %s\" gigablasttraffic=%"INT32" "
|
|
"googletraffic=%"INT32" serpscore=%f goodscore=%f"
|
|
,qcursor+1
|
|
,nks
|
|
,rd->m_docId
|
|
,qe->getQueryStr()
|
|
,qe->m_gigablastTraffic
|
|
,qe->m_googleTraffic
|
|
,score
|
|
,qe->m_topSERPScore // of a docid slice on 1 host
|
|
);
|
|
//
|
|
// no longer used queryrel!
|
|
//
|
|
// if we are scoring QueryLinks then we add a QueryRel
|
|
//QueryRel qr;
|
|
// clear that mem to zero
|
|
//memset ( &qr , 0 , sizeof(QueryRel));
|
|
// then add the info we know
|
|
//qr.m_relatedDocId = qk->m_relatedDocId;
|
|
//char *base2 = m_relatedDocIdBuf.getBufStart();
|
|
//int32_t rdOff = (char *)qk->m_relatedDocId - base2;
|
|
//qr.m_relatedDocIdOff = rdOff;
|
|
//qr.m_offsetIntoRelQStrBuf = qk->m_queryStringOffset;
|
|
//qr.m_myScore = score;
|
|
//qr.m_nextOff = -1;
|
|
//qr.m_tailOff = -1;
|
|
qk->m_serpScore = score;
|
|
// save that. WHAT IF THIS ERRORS?!?!?!
|
|
//if ( ! m_queryRelBuf.safeMemcpy(&qr,sizeof(QueryRel)) ) {
|
|
// m_binError = g_errno;
|
|
// log("xmldoc: panic. failed to store query rel");
|
|
// break;
|
|
//}
|
|
// debug test
|
|
//m_binError = EBADENGINEER;
|
|
//log("xmldoc: panic2. failed to store query rel");
|
|
//break;
|
|
}
|
|
|
|
// ok, we got the docid and score, now free it
|
|
mfree ( rbuf , replyMaxSize , "fmsg3f" );
|
|
|
|
// clean up the bin and the multicast and the request buffer
|
|
mfree ( bin , bin->m_allocSize, "delbin" );
|
|
|
|
//if ( m_newxd2Blocked )
|
|
// log("debug: got reply, but returning because newxd2 "
|
|
// "had blocked");
|
|
|
|
// prevent double entry bug from entering scoreDocIdRestrictedQueries()
|
|
// from a newxd2 function blocking and coming in through msg22
|
|
// callback or whatever, vs. coming in from here
|
|
if ( m_newxd2Blocked ) return;
|
|
|
|
//log("debug: got reply and calling masterloop");
|
|
|
|
// go back to the transmit function
|
|
m_masterLoop ( m_masterState );
|
|
|
|
// if not done, just return... otherwise we double enter
|
|
// scoreDocIdRestrictedQueries() along with it's call to
|
|
// getTermListBuf()... and all hell breaks loose
|
|
return;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// send contents of m_socketWriteBuf to m_seoSocket
|
|
void XmlDoc::pumpSocketWriteBuf ( ) {
|
|
|
|
if ( ! m_seoSocket ) return;
|
|
|
|
setStatus ( "pumpsocketwritebuf" );
|
|
|
|
SafeBuf *sb = &m_socketWriteBuf;
|
|
|
|
// insert http header into m_socketWriteBuf if not there
|
|
char *wbuf = sb->getBufStart();
|
|
bool insertIt = false;
|
|
if ( ! wbuf ) insertIt = true;
|
|
if ( wbuf && strncmp(wbuf,"HTTP/1.0 ",9 ) ) insertIt = true;
|
|
// add http header first
|
|
if ( insertIt ) {
|
|
// reset # bytes sent
|
|
m_socketWriteBufSent = 0;
|
|
m_registeredSocketCallback = false;
|
|
// xml-itize each query reply without scoring info
|
|
sb->insert("HTTP/1.0 200 OK\r\n"
|
|
"Content-Type: text/xml ; "
|
|
"charset=utf-8\r\n"
|
|
"\r\n"
|
|
"<response>\n",0);
|
|
}
|
|
|
|
// come back here to do another send
|
|
sendLoop:
|
|
|
|
// try sending out our xml buffer on the socket
|
|
// the very first things we do is send the queries over without
|
|
// the ranking info which we compute by calling msg39 on each query,
|
|
// so at least we can display something quite quickly.
|
|
if ( m_socketWriteBufSent < sb->length() ) {
|
|
int32_t sd = m_seoSocket->m_sd;
|
|
// just in case
|
|
if ( m_registeredSocketCallback ) {
|
|
g_loop.unregisterWriteCallback(sd,this,
|
|
getSEOQueryInfoWrapper2);
|
|
m_registeredSocketCallback = false;
|
|
}
|
|
// send that off
|
|
int32_t sendLen = sb->length();
|
|
char *sendStr = sb->getBufStart();
|
|
char *sendEnd = sendStr + sendLen;
|
|
// if we sent SOME last time, skip over that
|
|
sendStr += m_socketWriteBufSent;
|
|
// how much left?
|
|
int32_t remaining = sendEnd - sendStr;
|
|
// wtf?
|
|
if ( remaining <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// try a send on non-blocking socket
|
|
int32_t n = ::send ( sd , sendStr , remaining , 0 );
|
|
// did we send something?
|
|
if ( n > 0 ) {
|
|
m_socketWriteBufSent += n;
|
|
goto sendLoop;
|
|
}
|
|
// maybe it sent 0 because it was waiting for something
|
|
// so set our callback for when the socket is ready for
|
|
// writing again. try sending more later.
|
|
g_loop.registerWriteCallback ( sd ,
|
|
this ,
|
|
getSEOQueryInfoWrapper2,
|
|
0 ); // niceness = 0
|
|
// flag it so we don't leak these
|
|
m_registeredSocketCallback = true;
|
|
}
|
|
}
|
|
*/
|
|
|
|
bool XmlDoc::getIsInjecting ( ) {
|
|
bool isInjecting = false;
|
|
//if ( g_inPageInject ) isInjecting = true;
|
|
if ( m_sreqValid && m_sreq.m_isInjecting ) isInjecting = true;
|
|
if ( m_isInjecting && m_isInjectingValid ) isInjecting = true;
|
|
return isInjecting;
|
|
}
|
|
|
|
|
|
int posdbKeyCmp ( const void *a, const void *b ) {
|
|
char *ka = (char *)a;
|
|
char *kb = (char *)b;
|
|
//int64_t tid64a = g_posdb.getTermId(ka);
|
|
//int64_t tid64b = g_posdb.getTermId(kb);
|
|
// a bit of a hack so handleRequest8e already has these
|
|
// guys sorted by their lower 32-bits of termids so it can
|
|
// match this doc to queries without having to sort first.
|
|
//uint32_t tid32a = (uint32_t)tid64a;
|
|
//uint32_t tid32b = (uint32_t)tid64b;
|
|
//if ( tid32a < tid32b ) return -1;
|
|
//if ( tid32a > tid32b ) return 1; // swap
|
|
//if ( tid64a < tid64b ) return -1;
|
|
//if ( tid64a > tid64b ) return 1; // swap
|
|
char val = KEYCMP(ka,kb,sizeof(POSDBKEY));
|
|
if ( val > 0 ) return 1;
|
|
if ( val < 0 ) return -1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
// . used by XmlDoc::getTermListBuf() below
|
|
// . sorted by posdb key straight up
|
|
SafeBuf *XmlDoc::getTermIdSortedPosdbListBuf ( ) {
|
|
|
|
if ( m_sortedPosdbListBufValid )
|
|
return &m_sortedPosdbListBuf;
|
|
|
|
// get the lists. forDelete = false.
|
|
char *metaList = getMetaList ( false );
|
|
if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;
|
|
|
|
// sanity
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make a tmp buf to hold posdb keys
|
|
//SafeBuf tmp;
|
|
if ( ! m_sortedPosdbListBuf.reserve(m_metaListSize,"spbuf"))
|
|
return NULL;
|
|
// point into it
|
|
char *dst = m_sortedPosdbListBuf.getBufStart();
|
|
|
|
// debug test
|
|
//verifyMetaList ( m_metaList ,
|
|
// m_metaList + m_metaListSize ,
|
|
// false );
|
|
|
|
// scan the meta list for posdb keys
|
|
char *p = metaList;
|
|
char *pend = p + m_metaListSize;
|
|
// stole this loop from getMetaList()
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save it with the flag
|
|
char byte = *p;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// get key
|
|
char *key = p;
|
|
// skip that
|
|
p += ks;
|
|
// get datasize
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
// assume we do not store the datasize
|
|
//bool neg = false;
|
|
// . if key is negative, no data is present
|
|
// . the doledb key is negative for us here
|
|
if ( (key[0] & 0x01) == 0x00 ) ds = 0;
|
|
// if datasize variable, read it in
|
|
if ( ds == -1 ) {
|
|
// get data size
|
|
ds = *(int32_t *)p;
|
|
// skip data size int32_t
|
|
p += 4;
|
|
}
|
|
// point to data
|
|
//char *data = p;
|
|
// skip data if not zero
|
|
p += ds;
|
|
// if not posdb skip rec
|
|
if ( rdbId != RDB_POSDB ) continue;
|
|
// skip negative keys
|
|
if ( (key[0] & 0x01) == 0x00 ) continue;
|
|
// add to new buf now
|
|
gbmemcpy ( dst , key , sizeof(POSDBKEY) );
|
|
// advance
|
|
dst += sizeof(POSDBKEY);
|
|
}
|
|
char *start = m_sortedPosdbListBuf.getBufStart();
|
|
// update tmp
|
|
m_sortedPosdbListBuf.incrementLength ( dst - start );
|
|
// sanity
|
|
if ( m_sortedPosdbListBuf.length() > m_metaListSize ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// point
|
|
char *pbuf = m_sortedPosdbListBuf.getBufStart();
|
|
int32_t numKeys = m_sortedPosdbListBuf.length()/sizeof(POSDBKEY);
|
|
|
|
// sort keys by termid
|
|
gbqsort ( pbuf ,
|
|
numKeys,
|
|
sizeof(POSDBKEY),
|
|
posdbKeyCmp,
|
|
m_niceness );
|
|
|
|
m_sortedPosdbListBufValid = true;
|
|
return &m_sortedPosdbListBuf;
|
|
}
|
|
|
|
|
|
#define TLBUFSIZE 5000
|
|
|
|
// . used by the seo pipeline
|
|
// . this is a list of posdb termlists, one termlist per termid.
|
|
// . we store each termlist in this termlistbuf into g_termListCache
|
|
// . we use g_termListCache for evaluating gbdocid:xxx| restricted queries
|
|
// very quickly without having to hit disk because all the posdb termlists
|
|
// for that docid should be in g_termListCache
|
|
SafeBuf *XmlDoc::getTermListBuf ( ) {
|
|
|
|
if ( m_termListBufValid )
|
|
return &m_termListBuf;
|
|
|
|
// . ensure content is recycled from title rec
|
|
// . no, because if we had to download the doc fresh for the first
|
|
// time, this caused us headaches around line 30657 and we ended
|
|
// up setting m_docIndexed to false there and calling logIt() twice!
|
|
//m_recycleContent = true;
|
|
//m_recycleLinkInfo = true;
|
|
|
|
// try to set from title rec first. return false if it blocked.
|
|
//if ( ! loadFromOldTitleRec() ) return (SafeBuf *)-1;
|
|
|
|
// did that fail? i.e. docid not found!?!?!
|
|
//if ( m_oldTitleRecValid && ! m_oldTitleRec ) {
|
|
// g_errno = ENOTFOUND;
|
|
// return NULL;
|
|
//}
|
|
|
|
// only get posdb keys in getMetaList()
|
|
/*
|
|
m_useTitledb = false;
|
|
m_useTagdb = false;
|
|
m_useClusterdb = false;
|
|
m_useSpiderdb = false;
|
|
m_useLinkdb = false;
|
|
*/
|
|
|
|
// . these are FULL 18-byte keys, no compression
|
|
// . sorted by posdbkeys straight up, so by termid
|
|
SafeBuf *posdbBuf = getTermIdSortedPosdbListBuf ();
|
|
if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;
|
|
|
|
int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);
|
|
|
|
// . reserve mem for new termlistbuf
|
|
// . include 4 bytes for listsize
|
|
// . this buffer will be a list of lists
|
|
int32_t need = numKeys * (sizeof(POSDBKEY) + 4);
|
|
if ( ! m_termListBuf.reserve ( need ,"tlstbuf" ) )
|
|
return NULL;
|
|
|
|
|
|
int64_t lastTermId = -1LL;
|
|
/*
|
|
char tmpBuf[TLBUFSIZE];
|
|
// build termlists from the posdb records
|
|
RdbList termList;
|
|
// stolen from RdbList::set
|
|
termList.m_list = tmpBuf;
|
|
termList.m_listSize = 0;
|
|
termList.m_listEnd = tmpBuf;
|
|
termList.m_alloc = tmpBuf;
|
|
termList.m_allocSize = TLBUFSIZE;
|
|
termList.m_ownData = false;
|
|
termList.m_ks = sizeof(POSDBKEY);
|
|
termList.m_fixedDataSize = 0;
|
|
termList.m_ownData = false;
|
|
termList.m_useHalfKeys = true;
|
|
termList.resetListPtr();
|
|
bool breakOut = false;
|
|
*/
|
|
// start a size bookmark
|
|
int32_t *bookmark = NULL;
|
|
// scan all the sorted posdb keys and build posdb termlists and
|
|
// store the termlists into "m_termListBuf"
|
|
char *p = posdbBuf->getBufStart();
|
|
char *pend = p + posdbBuf->length();
|
|
for ( ; p < pend ; ) {
|
|
// get the key
|
|
char *key = p;
|
|
// must be full 18 byte keys!
|
|
if ( p[0] & 0x06 ) { char *xx=NULL;*xx=0; }
|
|
// skip it
|
|
p += sizeof(POSDBKEY);
|
|
// get key termid
|
|
int64_t termId = g_posdb.getTermId ( key );
|
|
// sanity
|
|
int64_t docId = g_posdb.getDocId ( key );
|
|
if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
|
|
// sanity. is it sorted by termid?
|
|
if ( termId < lastTermId && lastTermId == -1 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// log it for debug
|
|
//if ( docId == 192304365235LL )
|
|
// log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"",
|
|
// docId,
|
|
// termId,
|
|
// g_posdb.getWordPos(key));
|
|
// . store size of keys following that have same termid
|
|
// . assume just one for now!
|
|
if ( termId != lastTermId ) {
|
|
bookmark = (int32_t *)m_termListBuf.getBuf();
|
|
m_termListBuf.pushLong(sizeof(POSDBKEY));
|
|
}
|
|
// store the key
|
|
m_termListBuf.safeMemcpy ( key , sizeof(POSDBKEY) );
|
|
// if not first in the list, update size
|
|
if ( termId == lastTermId ) *bookmark += sizeof(POSDBKEY);
|
|
// . cache currently made list then
|
|
// . set startkey/endkey
|
|
//char startKey[sizeof(POSDBKEY)];
|
|
//char endKey [sizeof(POSDBKEY)];
|
|
//g_posdb.makeStartKey(startKey,lastTermId,m_docId);
|
|
//g_posdb.makeEndKey (endKey,lastTermId,m_docId);
|
|
// update it for next list
|
|
lastTermId = termId;
|
|
// . add to ongoing list? will use compression bit.
|
|
// . return true with g_errno set on error
|
|
// . use g_termListCache in Msg0.cpp
|
|
//if(!addToTermListCache(cr->m_coll,startKey,endKey,&termList))
|
|
// return true;
|
|
// first store the lits size
|
|
//m_termListBuf.pushLong(termList.m_listSize);
|
|
// then the list data itself
|
|
//m_termListBuf.safeMemcpy(termList.m_list,termList.m_listSize)
|
|
// now reset
|
|
//termList.m_listSize = 0;
|
|
//termList.m_list = tmpBuf;
|
|
//termList.m_listEnd = tmpBuf;//ermList.m_list;
|
|
//termList.resetListPtr();
|
|
// if we are a loopback, bail
|
|
//if ( breakOut ) break;
|
|
// are we the last record?
|
|
//if ( p >= pend ) breakOut = true;
|
|
// add fresh to the new termlist
|
|
//goto addIt;
|
|
}
|
|
|
|
// sanity
|
|
if ( m_termListBuf.length() &&
|
|
g_posdb.getDocId(m_termListBuf.getBufStart()+4) != m_docId ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
m_termListBufValid = true;
|
|
|
|
return &m_termListBuf;
|
|
// print timing
|
|
//int64_t now = gettimeofdayInMilliseconds();
|
|
//int64_t took = now - m_cacheStartTime;
|
|
//log("seopipe: took %"INT64" ms to parse docid %"INT64"",took,m_docId);
|
|
// . flag it as being completely cached now
|
|
// . returns false and sets g_errno on error
|
|
//return addDocIdToTermListCache ( m_docId , cr->m_coll );
|
|
}
|
|
|
|
|
|
//int32_t XmlDoc::getNumInsertableTerms ( ) {
|
|
// // make sure they called getInsertableTerms() first!
|
|
// if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0;}
|
|
// return m_insertableTermsBuf.length() / sizeof(InsertableTerm);
|
|
//}
|
|
|
|
// . return a list of InsertableTerms
|
|
// . these are just terms we will try to insert into the document in every
|
|
// possible place to see how they affect ranking of this document for
|
|
// all the applicable queries
|
|
// . then when we call getScoredInsertableTerms() it will fill in the
|
|
// m_queryChangeBuf array
|
|
SafeBuf *XmlDoc::getInsertableTerms ( ) {
|
|
|
|
if ( m_insertableTermsBufValid )
|
|
return &m_insertableTermsBuf;
|
|
|
|
// make sure related query string buf is valid
|
|
//SafeBuf *rrr = getRelatedQueryLinksWithStrings();
|
|
//if ( ! rrr || rrr == (void *)-1 ) return rrr;
|
|
|
|
// just use this now
|
|
SafeBuf *mtBuf = getMissingTermBuf();
|
|
if ( ! mtBuf || mtBuf == (void *)-1 ) return mtBuf;
|
|
|
|
// get buffer of ptrs to the msg99 replies for this url
|
|
//SafeBuf *mqbuf = getMatchingQueries ( false );
|
|
//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
|
|
|
|
// just use the MissingTerm class for these as well!!
|
|
SafeBuf *maBuf = getMatchingTermBuf();
|
|
if ( ! maBuf || maBuf == (void *)-1 ) return maBuf;
|
|
|
|
|
|
|
|
//
|
|
// alloc space for the insertable terms in its safebuf
|
|
//
|
|
int32_t need = 0;
|
|
char *p;
|
|
char *pend;
|
|
p = mtBuf->getBufStart();
|
|
pend = mtBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
MissingTerm *mt = (MissingTerm *)p;
|
|
p += mt->getSize();
|
|
need += sizeof(InsertableTerm);
|
|
need += mt->getTermSize();
|
|
}
|
|
// these are the matching terms, but use the same MissingTerm class
|
|
p = maBuf->getBufStart();
|
|
pend = maBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
MissingTerm *mt = (MissingTerm *)p;
|
|
p += mt->getSize();
|
|
need += sizeof(InsertableTerm);
|
|
need += mt->getTermSize();
|
|
}
|
|
if ( ! m_insertableTermsBuf.reserve ( need ,"itblbuf" ) ) return NULL;
|
|
|
|
//
|
|
// now interleave the matching terms with the related terms
|
|
//
|
|
|
|
char *p1 = mtBuf->getBufStart();
|
|
char *p1End = mtBuf->getBuf();
|
|
|
|
char *p2 = maBuf->getBufStart();
|
|
char *p2End = maBuf->getBuf();
|
|
|
|
// int16_tcut
|
|
SafeBuf *ib = &m_insertableTermsBuf;
|
|
|
|
int32_t count; for ( count = 0 ; ; count++ ) {
|
|
// . just get top 50 insertable terms
|
|
// . use #define MAX_INSERTABLE_TERMS 50?
|
|
if ( count >= 50 ) break;
|
|
bool add1 = false;
|
|
bool add2 = false;
|
|
if ( ( count % 2 ) == 0 && p1 < p1End ) add1 = true;
|
|
if ( ( count % 2 ) == 1 && p2 < p2End ) add2 = true;
|
|
if ( ! add1 && ! add2 ) break;
|
|
MissingTerm *mt;
|
|
if ( add1 ) {
|
|
mt = (MissingTerm *)p1;
|
|
p1 += mt->getSize();
|
|
}
|
|
if ( add2 ) {
|
|
mt = (MissingTerm *)p2;
|
|
p2 += mt->getSize();
|
|
}
|
|
// make an insertable term
|
|
InsertableTerm it;
|
|
if ( add1 ) it.m_isRelatedTerm = true;
|
|
else it.m_isRelatedTerm = false;
|
|
// sum of traffic of the queries that contained this term
|
|
it.m_trafficSum = mt->m_traffic;
|
|
// hash it up
|
|
char *term = mt->getTerm();
|
|
int32_t termSize = mt->getTermSize();
|
|
it.m_termHash64 = hash64 ( term , termSize - 1 );
|
|
it.m_termSize = termSize;
|
|
// reset this for later use
|
|
it.m_bestTrafficGain = -1;
|
|
it.m_bestInsertPos = -1;
|
|
// store that insertable term
|
|
ib->safeMemcpy(&it,sizeof(InsertableTerm));
|
|
// then the term string itself follows for easy serialization
|
|
// into cachedb...
|
|
ib->safeMemcpy(term,termSize);
|
|
}
|
|
|
|
if ( ib->length() > need ) { char *xx=NULL;*xx=0; }
|
|
|
|
//m_numInsertableTerms = count;
|
|
|
|
m_insertableTermsBufValid = true;
|
|
return &m_insertableTermsBuf;
|
|
}
|
|
|
|
|
|
static void gotMsg95ReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
THIS->gotMsg95Reply( slot );
|
|
}
|
|
|
|
void XmlDoc::gotMsg95Reply ( UdpSlot *slot ) {
|
|
// count it
|
|
m_numMsg95Replies++;
|
|
// return if still waiting
|
|
if ( m_numMsg95Replies < m_numMsg95Requests ) return;
|
|
// . store each msg95reply
|
|
// . TODO: do we need m_msg95ReplyAlloc[] like m_msg99 has?
|
|
m_msg95ReplyPtrs [slot->m_hostId] = slot->m_readBuf;
|
|
m_msg95ReplySizes[slot->m_hostId] = slot->m_readBufSize;
|
|
// do not let it free it, we will free it
|
|
slot->m_readBuf = NULL;
|
|
// all done! should call getScoredInsertableTerms() indirectly
|
|
m_masterLoop ( m_masterState );
|
|
}
|
|
|
|
#include "seo.h" // for Msg95Request class
|
|
|
|
/*
|
|
// return a buffer of WordFreqInfo instances for every word in the
|
|
// insertable terms buffer. we use this so the msg95 handler can get the
|
|
// term freqs of any term in any matching query consistently, because
|
|
// we are host #0 calling this presumably. msg95 handler will use these
|
|
// to set the termfreqs in the Msg39Request when calling msg39.
|
|
// TODO: run through related queries as well! why didn't insertable terms
|
|
// work!?!?! it should...
|
|
SafeBuf *XmlDoc::getInsertableWordFreqInfoBuf ( ) {
|
|
|
|
// must always be host 0 or it's twin! we have to ensure
|
|
// consistency always when calling getTermFreq()...
|
|
if ( g_hostdb.m_groupId != 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( m_iwfiBufValid )
|
|
return &m_iwfiBuf;
|
|
|
|
// get the same top word ids we pass to the msg95 request,
|
|
// because handleRequest95() uses those to get the queries
|
|
// that we match, and it evaluates each of those queries on each
|
|
// insertion we do.
|
|
// So that is the ptr_twid32Buf, which MUST include all
|
|
// insertable terms as well, like those insertable terms that are
|
|
// new to us!!
|
|
|
|
// scan list of insertable terms
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
|
|
|
|
// . true means to get synonyms
|
|
// . itBuf non-null will append new insertable terms we don't have
|
|
int32_t *twids = getTopTermsVectorWithNewTerms ( true , itBuf );
|
|
if ( ! twids || twids==(void *)-1 ) return (SafeBuf *)twids;
|
|
|
|
// int16_tcut
|
|
//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
|
|
//int32_t ni = itBuf->length() / sizeof(InsertableTerm);
|
|
|
|
// get buffer of ptrs to the msg99 replies for this url
|
|
//SafeBuf *mqbuf = getMatchingQueries ( false );
|
|
//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
|
|
//Msg99Reply **mrp = (Msg99Reply **)mqbuf->getBufStart();
|
|
//int32_t nmrp = mqbuf->length() / 4;
|
|
|
|
|
|
// use table to dedup so we do not store dups
|
|
HashTableX dups;
|
|
if ( ! dups.set ( 8,0,8192,NULL,0,false,m_niceness,"iwfidup") )
|
|
return NULL;
|
|
|
|
// . first store the langid in the buf!!!
|
|
// . then the wordfreqinfos follow!
|
|
if ( ! m_iwfiBuf.safeMemcpy ( &docLangId , 1 ) )
|
|
return NULL;
|
|
|
|
char *p = itBuf->getBufStart();
|
|
char*pend = itBuf->getBuf();
|
|
|
|
// scan each "term" which might be one or more words
|
|
for ( ; p < pend ; ) {
|
|
//for ( int32_t i = 0 ; i < nmrp ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// add it in
|
|
if ( ! addTermFreqsForTerm ( it->getTerm() , &dups ) )
|
|
return NULL;
|
|
}
|
|
|
|
// do the same for all words and bigram terms in doc as well
|
|
|
|
|
|
m_iwfiBufValid = true;
|
|
return &m_iwfiBuf;
|
|
}
|
|
|
|
bool XmlDoc::addTermFreqsForTerm ( char *term , HashTableX *dups ) {
|
|
|
|
// we need this for synonyms
|
|
//uint8_t langId = langEnglish;
|
|
uint8_t *langIdPtr = getLangId();
|
|
// this should have been set by parent caller
|
|
if ( ! langIdPtr || langIdPtr == (uint8_t *)-1 ) {char *xx=NULL;*xx=0;}
|
|
// get the language this doc is in
|
|
uint8_t docLangId = *langIdPtr;
|
|
// if uknown, use english!
|
|
if ( docLangId == langUnknown ) docLangId = langEnglish;
|
|
|
|
|
|
//Msg99Reply *mr = mrp[i];
|
|
//Words ww;
|
|
//ww.set3 ( it->m_termStr );
|
|
//ww.set3(it->getTerm() );//mr->m_queryStr );//it->m_termStr );
|
|
Query qq;
|
|
// false = query expansion? i.e. use synonyms?
|
|
//qq.set2 ( it->getTerm(),docLangId,true);
|
|
qq.set2 ( term,docLangId,true);
|
|
//if ( strstr ( mr->m_queryStr, "bio wagner"))
|
|
// log("hey");
|
|
log("adding %s",term);
|
|
//int64_t *wids = ww.getWordIds();
|
|
// scan each word for term freq
|
|
for ( int32_t j = 0 ; j < qq.m_numTerms ; j++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qq.m_qterms[j];
|
|
// get the full 64-bit hash of the word
|
|
int64_t wid = qt->m_rawTermId;
|
|
// skip if punct
|
|
if ( ! wid ) continue;
|
|
// dup?
|
|
if ( dups->isInTable ( &wid ) ) continue;
|
|
// add it
|
|
int64_t tf = g_posdb.getTermFreq ( cr->m_coll, wid );
|
|
if ( ! dups->addKey ( &wid ) ) return NULL;
|
|
WordFreqInfo wfi;
|
|
wfi.m_wordId64 = wid;
|
|
wfi.m_wordFreq64 = tf;
|
|
// note it
|
|
SafeBuf bb;
|
|
bb.safePrintf("seo: tf for term=\"");
|
|
bb.safeMemcpy ( qt->m_term, qt->m_termLen);
|
|
bb.safePrintf("\" = %"INT64"",tf);
|
|
log("seo: %s",bb.getBufStart());
|
|
// store it
|
|
if(!m_iwfiBuf.safeMemcpy(&wfi,sizeof(WordFreqInfo)))
|
|
return NULL;
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// 2. now transmit all the insertable terms to each host in the network. each
|
|
// host will evaluate each term in the list for every query that that
|
|
// host has in its memory for every new word position. kick this process
|
|
// off with the getNewRanks() function which returns a list of
|
|
// query terms where each query term has a wordposition/trafficgain
|
|
// array. [try to also insert entire phrases not just words]
|
|
// Each host will return an InsertedTerm class for each term. But then
|
|
// WE have to merge the InsertedTerm classes together for a particular
|
|
// term. That can be a bit tricky since we do not list a wordposition
|
|
// if it's traffic gain was the same as its previous wordposition.
|
|
// PASS in the entire doc's termlist with each request in case not in cache
|
|
// so it can evaluate each query's scores very quickly!
|
|
//
|
|
// . send a msg95 request to each host consisting of a list of terms to
|
|
// insert, and the entire termlists of this document.
|
|
// . then merge the replies into a final list of InsertedTerms.
|
|
// . returned is buffer of InsertableTerms
|
|
SafeBuf *XmlDoc::getScoredInsertableTerms ( ) {
|
|
|
|
setStatus ( "getscoredinsertableterms" );
|
|
|
|
if ( m_scoredInsertableTermsBufValid )
|
|
return &m_insertableTermsBuf;
|
|
|
|
uint8_t *langIdPtr = getLangId();
|
|
if ( ! langIdPtr || langIdPtr == (void *)-1 )
|
|
return (SafeBuf *)langIdPtr;
|
|
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
|
|
|
|
// these are the posdb keys of our document, makes it fast
|
|
// and easy for msg39 to return a serp score restricted to our docid
|
|
SafeBuf *termListBuf = getTermListBuf();
|
|
if ( ! termListBuf || termListBuf==(void *)-1 )
|
|
return termListBuf;
|
|
|
|
|
|
// this has all our documents terms and their synonyms in it,
|
|
// as well as the new terms we plan to insert that our doc does not
|
|
// have, from the getMissingTerms() buffer. in addition it
|
|
// has the term freq of each one!
|
|
SafeBuf *ntiBuf = getNewTermInfoBuf();
|
|
if ( ! ntiBuf || ntiBuf == (void *)-1 ) return (SafeBuf *)ntiBuf;
|
|
|
|
// get list of TermFreqInfo instances for all words in the
|
|
// lits of insertable terms
|
|
//SafeBuf *wfib = getInsertableWordFreqInfoBuf ( );
|
|
//if ( ! wfib || wfib == (void *)-1 ) return wfib;
|
|
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) return wpib;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// if still waiting for replies to come in, return -1
|
|
if ( m_numMsg95Requests > 0 && m_numMsg95Replies < m_numMsg95Requests )
|
|
return (SafeBuf *)-1;
|
|
|
|
top:
|
|
|
|
// otherwise, we are done!
|
|
if ( m_numMsg95Requests > 0 && m_numMsg95Replies >=m_numMsg95Requests){
|
|
// . calculate the best insertable position for each
|
|
// Insertable Term.
|
|
// . we get a QueryChange array back from each host for
|
|
// the same term, but for queries local on that host,
|
|
// so add them all up here and set
|
|
// InsertableTerm::m_bestTrafficGain/m_bestTermPosition
|
|
// . queries that did not have us in the top 50 will not
|
|
// be in the reply
|
|
processMsg95Replies();
|
|
// show how long it took
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginMsg95s;
|
|
log("seopipe: time: getscoredinsertableterms took %"INT64" ms",
|
|
took);
|
|
// return the list of InsertableTerms, scored
|
|
m_scoredInsertableTermsBufValid = true;
|
|
// cache it! if it blocks that is ok, since it is valid n
|
|
// disable for debug... MDW!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
if ( ! storeScoredInsertableTermsIntoCachedb() )
|
|
return (SafeBuf *)-1;
|
|
return &m_insertableTermsBuf;
|
|
}
|
|
|
|
|
|
// now send every term in this list to every host in the
|
|
// network so it can evaluate with each of the queries it contains
|
|
// in memory from the query log for every position in the doc.
|
|
// then it will return InsertableTerm::m_wordPositions/m_trafficGain
|
|
// arrays for each InsertableTerm.
|
|
|
|
// time how long this whole thing takes
|
|
m_beginMsg95s = gettimeofdayInMilliseconds();
|
|
// reset this crap i guess
|
|
m_numMsg95Requests = 0;
|
|
m_numMsg95Replies = 0;
|
|
|
|
// from seo.h
|
|
Msg95Request mr;
|
|
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
mr.m_docId = m_docId;
|
|
mr.m_docLangId = *langIdPtr;
|
|
mr.m_seoDebug = m_seoDebug;
|
|
|
|
mr.ptr_posdbTermList = termListBuf->getBufStart();
|
|
// a buffer of TermInfos. used to set the termFreq of each term
|
|
// and used to determine what queries match the doc and should be
|
|
// evaluated for every insertion.
|
|
mr.ptr_termInfoBuf = ntiBuf->getBufStart();
|
|
mr.ptr_coll = cr->m_coll;
|
|
//mr.ptr_wordFreqInfoBuf = wfib->getBufStart();
|
|
mr.ptr_wordPosInfoBuf = wpib->getBufStart();
|
|
// why do we need this? doesn't termInfoBuf have all that? no,
|
|
// because we limit insertableterms to like the top 300 highest
|
|
// scoring, so they are separate. the termInfoBuf is sorted by
|
|
// termid (lower 32-bits) and has a termfreq and is used to
|
|
// get the matching queries in seo.cpp:handlerequest95()
|
|
mr.ptr_insertableTerms = m_insertableTermsBuf.getBufStart();
|
|
|
|
mr.size_posdbTermList = termListBuf->length();
|
|
mr.size_termInfoBuf = ntiBuf->length();//m_numTwids * 4;
|
|
mr.size_coll = gbstrlen(cr->m_coll)+1;
|
|
//mr.size_wordFreqInfoBuf = wfib->length();
|
|
mr.size_wordPosInfoBuf = wpib->length();
|
|
mr.size_insertableTerms = m_insertableTermsBuf.length();
|
|
|
|
int32_t requestSize;
|
|
char *req = serializeMsg ( sizeof(Msg95Request),
|
|
&mr.size_posdbTermList ,// firstSizeParm
|
|
&mr.size_insertableTerms,//lastSizeP
|
|
&mr.ptr_posdbTermList ,// firststrptr
|
|
&mr ,// thisPtr
|
|
&requestSize ,
|
|
NULL ,
|
|
0 ,
|
|
true );
|
|
|
|
if ( ! req ) return NULL;
|
|
|
|
int32_t numHosts = g_hostdb.m_numHosts;
|
|
// do not re-send if we already did this!
|
|
if ( m_numMsg95Requests > 0 ) numHosts = 0;
|
|
|
|
// send one msg95 request to each host. skip if dead.
|
|
for ( int32_t i = 0; i < numHosts ; i++ ) {
|
|
// get ptr to the host
|
|
Host *host = g_hostdb.getHost(i);
|
|
// get hostid of host #i
|
|
int32_t hostId = host->m_hostId;
|
|
// count it
|
|
m_numMsg95Requests++;
|
|
// skip if dead. i guess no queries from that guy. we can't
|
|
// send to a twin because the twin does not have the same
|
|
// queries in its in-memory query log. once we get more
|
|
// machines we should probably make the twin have the same
|
|
// copy so we can be redundant.
|
|
if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive ) {
|
|
log("seo: warning. host %"INT32" is dead so we could "
|
|
"not do the keyword tool right",hostId);
|
|
m_numMsg95Replies++;
|
|
continue;
|
|
}
|
|
// . send our posdb termlist to each host so it can
|
|
// call msg39 restricted to our docid very quickly
|
|
// . also send a ALL of the insertable terms to each
|
|
// host so they can evaluate the insertion for all of the
|
|
// relevant queries.
|
|
// . each host should be smart enough to realize that some
|
|
// queries need not be performed for an insertion because
|
|
// it is impossible to break the minimum score to be in the
|
|
// top 50 for that query. but we'll only have a minimum
|
|
// score for each query once we run a batch to eval
|
|
// each query at least partially to get a rough idea of
|
|
// the score needed to be in the top 50.
|
|
// . reply should be an array of QueryChanges for each
|
|
// insertable term for every query that matches this doc
|
|
// in the g_qlog buffer.
|
|
// . in most cases these arrays will be empty because we are
|
|
// not in the top 50 for that query
|
|
if ( ! g_udpServer.sendRequest ( req ,
|
|
requestSize ,
|
|
0x95 , // msgtype
|
|
host->m_ip , // ip
|
|
host->m_port , // port
|
|
hostId,
|
|
NULL, // retslot
|
|
this,
|
|
gotMsg95ReplyWrapper,
|
|
10000 , // timeout
|
|
-1 , // backoff
|
|
-1 , // maxwait
|
|
NULL, // replybuf
|
|
0, // replybufmaxsize
|
|
m_niceness // niceness
|
|
)) {
|
|
// let admin know about error
|
|
log("seopipe: sendRequest 95 had error: %s",
|
|
mstrerror(g_errno));
|
|
// count it as replied then
|
|
m_numMsg95Replies++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// wait for all msg95 replies to come in
|
|
if ( m_numMsg95Requests > m_numMsg95Replies )
|
|
return (SafeBuf *)-1;
|
|
|
|
// somehow we finished without blocking
|
|
goto top;
|
|
|
|
// dummy return
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// now sort the huge ptr buffer to QueryChanges first by:
|
|
// 1: QueryChange::m_termHash64
|
|
// 2: QueryChange::m_queryHash32
|
|
// 3: QueryChange::m_insertPos
|
|
int queryChangeCmp ( const void *a, const void *b ) {
|
|
QueryChange *qa = *(QueryChange **)a;
|
|
QueryChange *qb = *(QueryChange **)b;
|
|
// smallest term hash should be at the head of the list
|
|
if ( qa->m_termHash64 < qb->m_termHash64 ) return -1;
|
|
if ( qa->m_termHash64 > qb->m_termHash64 ) return 1;
|
|
if ( qa->m_queryHash32 < qb->m_queryHash32 ) return -1;
|
|
if ( qa->m_queryHash32 > qb->m_queryHash32 ) return 1;
|
|
if ( qa->m_insertPos < qb->m_insertPos ) return -1;
|
|
if ( qa->m_insertPos > qb->m_insertPos ) return 1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
// . make each InsertableTerm point to a linked list of QueryChanges for it.
|
|
// . each QueryChange is a word position and a rank change
|
|
// . the linked list will be sorted by QueryChange::m_insertPos
|
|
// . there can be multiple QueryChanges for a single m_insertPos, but
|
|
// they will be fore different queries.
|
|
bool XmlDoc::processMsg95Replies() {
|
|
|
|
int32_t need = 0;
|
|
// each reply is a list of QueryChanges
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// get reply
|
|
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
|
|
// skip if empty, error?
|
|
if ( ! mr ) continue;
|
|
// deserialize the msg95replies first
|
|
deserializeMsg ( sizeof(Msg95Reply) ,
|
|
(int32_t *)&mr->size_queryChangeBuf,//1stszparm
|
|
(int32_t *)&mr->size_queryLogBuf,//lastszparm
|
|
(char **)&mr->ptr_queryChangeBuf,//1ststrptr
|
|
mr->m_buf );
|
|
// scan the QueryChanges
|
|
//QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
|
|
int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
|
|
need += ncs * 4;
|
|
}
|
|
// alloc now
|
|
SafeBuf hugePtrBuf;
|
|
if ( ! hugePtrBuf.reserve ( need ,"hpbuf" ) ) return false;
|
|
|
|
// how big are all query log bufs?
|
|
int32_t sumTotal = 0;
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// get reply
|
|
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
|
|
// skip if empty, error?
|
|
if ( ! mr ) continue;
|
|
// how big
|
|
sumTotal += mr->size_queryLogBuf;
|
|
}
|
|
m_queryLogBuf.reset();
|
|
if ( ! m_queryLogBuf.reserve ( sumTotal ,"qlogbuf") ) return false;
|
|
char *orig = m_queryLogBuf.getBufStart();
|
|
|
|
int32_t ongoingOffset = 0;
|
|
int32_t ongoingDebugOffset = 0;
|
|
int32_t ongoingOrigOffset = 0;
|
|
|
|
// . fill up higePtrBuf for sorting below
|
|
// . also fill up m_queryLogBuf now for store*IntoCachedb()
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// get reply
|
|
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
|
|
// skip if empty, error?
|
|
if ( ! mr ) continue;
|
|
// ref it
|
|
//char *ref = m_queryLogBuf.getBuf();
|
|
//int32_t ref = m_queryLogBuf.length();
|
|
// add to our big buffer
|
|
m_queryLogBuf.safeMemcpy ( mr->ptr_queryLogBuf ,
|
|
mr->size_queryLogBuf );
|
|
// debug scores. should be length 0 if not debugging.
|
|
m_debugScoreInfoBuf.safeMemcpy ( mr->ptr_debugScoreInfoBuf ,
|
|
mr->size_debugScoreInfoBuf );
|
|
// original scores buf
|
|
m_origScoreInfoBuf.safeMemcpy ( mr->ptr_origScoreInfoBuf ,
|
|
mr->size_origScoreInfoBuf );
|
|
// scan the QueryChanges
|
|
QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
|
|
int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
|
|
for ( int32_t j = 0 ; j < ncs ; j++ ) {
|
|
QueryChange *qc = &qcs[j] ;
|
|
// this is relative to ptr_queryLogBuf
|
|
qc->m_replyQueryOffset += ongoingOffset;
|
|
// if we have debug score info
|
|
if ( m_seoDebug >= 2 ) {
|
|
if ( qc->m_debugScoreInfoOffset < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
if ( qc->m_origScoreInfoOffset < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
qc->m_debugScoreInfoOffset +=
|
|
ongoingDebugOffset;
|
|
qc->m_origScoreInfoOffset +=
|
|
ongoingOrigOffset;
|
|
}
|
|
// that's relative to the msg95reply's ptr_queruStrBuf
|
|
//QueryLogEntry *qe;
|
|
//qe = (QueryLogEntry *)(mr->ptr_queryLogBuf + qoff);
|
|
//qe = (QueryLogEntry *)(ref + qoff);
|
|
// HACK that in. RELATIVE to m_queryLogBuf!!!
|
|
//qc->m_queryOffset3 = ref;//(int32_t)qe;
|
|
// add ptr to our global buffer
|
|
hugePtrBuf.pushPtr ( qc );
|
|
}
|
|
// sum it up
|
|
ongoingOffset += mr->size_queryLogBuf;
|
|
ongoingDebugOffset += mr->size_debugScoreInfoBuf;
|
|
ongoingOrigOffset += mr->size_origScoreInfoBuf;
|
|
}
|
|
// sanity. make sure doesn't grow since we reference it
|
|
if ( m_queryLogBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now sort the huge ptr buffer to QueryChanges first by:
|
|
// 1: QueryChange::m_termHash64
|
|
// 2: QueryChange::m_queryHash32
|
|
// 3: QueryChange::m_insertPos
|
|
char *hhh = hugePtrBuf.getBufStart();
|
|
int32_t size = hugePtrBuf.length();
|
|
// this should breath with niceness!!
|
|
gbqsort ( hhh ,
|
|
size/4 ,
|
|
sizeof(QueryChange *),
|
|
queryChangeCmp ,
|
|
m_niceness ) ;
|
|
|
|
// now store those sorted query changes into m_queryChangeBuf
|
|
// so we can cache them in store*IntoCached() easily
|
|
int32_t nqc = (need / 4) ;
|
|
if ( ! m_queryChangeBuf.reserve ( nqc * sizeof(QueryChange),"qcbuf") )
|
|
return false;
|
|
// for sanity check
|
|
char *orig2 = m_queryChangeBuf.getBufStart();
|
|
|
|
// copy over sorted into m_queryChangeBuf so we can cache it in cachedb
|
|
char *p = hhh;
|
|
char *pend = hhh + size;
|
|
for ( ; p < pend ; p += sizeof(QueryChange *) ) {
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)p;
|
|
// save ptr to it
|
|
char *ref = m_queryChangeBuf.getBuf();
|
|
// save it
|
|
m_queryChangeBuf.safeMemcpy ( qc , sizeof(QueryChange) );
|
|
// now ref that instead
|
|
*(QueryChange **)p = (QueryChange *)ref;
|
|
}
|
|
// sanity test
|
|
if ( m_queryChangeBuf.getBufStart() != orig2 ) { char *xx=NULL;*xx=0;}
|
|
|
|
// now we can free the replies since we stored the replies into
|
|
// m_queryLogBuf and m_queryChangeBuf for store*IntoCachedb()
|
|
for ( int32_t i = 0;i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg95ReplyPtrs[i] ) continue;
|
|
mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
|
|
m_msg95ReplyPtrs[i] = NULL;
|
|
}
|
|
|
|
|
|
// . now set QueryChange::m_next to make our linked list
|
|
// . if it is for a different query or termhash then end the linked
|
|
// list by setting m_next to NULL
|
|
QueryChange *lastqc = NULL;
|
|
for ( p = hhh ; p < pend ; p += 4 ) {
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)p;
|
|
// assume we are the last one in the linked list
|
|
qc->m_next = NULL;
|
|
// make linked list
|
|
if ( lastqc &&
|
|
// terms must match to be in same linked list
|
|
lastqc->m_termHash64 == qc->m_termHash64 )
|
|
// link them
|
|
lastqc->m_next = qc;
|
|
// set this for next qc
|
|
lastqc = qc;
|
|
}
|
|
|
|
// now set InsertableTerm::m_firstQueryChange to point to the head
|
|
// of the linked list for that term based on it's m_termHash64.
|
|
// but the insertable terms are sorted by m_trafficSum.
|
|
// map a termHash64 to its corresponding first QueryChange.
|
|
HashTableX tit;
|
|
if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0; }
|
|
int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
|
|
if ( ! tit.set ( 8,4, ni*4,NULL,0,false,m_niceness,"tittbl") )
|
|
return false;
|
|
|
|
int64_t lastHash64 = 0LL;
|
|
// . store ptr to first querychange for each termhash64 into hash table
|
|
// . should be the head of the linked list for a termid
|
|
for ( p = hhh ; p < pend ; p += 4 ) {
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)p;
|
|
// skip if not a new term hash
|
|
if ( qc->m_termHash64 == lastHash64 ) continue;
|
|
// update it
|
|
lastHash64 = qc->m_termHash64;
|
|
// . map it in the hash table then
|
|
// . it should be pre-allocated!
|
|
if (!tit.addKey(&qc->m_termHash64,&qc)){char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// now scan the insertable terms and set their
|
|
// InsertableTerm::m_firstQueryChange ptr. points to the head
|
|
// of the QueryChange linked list for this insertable term
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
p = itBuf->getBufStart();
|
|
pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// assume none
|
|
it->m_firstQueryChange = NULL;
|
|
char *val = (char *)tit.getValue(&it->m_termHash64);
|
|
// i guess there is none
|
|
if ( ! val ) continue;
|
|
// cast it
|
|
QueryChange *qc = *(QueryChange **)val;
|
|
// and assign
|
|
it->m_firstQueryChange = qc;
|
|
}
|
|
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
|
|
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
|
|
|
|
// now set InsertableTerm::m_bestTrafficGain/m_bestInsertPos/
|
|
// m_bestQueryChange by scanning the linked list and scoring each
|
|
// QueryChange::m_insertPos to see which is the highest traffic gain.
|
|
// and in the case of ties prefer the lowest word position.
|
|
p = itBuf->getBufStart();
|
|
pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// . use this function now so seo.cpp can call it too!
|
|
// . sets WordPosInfo::m_trafficGain members
|
|
setWordPosInfosTrafficGain ( it );
|
|
// now find the insert position with the most traffic gain!
|
|
int32_t bestTrafficGain = -1;
|
|
int32_t bestInsertPos = -1;
|
|
for ( int32_t j = 0 ; j < nwpis ; j++ ) {
|
|
// skip if not the best scoring position
|
|
if ( wpis[j].m_trafficGain <= bestTrafficGain &&
|
|
// and if not first time!
|
|
bestInsertPos != -1 )
|
|
continue;
|
|
// we got a new winner
|
|
bestTrafficGain = wpis[j].m_trafficGain;
|
|
bestInsertPos = wpis[j].m_wordPos;//insertPos;
|
|
}
|
|
// set it
|
|
it->m_bestTrafficGain = bestTrafficGain;
|
|
it->m_bestInsertPos = bestInsertPos;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void XmlDoc::setWordPosInfosTrafficGain ( InsertableTerm *it ) {
|
|
|
|
// get the wordposinfobuf!
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
|
|
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
|
|
|
|
// . use the wordposinfo array to accumulate traffic gains
|
|
// for each word position, WordPosInfo::m_insertPos.
|
|
// . TODO: ignore tags like gblangid:
|
|
// . so reset the traffic gains first
|
|
for ( int32_t j = 0 ; j < nwpis ; j++ )
|
|
wpis[j].m_trafficGain = 0;
|
|
|
|
|
|
if ( ! it ) return;
|
|
|
|
// head of the linked list of QueryChanges for this InsertableTerm
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
// skip if no list. leave traffic gains set to 0 for all
|
|
if ( ! qc ) return;
|
|
|
|
|
|
// accumulate traffic gains
|
|
int32_t k = 0;
|
|
int32_t lastQueryHash32 = 0;
|
|
//bool firstQueryChangeForQuery;
|
|
QueryChange *lastqc = NULL;
|
|
// . scan the linked list of query changes
|
|
// . this is sorted by query first then m_insertPos
|
|
for ( ; qc ; qc = qc->m_next ) {
|
|
// assume NOT the first QueryChange for this query
|
|
//firstQueryChangeForQuery = false;
|
|
// . reset stuff for each different query
|
|
// . QueryChanges are sorted by m_queryHash32 secondly
|
|
// and by m_insertPos thirdly now...
|
|
if ( qc->m_queryHash32 != lastQueryHash32 ) {
|
|
// reset our WordPosInfo cursor
|
|
k = 0;
|
|
// for detecting the next set of QueryChanges
|
|
// for a different query
|
|
lastQueryHash32 = qc->m_queryHash32;
|
|
//firstQueryChangeForQuery = true;
|
|
lastqc = NULL;
|
|
}
|
|
// sanity
|
|
if ( lastqc && lastqc->m_insertPos > qc->m_insertPos ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// compute th traffic in advance from the rank changes
|
|
int32_t trafficGain = getTrafficGain( qc );
|
|
// checkpoint
|
|
/*
|
|
if ( trafficGain > 0 )
|
|
log("got some traffic gain qh=%"UINT32" "
|
|
"pos=%"INT32" term=%s gain=%"INT32"",
|
|
qc->m_queryHash32,
|
|
qc->m_insertPos,
|
|
it->m_termStr,
|
|
trafficGain);
|
|
*/
|
|
// get next query change
|
|
QueryChange *nqc = qc->m_next;
|
|
// make it NULL if for a different query
|
|
if ( nqc && nqc->m_queryHash32 != qc->m_queryHash32 )
|
|
nqc = NULL;
|
|
// . we use a compression where we only store a
|
|
// QueryChange if different than the last QueryChange
|
|
// . so advance the WordPosInfos cursor "k" until
|
|
// we catch up to the qc->m_insertPos.
|
|
for ( ; k < nwpis ; k++ ) {
|
|
// stop if we are caught up
|
|
if ( wpis[k].m_wordPos >= qc->m_insertPos )
|
|
break;
|
|
}
|
|
// now this position and up to next qc "nqc" gets the traffic
|
|
for ( ; k < nwpis ; k++ ) {
|
|
// stop if we are caught up
|
|
if ( nqc && wpis[k].m_wordPos >= nqc->m_insertPos )
|
|
break;
|
|
wpis[k].m_trafficGain += trafficGain;
|
|
}
|
|
}
|
|
|
|
/*
|
|
// print out positives - debug
|
|
for ( int32_t k = 0 ; k < nwpis ; k++ ) {
|
|
// stop if we are caught up
|
|
if ( ! wpis[k].m_trafficGain ) continue;
|
|
if ( wpis[k].m_trafficGain <= 0 ) continue;
|
|
// note it
|
|
log("seo: gain pos=%"INT32" gain=%"INT32"",
|
|
wpis[k].m_wordPos,
|
|
wpis[k].m_trafficGain);
|
|
}
|
|
*/
|
|
}
|
|
|
|
double getTrafficPercent ( int32_t rank ) {
|
|
// from aol's query logs from that same searchenginewatch.com url
|
|
static double s_posClicks[1000] = {
|
|
.4230, // #1
|
|
.1192,
|
|
.0844,
|
|
.0603,
|
|
.0486,
|
|
.0399,
|
|
.0337,
|
|
.0298,
|
|
.0283,
|
|
.0270 // #10 (was .297 but for our purposes, make it <)
|
|
};
|
|
|
|
//static float s_pageClicks[5];
|
|
|
|
// set total of clicks each page gets
|
|
static bool s_init = false;
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
//float sum = 0.0;
|
|
//for ( int32_t i = 0 ; i < 10 ; i++ )
|
|
// sum += s_posClicks[i];
|
|
// this is about .11 or so
|
|
//float pageFactor = 1.0 - sum;
|
|
// HACK! make it pass the sanity check below!
|
|
//pageFactor *= .50;
|
|
// sanity. do not allow top result on 2nd page
|
|
// to rank higher!!
|
|
//if ( pageFactor * s_posClicks[0] > s_posClicks[9] ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// will be like .11 for second page, .01 for 3rd, etc.
|
|
//float pageMult = 1.0;
|
|
// fill in the rest
|
|
for ( int32_t i = 10 ; i < 1000 ; i++ ) {
|
|
// just make it linear since there is too much
|
|
// chaos as to our diffs with google. so this is
|
|
// a good estimation way...
|
|
s_posClicks[i] = .0270 - .0007 * i;
|
|
if ( s_posClicks[i] < 0 )
|
|
s_posClicks[i] = 0.0;
|
|
}
|
|
// sanity to make sure all in order
|
|
for ( int32_t i = 1 ; i < 1000 ; i++ ) {
|
|
if ( s_posClicks[i-1] < s_posClicks[i] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
if ( s_posClicks[i] < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
}
|
|
|
|
if ( rank >= 1000 ) rank = 999;
|
|
if ( rank < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
return s_posClicks[rank];
|
|
}
|
|
// . based on difference between m_oldRank and m_newRank
|
|
// . m_*Rank starts at 0 and goes to 9 for first page of results
|
|
int32_t XmlDoc::getTrafficGain ( QueryChange *qc ) {
|
|
|
|
// no rank change? this can both be -1 if it is a missing
|
|
// term i guess... and we're not inserting it.
|
|
if ( qc->m_oldRank == qc->m_newRank ) return 0;
|
|
|
|
// get old clicks
|
|
int32_t oldRank = qc->m_oldRank;
|
|
double oldp;
|
|
// if not ranked before because this was inserting a brand new
|
|
// missing term, this will be -1
|
|
if ( oldRank == -1 ) oldp = 0.0;
|
|
else oldp = getTrafficPercent ( oldRank );
|
|
//if ( oldRank < 50 ) oldp = s_posClicks[oldRank];
|
|
|
|
// get new clicks
|
|
int32_t newRank = qc->m_newRank;
|
|
float newp = getTrafficPercent ( newRank );
|
|
//if ( newRank < 50 ) newp = s_posClicks[newRank];
|
|
|
|
// HACK
|
|
// we stored the entire querylogreply buf in here
|
|
char *ref = m_queryLogBuf.getBufStart();
|
|
// so we can use the replyqueryoffset then...
|
|
QueryLogEntry *qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
|
|
int32_t traffic = qe->m_gigablastTraffic;
|
|
traffic *= GB_TRAFFIC_MODIFIER;
|
|
|
|
int32_t trafficChange = (int32_t)((newp - oldp) * traffic);
|
|
|
|
// sanity.
|
|
if ( qc->m_oldRank > qc->m_newRank && trafficChange < 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// ignore this sanity check if not ranked before. i.e. inserting
|
|
// a new missing term...
|
|
if ( qc->m_oldRank != -1 &&
|
|
qc->m_oldRank < qc->m_newRank && trafficChange > 0 ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// return the change. it might be negative!
|
|
return trafficChange;
|
|
}
|
|
|
|
|
|
// 4. then we just dump out all the InsertedTerms into xml so they can be
|
|
// displayed on the front end.
|
|
|
|
// dump the list of InsertedTerms into "sbuf" as xml
|
|
bool XmlDoc::printScoredInsertableTerms ( SafeBuf *sbuf ) {
|
|
// print the header
|
|
sbuf->safePrintf("\t<insertableTerms>\n");
|
|
// scan each term
|
|
SafeBuf *itBuf = getInsertableTerms();
|
|
// has to be there
|
|
if ( ! itBuf || itBuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
SafeBuf *wpib = getWordPosInfoBuf();
|
|
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
|
|
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
|
|
|
|
// cast it
|
|
//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
|
|
// how many terms do we have?
|
|
//int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
|
|
// dedup queries used in query changes
|
|
HashTableX qdups;
|
|
if ( ! qdups.set(4,0,32,NULL,0,false,m_niceness,"qddd") ) return false;
|
|
|
|
//
|
|
// . print query map
|
|
// . print all query ids we use and their strings
|
|
//
|
|
bool firstTime = true;
|
|
|
|
char *p = itBuf->getBufStart();
|
|
char *pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// scan its query changes
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
for ( ; qc ; qc = qc->m_next ) {
|
|
// skip if already printed
|
|
if ( qdups.isInTable(&qc->m_queryHash32) ) continue;
|
|
if ( firstTime ) {
|
|
sbuf->safePrintf("\t\t<queryMap>\n");
|
|
sbuf->safePrintf("\t\t\t<desc>"
|
|
"<![CDATA["
|
|
"32bitSignedQueryHash,"
|
|
"queryString"
|
|
"]]></desc>\n"
|
|
);
|
|
}
|
|
firstTime = false;
|
|
// HACK
|
|
char *ref = m_queryLogBuf.getBufStart();
|
|
QueryLogEntry *qe;
|
|
qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
|
|
// new query, print it. map the hash to the string
|
|
// so we can just show the hash when printing
|
|
// out all the QueryChanges below to save space
|
|
sbuf->safePrintf("\t\t\t<queryPoint>"
|
|
"<![CDATA[%"INT32",%s]]>"
|
|
"</queryPoint>\n"
|
|
, qc->m_queryHash32
|
|
// hack...
|
|
, qe->getQueryStr()
|
|
);
|
|
// do not re-print
|
|
if ( ! qdups.addKey(&qc->m_queryHash32) )return false;
|
|
}
|
|
}
|
|
if ( ! firstTime )
|
|
sbuf->safePrintf("\t\t</queryMap>\n");
|
|
|
|
// . now the word position map
|
|
// . we only provided querychange if it has a different score than
|
|
// the previously stored querychange. this is a kind of compression
|
|
// . so you need to know all the possible word positions we tried
|
|
// for each insertion we did
|
|
sbuf->safePrintf("\t\t<wordInsertionMap>\n");
|
|
sbuf->safePrintf("\t\t\t<desc>"
|
|
"<![CDATA["
|
|
"Describes all positions we attempt to insert each "
|
|
"insertable term into. The terms at that position "
|
|
"and up are pushed forward by the insertion. "
|
|
"<sent> is the sentence number."
|
|
"]]></desc>\n"
|
|
);
|
|
for ( int32_t i = 0 ; i < nwpis ; i++ ) {
|
|
WordPosInfo *wpi = &wpis[i];
|
|
sbuf->safePrintf("\t\t\t<word>\n"
|
|
"\t\t\t\t<pos>%"INT32"</pos>\n"
|
|
"\t\t\t\t<sent>%"INT32"</sent>\n"
|
|
"\t\t\t\t<hashGroup>%s</hashGroup>\n"
|
|
"\t\t\t\t<densityRank>%"INT32"</densityRank>\n"
|
|
"\t\t\t\t<spamRank>%"INT32"</spamRank>\n"
|
|
"\t\t\t</word>\n"
|
|
,wpi->m_wordPos
|
|
,wpi->m_sentNum
|
|
,getHashGroupString(wpi->m_hashGroup)
|
|
,(int32_t)wpi->m_densityRank
|
|
,(int32_t)wpi->m_wordSpamRank
|
|
);
|
|
|
|
}
|
|
sbuf->safePrintf("\t\t</wordInsertionMap>\n");
|
|
|
|
|
|
// scan all the insertable terms
|
|
p = itBuf->getBufStart();
|
|
pend = itBuf->getBuf();
|
|
for ( ; p < pend ; ) {
|
|
QUICKPOLL(m_niceness);
|
|
// cast it
|
|
InsertableTerm *it = (InsertableTerm *)p;
|
|
p += it->getSize();
|
|
// print the term
|
|
sbuf->safePrintf("\t\t<term>\n");
|
|
// the string
|
|
sbuf->safePrintf("\t\t\t<string><![CDATA[%s]]></string>\n",
|
|
it->getTerm());
|
|
// sum of traffic of all queries containing this term
|
|
sbuf->safePrintf("\t\t\t<importance>%"INT32"</importance>\n",
|
|
it->m_trafficSum);
|
|
// is it contained in the doc/linktext or is it "related"
|
|
sbuf->safePrintf("\t\t\t<isRelatedTerm>%"INT32"</isRelatedTerm>\n",
|
|
(int32_t)it->m_isRelatedTerm);
|
|
// get the first query change if any
|
|
QueryChange *qc = it->m_firstQueryChange;
|
|
// limit to fix firefox crash
|
|
//int32_t queryChangeLimit = 30;
|
|
// skip if no list
|
|
if ( ! qc ) goto skip;
|
|
// print the insert position that gives us the most traffic
|
|
sbuf->safePrintf("\t\t\t<bestInsertPosition>%"INT32""
|
|
"</bestInsertPosition>\n",
|
|
it->m_bestInsertPos);
|
|
sbuf->safePrintf("\t\t\t<bestTrafficGain>%"INT32""
|
|
"</bestTrafficGain>\n",
|
|
it->m_bestTrafficGain);
|
|
// print query changes
|
|
if ( it->m_firstQueryChange )
|
|
sbuf->safePrintf("\t\t\t<queryChanges><![CDATA["
|
|
);
|
|
// print out query changes for this term
|
|
for ( qc = it->m_firstQueryChange ; qc ; qc = qc->m_next ) {
|
|
// fix firefox crash for now
|
|
//if ( --queryChangeLimit <= 0 ) break;
|
|
// now store in binary
|
|
sbuf->pushLong(qc->m_insertPos);
|
|
sbuf->pushLong(qc->m_queryHash32);
|
|
sbuf->pushChar(qc->m_oldRank);
|
|
sbuf->pushChar(qc->m_newRank);
|
|
/*
|
|
// . TODO: make sure to remove QueryChanges that have
|
|
// the same old and new rank
|
|
// . print it
|
|
sbuf->safePrintf("\t\t\t<queryChange>\n");
|
|
sbuf->safePrintf("\t\t\t\t<insertPos>%"INT32""
|
|
"</insertPos>\n", qc->m_insertPos);
|
|
sbuf->safePrintf("\t\t\t\t<oldRank>%"INT32""
|
|
"</oldRank>\n",(int32_t)qc->m_oldRank);
|
|
sbuf->safePrintf("\t\t\t\t<newRank>%"INT32""
|
|
"</newRank>\n",(int32_t)qc->m_newRank);
|
|
sbuf->safePrintf("\t\t\t\t<queryId>%"INT32""
|
|
"</queryId>\n",
|
|
qc->m_queryHash32 );
|
|
sbuf->safePrintf("\t\t\t</queryChange>\n");
|
|
*/
|
|
}
|
|
if ( it->m_firstQueryChange )
|
|
sbuf->safePrintf("]]></queryChanges>\n");
|
|
|
|
skip:
|
|
// print the term end
|
|
sbuf->safePrintf("\t\t</term>\n");
|
|
}
|
|
sbuf->safePrintf("\t</insertableTerms>\n");
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
static int wordPosInfoCmp ( const void *a, const void *b ) {
|
|
WordPosInfo *wa = (WordPosInfo *)a;
|
|
WordPosInfo *wb = (WordPosInfo *)b;
|
|
// smallest word position should be at the head of the list
|
|
if ( wa->m_wordPos < wb->m_wordPos ) return -1;
|
|
if ( wa->m_wordPos > wb->m_wordPos ) return 1;
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
static int wpPosdbKeyCmp ( const void *a, const void *b ) {
|
|
int32_t wpa = g_posdb.getWordPos((char *)a);
|
|
int32_t wpb = g_posdb.getWordPos((char *)b);
|
|
return wpa - wpb;
|
|
}
|
|
|
|
SafeBuf *XmlDoc::getWordPosSortedPosdbListBuf ( ) {
|
|
|
|
if ( m_wpSortedPosdbListBufValid )
|
|
return &m_wpSortedPosdbListBuf;
|
|
|
|
// get the lists. forDelete = false.
|
|
char *metaList = getMetaList ( false );
|
|
if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;
|
|
|
|
// sanity
|
|
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make a tmp buf to hold posdb keys
|
|
//SafeBuf tmp;
|
|
if ( ! m_wpSortedPosdbListBuf.reserve ( m_metaListSize,"wpsbuf" ) )
|
|
return NULL;
|
|
// point into it
|
|
char *dst = m_wpSortedPosdbListBuf.getBufStart();
|
|
|
|
// scan the meta list for posdb keys
|
|
char *p = metaList;
|
|
char *pend = p + m_metaListSize;
|
|
// stole this loop from getMetaList()
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save it with the flag
|
|
char byte = *p;
|
|
// get rdbId
|
|
char rdbId = byte & 0x7f;
|
|
// skip that
|
|
p++;
|
|
// key size
|
|
int32_t ks = getKeySizeFromRdbId(rdbId);
|
|
// get key
|
|
char *key = p;
|
|
// skip that
|
|
p += ks;
|
|
// get datasize
|
|
int32_t ds = getDataSizeFromRdbId(rdbId);
|
|
// assume we do not store the datasize
|
|
//bool neg = false;
|
|
// . if key is negative, no data is present
|
|
// . the doledb key is negative for us here
|
|
if ( (key[0] & 0x01) == 0x00 ) ds = 0;
|
|
// if datasize variable, read it in
|
|
if ( ds == -1 ) {
|
|
// get data size
|
|
ds = *(int32_t *)p;
|
|
// skip data size int32_t
|
|
p += 4;
|
|
}
|
|
// point to data
|
|
//char *data = p;
|
|
// skip data if not zero
|
|
p += ds;
|
|
// if not posdb skip rec
|
|
if ( rdbId != RDB_POSDB ) continue;
|
|
// skip negative keys
|
|
if ( (key[0] & 0x01) == 0x00 ) continue;
|
|
// add to new buf now
|
|
gbmemcpy ( dst , key , sizeof(POSDBKEY) );
|
|
// advance
|
|
dst += sizeof(POSDBKEY);
|
|
}
|
|
char *start = m_wpSortedPosdbListBuf.getBufStart();
|
|
// update tmp
|
|
m_wpSortedPosdbListBuf.incrementLength ( dst - start );
|
|
// sanity
|
|
if ( m_wpSortedPosdbListBuf.length() > m_metaListSize ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// point
|
|
char *pbuf = m_wpSortedPosdbListBuf.getBufStart();
|
|
int32_t numKeys = m_wpSortedPosdbListBuf.length()/sizeof(POSDBKEY);
|
|
// sort keys by word position
|
|
gbqsort ( pbuf ,
|
|
numKeys,
|
|
sizeof(POSDBKEY),
|
|
wpPosdbKeyCmp ,
|
|
m_niceness );
|
|
|
|
m_wpSortedPosdbListBufValid = true;
|
|
return &m_wpSortedPosdbListBuf;
|
|
}
|
|
|
|
// now pass this into Msg95Request so we only try to insert right before
|
|
// or after m_wordPos values in this WordPosInfo vector.
|
|
SafeBuf *XmlDoc::getWordPosInfoBuf ( ) {
|
|
|
|
// if it is valid and we have not yet added to cachedb...
|
|
if ( m_wordPosInfoBufValid && ! m_triedToAddWordPosInfoToCachedb ) {
|
|
// only do this once
|
|
m_triedToAddWordPosInfoToCachedb = true;
|
|
// store the m_wordPosInfoBuf into cachedb
|
|
if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
|
|
return (SafeBuf *)-1;
|
|
}
|
|
|
|
|
|
if ( m_wordPosInfoBufValid )
|
|
return &m_wordPosInfoBuf;
|
|
|
|
// it should be valid now from our logic in hashWords3() if
|
|
// m_doingSEO is set to true
|
|
char *xx=NULL; *xx=0;
|
|
|
|
// these are FULL 18-byte keys, no compression, sorted by word pos
|
|
SafeBuf *posdbBuf = getWordPosSortedPosdbListBuf ();
|
|
if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;
|
|
|
|
// scan posdb keys
|
|
int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);
|
|
|
|
// . reserve mem for new buf
|
|
int32_t need = numKeys * sizeof(WordPosInfo);
|
|
if ( ! m_wordPosInfoBuf.reserve ( need ,"wpibuf" ) )
|
|
return NULL;
|
|
|
|
int32_t sentNum = 0;
|
|
int32_t lastWordPos = -1;
|
|
//int32_t lastwp = -1;
|
|
int32_t lastSentNum = -1;
|
|
|
|
// scan all the sorted posdb keys and build posdb termlists and
|
|
// store the termlists into "m_termListBuf"
|
|
char *p = posdbBuf->getBufStart();
|
|
char *pend = p + posdbBuf->length();
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the key
|
|
char *key = p;
|
|
// sanity
|
|
if ( g_posdb.getKeySize(p) != 18 ) { char *xx=NULL;*xx=0; }
|
|
// skip del keys
|
|
if ( (p[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
|
|
// skip it
|
|
p += sizeof(POSDBKEY);
|
|
// get key termid
|
|
//int64_t termId = g_posdb.getTermId ( key );
|
|
// sanity
|
|
//int64_t docId = g_posdb.getDocId ( key );
|
|
//if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
|
|
// log it for debug
|
|
//if ( docId == 192304365235LL )
|
|
// log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"",
|
|
// docId,
|
|
// termId,
|
|
// g_posdb.getWordPos(key));
|
|
WordPosInfo wpi;
|
|
int32_t wp = g_posdb.getWordPos(key);
|
|
// set "m_sentNum"
|
|
if ( wp >= lastWordPos + 50 ) sentNum++;
|
|
wpi.m_wordPos = wp;
|
|
wpi.m_sentNum = sentNum;
|
|
wpi.m_hashGroup = g_posdb.getHashGroup (key);
|
|
wpi.m_densityRank = g_posdb.getDensityRank (key);
|
|
wpi.m_wordSpamRank = g_posdb.getWordSpamRank (key);
|
|
wpi.m_trafficGain = 0;
|
|
// log it
|
|
/*
|
|
log("seopipe: term=%"INT64" pos=%"INT32" sent=%"INT32" hg=%s dr=%"INT32"",
|
|
g_posdb.getTermId(key),
|
|
(int32_t)wp,
|
|
sentNum,
|
|
getHashGroupString(wpi.m_hashGroup),
|
|
(int32_t)wpi.m_densityRank);
|
|
*/
|
|
// bigrams share the same word position as the single term.
|
|
// so ignore them. we only want unique insertion positions.
|
|
if ( wp == lastWordPos ) continue;
|
|
// . i thought sorted by word position??
|
|
// . word position 0 is used by generic terms, like tags
|
|
if ( wp < lastWordPos ) { char *xx=NULL;*xx=0; }
|
|
// additional positoin at the end of a sentence?
|
|
//if ( lastwp != wp && lastSentNum == sentNum )
|
|
// // store it
|
|
// m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
|
|
// to right as well! so it can be in same sentence, if this
|
|
// word as at the end of the sentence.
|
|
//wpi.m_wordPos = wp;// + 2;
|
|
// add it
|
|
m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
|
|
|
|
int32_t nextSent = -1;
|
|
if ( p < pend ) {
|
|
// assume same as current sentence
|
|
nextSent = sentNum;
|
|
// get word position of next term
|
|
int32_t nextwp = g_posdb.getWordPos(p);
|
|
// same as us? then it is a bigram, so try the
|
|
// word after that!
|
|
if ( nextwp == wp && p+18<pend )
|
|
nextwp = g_posdb.getWordPos(p+18);
|
|
// if the following word position is in a new sentence
|
|
// he will be separated by 50 units! that is our base
|
|
// for sentence skip.
|
|
if ( nextwp >= wp + SENT_UNITS )
|
|
nextSent = sentNum+1;
|
|
}
|
|
|
|
// HACK. if next word starts a new sentence, add a WordPosInfo
|
|
// here so we can insert term at end of THIS sentence.
|
|
// otherwise we are inserted BEFORE the term whose position
|
|
// we use.
|
|
if ( nextSent != sentNum ) {
|
|
wpi.m_wordPos += 2;
|
|
m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
|
|
}
|
|
|
|
// set these
|
|
lastWordPos = wp;
|
|
//lastwp = wp;// + 2;
|
|
lastSentNum = sentNum;
|
|
}
|
|
|
|
/*
|
|
// point to raw buf
|
|
char *raw = m_wordPosInfoBuf.getBufStart();
|
|
int32_t size = m_wordPosInfoBuf.length();
|
|
// this shit is sorted by termid then pos, so sort just by pos
|
|
// this should breath with niceness!!
|
|
gbqsort ( raw ,
|
|
size / sizeof(WordPosInfo),
|
|
sizeof(WordPosInfo) ,
|
|
wordPosInfoCmp ,
|
|
m_niceness ) ;
|
|
*/
|
|
|
|
m_wordPosInfoBufValid = true;
|
|
|
|
return &m_wordPosInfoBuf;
|
|
}
|
|
|
|
// . i made this easy to serialize by using offsets and not ptrs
|
|
// . so we can add to cachedb easily
|
|
// . and so its immune to reallocs() on m_linkSourceBuf SafeBuf
|
|
class LinkSource {
|
|
public:
|
|
|
|
int32_t m_linkSiteRank;
|
|
|
|
// the actual url of the link, references into m_buf
|
|
int32_t m_linkUrlOffset;
|
|
// the title of the link, references into m_buf
|
|
int32_t m_linkTitleOffset;
|
|
|
|
// . we store the offsets of the RelatedDocIds in m_relatedDocIdBuf
|
|
// . these are the related docids that are linked to by this link src
|
|
int32_t m_offsetOfRelatedDocIdOffsets;
|
|
int32_t m_numRelatedDocIds;
|
|
|
|
char m_buf[0];
|
|
|
|
char *getLinkUrl ( SafeBuf *linkSourceBuf ) {
|
|
char *buf = linkSourceBuf->getBufStart();
|
|
buf += m_linkUrlOffset;
|
|
return buf;
|
|
};
|
|
|
|
char *getLinkTitle ( SafeBuf *linkSourceBuf ) {
|
|
char *buf = linkSourceBuf->getBufStart();
|
|
buf += m_linkTitleOffset;
|
|
return buf;
|
|
};
|
|
|
|
// crap, do we store RelatedDocIds into cachedb? we should
|
|
// make it use offsets and not ptrs too...
|
|
int32_t *getRelatedDocIdOffsets ( SafeBuf *linkSourceBuf ) {
|
|
// how can this be?
|
|
//if ( m_numRelatedDocIds == 0 ) return NULL;
|
|
char *buf = linkSourceBuf->getBufStart();
|
|
buf += m_offsetOfRelatedDocIdOffsets;
|
|
return (int32_t *)buf;
|
|
};
|
|
|
|
};
|
|
|
|
|
|
/*
|
|
static void gotLinkInfoReplyWrapper ( void *state ) {
|
|
//XmlDoc *newxd = (XmlDoc *)state;
|
|
Msg25 *msg25 = (Msg25 *)state;
|
|
XmlDoc *xd = msg25->m_xd;
|
|
// count it as returned
|
|
xd->m_numLinkRequestsIn++;
|
|
// this will nuke the msg25 as well after copying its linkinfo
|
|
xd->processLinkInfoMsg20Reply ( msg25 );
|
|
// try to send out more requests or intersect them if done
|
|
xd->m_masterLoop ( xd->m_masterState );
|
|
}
|
|
|
|
// . before we were just looking at the LinkInfo the msg25 makes from
|
|
// all the Msg20Replies it gets, but let's keep the msg20 replies
|
|
// intact because they have the titles we need!
|
|
// . return false on error, true otherwise
|
|
bool XmlDoc::processLinkInfoMsg20Reply ( Msg25 *msg25 ) {
|
|
// int16_tcut
|
|
//LinkInfo *info = msg25->getLinkInfo ();
|
|
// store into our buffer
|
|
//bool status ;
|
|
// i guess info can be NULL on error
|
|
//if ( info )
|
|
// status = m_linkInfoReplyBuf.safeMemcpy (info, info->getSize());
|
|
|
|
// give front-end the progress bar info
|
|
if ( m_seoSocket && m_progressBar ) {
|
|
// tmp buf
|
|
char tmp[16];
|
|
float percent = (float)m_rdCursor;
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
|
|
percent /= (float)numRelated;
|
|
// 80% of the pipeline was doing the full queries
|
|
percent *= .20;
|
|
percent += .80;
|
|
percent *= 100.0;
|
|
int32_t percentLong = (int32_t)percent;
|
|
if ( percentLong >= 100 ) percentLong = 99;
|
|
int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong);
|
|
if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen);
|
|
// try a send on non-blocking socket
|
|
int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
|
|
if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n);
|
|
// forget error
|
|
errno = 0;
|
|
}
|
|
|
|
// store this
|
|
int32_t nr = msg25->m_numReplyPtrs;
|
|
// reserve space
|
|
if ( ! m_msg20ReplyPtrBuf.reserve ( 8 + nr * 4 * 2 ) ) {
|
|
m_hadLinkInfoError = g_errno;
|
|
nr = 0;
|
|
}
|
|
// first store related docid ptr into m_relatedDocIdBuf safebuf
|
|
RelatedDocId *rd = (RelatedDocId *)msg25->m_hackrd;
|
|
m_msg20ReplyPtrBuf.pushLong((int32_t)rd);
|
|
// then store the # of msg20 replies
|
|
m_msg20ReplyPtrBuf.pushLong(nr);
|
|
// . scan each msg20reply it got, each msg20reply is an inlink
|
|
// for this docid
|
|
// . seems like they are only freed in Msg25::reset()
|
|
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
|
// get one
|
|
Msg20Reply *r = msg25->m_replyPtrs[i];
|
|
int32_t size = msg25->m_replySizes[i];
|
|
// steal it, we will free them ourselves below
|
|
m_msg20ReplyPtrBuf.pushLong((int32_t)r);
|
|
// we need this since we need to free it when done
|
|
m_msg20ReplyPtrBuf.pushLong(size);
|
|
}
|
|
// . do not allow Msg25 to free it, we will free it below
|
|
// . on OOM error above we set nr to 0 on error, so allow msg25
|
|
// to free the replies in that case
|
|
if ( nr ) msg25->m_numReplyPtrs = 0;
|
|
// nuke it
|
|
mdelete ( msg25 , sizeof(Msg25), "m25li" );
|
|
delete ( msg25 );
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
static int riCmp ( const void *a, const void *b ) {
|
|
RecommendedLink *wa = *(RecommendedLink **)a;
|
|
RecommendedLink *wb = *(RecommendedLink **)b;
|
|
int32_t diff = wb->m_votes - wa->m_votes;
|
|
if ( diff ) return diff;
|
|
if ( wb->m_totalRecommendedScore > wa->m_totalRecommendedScore )
|
|
return 1;
|
|
if ( wb->m_totalRecommendedScore < wa->m_totalRecommendedScore )
|
|
return -1;
|
|
// docid to break all ties
|
|
if ( wb->m_rl_docId > wa->m_rl_docId )
|
|
return 1;
|
|
if ( wb->m_rl_docId < wa->m_rl_docId )
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void gotLinkdbListWrapper ( void *state ) {
|
|
Msg0 *msg0 = (Msg0 *)state;
|
|
XmlDoc *xd = msg0->m_hackxd;
|
|
// free it's memory here lest we have a leak
|
|
//msg0->reset();
|
|
xd->m_numLinkRequestsIn++;
|
|
xd->m_masterLoop ( xd->m_masterState );
|
|
}
|
|
|
|
|
|
#define MAX_RECOMMENDED_LINKS 300
|
|
|
|
// . returns safebuf of RecommendedLinks
|
|
// . use RecommendedLink::getSize() to skip over element in array/safebuf
|
|
// . these are the recommended link sources
|
|
// . these are the links that your relateddocids (i.e. competing pages) have
|
|
// in common the most
|
|
// . TODO: store the returned safebuf in cachedb as well!
|
|
SafeBuf *XmlDoc::getRecommendedLinksBuf ( ) {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_recommendedLinksBufValid )
|
|
return &m_recommendedLinksBuf;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// what docids share our matching queries?
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
// cast then
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
// how many related docids do we have?
|
|
int32_t numRelatedDocIds = rdbuf->length() / sizeof(RelatedDocId);
|
|
|
|
if ( m_numLinkRequestsOut == 0 ) {
|
|
// reset these on first call
|
|
m_rdCursor = 0;
|
|
m_numLinkRequestsIn = 0;
|
|
m_hadLinkInfoError = 0;
|
|
m_numMsg20sIn = 0;
|
|
m_numMsg20sOut = 0;
|
|
m_numValidMsg20s = 0;
|
|
m_titleCursor = 0;
|
|
m_msg20Phase = 0;
|
|
m_recommendedLinkError = 0;
|
|
}
|
|
|
|
if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if we are looking up the title/url of each docid in
|
|
// the m_recommendedLinksBuf now, go back there
|
|
if ( m_msg20Phase )
|
|
return lookupTitles();
|
|
|
|
for ( ; m_rdCursor < numRelatedDocIds ; m_rdCursor++ ) {
|
|
// wait if too many out. only allow 2 out. otherwise each
|
|
// one can send out like 500 msg20s
|
|
if ( m_numLinkRequestsOut - m_numLinkRequestsIn > 60 )
|
|
// wait for 1 to come back
|
|
return (SafeBuf *)-1;
|
|
// skip the rest on error
|
|
if ( m_hadLinkInfoError ) continue;
|
|
// cast it
|
|
RelatedDocId *rd = &rds[m_rdCursor];
|
|
// bogus? a not found, EDOCBANNED/EDOCFILTERED or it
|
|
// linked to our domain
|
|
if ( rd->rd_url_off < 0 )
|
|
continue;
|
|
// bogus?
|
|
if ( ! rd->getUrl( &m_relatedTitleBuf ) ) {
|
|
log("seo: skipping null url");
|
|
continue;
|
|
}
|
|
if ( ! rd->getSite( &m_relatedTitleBuf ) ) {
|
|
log("seo: skipping null site");
|
|
continue;
|
|
}
|
|
|
|
// allocate msg0 array into m_tmpMsg0Buf safebuf
|
|
if ( ! m_tmpMsg0Buf.length() ) {
|
|
// fill tmpmsg0 buf
|
|
int32_t need = sizeof(Msg0) * numRelatedDocIds;
|
|
if ( ! m_tmpMsg0Buf.reserve ( need , "tmp20s" ) )
|
|
return NULL;
|
|
// do not re-call!
|
|
m_tmpMsg0Buf.setLength(need);
|
|
char *p = m_tmpMsg0Buf.getBufStart();
|
|
char *pend = p + need;
|
|
for ( ; p < pend ; p += sizeof(Msg0) ) {
|
|
Msg0 *msg0 = (Msg0 *)p;
|
|
msg0->constructor();
|
|
}
|
|
}
|
|
|
|
// debug it
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: getting inlinks to related docid=%"INT64" "
|
|
"weight=%f "
|
|
"url=%s",
|
|
rd->m_docId,
|
|
rd->m_relatedWeight,
|
|
rd->getUrl(&m_relatedTitleBuf));
|
|
|
|
// just get his linkdb list!
|
|
Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
|
|
Msg0 *msg0 = &array[m_rdCursor];
|
|
key224_t startKey;
|
|
key224_t endKey;
|
|
char *rdurl = rd->getUrl(&m_relatedTitleBuf);
|
|
// by default, just hash of hostname, unless overridden
|
|
// with "site" tag in tagdb, or has a path like /~mwells
|
|
int32_t siteHash32 = rd->m_rd_siteHash32;
|
|
int64_t linkHash64 = hash64n(rdurl);
|
|
startKey = g_linkdb.makeStartKey_uk (siteHash32,linkHash64 );
|
|
endKey = g_linkdb.makeEndKey_uk (siteHash32,linkHash64 );
|
|
|
|
// hack that thing
|
|
msg0->m_hackxd = this;
|
|
|
|
// consider it outstanding
|
|
m_numLinkRequestsOut++;
|
|
|
|
// int16_tcut, piggyback on the msg0
|
|
RdbList *list = &msg0->m_handyList;
|
|
//RdbList list2;
|
|
|
|
if ( ! msg0->getList ( -1 , // hostId, -1 if none
|
|
0 , // hostId ip
|
|
0 , // hostId port
|
|
0 , // max cache age -secs
|
|
false , // addToCache?
|
|
RDB_LINKDB ,
|
|
cr->m_collnum ,
|
|
list , // linkdb list to fill
|
|
(char*)&startKey,
|
|
(char*)&endKey ,
|
|
1000000 , // 1MB minrecsizes
|
|
msg0 ,
|
|
gotLinkdbListWrapper ,
|
|
m_niceness ,
|
|
true , // error correct?
|
|
true , // includeTree
|
|
true , // do merge
|
|
-1,//hostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
60*60*24*365 )){//timeout of one year
|
|
// blocked? keep chugging
|
|
continue;
|
|
}
|
|
|
|
// . maybe it was cached or something, or we had an error!
|
|
// . this will nuke the msg25
|
|
// . returns false and sets g_errno on error
|
|
//processLinkInfoMsg20Reply ( msg25 );
|
|
m_numLinkRequestsIn++;
|
|
// save g_errno
|
|
int32_t saved = g_errno;
|
|
// free it's memory here lest we have a leak
|
|
//msg0->reset();
|
|
// error? it will not have blocked then
|
|
if ( ! saved ) continue;
|
|
// save error, and stop launching any more requests
|
|
m_hadLinkInfoError = saved;
|
|
log("xmldoc: linksrc error3 = %s",mstrerror(saved));
|
|
}
|
|
|
|
// return -1 if waiting for more requests to come in
|
|
if ( m_numLinkRequestsOut > m_numLinkRequestsIn )
|
|
return (SafeBuf *)-1;
|
|
|
|
// vote table to allow inlink voting
|
|
HashTableX riTable;
|
|
// do not return on error setting this table because we'll leave
|
|
// the msg20 replies unfreed!
|
|
if ( ! riTable.set ( 8,4,1024,NULL,0,false,m_niceness,"ritbl") )
|
|
m_hadLinkInfoError = g_errno;
|
|
|
|
RecommendedLink *ri;
|
|
|
|
HashTableX dedupVotesTable;
|
|
if ( ! dedupVotesTable.set(8,0,1024,NULL,0,false,m_niceness,"dvtt") )
|
|
return NULL;
|
|
|
|
// need this for computing rdOff
|
|
char *rdStart = m_relatedDocIdBuf.getBufStart();
|
|
|
|
// store recommended links bufs here temporarily
|
|
SafeBuf tmpBuf;
|
|
if ( ! tmpBuf.reserve ( 10000000 ,"tt5buf" ) ) return NULL;
|
|
|
|
// all done. scan linkdb lists and intersect. there is one list
|
|
// per related docid.
|
|
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
|
|
// get related docid that had the following msg20replies
|
|
RelatedDocId *rd = &rds[i];
|
|
// his offset in his buf
|
|
int32_t rdOff = (char *)rd - rdStart;
|
|
// get linkdb list loaded from msg0 call above
|
|
Msg0 *msg0 = &((Msg0 *)m_tmpMsg0Buf.getBufStart())[i];
|
|
RdbList *list = &msg0->m_handyList;
|
|
list->resetListPtr();
|
|
// scan the docids in list
|
|
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
|
|
// get the current key if list has more left
|
|
key224_t key;
|
|
list->getCurrentKey( &key );
|
|
//int32_t itop = g_linkdb.getLinkerIp24_uk ( &key );
|
|
int32_t ip32 = g_linkdb.getLinkerIp_uk ( &key );
|
|
//bool isLinkSpam = g_linkdb.isLinkSpam_uk ( &key );
|
|
int64_t docId = g_linkdb.getLinkerDocId_uk ( &key );
|
|
//int32_t discovered = g_linkdb.getDiscoveryDate_uk(&key);
|
|
|
|
// skip if no longer there on page, we keep these
|
|
// only to graph lost links over time
|
|
int32_t lostDate = g_linkdb.getLostDate_uk ( &key );
|
|
if ( lostDate )
|
|
continue;
|
|
|
|
// if the inlink is from the same c-block IP as the
|
|
// related docid it links to, then do not consider.
|
|
// the ip used in linkdb is the current ip not the
|
|
// first ip actually.
|
|
if ( ipdom(ip32)==ipdom(rd->m_relatedCurrentIp))
|
|
continue;
|
|
if ( ipdom(ip32)==ipdom(rd->m_relatedFirstIp))
|
|
continue;
|
|
// if the linking document links to the same related
|
|
// docid multiple times/ we need to dedup so m_votes
|
|
// is not incremented multiple times!
|
|
// actually make it use c-block not docid to fix
|
|
// links/pages getting two m_votes for linking to
|
|
// two competitors, where each competitor linked to
|
|
// is on the same c-block... kinda strange.
|
|
int64_t dkey = docId ^ ipdom(rd->m_relatedFirstIp);
|
|
if ( dedupVotesTable.isInTable(&dkey) )
|
|
continue;
|
|
if ( ! dedupVotesTable.addKey(&dkey) ) return NULL;
|
|
|
|
// now we associate a new class with each unique linker
|
|
int32_t *poff = (int32_t *)riTable.getValue ( &docId );
|
|
// if there, it will be an offset into the links buf
|
|
if ( poff ) {
|
|
char *ptr = tmpBuf.getBufStart();
|
|
ptr += *poff;
|
|
RecommendedLink *rip = (RecommendedLink *)ptr;
|
|
rip->m_totalRecommendedScore +=
|
|
rd->m_relatedWeight;
|
|
rip->m_votes++;
|
|
// add to array of rd offs
|
|
int32_t k; for ( k = 0 ; k < 10 ; k++ ) {
|
|
if ( rip->m_relatedDocIdOff[k]==-1)
|
|
break;
|
|
}
|
|
if ( k < 10 )
|
|
rip->m_relatedDocIdOff[k] = rdOff;
|
|
continue;
|
|
}
|
|
|
|
// reserve space
|
|
int32_t need = sizeof(RecommendedLink);
|
|
// reserve
|
|
if ( ! tmpBuf.reserve ( need , "tt5buf" ) ) {
|
|
m_hadLinkInfoError = g_errno;
|
|
continue;
|
|
}
|
|
|
|
// save this
|
|
int32_t firstOff = tmpBuf.length();
|
|
|
|
// ref it
|
|
char *buf = tmpBuf.getBuf();
|
|
ri = (RecommendedLink *)buf;
|
|
// advance over that
|
|
int32_t over = sizeof(RecommendedLink);
|
|
// increase buf length
|
|
tmpBuf.incrementLength(over);
|
|
|
|
// this is how similar the relatedDocId is to the
|
|
// main url. these dotproducts are all relative
|
|
// with the other relatedDocIds for this url.
|
|
// the dotproduct was basically a dotproduct
|
|
// of the score vector of "rd" with that of
|
|
// the main url for the same queries. and that
|
|
// was normalized by the score of the top result
|
|
// for each query that have in common. see the
|
|
// the algo above for the "m_dotProduct" computation.
|
|
ri->m_totalRecommendedScore = rd->m_relatedWeight;
|
|
ri->m_votes = 1;
|
|
|
|
ri->m_rl_docId = docId;
|
|
|
|
// we do not know these things until we call msg20
|
|
// on the docid:
|
|
ri->m_rl_siteRank = -1;//reply->m_siteRank;
|
|
ri->m_rl_firstIp = 0;//reply->m_firstIp;
|
|
|
|
// each recommended link links to one or more
|
|
// related docids. so record them!
|
|
ri->m_relatedDocIdOff[0] = rdOff;
|
|
ri->m_relatedDocIdOff[1] = -1;
|
|
ri->m_relatedDocIdOff[2] = -1;
|
|
ri->m_relatedDocIdOff[3] = -1;
|
|
ri->m_relatedDocIdOff[4] = -1;
|
|
ri->m_relatedDocIdOff[5] = -1;
|
|
ri->m_relatedDocIdOff[6] = -1;
|
|
ri->m_relatedDocIdOff[7] = -1;
|
|
ri->m_relatedDocIdOff[8] = -1;
|
|
ri->m_relatedDocIdOff[9] = -1;
|
|
|
|
ri->m_urlSize = 0;
|
|
ri->m_titleSize = 0;
|
|
|
|
// store it in table then, pointing into the new buf
|
|
if ( ! riTable.addKey ( &docId, &firstOff ) )
|
|
m_hadLinkInfoError = g_errno;
|
|
}
|
|
// free that list now to save mem
|
|
list->freeList();
|
|
}
|
|
|
|
// free the msg0s now, including Msg0::m_handyList, what we used
|
|
// to hold the linkdb list
|
|
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
|
|
Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
|
|
Msg0 *msg0 = &array[i];
|
|
// free the mem and the handylist now that we've processed them
|
|
msg0->reset();
|
|
}
|
|
// no longer need the msg0s and linkdb lists (Msg0::m_handyLists)
|
|
m_tmpMsg0Buf.purge();
|
|
|
|
|
|
//
|
|
// now sort RecommendedLinks in tmpBuf by their scores
|
|
//
|
|
// get the top 300 recommended links so we can save mem and
|
|
// store this beastie in cachedb
|
|
SafeBuf ptrBuf;
|
|
int32_t maxNumPtrs = tmpBuf.length() / sizeof(RecommendedLink);
|
|
if ( ! ptrBuf.reserve(maxNumPtrs *sizeof(RecommendedLink *),"ptrbuf"))
|
|
return NULL;
|
|
char *p = tmpBuf.getBufStart();
|
|
char *pend = tmpBuf.getBuf();
|
|
int32_t numPtrs = 0;
|
|
for ( ; p < pend ; ) {
|
|
RecommendedLink *ri = (RecommendedLink *)p;
|
|
ptrBuf.pushPtr ( ri );
|
|
p += sizeof(RecommendedLink);
|
|
// we have no title or url at this point...
|
|
if ( ri->getSize() != sizeof(RecommendedLink) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
numPtrs++;
|
|
}
|
|
// now sort!
|
|
RecommendedLink **ptrs = (RecommendedLink **)ptrBuf.getBufStart();
|
|
gbqsort ( ptrs ,
|
|
numPtrs ,
|
|
sizeof(RecommendedLink *),
|
|
riCmp,
|
|
m_niceness );
|
|
// copy over the top recommended links into permanent buffer in order
|
|
// of score
|
|
int32_t need2 = tmpBuf.length();
|
|
// increase for storing titles/urls into here
|
|
need2 = numPtrs * sizeof(RecommendedLink);
|
|
// allocate that now
|
|
if ( ! m_recommendedLinksBuf.reserve ( need2 ,"rlkbuf") ) return NULL;
|
|
// and copy over from tmpBuf, sorted by the score
|
|
for ( int32_t i = 0 ; i < numPtrs ; i++ )
|
|
m_recommendedLinksBuf.safeMemcpy(ptrs[i],
|
|
sizeof(RecommendedLink));
|
|
// this can be really huge! > 30MB
|
|
tmpBuf.purge();
|
|
// free the ptrs too!
|
|
ptrBuf.purge();
|
|
|
|
|
|
//
|
|
// now m_recommendedLinksBuf is a bunch of RecommendedLinks sorted
|
|
// by score. now use msg20 to lookup the top 300 or so that
|
|
// do not link to our main doc
|
|
//
|
|
m_msg20Phase = true;
|
|
return lookupTitles ();
|
|
}
|
|
|
|
//static void gotLinkerTitleWrapper ( void *state ) {
|
|
// Msg20 *msg20 = (Msg20 *)state;
|
|
// XmlDoc *THIS = (XmlDoc *)msg20->m_state2;
|
|
// THIS->gotLinkerTitle ( msg20 );
|
|
// THIS->m_masterLoop ( THIS->m_masterState );
|
|
//}
|
|
|
|
SafeBuf *XmlDoc::lookupTitles ( ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
// none have a title/url following them in here yet
|
|
int32_t numLinkers = m_recommendedLinksBuf.length();
|
|
numLinkers /= sizeof(RecommendedLink);
|
|
|
|
if ( ! m_msg20Array.length() ) {
|
|
int32_t need = numLinkers * sizeof(Msg20);
|
|
if ( ! m_msg20Array.reserve ( need,"m20arr" ) )
|
|
return (SafeBuf *)-1;
|
|
// do not re-call!
|
|
m_msg20Array.setLength(need);
|
|
char *p = m_msg20Array.getBufStart();
|
|
char *pend = p + need;
|
|
for ( ; p < pend ; p += sizeof(Msg20) )
|
|
((Msg20 *)p)->constructor();
|
|
}
|
|
|
|
Msg20 *msg20s = (Msg20 *)m_msg20Array.getBufStart();
|
|
// one per linker
|
|
int32_t numMsg20s = numLinkers;
|
|
|
|
// we can use the array model because each element is fixed size
|
|
// because they do not have the url/title string following them
|
|
// yet...
|
|
char *ppp = m_recommendedLinksBuf.getBufStart();
|
|
RecommendedLink *ptr = (RecommendedLink *)ppp;
|
|
|
|
// scan the msg20s we allocated to see if any got a reply
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &msg20s[i];
|
|
// skip if never launched
|
|
if ( ! msg20->m_launched ) continue;
|
|
// skip if it is in progress, awaiting its reply
|
|
if ( msg20->m_inProgress ) continue;
|
|
// ok, it has a reply. could be NULL if g_errno was set.
|
|
if ( ! gotLinkerTitle ( msg20 ) )
|
|
m_recommendedLinkError = g_errno;
|
|
// reset it for later us... or not...
|
|
msg20->reset();
|
|
}
|
|
|
|
//
|
|
// call a msg20 on each recommendedlink to get url/title and
|
|
// see if it links to any url on our main url's site/domain
|
|
//
|
|
for ( ; m_titleCursor < numLinkers ; m_titleCursor++ ) {
|
|
// bail?
|
|
if ( m_numMsg20sOut - m_numMsg20sIn > 60 )
|
|
break;
|
|
// stop launching if got enough
|
|
if ( m_numValidMsg20s >= MAX_RECOMMENDED_LINKS )
|
|
break;
|
|
// cast it
|
|
RecommendedLink *rl = &ptr[m_titleCursor];
|
|
|
|
// get avail msg20
|
|
int32_t i; for ( i = 0 ; i < 100 ; i++ ) {
|
|
if ( msg20s[i].m_inProgress ) continue;
|
|
break;
|
|
}
|
|
// sanity!
|
|
if ( i >= 100 ) { char *xx=NULL;*xx=0; }
|
|
// look it up
|
|
Msg20 *msg20 = &msg20s[i];
|
|
// make request
|
|
Msg20Request req;
|
|
req.m_docId = rl->m_rl_docId;
|
|
//req.m_state = msg20;
|
|
req.m_state = m_masterState;//this;
|
|
req.m_callback2 = m_masterLoop;//gotLinkerTitleWrapper;
|
|
//req.ptr_coll = cr->m_coll;
|
|
//req.size_coll = gbstrlen(cr->m_coll)+1;
|
|
req.m_collnum = cr->m_collnum;
|
|
req.m_expected = true;
|
|
req.m_niceness = m_niceness;
|
|
// do not get summary stuff. too slow.
|
|
req.m_numSummaryLines = 0;
|
|
// if it has an outlink to our site/domain set
|
|
// Msg20Reply::m_hasLinkToOurDomOrHost
|
|
req.m_ourHostHash32 = getHostHash32a();
|
|
req.m_ourDomHash32 = getDomHash32();
|
|
|
|
// store cursor in msg20 itself so we know what rd it's using
|
|
msg20->m_hack2 = m_titleCursor;
|
|
|
|
// assume outstanding
|
|
m_numMsg20sOut++;
|
|
// debug
|
|
//log("seo: DEBUG: launching msg20 d=%"INT64"",req.m_docId);
|
|
// get it. continue if blocked
|
|
if ( ! msg20->getSummary ( &req ) ) continue;
|
|
// error?
|
|
if ( ! gotLinkerTitle ( msg20 ) )
|
|
m_recommendedLinkError = g_errno;
|
|
// save mem
|
|
msg20->reset();
|
|
}
|
|
|
|
// wait for all to return?
|
|
if ( m_numMsg20sOut > m_numMsg20sIn )
|
|
return (SafeBuf *)-1;
|
|
|
|
|
|
// we called gotLinkerTitle() on all msg20s, so destroy them
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &msg20s[i];
|
|
// free
|
|
msg20->destructor();
|
|
}
|
|
// and free the lot of them
|
|
m_msg20Array.purge();
|
|
|
|
|
|
// now revert back
|
|
m_recommendedLinksBuf.stealBuf ( &m_newLinkerBuf );
|
|
|
|
// . this is an array of Inlinks
|
|
// . shit, but we need to add a count of how many related docids
|
|
// had the inlink, and what the weight or score of it was
|
|
// . it should be based on the weights/scores of the related docids
|
|
// . maybe just hijack "Inlink::m_numUniqueIPs" or something
|
|
// . crap, we also need to store the RelatedDocIds, i guess we
|
|
// could store a list of offsets to them in m_relatedDocIdBuf
|
|
m_recommendedLinksBufValid = true;
|
|
|
|
// store in cachedb. if it blocks return -1. bufvalid is set to
|
|
// true so when this function is re-entered it should return
|
|
// the safebuf ptr right away.
|
|
if ( ! storeRecommendedLinksBuf () )
|
|
return (SafeBuf *)-1;
|
|
|
|
return &m_recommendedLinksBuf;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::gotLinkerTitle ( Msg20 *msg20 ) {
|
|
// count it as returned
|
|
m_numMsg20sIn++;
|
|
|
|
// debug
|
|
//log("seo: DEBUG: got msg20 reply");
|
|
|
|
// get the recommendedlink for this (titleCursor)
|
|
char *vvv = m_recommendedLinksBuf.getBufStart();
|
|
RecommendedLink *rptrs = (RecommendedLink *)vvv;
|
|
int32_t titleCursor = msg20->m_hack2;
|
|
RecommendedLink *rl = &rptrs[titleCursor];
|
|
// sanity
|
|
if ( titleCursor < 0 ) {char *xx=NULL;*xx=0;}
|
|
|
|
// not found?
|
|
if ( g_errno ) {
|
|
log("seo: lookuptitles: %s",mstrerror(g_errno));
|
|
// ignore
|
|
g_errno = 0;
|
|
return true;
|
|
}
|
|
// get reply
|
|
Msg20Reply *reply = msg20->getReply();
|
|
// skip if linked to our site!
|
|
if ( reply->m_hasLinkToOurDomOrHost ) {
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: inlinker %s links to our "
|
|
"domain. ignoring.",
|
|
reply->ptr_ubuf);
|
|
return true;
|
|
}
|
|
// or if banned/filtered.. then skip
|
|
if ( reply->m_errno ) {
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: inlinker %s had error: %s",
|
|
reply->ptr_ubuf,
|
|
mstrerror(reply->m_errno));
|
|
return true;
|
|
}
|
|
// wtf?
|
|
if ( reply->size_ubuf <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
// set basic info
|
|
rl->m_rl_siteRank = reply->m_siteRank;
|
|
rl->m_rl_firstIp = reply->m_firstIp;
|
|
|
|
// sanity
|
|
if ( rl->m_rl_docId != reply->m_docId ) { char *xx=NULL;*xx=0; }
|
|
|
|
char *title = reply->ptr_tbuf;
|
|
int32_t titleSize = reply->size_tbuf;
|
|
if ( titleSize == 0 ) {
|
|
title = "\0";
|
|
titleSize = 1;
|
|
}
|
|
|
|
// debug
|
|
//log("seo: DEBUG: got VALID msg20 reply #%"INT32"",m_numValidMsg20s);
|
|
|
|
// count as valid
|
|
m_numValidMsg20s++;
|
|
|
|
rl->m_urlSize = reply->size_ubuf;
|
|
rl->m_titleSize = titleSize;
|
|
|
|
|
|
if ( ! m_newLinkerBuf.safeMemcpy ( rl , sizeof(RecommendedLink) ) )
|
|
return false;
|
|
if ( ! m_newLinkerBuf.safeMemcpy ( reply->ptr_ubuf,reply->size_ubuf))
|
|
return false;
|
|
if ( ! m_newLinkerBuf.safeMemcpy ( title , titleSize ) )
|
|
return false;
|
|
|
|
|
|
// i guess we are done then
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// returns false if blocked, true otherwise. sets g_errno on error
|
|
bool XmlDoc::printRecommendedLinksBuf ( SafeBuf *sb ) {
|
|
|
|
SafeBuf *recBuf = getRecommendedLinksBuf();
|
|
if ( ! recBuf ) return true;
|
|
if ( recBuf == (void *)-1 ) return false;
|
|
|
|
int32_t count = 1;
|
|
char *p = recBuf->getBufStart();
|
|
char *pend = recBuf->getBuf ();
|
|
for ( ; p < pend ; ) {
|
|
// cast it
|
|
RecommendedLink *ri = (RecommendedLink *)p;
|
|
// skip it
|
|
p += ri->getSize();
|
|
// print it out
|
|
sb->safePrintf("%"INT32") %.04f %s | %s<br>"
|
|
,count++
|
|
,ri->m_totalRecommendedScore
|
|
,ri->getUrl(recBuf)
|
|
,ri->getTitle(recBuf)
|
|
);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
// . use Msg25::m_numReplyPtrs and Msg25::m_replyPtrs[i] to access the
|
|
// Msg20s of the inlinks
|
|
// . NOT the same as getLinkInfo() because this does not filter out the
|
|
// "bad" inlinks, it gets everything and keeps the full Msg20Replies!!
|
|
Msg25 *XmlDoc::getAllInlinks ( bool forSite ) {
|
|
|
|
// if valid, return it now
|
|
if ( forSite && m_tempMsg25SiteValid )
|
|
return m_tempMsg25Site;
|
|
|
|
if ( ! forSite && m_tempMsg25PageValid )
|
|
return m_tempMsg25Page;
|
|
|
|
Msg25 *myMsg25 ;
|
|
if ( forSite ) myMsg25 = m_tempMsg25Site;
|
|
else myMsg25 = m_tempMsg25Page;
|
|
|
|
int32_t *ipp = getIp();
|
|
if ( ! ipp || ipp == (void *)-1 ) return (Msg25 *)ipp;
|
|
int64_t *d = getDocId();
|
|
if ( ! d || d == (int64_t *)-1 ) return (Msg25 *)d;
|
|
char *site = getSite ();
|
|
if ( ! site || site == (char *)-1 ) return (Msg25 *)site;
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
Url *fu = getFirstUrl();
|
|
|
|
// make a new one
|
|
if ( ! myMsg25 ) {
|
|
Msg25 *msg25 = NULL;
|
|
try { msg25 = new ( Msg25 ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("xmldoc: linksrc error2 = %s",mstrerror(g_errno));
|
|
m_hadLinkInfoError = g_errno;
|
|
}
|
|
mnew ( msg25, sizeof(Msg25),"m25li");
|
|
// record it for freeing/deleting later
|
|
if ( forSite ) m_tempMsg25Site = msg25;
|
|
else m_tempMsg25Page = msg25;
|
|
// reference it
|
|
myMsg25 = msg25;
|
|
}
|
|
|
|
int32_t type ;
|
|
if ( forSite ) type = cr_Msg25SiteInfo;
|
|
else type = cr_Msg25PageInfo;
|
|
|
|
// get list
|
|
RdbList *myList;
|
|
if ( forSite ) myList = &m_siteReplyList;
|
|
else myList = &m_pageReplyList;
|
|
|
|
int32_t uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
|
|
|
|
// first check cachedb!
|
|
bool checkIt = false;
|
|
if ( forSite && ! m_checkedCachedbForSite ) checkIt = true;
|
|
if ( ! forSite && ! m_checkedCachedbForPage ) checkIt = true;
|
|
if ( checkIt ) {
|
|
// do not repeat
|
|
if ( forSite ) m_checkedCachedbForSite = true;
|
|
else m_checkedCachedbForPage = true;
|
|
// use 0 for content hash since the link info is independent
|
|
// of your page's or site's content
|
|
key_t sk = g_cachedb.makeStartKey2 ( uh32 , 0 , type );
|
|
key_t ek = g_cachedb.makeEndKey2 ( uh32 , 0 , type );
|
|
// . get it from the appropriate host
|
|
// . get cachedb rec for all types of safebufs for this
|
|
// url/content
|
|
// . then we will set safebufs based on what recs we find
|
|
// in the returned list
|
|
if ( ! m_msg0.getList ( -1, // hostid
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxcacheage
|
|
false, // addtocache?
|
|
RDB_CACHEDB,
|
|
cr->m_collnum ,
|
|
myList, // &m_cacheList,
|
|
(char *)&sk ,
|
|
(char *)&ek ,
|
|
30000000, // minrecsizes 30MB
|
|
m_masterState,
|
|
m_masterLoop,
|
|
m_niceness ) )
|
|
// blocked?
|
|
return (Msg25 *)-1;
|
|
}
|
|
|
|
Msg20Reply *reply;
|
|
|
|
// even if it had 0 msg20replies, list should be non-zero length
|
|
if ( ! myList->isEmpty() ) {
|
|
// get # replies
|
|
char *p = myList->getList();
|
|
// first is key
|
|
p += 12;
|
|
// then datasize
|
|
p += 4;
|
|
// then # msg20 replies
|
|
int32_t numReplies = *(int32_t *)p;
|
|
p += 4;
|
|
myMsg25->m_numReplyPtrs = numReplies;
|
|
// do not free any replies, they reference into m_pageList
|
|
myMsg25->m_ownReplies = false;
|
|
// loop over replies
|
|
for ( int32_t i = 0 ; i < numReplies ; i++ ) {
|
|
// get reply size
|
|
int32_t replySize = *(int32_t *)p;
|
|
p += 4;
|
|
// reply itself
|
|
reply = (Msg20Reply *)p;
|
|
// reconstruct ptrs from the offsets relative
|
|
// to start of "reply"
|
|
int32_t used = reply->deserialize();
|
|
if ( used < 0 ) {
|
|
log("xmldoc: reply deserialize error");
|
|
g_errno = ECORRUPTDATA;
|
|
return NULL;
|
|
}
|
|
// skip reply
|
|
p += replySize;
|
|
// store it
|
|
myMsg25->m_replyPtrs[i] = reply;
|
|
}
|
|
// validate!
|
|
if ( forSite ) m_tempMsg25SiteValid = true;
|
|
else m_tempMsg25PageValid = true;
|
|
// all done!
|
|
return myMsg25;
|
|
}
|
|
|
|
bool *calledItPtr ;
|
|
if ( forSite ) calledItPtr = &m_calledMsg25ForSite;
|
|
else calledItPtr = &m_calledMsg25ForPage;
|
|
|
|
|
|
// ok, get it the hard way
|
|
// send out the request now
|
|
if ( ! *calledItPtr ) {
|
|
// do not re-call!
|
|
*calledItPtr = true;
|
|
// call it now
|
|
if ( ! myMsg25->getLinkInfo2( site,
|
|
fu->getUrl() , // url
|
|
false , // isSiteLinkInfo?
|
|
*ipp,
|
|
*d, // docid
|
|
m_collnum,//cr->m_coll,
|
|
NULL, // qbuf
|
|
0, // qbufSize
|
|
m_masterState, // state
|
|
m_masterLoop, // callback
|
|
false, // isInjecting?
|
|
false, // pbuf (for printing)
|
|
//this, // xd holder (Msg25::m_xd
|
|
false, // printInXml
|
|
// this is irrelevant since we
|
|
// are getting all inlinks:
|
|
0, // siteNumInlinks, irrelevant
|
|
NULL, // oldlinkinfo
|
|
m_niceness,
|
|
true, // doLinkSpamCheck?
|
|
true, // onevoteperip. unused?
|
|
false,// can be cancelled?
|
|
0, // lastupdatetime
|
|
// !!!!!!!!!!
|
|
// we want all!!!!!!!!!!!!!!!!!!!
|
|
// !!!!!!!!!!
|
|
false ,//onlyneedgoodinlinks?
|
|
false,//getlinkertitles?
|
|
0, // ourhosthash32 (special)
|
|
0, // ourdomhash32 (special)
|
|
&m_myTempLinkInfoBuf ) )
|
|
// blocked?
|
|
return (Msg25 *)-1;
|
|
}
|
|
|
|
// validate it so when msg1 below returns and calls this function
|
|
// again at the top we return the ptr right away
|
|
if ( forSite ) m_tempMsg25SiteValid = true;
|
|
else m_tempMsg25PageValid = true;
|
|
|
|
// serialize the msg20 reply ptrs into a buf for list
|
|
SafeBuf listBuf;
|
|
// compute datasize
|
|
int32_t dataSize = 0;
|
|
// # of replies
|
|
dataSize += 4;
|
|
// each reply
|
|
for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
|
|
// reply size
|
|
dataSize += 4;
|
|
// reply data
|
|
//dataSize += myMsg25->m_replySizes[i];
|
|
// we can't use replySizes[i] because Linkdb.cpp will
|
|
// MODIFY the msg20 requests to add ptr_note/size_note
|
|
reply = myMsg25->m_replyPtrs[i];
|
|
// so we have to calculate the new serialized size
|
|
dataSize += reply->getStoredSize();
|
|
}
|
|
// how much to reserve?
|
|
int32_t need = sizeof(key_t) + 4 + dataSize;
|
|
// reserve that space!
|
|
if ( ! listBuf.reserve ( need ,"listbuf" ) ) {
|
|
// just ignore error
|
|
g_errno = 0;
|
|
// and return
|
|
if ( forSite ) return m_tempMsg25Site;
|
|
else return m_tempMsg25Page;
|
|
}
|
|
// make key for it, contenthash is 0, since it is irrelevant
|
|
key_t kk = g_cachedb.makeKey ( uh32 , 0 , type );
|
|
// store key
|
|
listBuf.safeMemcpy ( &kk , sizeof(key_t) );
|
|
// store datasize
|
|
listBuf.pushLong ( dataSize );
|
|
// # of replies
|
|
listBuf.pushLong ( myMsg25->m_numReplyPtrs );
|
|
// store each reply then
|
|
for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
|
|
// get reply
|
|
reply = myMsg25->m_replyPtrs[i];
|
|
// . how many bytes to store the MODIFIED msg20reply?
|
|
// . Linkdb.cpp adds the ptr_note AFTER it receives all replies
|
|
// so we can't just use Msg25::m_replySizes[i]
|
|
int32_t replySize = reply->getStoredSize();
|
|
listBuf.pushLong ( replySize );
|
|
// store that
|
|
int32_t stored = reply->serialize ( listBuf.getBuf() ,
|
|
listBuf.getAvail() );
|
|
// skip that
|
|
listBuf.incrementLength ( stored );
|
|
// sanity
|
|
if ( stored != replySize ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity
|
|
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// make the list to add to cachedb
|
|
RdbList storeList;
|
|
key_t startKey = g_cachedb.makeStartKey2 ( uh32, 0 , type );
|
|
key_t endKey = g_cachedb.makeEndKey2 ( uh32, 0 , type );
|
|
m_storeList.set ( listBuf.getBufStart() ,
|
|
listBuf.length() ,
|
|
listBuf.getBufStart() , // alloc
|
|
listBuf.getCapacity(), // allocsize
|
|
startKey,
|
|
endKey,
|
|
-1, // fixeddatasize
|
|
true, // owndata?
|
|
false ); // use half keys?
|
|
|
|
// disconnect it from safebuf so it doesn't get freed
|
|
listBuf.detachBuf();
|
|
|
|
//m_storeList.printList();
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
char *tt ;
|
|
if ( forSite ) tt = "site";
|
|
else tt = "page";
|
|
log("xmldoc: adding msg20%slinkreplies list of %"INT32" bytes to cachedb",
|
|
tt,m_storeList.m_listSize);
|
|
|
|
// returns false if it blocks, true otherwise
|
|
if ( ! m_msg1.addList ( &m_storeList,
|
|
RDB_CACHEDB ,
|
|
cr->m_collnum,
|
|
m_masterState,
|
|
m_masterLoop,
|
|
false, // forcelocal?
|
|
m_niceness ) )
|
|
// blocked?
|
|
return (Msg25 *)-1;
|
|
|
|
if ( forSite ) return m_tempMsg25Site;
|
|
else return m_tempMsg25Page;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . sets RelatedDocId::m_relatedWeight
|
|
// . when printing the competitor pages, we sort by this, highest first
|
|
// 1. then scan the list of queries for each related docid
|
|
// 2. determine each of those matching queries weights
|
|
// 3. add up the weights and set RelatedDocId::m_relatedWeight to that
|
|
bool XmlDoc::setRelatedDocIdWeightAndRank ( RelatedDocId *rd ) {
|
|
|
|
// get our site hash
|
|
int32_t *shp = getSiteHash32();
|
|
if ( ! shp ) return false;
|
|
if ( shp == (int32_t *)-1 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
|
|
int32_t mainUrlSiteRank = getSiteRank();
|
|
|
|
// max queries
|
|
int32_t nc = rd->m_numCommonQueries;
|
|
int32_t unit = 0;
|
|
unit += sizeof(float);
|
|
//unit += sizeof(Msg99Reply *);
|
|
unit += sizeof(Query);
|
|
unit += sizeof(HashTableX);
|
|
unit += sizeof(QueryNumLinkedNode *);
|
|
int32_t need = nc * unit;
|
|
char *mem = (char *)mmalloc ( need , "qrybuf" );
|
|
if ( ! mem ) {
|
|
log("seo: failed to set related docid weight: %s",
|
|
mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
char *p = mem;
|
|
|
|
float *queryWeights = (float *)p;
|
|
p += nc * sizeof(float);
|
|
|
|
//Msg99Reply **replyPtrs = (Msg99Reply **)p;
|
|
//p += nc * sizeof(Msg99Reply *);
|
|
|
|
Query *queries = (Query *)p;
|
|
p += nc * sizeof(Query);
|
|
|
|
QueryNumLinkedNode **qnPtrs = (QueryNumLinkedNode **)p;
|
|
p += nc * sizeof(QueryNumLinkedNode *);
|
|
|
|
HashTableX *htables = (HashTableX *)p;
|
|
p += nc * sizeof(HashTableX);
|
|
|
|
// sanity
|
|
if ( p != mem + need ) { char *xx=NULL;*xx=0; }
|
|
// initialize the mem
|
|
for ( int32_t i = 0 ; i < nc ; i++ ) {
|
|
queryWeights[i] = 1.0;
|
|
qnPtrs[i] = NULL;
|
|
queries[i].constructor();
|
|
htables[i].constructor();
|
|
}
|
|
|
|
// total pages indexed!
|
|
//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
|
|
|
|
|
|
float totalWeight;
|
|
|
|
// get matching queries
|
|
//SafeBuf *qpbuf = getMatchingQueriesScored();
|
|
//if ( ! qpbuf || qpbuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
// cast it
|
|
//Msg99Reply **qptrs=(Msg99Reply **)qpbuf->getBufStart();
|
|
|
|
SafeBuf *mq = getMatchingQueryBuf();
|
|
if ( mq == NULL || mq == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
int32_t nks = mq->length() / sizeof(QueryLink);
|
|
QueryLink *qks = (QueryLink *)mq->getBufStart();
|
|
|
|
// print the queries in common!
|
|
int32_t firstOff = rd->m_firstCommonQueryNumOff;
|
|
int32_t offset = firstOff;
|
|
int32_t qc = 0;
|
|
|
|
//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
|
|
// this is fixed at the time we set QueryLogEntry::m_numResultsInSlice
|
|
int64_t numPagesIndexed = 1114000000;
|
|
int64_t point0 = numPagesIndexed / 119LL;
|
|
int64_t point1 = numPagesIndexed / 15LL;
|
|
|
|
// loop over the query/score pairs this related docid matched
|
|
for ( ; offset >= 0 ; qc++ ) {
|
|
// get that node
|
|
char *buf = m_commonQueryNumBuf.getBufStart();
|
|
// and offset
|
|
buf += offset;
|
|
// then cast
|
|
QueryNumLinkedNode *qn;
|
|
qn = (QueryNumLinkedNode *)buf;
|
|
// advance. will be -1 when done
|
|
if ( qn ) offset = qn->m_nextOff;
|
|
else offset = -1;
|
|
// get #qn into there
|
|
//Msg99Reply *rp = qptrs[qn->m_queryNum];
|
|
if ( qn->m_queryNum < 0 || qn->m_queryNum >= nks ) {
|
|
char *xx=NULL;*xx=0; }
|
|
QueryLink *qk = &qks[qn->m_queryNum];
|
|
QueryLogEntry *qe ;
|
|
qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
|
|
char *qstr = qe->getQueryString();
|
|
|
|
qnPtrs[qc] = qn;
|
|
|
|
// save ptrs too
|
|
//replyPtrs[qc] = rp;
|
|
|
|
// get main url score for query
|
|
//float mainUrlScore = rp->m_myScore;
|
|
int32_t mainUrlSiteHash26 = m_siteHash32;
|
|
// seems like clusterdb masks them a bit in
|
|
// Clusterdb::getSiteHash()
|
|
mainUrlSiteHash26 &= 0x03ffffff;
|
|
|
|
int32_t mainUrlRank = -1;
|
|
int32_t rdRank = -1;
|
|
//float mainUrlSerpScore = -1.0;
|
|
|
|
// . the relateddocidnumhack
|
|
// . this is used as the topdocidnum # in the case of
|
|
// m_matchingQueryBuf (doMatchingQueries)
|
|
int32_t tdnum = qk->m_relatedDocIdNum;
|
|
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
|
|
int32_t maxnum = m_topDocIdsBuf.length()/sizeof(TopDocIds);
|
|
if ( tdnum < 0 || tdnum >= maxnum ) { char *xx=NULL;*xx=0; }
|
|
TopDocIds *td = &tds[tdnum];
|
|
|
|
// assume none
|
|
//float rdScore = 0.0;
|
|
// find docid for this related docid
|
|
//TopDocIds *td = rp->getTopDocIds(&m_topDocIdsBuf);
|
|
|
|
int32_t nd = td->m_numDocIds;
|
|
for ( int32_t y = 0 ; y < nd ; y++ ) {
|
|
// if we first encounter a result from the same
|
|
// site as the main url then stop! you don't get
|
|
// the 10x bonus then!
|
|
if ( td->m_topSiteHashes26[y] == mainUrlSiteHash26 &&
|
|
mainUrlRank == -1 ) {
|
|
//mainUrlSerpScore = td->m_topScores[y];
|
|
mainUrlRank = y;
|
|
}
|
|
// set our score?
|
|
if ( td->m_topDocIds[y] == rd->m_docId ) {
|
|
//rdScore = td->m_topScores[y];
|
|
rdRank = y;
|
|
}
|
|
}
|
|
// these should always be set! even if not ranked in the
|
|
// top 300 because of our new logic using msg4f in
|
|
// getRelatedDocIdsScored()
|
|
float rdScore = qn->m_relatedDocIdSerpScore;
|
|
float mainUrlSerpScore = qk->m_serpScore;
|
|
|
|
bool better = false;
|
|
// give it a weight of 10 if higher-scoring!
|
|
//if ( rdRank < mainUrlRank ) better = true;
|
|
if ( rdScore >= mainUrlSerpScore ) better = true;
|
|
// if your site not in top 300 or so, and he is, he's better
|
|
//if ( mainUrlRank == -1 && rdRank >= 0 ) better = true;
|
|
|
|
// this is the specific url, not the SITE, like
|
|
// mainUrlRank is, for the entire site
|
|
//if ( rdScore > mainUrlScore ) better = true;
|
|
|
|
// how many search results does this query have total?
|
|
int64_t numResults = qe->m_numTotalResultsInSlice;
|
|
// fix it to be global
|
|
numResults *= (int64_t)g_hostdb.getNumShards();
|
|
// big indexes did the "slice logic" restricting docid
|
|
// range to MAX_DOCID * .10 when setting this!
|
|
if ( numPagesIndexed > 10000000 ) numResults *= 10;
|
|
|
|
////////////////////
|
|
//
|
|
// Scoring is what we do when the number of combinations
|
|
// it too high to effectively compute. - matt
|
|
//
|
|
////////////////////
|
|
|
|
|
|
// lower from 10 so google still won't dominate generic qyries?
|
|
// crap, at 2.0 gigablast.com had bad competitors because
|
|
// they all matc queries with gigablast in them.
|
|
// i put it down from 30.0 to 5.0 to fix chessusa.com
|
|
// who was getting bad competitor pages that had just
|
|
// 'ccc' matching non-generic queries having them come up too
|
|
// high of score.
|
|
//if ( better )
|
|
// queryWeights[qc] = 1.0;//30.0;//100.0; // 10.0;
|
|
|
|
//
|
|
// do not give related docid query that has YOUR brand in it
|
|
// much weight. we do not want it talking about you, because
|
|
// it is a competitor.
|
|
//
|
|
// PROBLEM: "cheatcodes.com"'s brand is descriptive!
|
|
//
|
|
// . if not generic and it beats YOU, give more!
|
|
// . try to fix ibm.com gigablast.com seomoz.org ahrefs.com
|
|
// that suffer because of matching their brand. actually
|
|
// maybe only do this if seomoz.org matches this query
|
|
// with their link text only...??? thus, pages that contain
|
|
// "seo moz" will match the "seo moz" query but will gain
|
|
// RELATIVELY little because they can't be seomoz.org on it.
|
|
// . crap though this will hurt chessusa.com right?? try again
|
|
// since algo changed a lot since then
|
|
bool isBrand = true;
|
|
// if other guy ranks better than you, probably not
|
|
// your brand, or if it is, it could be his brand too?
|
|
if ( better ) // && numResults < point0 )
|
|
isBrand = false;
|
|
// or if you are not in the top 100 it is probably not
|
|
// your brand name either!
|
|
if ( mainUrlRank == -1 )
|
|
isBrand = false;
|
|
// fix chessusa.com for 'chess' by lowering from 100 to 20...
|
|
if ( mainUrlRank >= 20 )
|
|
isBrand = false;
|
|
// fix 'corporation' for ibm.com. it is too generic to
|
|
// be a brand. on our 1.1B page index, point0 is like 9.3M.
|
|
// 'ibm' is 5.5M, 'corporation' is 25M,...
|
|
if ( numResults >= point0 )
|
|
isBrand = false;
|
|
// or for ibm.com ... or other pages with high siteranks,
|
|
// your brand queries should be in the top 10!! otherwise,
|
|
// ibm has so many other matching queries in the top 100 that
|
|
// are not brands for it because its siterank is so high.
|
|
if ( mainUrlSiteRank >= 10 && mainUrlRank >= 10 )
|
|
isBrand = false;
|
|
// top 5 for brands in siterank 11 sites
|
|
if ( mainUrlSiteRank >= 11 && mainUrlRank >= 5 )
|
|
isBrand = false;
|
|
|
|
// . good competitors will be in top 30 for a query
|
|
// . let's keep in mind though that we use these competitors
|
|
// to find backlinks AND to generate related terms, so
|
|
// it's not so important that they dominate a query, but
|
|
// rather that they match your content...
|
|
/*
|
|
if ( better &&
|
|
numResults < point0 &&
|
|
rdRank >= 0 &&
|
|
rdRank < 20 )
|
|
queryWeights[qc] *= 1.2;//50.0;
|
|
|
|
// top ten???
|
|
if ( better &&
|
|
numResults < point0 &&
|
|
rdRank >= 0 &&
|
|
rdRank < 10 )
|
|
queryWeights[qc] *= 1.3;//51.0;
|
|
|
|
// top 5?
|
|
if ( better &&
|
|
numResults < point0 &&
|
|
rdRank >= 0 &&
|
|
rdRank < 5 )
|
|
queryWeights[qc] *= 1.4;//52.0;
|
|
*/
|
|
|
|
// weight it by how relevant the query it matches is to us
|
|
//if ( better && numResults < point0 )
|
|
// queryWeights[qc] = (qk->m_serpScore / 1000000.0);
|
|
|
|
//
|
|
// generic query?
|
|
//
|
|
float weight = 1.0;
|
|
if ( numResults < point0 ) weight = 100.0;
|
|
else if ( numResults < point1 ) weight = 10.0;
|
|
queryWeights[qc] *= weight;
|
|
|
|
//
|
|
// weight by related docid's serp score
|
|
//
|
|
float ss = qk->m_serpScore;
|
|
float w2 = 1.0;
|
|
if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
|
|
else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
|
|
else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M
|
|
else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M
|
|
else if ( ss > 100000.0 ) w2 = 6.0; // > 100k
|
|
else if ( ss > 10000.0 ) w2 = 5.0; // > 10k
|
|
else if ( ss > 1000.0 ) w2 = 4.0; // > 1k
|
|
else if ( ss > 100.0 ) w2 = 3.0; // > 100
|
|
else if ( ss > 10.0 ) w2 = 2.0; // > 10
|
|
queryWeights[qc] *= w2;
|
|
|
|
|
|
//
|
|
// weight by main url's serp score as well!
|
|
//
|
|
ss = mainUrlSerpScore;//qk->m_serpScore;
|
|
w2 = 1.0;
|
|
if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
|
|
else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
|
|
else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M
|
|
else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M
|
|
else if ( ss > 100000.0 ) w2 = 6.0; // > 100k
|
|
else if ( ss > 10000.0 ) w2 = 5.0; // > 10k
|
|
else if ( ss > 1000.0 ) w2 = 4.0; // > 1k
|
|
else if ( ss > 100.0 ) w2 = 3.0; // > 100
|
|
else if ( ss > 10.0 ) w2 = 2.0; // > 10
|
|
queryWeights[qc] *= w2;
|
|
|
|
|
|
// punish query weight if it is your brand most likely
|
|
//if ( isBrand )
|
|
// queryWeights[qc] = 0.01;
|
|
|
|
// . store related docid rank and your rank
|
|
// . then we do not need cache m_topDocIdsBuf and seo.cpp
|
|
// has this info readily available.
|
|
qn->m_relatedDocIdRank = rdRank;
|
|
qn->m_mainUrlRank = mainUrlRank;
|
|
//qn->m_mainUrlSerpScore = mainUrlSerpScore;
|
|
|
|
/*
|
|
int64_t numResults = qe->m_numTotalResultsInSlice;
|
|
// fix it to be global
|
|
numResults *= (int64_t)g_hostdb.getNumGroups();
|
|
// big indexes did the "slice logic" restricting docid
|
|
// range to MAX_DOCID * .10 when setting this!
|
|
if ( numPagesIndexed > 10000000 ) numResults *= 10;
|
|
// fix divide by zero and make all rare queries similar weight
|
|
//if ( numResults < 1000 ) numResults = 1000;
|
|
// divide by # results query has so more generic stuff
|
|
// is down weighted
|
|
//queryWeights[qc] /= (float)numResults;
|
|
if ( numResults < 1000 )
|
|
queryWeights[qc] /= 1;
|
|
else if ( numResults < 10000 )
|
|
queryWeights[qc] /= 2;
|
|
else if ( numResults < 100000 )
|
|
queryWeights[qc] /= 4;
|
|
else if ( numResults < 1000000 ) // 1M
|
|
queryWeights[qc] /= 8;
|
|
else if ( numResults < 10000000 ) // 10M
|
|
queryWeights[qc] /= 16;
|
|
else if ( numResults < 10000000 ) // 100M
|
|
queryWeights[qc] /= 32;
|
|
else
|
|
queryWeights[qc] /= 64;
|
|
*/
|
|
|
|
//int32_t qlen = gbstrlen(qstr);
|
|
// int16_tcuts
|
|
Query *qp = &queries[qc];
|
|
HashTableX *ht = &htables[qc];
|
|
// this is currently a int64_t bit vector
|
|
int32_t vs = sizeof(qvec_t);
|
|
if ( ! ht->set ( 8,vs,128,NULL,0,false,m_niceness,"wbvbuf") )
|
|
// hopefully g_errno is preserved
|
|
goto done;
|
|
// if unknown use english so pandora's -> pandora,pandoras?
|
|
// because 'pandora's tower' was not matching
|
|
// 'pandoras tower' because both words could have been
|
|
// english or german, thus the queries were thought to be
|
|
// independent! giving rise to high-scoring competitive pages
|
|
// that matched only those two queries.
|
|
uint8_t qlangId = qe->m_langId;
|
|
if ( ! qlangId ) qlangId = langEnglish;
|
|
qp->set2 ( qstr , qlangId , true );
|
|
// hash it up
|
|
for ( int32_t i = 0 ; i < qp->m_numTerms ; i++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qp->m_qterms[i];
|
|
// bigrams imply 2 explicit bits, one from each term
|
|
// in the bigram. synonym terms should share the same
|
|
// bit as the term they are a synonym of
|
|
int64_t bits = qt->m_implicitBits;
|
|
// . add bit vec. use rawTermId?
|
|
// . hash to wordbit vector of query words contained
|
|
if ( ! ht->addKey ( &qt->m_termId , &bits ) )
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
// . set the dup flags!
|
|
// . scan queries related docid matches
|
|
for ( int32_t i = 0 ; i < qc ; i++ ) {
|
|
// get it
|
|
Query *qpi = &queries[i];
|
|
HashTableX *hti = &htables[i];
|
|
// scan all queries above
|
|
for ( int32_t j = i+1 ; j < qc ; j++ ) {
|
|
// reset
|
|
bool jIsSubQueryOfi = false;
|
|
bool iIsSubQueryOfj = false;
|
|
// skip ourselves
|
|
//if ( j == i ) continue;
|
|
// get it
|
|
Query *qpj = &queries[j];
|
|
HashTableX *htj = &htables[j];
|
|
// scan every query term in query #j and map each
|
|
// termid to the term bit vector that indicates what
|
|
// terms query #j has in query #i.
|
|
qvec_t totalVec = 0LL;
|
|
// is it a dup?
|
|
for ( int32_t k = 0 ; k < qpj->m_numTerms ; k++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qpj->m_qterms[k];
|
|
// see if in there
|
|
char *val ;
|
|
val = (char *)hti->getValue(&qt->m_termId);
|
|
if ( ! val ) continue;
|
|
// get implied term bits
|
|
qvec_t vec = *(qvec_t *)val;
|
|
// this is the termbit vector for query #i.
|
|
// it tells us what terms query #j shares.
|
|
totalVec |= vec;
|
|
}
|
|
// we only care about "required" terms. i.e. bigrams
|
|
// are essentially ignored if not in quotes.
|
|
totalVec &= qpi->m_requiredBits;
|
|
// how many words do we match?
|
|
if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
|
|
int32_t numSharedWithQueryi = getNumBitsOn64(totalVec);
|
|
// how many required bits does it have?
|
|
int32_t needi = getNumBitsOn64(qpi->m_requiredBits);
|
|
// if all terms in query #i are in query #j then subset
|
|
if ( numSharedWithQueryi == needi )
|
|
iIsSubQueryOfj = true;
|
|
|
|
//
|
|
// now go the other way
|
|
//
|
|
totalVec = 0LL;
|
|
// is it a dup?
|
|
for ( int32_t k = 0 ; k < qpi->m_numTerms ; k++ ) {
|
|
// int16_tcut
|
|
QueryTerm *qt = &qpi->m_qterms[k];
|
|
// see if in there
|
|
char *val;
|
|
val = (char *)htj->getValue(&qt->m_termId);
|
|
if ( ! val ) continue;
|
|
// get implied term bits
|
|
qvec_t vec = *(qvec_t *)val;
|
|
// this is the termbit vector for query #j.
|
|
// it tells us what terms query #i shares.
|
|
totalVec |= vec;
|
|
}
|
|
// we only care about "required" terms. i.e. bigrams
|
|
// are essentially ignored if not in quotes.
|
|
totalVec &= qpj->m_requiredBits;
|
|
// how many words do we match?
|
|
if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
|
|
int32_t numSharedWithQueryj = getNumBitsOn64(totalVec);
|
|
// how many required bits does it have?
|
|
int32_t needj = getNumBitsOn64(qpj->m_requiredBits);
|
|
// if all terms in query #i are in query #j then subset
|
|
if ( numSharedWithQueryj == needj )
|
|
jIsSubQueryOfi = true;
|
|
|
|
|
|
// now set dup bit if query #i is same as query #j
|
|
// taking into account "missing spaces" so that we
|
|
// have two terms in one query , and their bigram
|
|
// in the other query. OR we have synonyms. OR we
|
|
// have differences of "ignored" words.
|
|
// "leg" = "legs"
|
|
// "cheat code" = "cheatcodes"
|
|
// "the tigers" = "tigers"
|
|
if(jIsSubQueryOfi&&
|
|
iIsSubQueryOfj&&
|
|
queryWeights[j]>.02){
|
|
// debug?
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: %s ISDUPOF %s",
|
|
qpj->m_orig,
|
|
qpi->m_orig);
|
|
// the dup weight is .02
|
|
queryWeights[j] *= .1; // = .02
|
|
}
|
|
|
|
// proper subquery examples:
|
|
// "leg" is subquery of "nice legs"
|
|
else if ( jIsSubQueryOfi &&
|
|
! iIsSubQueryOfj &&
|
|
queryWeights[j] > .05 ) {
|
|
// debug?
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: %s SUBQUERYOF %s",
|
|
qpj->m_orig,
|
|
qpi->m_orig);
|
|
// the subquery weight is .05
|
|
queryWeights[j] *= 0.1; // = 5.0;//.05;
|
|
}
|
|
|
|
// is query #i a PROPER subquery of query #j
|
|
else if ( iIsSubQueryOfj &&
|
|
! jIsSubQueryOfi &&
|
|
queryWeights[i] > .05 ) {
|
|
// debug?
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: %s SUBQUERYOF %s",
|
|
qpi->m_orig,
|
|
qpj->m_orig);
|
|
// the subquery weight is .05
|
|
// increase to 5.0 to try to drown out the
|
|
// anomaly queries promoting poker sites
|
|
// for cheatcodes.com competitors
|
|
queryWeights[i] *= 0.1; // = 5.0;//.05;
|
|
}
|
|
|
|
else {
|
|
// debug?
|
|
//if ( debug )
|
|
//log("seo: %s UNRELATEDTO %s",
|
|
// qpi->m_orig,
|
|
// qpj->m_orig);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
// scan the queries again and add up their weights this time!
|
|
totalWeight = 0.0;
|
|
for ( int32_t i = 0 ; i < qc ; i++ ) {
|
|
totalWeight += queryWeights[i];
|
|
qnPtrs[i]->m_queryScoreWeight = queryWeights[i];
|
|
//Msg99Reply *ptr = replyPtrs[i];
|
|
Query *qp = &queries[i];
|
|
char *qstr = qp->m_orig;//ptr->m_queryStr;
|
|
// log it
|
|
if ( m_seoDebug >= 2 )
|
|
log("seo: docid=%"INT64" weight=%f qry=%s",
|
|
rd->m_docId,
|
|
queryWeights[i],
|
|
qstr);
|
|
}
|
|
|
|
|
|
// that is the docid related weight now
|
|
rd->m_relatedWeight = totalWeight;
|
|
|
|
done:
|
|
for ( int32_t i = 0 ; i < nc ; i++ ) {
|
|
queries[i].destructor();
|
|
htables[i].destructor();
|
|
}
|
|
mfree ( mem , need , "qrybuf" );
|
|
return true;
|
|
}
|
|
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::addRelatedDocIdInfo ( int64_t docId ,
|
|
int32_t queryNum ,
|
|
float score ,
|
|
int32_t rank ,
|
|
int32_t siteHash26 ) {
|
|
|
|
// do not add if does not match the query
|
|
if ( score <= 0.0 ) return true;
|
|
|
|
// alloc space if first time calling
|
|
if ( ! m_rdtab.m_numSlots ) {
|
|
if ( ! m_rdtab.set(8,sizeof(RelatedDocId),1024,NULL,0,
|
|
false,0,"rdtab"))
|
|
return false;
|
|
}
|
|
|
|
// get the related docid as it exists in m_relatedDocIdBuf
|
|
RelatedDocId *rd = NULL;
|
|
|
|
// now we also store these for intersecting
|
|
// in phase 2 to see what urls are most
|
|
// similar to us
|
|
int32_t slot = m_rdtab.getSlot(&docId);
|
|
// if not there, add it
|
|
if ( slot < 0 ) {
|
|
// make one
|
|
RelatedDocId rdx;
|
|
// the most important thing is the docid!
|
|
rdx.m_docId = docId;
|
|
// and now the 32-bit site hash
|
|
rdx.m_siteHash26 = siteHash26;
|
|
// how many search results we are in
|
|
rdx.m_numCommonQueries = 0;
|
|
// the queryImportance should be our score
|
|
// for this query divided by m_minTop50Score
|
|
// to normalize it.
|
|
//float qimp=qp->m_queryInfo.m_queryImportance;
|
|
// just add up the query importance for
|
|
// each query we share in common with main url
|
|
//rd.m_similarityScore = qip;
|
|
// now we do a dot product of this related
|
|
// docids score vector with the main url's
|
|
// score vector. both vector's are normalized
|
|
// using the score of the 1st result!
|
|
//rd.m_dotProduct = score;
|
|
// reset this
|
|
rdx.m_rd_siteRank = -1;
|
|
rdx.m_rd_langId = 255;
|
|
rdx.rd_title_off = -1;
|
|
rdx.rd_url_off = -1;
|
|
rdx.rd_site_off = -1;
|
|
// point to beginning of linked list of qrynums
|
|
rdx.m_firstCommonQueryNumOff = -1;//off;
|
|
//rdx.m_lastCommonQueryNumOff = -1;//off;
|
|
// remember offset
|
|
int32_t rdOff = m_relatedDocIdBuf.length();
|
|
// store it
|
|
m_relatedDocIdBuf.safeMemcpy ( &rdx , sizeof(RelatedDocId) );
|
|
// add OFFSET to table. data is 12 bytes
|
|
if(! m_rdtab.addKey(&docId,&rdOff)) return false;
|
|
// all done then
|
|
//continue;
|
|
// set this for adding to the linked list
|
|
char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
|
|
// cast it
|
|
rd = (RelatedDocId *)p;
|
|
}
|
|
else {
|
|
// get the data
|
|
int32_t rdOff = *(int32_t *)m_rdtab.getValueFromSlot(slot);
|
|
// point to it
|
|
char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
|
|
// cast it
|
|
rd = (RelatedDocId *)p;
|
|
}
|
|
|
|
// before we add the querynumlinkednode make sure not a dup!
|
|
char *qnbuf = m_commonQueryNumBuf.getBufStart();
|
|
// . offset of first node for this related docid
|
|
// . this is the start of his linked list of query/score nodes
|
|
int32_t firstOff = rd->m_firstCommonQueryNumOff;
|
|
|
|
// sanity
|
|
if ( firstOff == -1 && rd->m_numCommonQueries ) { char *xx=NULL;*xx=0;}
|
|
|
|
// assume no linked list
|
|
QueryNumLinkedNode *node = NULL;
|
|
// only a linked list if firstOff is not -1
|
|
if ( firstOff >= 0 ) node = (QueryNumLinkedNode *)(qnbuf + firstOff);
|
|
// scan the nodes (query/score pairs) we got for this related docid
|
|
for ( ; node ; ) {
|
|
// if this query is already in the linked list, stop! we
|
|
// do not want to add dup QueryNumLinkedNode nodes.
|
|
if ( node->m_queryNum == queryNum ) return true;
|
|
// end of linked list?
|
|
if ( node->m_nextOff == -1 ) break;
|
|
// advance to next node in linked list
|
|
node = (QueryNumLinkedNode *)(qnbuf+node->m_nextOff);
|
|
}
|
|
|
|
// store query num element in a linked list so
|
|
// we can print the actualy queryNums a related
|
|
// docid has in common with the main url
|
|
int32_t nodeOff = m_commonQueryNumBuf.length();
|
|
// we can record our rank and your rank in this!
|
|
QueryNumLinkedNode qn;
|
|
qn.m_queryNum = queryNum; // qp->m_queryNum;
|
|
qn.m_nextOff = -1;
|
|
qn.m_relatedDocIdRank = rank;
|
|
qn.m_relatedDocIdSerpScore = score;
|
|
qn.m_mainUrlRank = -1;
|
|
//qn.m_mainUrlSerpScore = -1.0;
|
|
int32_t sq = sizeof(QueryNumLinkedNode);
|
|
// point to it
|
|
if ( ! m_commonQueryNumBuf.safeMemcpy(&qn,sq) )
|
|
return false;
|
|
// point to node we stored in the buf so we can adjust it below
|
|
QueryNumLinkedNode *stored ;
|
|
stored = (QueryNumLinkedNode *)(m_commonQueryNumBuf.getBuf() - sq);
|
|
|
|
|
|
// increment the count. the # of nodes in his linked list.
|
|
rd->m_numCommonQueries++;
|
|
|
|
// continue the linked list
|
|
qnbuf = m_commonQueryNumBuf.getBufStart();
|
|
|
|
// the first node?
|
|
if ( firstOff == -1 ) {
|
|
rd->m_firstCommonQueryNumOff = nodeOff;
|
|
//rd->m_lastCommonQueryNumOff = nodeOff;
|
|
return true;
|
|
}
|
|
|
|
// get the current first
|
|
int32_t oldFirstOff = rd->m_firstCommonQueryNumOff;
|
|
//char *vv = qnbuf + rd->m_firstCommonQueryNumOff;
|
|
//QueryNumLinkedNode *first = (QueryNumLinkedNode *)vv;
|
|
// we are the new first
|
|
rd->m_firstCommonQueryNumOff = nodeOff;
|
|
// we point to old first as our next
|
|
stored->m_nextOff = oldFirstOff;
|
|
// and update that node's next link
|
|
//last->m_nextOff = nodeOff;
|
|
// and our new tail
|
|
//rd->m_lastCommonQueryNumOff = nodeOff;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . safebuf returned is a buffer of QueryLinks
|
|
// . use m_matchingQueryBuf/m_matchingStringBuf
|
|
SafeBuf *XmlDoc::getMatchingQueryBuf ( ) {
|
|
|
|
setStatus ( "getmatchingqueries" );
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_matchingQueryBufValid )
|
|
return &m_matchingQueryBuf;
|
|
|
|
|
|
if ( ! m_beginTimeAllMatch )
|
|
m_beginTimeAllMatch = gettimeofdayInMilliseconds();
|
|
|
|
if ( m_docIdListBuf.length() == 0 )
|
|
m_docIdListBuf.pushLongLong(m_docId);
|
|
|
|
// true = doMatchingQueries?
|
|
SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , true );
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
|
|
m_matchingQueryBuf .stealBuf ( qkbuf );
|
|
m_matchingQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );
|
|
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginTimeAllMatch;
|
|
log("seopipe: time: getMatchingQueries took %"INT64" ms",took);
|
|
|
|
m_matchingQueryBufValid = true;
|
|
|
|
// if getRelatedQueryBuf calles getQueryLinkBuf() it should
|
|
// do a recompute, so set this to false
|
|
m_queryLinkBufValid = false;
|
|
|
|
m_docIdListBuf.purge();
|
|
|
|
// store it
|
|
if ( ! storeMatchingQueriesIntoCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
return &m_matchingQueryBuf;
|
|
}
|
|
|
|
// . returns safebuf of QueryLinks, representing the intersected matching
|
|
// queries of all the related docids
|
|
SafeBuf *XmlDoc::getRelatedQueryBuf () {
|
|
|
|
// try to set from cachedb record
|
|
if ( ! checkCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
if ( m_relatedQueryBufValid )
|
|
return &m_relatedQueryBuf;
|
|
|
|
// we need these
|
|
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
|
|
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
|
|
|
|
|
|
if ( ! m_beginRelatedQueries )
|
|
m_beginRelatedQueries = gettimeofdayInMilliseconds();
|
|
|
|
if ( m_docIdListBuf.length() == 0 ) {
|
|
int32_t numRelatedDocIds = rdbuf->length()/sizeof(RelatedDocId);
|
|
// just use the top 50 for related queries for speed!
|
|
if ( numRelatedDocIds > 50 ) numRelatedDocIds = 50;
|
|
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
|
|
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
|
|
RelatedDocId *rd = &rds[i];
|
|
m_docIdListBuf.pushLongLong(rd->m_docId);
|
|
}
|
|
}
|
|
|
|
// false = doMatchingQueries?
|
|
SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , false );
|
|
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
|
|
|
|
m_relatedQueryBuf .stealBuf ( qkbuf );
|
|
m_relatedQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );
|
|
|
|
m_relatedQueryBufValid = true;
|
|
m_queryLinkBufValid = false;
|
|
|
|
m_docIdListBuf.purge();
|
|
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginRelatedQueries;
|
|
log("seopipe: time: getRelatedQueries took %"INT64" ms",took);
|
|
|
|
// store it
|
|
if ( ! storeRelatedQueriesIntoCachedb() )
|
|
return (SafeBuf *)-1;
|
|
|
|
|
|
return &m_relatedQueryBuf;
|
|
}
|
|
|
|
|
|
static void gotMsg8eReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
XmlDoc *THIS = (XmlDoc *)state;
|
|
|
|
int32_t hostId = slot->m_hostId;
|
|
THIS->m_msg8eReply [hostId] = slot->m_readBuf;
|
|
THIS->m_msg8eReplySize[hostId] = slot->m_readBufSize;
|
|
// do not let udpserver.cpp free it, we will later
|
|
slot->m_readBuf = NULL;
|
|
|
|
log("seo: got msg8e reply #%"INT32" of %"INT32" from host #%"INT32"",
|
|
(int32_t)THIS->m_numMsg8eReplies,
|
|
(int32_t)THIS->m_numMsg8eRequests,
|
|
(int32_t)hostId);
|
|
|
|
THIS->m_numMsg8eReplies++;
|
|
// do not free send buf until last reply!
|
|
if ( THIS->m_numMsg8eReplies < THIS->m_numMsg8eRequests ) {
|
|
slot->m_sendBufAlloc = NULL;
|
|
return;
|
|
}
|
|
// ok, sendBuf will auto free in UdpServer.cpp when we return from this
|
|
THIS->m_masterLoop ( THIS->m_masterState );
|
|
}
|
|
|
|
|
|
//static void gotMsg20ReplyWrapper ( void *state ) {
|
|
// XmlDoc *THIS = (XmlDoc *)state;
|
|
// THIS->m_numMsg20Replies++;
|
|
// if ( THIS->m_numMsg20Replies < THIS->m_numMsg20Requests )
|
|
// return;
|
|
// THIS->m_masterLoop ( THIS->m_masterState );
|
|
//}
|
|
|
|
|
|
// . returned safebuf is array of QueryLinks
|
|
// . gets all matching queries from all related docids and store them
|
|
// compactly as QueryLinks, otherwise we'd run out of memory because
|
|
// each docid has like 50,000 matching queries on avg.
|
|
// . we now get matching queries in modulus parts to avoid OOM, because
|
|
// with my new changes i made we are getting like a few hundred thousand
|
|
// matching queries per related docid.
|
|
// . we do not store the query string, etc, for the QueryLink,
|
|
// just the query offset and the hostid that has the query in its
|
|
// memory (g_qbuf). after we intersect the QueryLinks we will get the
|
|
// query strings, etc. there will be a lot fewer in the intersection.
|
|
SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {
|
|
|
|
if ( m_queryLinkBufValid )
|
|
return &m_queryLinkBuf;
|
|
|
|
bool doRelatedQueries = true;
|
|
if ( doMatchingQueries ) doRelatedQueries = false;
|
|
|
|
// get the 32-bit terms the main doc matches, so we may determine
|
|
// what terms in a related query are novel to this document.
|
|
SafeBuf *mainUrlTwidBuf32 = NULL;
|
|
if ( doRelatedQueries ) {
|
|
mainUrlTwidBuf32 = getTermId32Buf() ;//InfoBuf();
|
|
if ( ! mainUrlTwidBuf32 || mainUrlTwidBuf32 == (void *)-1 )
|
|
return mainUrlTwidBuf32;
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return NULL;
|
|
|
|
//
|
|
// SHIT! we can't use the keys in the termlistbuf for dual purpose
|
|
// role as terms the doc contains, because they do not have the
|
|
// synonym forms!!! So we have to get this terminfobuf as wells
|
|
// as the termlistbuf for each docid!!!!
|
|
//
|
|
// so we might as well not sort by the lower 32 bit hack as well
|
|
//
|
|
|
|
|
|
//
|
|
//
|
|
// 1. get termlistbuf for each docid possibly using msg20s
|
|
//
|
|
// we need this for getting the QueryLink::m_serpScores in
|
|
// handleRequest8e
|
|
//
|
|
//
|
|
//int32_t numDocIds = docIdList->length() / 8;
|
|
//int64_t *docIds = (int64_t *)docIdList->getBufStart();
|
|
|
|
|
|
//SafeBuf *tlistBuf = NULL;
|
|
//SafeBuf *twidBuf32 = NULL;
|
|
|
|
// . we just want the termlistbuf of each related docid
|
|
// . hack: it should be sorted by the LOWER 32 bits of termid
|
|
// so handlerequest8e does not need to sort its termid32/twid32 buf
|
|
//if ( doMatchingQueries ) {
|
|
// tlistBuf = getTermListBuf();
|
|
// if ( ! tlistBuf || tlistBuf == (void *)-1 ) return tlistBuf;
|
|
// twidBuf32 = getTermId32Buf();
|
|
// if ( ! twidBuf32 || twidBuf32 == (void *)-1 ) return twidBuf32;
|
|
//}
|
|
|
|
/*
|
|
if ( doRelatedQueries && ! m_launchedAll ) {
|
|
int32_t need = sizeof(Msg20) * numDocIds;
|
|
// we also use this same buf in getRelatedDocIdsWithTitles
|
|
if ( ! m_msg20Buf.reserve ( need,"m20buf3" ) ) return NULL;
|
|
// mark it all in use
|
|
m_msg20Buf.setLength(need);
|
|
// init them
|
|
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
|
|
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
|
|
// reset cursor to start with first related docid
|
|
m_numMsg20Replies = 0;
|
|
m_numMsg20Requests = 0;
|
|
// launch all!
|
|
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *msg20 = &mp[i];
|
|
// get current related docid
|
|
//RelatedDocId *rd = &rds[i];
|
|
// make the request
|
|
Msg20Request req;
|
|
req.ptr_coll = cr->m_coll;
|
|
req.size_coll = gbstrlen(cr->m_coll)+1;
|
|
req.m_docId = docIds[i];
|
|
req.m_expected = true;
|
|
req.m_niceness = m_niceness;
|
|
//req.m_state = m_masterState;
|
|
//req.m_callback2 = m_masterLoop;
|
|
req.m_state = this;
|
|
req.m_callback2 = gotMsg20ReplyWrapper;
|
|
// do not get summary stuff. too slow.
|
|
req.m_numSummaryLines = 0;
|
|
// get this
|
|
req.m_getTermListBuf = true;
|
|
// count these!
|
|
m_numMsg20Requests++;
|
|
// store cursor in msg20 itself so we know the rd
|
|
//msg20->m_hack2 = i;
|
|
// launch it
|
|
if ( ! msg20->getSummary ( &req ) ) continue;
|
|
// error?
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// note it
|
|
log("seo: error getting termlistbuf docid=%"INT64"",
|
|
docIds[i]);
|
|
// reset it
|
|
//msg20->reset();
|
|
// count reply as back now
|
|
m_numMsg20Replies++;
|
|
}
|
|
m_launchedAll = true;
|
|
}
|
|
// wait for one reply per related docid
|
|
if ( doRelatedQueries && m_numMsg20Replies < m_numMsg20Requests )
|
|
return (SafeBuf *)-1;
|
|
*/
|
|
|
|
//
|
|
//
|
|
// 2. send one msg8e request to each host with those termlistbufs
|
|
//
|
|
// it has one termlistbuf per relateddocid, enough info
|
|
// for handlerequest8e to return the list of matching QueryLinks
|
|
// intersected for all related docids.
|
|
//
|
|
if ( m_numMsg8eRequests == 0 ) {
|
|
SafeBuf request;
|
|
// how big is the request?
|
|
int32_t need = 0;
|
|
need += 1; // for the byte flag
|
|
int32_t collLen = gbstrlen(cr->m_coll);
|
|
need += collLen + 1;
|
|
// list of docids (just one for matching queries)
|
|
need += 4;
|
|
need += docIdList->length();
|
|
|
|
// twidtable alloc
|
|
if ( doRelatedQueries ) {
|
|
need += 4;
|
|
need += mainUrlTwidBuf32->length();
|
|
}
|
|
|
|
//if ( doMatchingQueries ) {
|
|
// // just our main url's termlistbuf
|
|
// need += 4;
|
|
// need += tlistBuf->length();
|
|
// need += 4;
|
|
// need += twidBuf32->length();
|
|
//}
|
|
|
|
//
|
|
// make the 8e request
|
|
//
|
|
if ( ! request.reserve ( need ,"rep8ebuf" ) )
|
|
return NULL;
|
|
// first store flag to indicate if getting matching or
|
|
// related queries
|
|
if ( doMatchingQueries ) request.pushChar(1);
|
|
else request.pushChar(0);
|
|
// then coll\0
|
|
request.safeMemcpy ( cr->m_coll, collLen );
|
|
request.pushChar ( 0 );
|
|
// then docids after the collection name
|
|
request.pushLong ( docIdList->length() );
|
|
request.safeMemcpy ( docIdList );
|
|
|
|
// then if doing related queries we need to store our
|
|
// 32-bit twids of the main url for setting m_uniqueRound
|
|
if ( doRelatedQueries ) {
|
|
request.pushLong(mainUrlTwidBuf32->length());
|
|
request.safeMemcpy(mainUrlTwidBuf32->getBufStart(),
|
|
mainUrlTwidBuf32->length() );
|
|
}
|
|
/*
|
|
// then store each termlistbuf from each msg20
|
|
for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ) {
|
|
// int16_tcut
|
|
Msg20 *mp = &mps[i];
|
|
Msg20Reply *rep = mp->getReply();
|
|
if ( rep ) {
|
|
request.pushLong ( rep->size_tlistBuf );
|
|
request.safeMemcpy ( rep->ptr_tlistBuf ,
|
|
rep->size_tlistBuf );
|
|
// then the 32-bit termid buf with synonyms
|
|
// that the above posdblist termlists don't
|
|
// have so we can match queries
|
|
request.pushLong ( rep->size_tiBuf );
|
|
request.safeMemcpy ( rep->ptr_tiBuf,
|
|
rep->size_tiBuf );
|
|
}
|
|
// make them empty i guess
|
|
else {
|
|
request.pushLong ( 0 );
|
|
request.pushLong ( 0 );
|
|
}
|
|
}
|
|
*/
|
|
/*
|
|
// just our main url's termlistbuf
|
|
if ( doMatchingQueries ) {
|
|
request.pushLong (tlistBuf->length());
|
|
request.safeMemcpy (tlistBuf);
|
|
// then the 32-bit termid buf with synonyms that
|
|
// the above posdblist termlists don't have so
|
|
// we can match queries
|
|
request.pushLong (twidBuf32->length());
|
|
request.safeMemcpy (twidBuf32);
|
|
|
|
}
|
|
*/
|
|
// sanity
|
|
if ( request.length() != need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do not free it here, let udpserver free it
|
|
char *req = request.getBufStart();
|
|
int32_t reqSize = request.length();
|
|
request.detachBuf();
|
|
|
|
// we've formulated the 8e request, no need for msg20s anymore
|
|
//for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ){
|
|
// // int16_tcut
|
|
// Msg20 *mp = &mps[i];
|
|
// mp->destructor();
|
|
//}
|
|
// free the mem as well
|
|
//m_msg20Buf.purge();
|
|
|
|
// must be host #0 for this next algo to work
|
|
if ( g_hostdb.m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
|
//
|
|
// send msg8e request to each host. skip if dead.
|
|
//
|
|
for ( int32_t k = 1; k <= g_hostdb.m_numHosts ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// send to ourselves last so we can do all in parallel
|
|
int32_t hosti = k;
|
|
if ( k == g_hostdb.m_numHosts ) hosti = 0;
|
|
// get ptr to the host
|
|
Host *host = g_hostdb.getHost(hosti);
|
|
// get hostid of host #i
|
|
int32_t hostId = host->m_hostId;
|
|
if ( hostId != hosti ) { char *xx=NULL;*xx=0; }
|
|
// count it
|
|
m_numMsg8eRequests++;
|
|
// skip if dead. i guess no queries from that guy. we
|
|
// can't send to a twin because the twin does not have
|
|
// the same queries in its in-memory query log.
|
|
if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive) {
|
|
log("seo: skipping msg8e to dead host %"INT32"",
|
|
hostId);
|
|
m_msg8eReply [hostId] = NULL;
|
|
m_msg8eReplySize[hostId] = 0;
|
|
m_numMsg8eReplies++;
|
|
continue;
|
|
}
|
|
// . send request to him
|
|
// . reply is the query strings
|
|
// . when reply comes in we store it in the query
|
|
// string buf and make the QueryLinks reference it
|
|
// with their QueryLink::m_queryStringOffset
|
|
if ( ! g_udpServer.sendRequest ( req ,
|
|
reqSize ,
|
|
0x8e , // msgtype
|
|
host->m_ip , // ip
|
|
host->m_port , // port
|
|
hostId,
|
|
NULL, // retslot
|
|
this,
|
|
gotMsg8eReplyWrapper,
|
|
999999, // timeout
|
|
-1 , // backoff
|
|
-1 , // maxwait
|
|
NULL, // replybuf
|
|
0, // replybufmaxsize
|
|
m_niceness // niceness
|
|
)) {
|
|
// let admin know about error
|
|
log("seopipe: sendRequest 8e had error: %s",
|
|
mstrerror(g_errno));
|
|
// count it as replied then
|
|
m_numMsg8eReplies++;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// this should never happen now with our new wrapper
|
|
if ( m_numMsg8eReplies < m_numMsg8eRequests )
|
|
return (SafeBuf *)-1;
|
|
|
|
//
|
|
//
|
|
// 3. MERGE the msg8e replies from all hosts
|
|
//
|
|
//
|
|
|
|
// gotMgs8eReplyWrapper() should have recorded each one into
|
|
// m_msg8Reply[i], the msg20 reply ptr. set up for merging.
|
|
char *bestPtr[MAX_HOSTS];
|
|
char *bufEnd [MAX_HOSTS];
|
|
for ( int32_t i = 0; i < g_hostdb.m_numHosts ; i++ ) {
|
|
char *reply = m_msg8eReply [i];
|
|
// this happens if host is dead...
|
|
if ( ! reply ) {
|
|
bestPtr[i] = NULL;
|
|
bufEnd [i] = NULL;
|
|
continue;
|
|
}
|
|
//int32_t replySize = m_msg8eReplySize [i];
|
|
// it should be a list of QueryLinks
|
|
char *p = reply;
|
|
int32_t queryLinkBufSize = *(int32_t *)p;
|
|
p += 4;
|
|
bestPtr[i] = p;
|
|
// bufEnd[i] also marks the start of the querystringbuf
|
|
bufEnd [i] = p + queryLinkBufSize;
|
|
}
|
|
int32_t count = 0;
|
|
int32_t maxQueryLinks = MAX_RELATED_QUERIES;
|
|
if ( doMatchingQueries ) maxQueryLinks = MAX_MATCHING_QUERIES;
|
|
|
|
// now merge the top "max" highest scoring
|
|
// QueryLinks and their correspoding QueryLogEntries into
|
|
// m_queryLinkBuf/m_queryLinkStringBuf
|
|
storeMore:
|
|
// get the max scoring QueryLink from the 8e replies
|
|
int32_t maxi = -1;
|
|
float maxScore = -1.0;
|
|
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
|
// skip if exhausted
|
|
if ( bestPtr[i] >= bufEnd[i] ) continue;
|
|
// cast it
|
|
QueryLink *qk = (QueryLink *)bestPtr[i];
|
|
// sanity, if not list head core
|
|
if ( ! qk->m_isFirst ) { char *xx=NULL;*xx=0; }
|
|
// skip if score is not the current maximum
|
|
if ( qk->m_totalQueryImportance < maxScore ) continue;
|
|
// we got a new max!
|
|
maxScore = qk->m_totalQueryImportance;
|
|
maxi = i;
|
|
}
|
|
|
|
// store max into m_queryLinkBuf and m_queryLinkStringBuf
|
|
if ( maxi >= 0 ) {
|
|
// int16_tcut
|
|
QueryLink *best = (QueryLink *)bestPtr[maxi];
|
|
// get # to copy
|
|
int32_t toCopy = sizeof(QueryLink);
|
|
if ( doRelatedQueries )
|
|
// how many querylinks in this list? i.e. those
|
|
// that all share the same query, but different
|
|
// relateddocid?
|
|
toCopy = best->m_numInList * sizeof(QueryLink);
|
|
// copy the querylink
|
|
if ( ! m_queryLinkBuf.reserve ( toCopy ) ) return NULL;
|
|
// point to it
|
|
QueryLink *qk = (QueryLink *)m_queryLinkBuf.getBuf();
|
|
// THEN store it
|
|
m_queryLinkBuf.safeMemcpy( best , toCopy );
|
|
// point to its querylogentry buf, it occurs right
|
|
// after the list of QueryLinks!
|
|
char *p = bufEnd[maxi];
|
|
// and the query it is for
|
|
p += qk->m_queryStringOffset;
|
|
// cast that
|
|
QueryLogEntry *qe = (QueryLogEntry *)p;
|
|
// ensure enough space
|
|
if ( ! m_queryLinkStringBuf.reserve(qe->getSize(),"rqbb" ) )
|
|
return NULL;
|
|
// we are moving it into the final buf
|
|
qk->m_queryStringOffset = m_queryLinkStringBuf.length();
|
|
// store query log entry here now
|
|
m_queryLinkStringBuf.safeMemcpy ( qe, qe->getSize() );
|
|
// advance
|
|
bestPtr[maxi] += toCopy;
|
|
}
|
|
// limit
|
|
if ( ++count < maxQueryLinks ) goto storeMore;
|
|
// liberate those msg20 reply buffers
|
|
for ( int32_t i = 0; i < g_hostdb.m_numHosts;i++) {
|
|
if ( ! m_msg8eReply[i] ) continue;
|
|
mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
|
|
m_msg8eReply[i] = NULL;
|
|
}
|
|
// reset our parms if we are re-called for related queries
|
|
m_numMsg8eReplies = 0;
|
|
m_numMsg8eRequests = 0;
|
|
m_queryLinkBufValid = true;
|
|
// show time
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t took = now - m_beginRelatedQueries;
|
|
log("seopipe: getrelatedquerybuftook %"INT64" ms",took);
|
|
m_beginRelatedQueries = 0LL;
|
|
|
|
// validate
|
|
m_queryLinkBufValid = true;
|
|
|
|
/*
|
|
// log for debug
|
|
qks = (QueryLink *)m_queryLinkBuf->getBufStart();
|
|
nks = m_queryLinkBuf->length() / sizeof(QueryLink);
|
|
for ( int32_t k = 0 ; k < nks ; k++ ) {
|
|
// now we use offsets into m_relatedQueryBuf.m_buf[]
|
|
QueryRel *qk = &qks[k];
|
|
// skip if not a head
|
|
if ( ! qk->m_isFirst ) continue;
|
|
char *qstr = qk->getQueryString(&m_queryLinkStringBuf);
|
|
log("seopipe: relquery=\"%s\" imp=%f votes=%"INT32"",
|
|
qstr,
|
|
qk->m_rq_totalScore,
|
|
qk->m_docIdVotes);
|
|
}
|
|
*/
|
|
|
|
return &m_queryLinkBuf;
|
|
}
|
|
|
|
// scan matches like XmlDoc::getSummary() does and get all sentences
|
|
// containing a query term...
|
|
//void XmlDoc::getGigabitExcerpts ( ) {
|
|
//}
|
|
|
|
|
|
// this is still used by Title.cpp to get the title: field quickly
|
|
char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {
|
|
|
|
if ( ! json ) return NULL;
|
|
|
|
// get length
|
|
int32_t fieldLen = gbstrlen(field);
|
|
// keep track of in a quote or not
|
|
bool inQuotes = false;
|
|
char *stringStart = NULL;
|
|
char *p = json;
|
|
bool gotOne = false;
|
|
int32_t depth = 0;
|
|
// scan
|
|
for ( ; *p ; p++ ) {
|
|
// escaping a quote? ignore quote then.
|
|
if ( *p == '\\' && p[1] == '\"' ) {
|
|
// skip two bytes then..
|
|
p++;
|
|
continue;
|
|
}
|
|
// count {} depth
|
|
if ( ! inQuotes ) {
|
|
if ( *p == '{' ) depth++;
|
|
if ( *p == '}' ) depth--;
|
|
}
|
|
// a quote?
|
|
if ( *p == '\"' ) {
|
|
inQuotes = ! inQuotes;
|
|
// set start of the string if quote is beginning
|
|
if ( inQuotes ) stringStart = p + 1;
|
|
// if quote is ending and a colon follows then
|
|
// it was a json field name. so if it matches the
|
|
// field we want return the following field for it.
|
|
else if ( ! inQuotes &&
|
|
! gotOne &&
|
|
p[1] == ':' &&
|
|
// {"title":"whatever",...}
|
|
// could be product:{title:... depth=2
|
|
(depth == 1 ||depth==2) &&
|
|
stringStart &&
|
|
(p - stringStart) == fieldLen &&
|
|
strncmp(field,stringStart,fieldLen)==0 ) {
|
|
// now, the next time we set stringStart
|
|
// it will be set to the VALUE of this field
|
|
// assuming the field is a STRING!!!!
|
|
gotOne = true;
|
|
// return after the quote
|
|
//return p + 2;
|
|
}
|
|
// ok, we got the string after the field string...
|
|
else if ( ! inQuotes && gotOne ) {
|
|
if ( valueLen ) *valueLen = p - stringStart;
|
|
return stringStart;
|
|
}
|
|
// keep chugging
|
|
continue;
|
|
}
|
|
}
|
|
// done, not found
|
|
return NULL;
|
|
}
|
|
|
|
|
|
Json *XmlDoc::getParsedJson ( ) {
|
|
|
|
if ( m_jpValid ) return &m_jp;
|
|
|
|
// core if not a json object
|
|
if ( m_contentTypeValid && m_contentType != CT_JSON &&
|
|
// spider status docs are now really json
|
|
m_contentType != CT_STATUS ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// \0 terminated
|
|
char **pp = getUtf8Content();
|
|
if ( ! pp || pp == (void *)-1 ) return (Json *)pp;
|
|
|
|
// point to the json
|
|
char *p = *pp;
|
|
|
|
// empty? all done then.
|
|
//if ( ! p ) return (char *)pp;
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . if p is NULL i guess this should still be ok and be empty
|
|
if ( ! m_jp.parseJsonStringIntoJsonItems ( p , m_niceness ) ) {
|
|
g_errno = EBADJSONPARSER;
|
|
return NULL;
|
|
}
|
|
|
|
m_jpValid = true;
|
|
return &m_jp;
|
|
}
|
|
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . hash each json VALUE (not FIELD) ... AND ... hash each json
|
|
// VALUE with its FIELD like "title:cool" or "description:whatever"
|
|
// . example:
|
|
// [{"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":1378322570280,"matched":64,"status":"Stopped","start":1378322184332,"token":"poo","parameterMap":{"token":"poo","seed":"www.alleyinsider.com","api":"article"},"crawled":64},{"id":"830e0584-7f69-4bdd-
|
|
|
|
#include "Json.h"
|
|
|
|
char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
|
|
|
setStatus ( "hashing json fields" );
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = table;
|
|
hi.m_desc = "json object";
|
|
|
|
// use new json parser
|
|
Json *jp = getParsedJson();
|
|
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
|
|
|
|
return hashJSONFields2 ( table , &hi , jp , true );
|
|
}
|
|
|
|
|
|
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
|
HashInfo *hi , Json *jp ,
|
|
bool hashWithoutFieldNames ) {
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
|
|
char nb[1024];
|
|
SafeBuf nameBuf(nb,1024);
|
|
|
|
//int32_t totalHash32 = 0;
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
// reset, but don't free mem etc. just set m_length to 0
|
|
nameBuf.reset();
|
|
|
|
// get its full compound name like "meta.twitter.title"
|
|
JsonItem *p = ji;
|
|
char *lastName = NULL;
|
|
char *nameArray[20];
|
|
int32_t numNames = 0;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
// empty name?
|
|
if ( ! p->m_name ) continue;
|
|
if ( ! p->m_name[0] ) continue;
|
|
// dup? can happen with arrays. parent of string
|
|
// in object, has same name as his parent, the
|
|
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
|
|
if ( p->m_name == lastName ) continue;
|
|
// update
|
|
lastName = p->m_name;
|
|
// add it up
|
|
nameArray[numNames++] = p->m_name;
|
|
// breach?
|
|
if ( numNames < 15 ) continue;
|
|
log("build: too many names in json tag");
|
|
break;
|
|
}
|
|
|
|
// if we are the diffbot reply "html" field do not hash this
|
|
// because it is redundant and it hashes html tags etc.!
|
|
// plus it slows us down a lot and bloats the index.
|
|
if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"html")==0)
|
|
continue;
|
|
|
|
// assemble the names in reverse order which is correct order
|
|
for ( int32_t i = 1 ; i <= numNames ; i++ ) {
|
|
// copy into our safebuf
|
|
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
|
|
return NULL;
|
|
// separate names with periods
|
|
if ( ! nameBuf.pushChar('.') ) return NULL;
|
|
}
|
|
// remove last period
|
|
nameBuf.removeLastChar('.');
|
|
// and null terminate
|
|
if ( ! nameBuf.nullTerm() ) return NULL;
|
|
// change all :'s in names to .'s since : is reserved!
|
|
char *px = nameBuf.getBufStart();
|
|
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
|
|
//for ( px = nameBuf.getBufStart(); *px ; px++ ) if ( *px == '-' ) *px = '_';
|
|
//
|
|
// DIFFBOT special field hacks
|
|
//
|
|
char *name = nameBuf.getBufStart();
|
|
hi->m_hashGroup = HASHGROUP_BODY;
|
|
if ( strstr(name,"title") )
|
|
hi->m_hashGroup = HASHGROUP_TITLE;
|
|
if ( strstr(name,"url") )
|
|
hi->m_hashGroup = HASHGROUP_INURL;
|
|
if ( strstr(name,"resolved_url") )
|
|
hi->m_hashGroup = HASHGROUP_INURL;
|
|
if ( strstr(name,"tags") )
|
|
hi->m_hashGroup = HASHGROUP_INTAG;
|
|
if ( strstr(name,"meta") )
|
|
hi->m_hashGroup = HASHGROUP_INMETATAG;
|
|
//
|
|
// now Json.cpp decodes and stores the value into
|
|
// a buffer, so ji->getValue() should be decoded completely
|
|
//
|
|
|
|
// . get the value of the json field
|
|
// . if it's a number or bool it converts into a string
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
char tbuf[32];
|
|
|
|
// if the value is clearly a date, just hash it as
|
|
// a number, so use a temporary value that holds the
|
|
// time_t and hash with that... this will hash
|
|
// diffbot's article date field as a number so we can
|
|
// sortby and constrain by it in the search results
|
|
if ( name && (strcasecmp(name,"date") == 0 || strcasecmp(name,"estimatedDate") == 0)) {
|
|
// this is in HttpMime.cpp
|
|
int64_t tt = atotime1 ( val );
|
|
// we can't store 64-bit dates... so truncate to -2147483648
|
|
// which is Dec 13 1901. so we don't quite get the 1898 date
|
|
// for the new york times dbpedia entry. maybe if we added
|
|
// an extra termlist for more precision to indicate century or
|
|
// something.
|
|
if ( tt && tt < (int32_t)0x80000000 )
|
|
tt = (int32_t)0x80000000;
|
|
// likewise, we can't be too big, passed 2038
|
|
if ( tt && tt > 0x7fffffff )
|
|
tt = (int32_t)0x7fffffff;
|
|
if ( tt ) {
|
|
// print out the time_t in ascii
|
|
vlen = sprintf(tbuf,"%"INT32"",(int32_t)tt);
|
|
// and point to it for hashing/indexing
|
|
val = tbuf;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// for deduping search results we set m_contentHash32 here for
|
|
// diffbot json objects.
|
|
// we can't do this here anymore, we have to set the
|
|
// contenthash in ::getContentHash32() because we need it to
|
|
// set EDOCUNCHANGED in ::getIndexCode() above.
|
|
//
|
|
/*
|
|
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
|
|
// make the content hash so we can set m_contentHash32
|
|
// for deduping
|
|
int32_t nh32 = hash32n ( name );
|
|
// do an exact hash for now...
|
|
int32_t vh32 = hash32 ( val , vlen , m_niceness );
|
|
// accumulate, order independently
|
|
totalHash32 ^= nh32;
|
|
totalHash32 ^= vh32;
|
|
}
|
|
*/
|
|
|
|
// index like "title:whatever"
|
|
hi->m_prefix = name;
|
|
hashString ( val , vlen , hi );
|
|
|
|
//log("hashing json var as %s %s %d", name, val, vlen);
|
|
|
|
// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
|
|
if ( name )
|
|
hashFieldMatchTerm ( val , (int32_t)vlen , hi );
|
|
|
|
if ( ! hashWithoutFieldNames )
|
|
continue;
|
|
|
|
// hash without the field name as well
|
|
hi->m_prefix = NULL;
|
|
hashString ( val , vlen , hi );
|
|
|
|
/*
|
|
// a number? hash special then as well
|
|
if ( ji->m_type != JT_NUMBER ) continue;
|
|
|
|
// use prefix for this though
|
|
hi->m_prefix = name;
|
|
|
|
// hash as a number so we can sort search results by
|
|
// this number and do range constraints
|
|
float f = ji->m_valueDouble;
|
|
if ( ! hashNumber2 ( f , hi ) )
|
|
return NULL;
|
|
*/
|
|
}
|
|
|
|
//m_contentHash32 = totalHash32;
|
|
//m_contentHash32Valid = true;
|
|
|
|
return (char *)0x01;
|
|
}
|
|
|
|
char *XmlDoc::hashXMLFields ( HashTableX *table ) {
|
|
|
|
setStatus ( "hashing xml fields" );
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = table;
|
|
hi.m_desc = "xml object";
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
|
|
Xml *xml = getXml();
|
|
int32_t n = xml->getNumNodes();
|
|
XmlNode *nodes = xml->getNodes ();
|
|
|
|
SafeBuf nameBuf;
|
|
|
|
// scan the xml nodes
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . skip if it's a tag not text node skip it
|
|
// . we just want the "text" nodes
|
|
if ( nodes[i].isTag() ) continue;
|
|
|
|
//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
|
|
// log("hey:hy");
|
|
|
|
// assemble the full parent name
|
|
// like "tag1.tag2.tag3"
|
|
nameBuf.reset();
|
|
xml->getCompoundName ( i , &nameBuf );
|
|
|
|
// this is \0 terminated
|
|
char *tagName = nameBuf.getBufStart();
|
|
|
|
// get the utf8 text
|
|
char *val = nodes[i].m_node;
|
|
int32_t vlen = nodes[i].m_nodeLen;
|
|
|
|
// index like "title:whatever"
|
|
if ( tagName && tagName[0] ) {
|
|
hi.m_prefix = tagName;
|
|
hashString ( val , vlen , &hi );
|
|
}
|
|
|
|
// hash without the field name as well
|
|
hi.m_prefix = NULL;
|
|
hashString ( val , vlen , &hi );
|
|
}
|
|
|
|
return (char *)0x01;
|
|
}
|
|
|
|
// if our url is that of a subdoc, then get the url of the parent doc
|
|
// from which we were a subsection
|
|
char *XmlDoc::getDiffbotParentUrl( char *myUrl ) {
|
|
// remove -diffbotxyz
|
|
if ( ! m_kbuf.safeStrcpy( myUrl ) ) return NULL;
|
|
char *p = m_kbuf.getBufStart();
|
|
char *s = strstr(p,"-diffbotxyz");
|
|
if ( s ) { *s = '\0'; return p; }
|
|
// temporarily until we inject "diffbotreply" uncomment this
|
|
/*
|
|
// otherwise i guess we got dan's format of -article|%"INT32"|%"INT32"
|
|
char *e = m_kbuf.getBuf() - 1;
|
|
for ( ; *e && is_digit(*e) ; e-- );
|
|
if ( *e != '|' ) return NULL;
|
|
e--;
|
|
for ( ; *e && is_digit(*e) ; e-- );
|
|
if ( *e != '|' ) return NULL;
|
|
e--;
|
|
// now to hyphen
|
|
char *estart = m_kbuf.getBufStart();
|
|
for ( ; e>estart && *e !='-' ; e-- );
|
|
if ( *e != '-' ) return NULL;
|
|
*e = '\0';
|
|
return p;
|
|
*/
|
|
return NULL;
|
|
}
|
|
|
|
bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
|
|
|
|
// sanity
|
|
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
storeFacetValuesSite ( qs, sb, fvh );
|
|
|
|
if ( m_hasMetadata) {
|
|
Json jpMetadata;
|
|
if (jpMetadata.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
|
|
storeFacetValuesJSON ( qs, sb, fvh, &jpMetadata );
|
|
}
|
|
}
|
|
|
|
// if "qa" is a gbxpathsitehash123456 type of beastie then we
|
|
// gotta scan the sections
|
|
if ( strncasecmp(qs,"gbxpathsitehash",15) == 0 )
|
|
return storeFacetValuesSections ( qs , sb , fvh );
|
|
|
|
// if a json doc, get json field
|
|
// spider status docs are really json now
|
|
if ( m_contentType == CT_JSON || m_contentType == CT_STATUS )
|
|
return storeFacetValuesJSON ( qs , sb , fvh, getParsedJson());
|
|
|
|
|
|
if ( m_contentType == CT_HTML )
|
|
return storeFacetValuesHtml ( qs , sb , fvh );
|
|
|
|
if ( m_contentType == CT_XML )
|
|
return storeFacetValuesXml ( qs , sb , fvh );
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// Store facet for site
|
|
bool XmlDoc::storeFacetValuesSite ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
|
|
|
|
char* val = getSite();
|
|
int vlen = gbstrlen(val);
|
|
FacetValHash_t val32 = hash32 ( val , vlen );
|
|
|
|
|
|
// skip if not for us
|
|
if ( fvh && val32 != fvh ) return false;
|
|
if ( strcmp("gbtagsite",qs) ) return false;
|
|
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false;
|
|
if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeFacetValuesSections ( char *qs , SafeBuf *sb ,
|
|
FacetValHash_t fvh ) {
|
|
|
|
// scan all sections
|
|
Sections *ss = getSections();
|
|
if ( ! ss ) return false;
|
|
if ( ss == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
Words *ww = getWords();
|
|
if ( ! ww ) return false;
|
|
if ( ww == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t siteHash32 = *getSiteHash32();
|
|
|
|
// qs is like gbxpathsitehash1234567
|
|
// so get the digit part
|
|
char *p = qs;
|
|
for ( ; *p && ! is_digit(*p); p++ );
|
|
uint64_t xsh = (uint64_t)atoll(p);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
Section *si = ss->m_rootSection;
|
|
//sec_t mflags = SEC_SENTENCE | SEC_MENU;
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// is it a match?
|
|
uint64_t mod;
|
|
mod = (uint32_t)si->m_turkTagHash32;
|
|
mod ^= (uint32_t)siteHash32;
|
|
if ( mod != xsh ) continue;
|
|
// . then add facet VALUE
|
|
// . hash of the innerhtml of sentence
|
|
// . get hash of sentences this tag contains indirectly
|
|
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
|
if ( ! val32 ) continue;
|
|
// if a facetvalhash was provided we must match
|
|
if ( fvh && val32 != fvh ) continue;
|
|
// got one print the facet field
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
if ( isString && ! sb->safePrintf("%"UINT32",",val32) )
|
|
return false;
|
|
// put ALSO print the string somewhat
|
|
char *a = m_words.m_words[si->m_next->m_a];
|
|
char *b = m_words.m_words[si->m_next->m_b-1];
|
|
b += m_words.m_wordLens [si->m_next->m_b-1];
|
|
if ( ! sb->safeTruncateEllipsis (a,b-a,160) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
// if wanted a specific string, we are done
|
|
if ( fvh ) return true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::storeFacetValuesHtml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
|
|
|
|
Xml *xml = getXml();
|
|
|
|
int32_t qsLen = gbstrlen(qs);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
// check for gblang:en etc.
|
|
// if ( isString && strncmp(qs,"gblang",6)==0 ) {
|
|
// if (!sb->safeStrcpy(qs) ) return false;
|
|
// if (!sb->pushChar('\0') ) return false;
|
|
// // find the lang that has that hash!
|
|
// if (!sb->safePrintf("%"UINT32",",(uint32_t)val32))return false;
|
|
// if (!sb->safeMemcpy(content,contentLen) ) return false;
|
|
// if (!sb->pushChar('\0') ) return false;
|
|
//}
|
|
|
|
|
|
char *content;
|
|
int32_t contentLen;
|
|
int32_t nameLen;
|
|
char *s;
|
|
int32_t i = 0;
|
|
|
|
bool uniqueField = false;
|
|
|
|
// a title tag can count now too
|
|
if ( strcmp(qs,"title") == 0 ) {
|
|
// skip leading spaces = false
|
|
content = xml->getString ("title",&contentLen,false);
|
|
uniqueField = true;
|
|
goto skip;
|
|
}
|
|
|
|
|
|
|
|
// find the first meta summary node
|
|
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
|
|
|
|
// continue if not a meta tag
|
|
if ( xml->m_nodes[i].m_nodeId != TAG_META ) continue;
|
|
// . does it have a type field that's "summary"
|
|
// . <meta name=summary content="...">
|
|
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
|
|
s = xml->getString ( i , "name", &nameLen );
|
|
// "s" can be "summary","description","keywords",...
|
|
if ( nameLen != qsLen ) continue;
|
|
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
|
|
// point to the summary itself
|
|
content = xml->getString ( i , "content" , &contentLen );
|
|
if ( ! content || contentLen <= 0 ) continue;
|
|
|
|
skip:
|
|
// hash it to match it if caller specified a particular hash
|
|
// because they are coming from Msg40::lookUpFacets() function
|
|
// to convert the hashes to strings, like for rendering in
|
|
// the facets box to the left of the search results
|
|
FacetValHash_t val32 = hash32 ( content, contentLen);
|
|
if ( fvh && fvh != val32 ) continue;
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
|
|
return false;
|
|
if ( !sb->safeMemcpy(content,contentLen) ) return false;
|
|
if ( !sb->pushChar('\0') ) return false;
|
|
|
|
// if only one specified, we are done
|
|
if ( fvh ) return true;
|
|
|
|
if ( uniqueField ) return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::storeFacetValuesXml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
|
|
|
|
Xml *xml = getXml();
|
|
|
|
int32_t qsLen = gbstrlen(qs);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
int32_t i = 0;
|
|
|
|
bool uniqueField = false;
|
|
|
|
SafeBuf nameBuf;
|
|
|
|
// find the first meta summary node
|
|
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
|
|
|
|
// skip text nodes
|
|
if ( xml->m_nodes[i].m_nodeId == 0 ) continue;
|
|
|
|
// assemble the full parent name
|
|
// like "tag1.tag2.tag3"
|
|
nameBuf.reset();
|
|
xml->getCompoundName ( i , &nameBuf );
|
|
int32_t nameLen = nameBuf.length();
|
|
char *s = nameBuf.getBufStart();
|
|
|
|
// . does it have a type field that's "summary"
|
|
// . <meta name=summary content="...">
|
|
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
|
|
//s = xml->getString ( i , "name", &nameLen );
|
|
|
|
// "s" can be "summary","description","keywords",...
|
|
if ( nameLen != qsLen ) continue;
|
|
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
|
|
|
|
// got it...
|
|
|
|
// wtf?
|
|
if ( i + 1 >= xml->m_numNodes ) continue;
|
|
|
|
// point to the content! this is a text node?
|
|
|
|
// skip if not a text node, we don't return tag nodes i guess
|
|
if ( xml->m_nodes[i+1].m_nodeId ) continue;
|
|
|
|
char *content = xml->m_nodes[i+1].m_node;
|
|
int32_t contentLen = xml->m_nodes[i+1].m_nodeLen;
|
|
|
|
// skip if empty
|
|
if ( ! content || contentLen <= 0 ) continue;
|
|
|
|
// skip commen cases too! like white space
|
|
if ( contentLen == 1 && is_wspace_a(content[0]) ) continue;
|
|
|
|
// hash it to match it if caller specified a particular hash
|
|
// because they are coming from Msg40::lookUpFacets() function
|
|
// to convert the hashes to strings, like for rendering in
|
|
// the facets box to the left of the search results
|
|
FacetValHash_t val32 = hash32 ( content, contentLen);
|
|
if ( fvh && fvh != val32 ) continue;
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
|
|
return false;
|
|
if ( !sb->safeMemcpy(content,contentLen) ) return false;
|
|
if ( !sb->pushChar('\0') ) return false;
|
|
|
|
// if only one specified, we are done
|
|
if ( fvh ) return true;
|
|
|
|
if ( uniqueField ) return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::storeFacetValuesJSON (char *qs,
|
|
SafeBuf *sb,
|
|
FacetValHash_t fvh,
|
|
Json *jp ) {
|
|
|
|
JsonItem *ji = jp->getFirstItem();
|
|
|
|
char nb[1024];
|
|
SafeBuf nameBuf(nb,1024);
|
|
|
|
bool isString = false;
|
|
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
|
|
|
|
for ( ; ji ; ji = ji->m_next ) {
|
|
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// skip if not number or string
|
|
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
|
continue;
|
|
|
|
// reset, but don't free mem etc. just set m_length to 0
|
|
nameBuf.reset();
|
|
|
|
// get its full compound name like "meta.twitter.title"
|
|
ji->getCompoundName ( nameBuf );
|
|
|
|
// skip if not for us
|
|
if ( strcmp(nameBuf.getBufStart(),qs) ) continue;
|
|
|
|
//
|
|
// now Json.cpp decodes and stores the value into
|
|
// a buffer, so ji->getValue() should be decoded completely
|
|
//
|
|
int32_t vlen;
|
|
char *val = ji->getValueAsString( &vlen );
|
|
|
|
// hash it to match it if caller specified a particular hash
|
|
// because they are coming from Msg40::lookUpFacets() function
|
|
// to convert the hashes to strings, like for rendering in
|
|
// the facets box to the left of the search results
|
|
FacetValHash_t val32 = hash32 ( val , vlen );
|
|
if ( fvh && val32 != fvh )
|
|
continue;
|
|
|
|
// otherwise add facet FIELD to our buf
|
|
if ( ! sb->safeStrcpy(qs) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// then add facet VALUE
|
|
if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
|
|
return false;
|
|
|
|
if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
|
|
if ( ! sb->pushChar('\0') ) return false;
|
|
|
|
// if wanted a specific string, then we are done
|
|
if ( fvh ) return true;
|
|
}
|
|
|
|
return true;
|
|
}
|