Matt Wells 53ee1039b8 import final hung crawl fix into old gb.
xor the firstip into the doledb key this time.
seems to avoid all collisions now so we don't
overwrite nodes in the doledb tree.
2016-11-07 09:11:27 -08:00

53448 lines
1.6 MiB

//-*- coding: utf-8 -*-
#include "gb-include.h"
#include "hash.h"
#include "XmlDoc.h"
#include "Indexdb.h" // for TERMID_MASK definition and g_indexdb.getTermId()
#include "Conf.h"
#include "Query.h" // getFieldCode()
#include "Clusterdb.h" // g_clusterdb
#include "Categories.h" // g_categories
#include "iana_charset.h"
//#include "Checksumdb.h"
//#include "Msg24.h"
#include "Stats.h"
#include "Sanity.h"
#include "Speller.h"
#include "CountryCode.h"
//#include "SiteBonus.h"
#include "linkspam.h"
#include "Tagdb.h"
//#include "Dates.h"
#include "Repair.h"
//#include "Links.h"
#include "HashTableX.h"
#include "LanguageIdentifier.h" // g_langId
#include "CountryCode.h" // g_countryCode
#include "sort.h"
#include "Wiki.h"
#include "Speller.h"
#include "SiteGetter.h"
#include "Placedb.h"
#include "Test.h"
#include "Synonyms.h"
//#include "Revdb.h"
#include "Timedb.h"
#ifdef _USETURKS_
//#include "PageTurk.h"
#endif
#include "PageInject.h"
#include "HttpServer.h"
#include "Facebook.h"
#include "Posdb.h"
#include "Highlight.h"
#include "Wiktionary.h"
#include "seo.h" // Msg99Request etc.
//#include <regex.h>
#include "PingServer.h"
#include "Parms.h"
extern int g_inMemcpy;
//#define MAXDOCLEN (1024*1024 * 5)
//#define MAXDOCLEN (1024*1024)
HashTableX *g_ct = NULL;
XmlDoc *g_doc = NULL;
char *g_ptr = NULL;
int32_t *g_int32_t = NULL;
#define SENT_UNITS 30
static int32_t getIsContacty ( Url *url ,
LinkInfo *info1 ,
int32_t hops ,
uint8_t ct ,
bool isRoot ,
int32_t niceness );
static int32_t getTopGigabits ( HashTableX *ht ,
GigabitInfo **top ,
int32_t max ,
int32_t minDocCount ) ;
static void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
int64_t wid1 ,
int64_t pid2 ,
int64_t wid2 , // post word
float *ww ,
HashTableX *tt1 ,
int32_t titleRecVersion ) ;
static bool addGigabit ( HashTableX *ht ,
char *s ,
int32_t slen ,
int64_t docId ,
Section *sp ,
bool singleWord ,
uint8_t langId ,
// starts with word #i
int32_t i ,
int32_t ptsArg = -1 ) ;
static bool getWordPosVec ( Words *words ,
Sections *sections,
//int32_t wordStart,
//int32_t wordEnd,
int32_t startDist,
char *fragVec,
int32_t niceness ,
SafeBuf *wpos ) ;
static void getMetaListWrapper ( void *state ) ;
char *getFirstJSONObject ( char *p ,
int32_t niceness ,
bool *isProduct ,
bool *isImage ) ;
char *getJSONObjectEnd ( char *p , int32_t niceness ) ;
void doneReadingArchiveFileWrapper ( int fd, void *state );
XmlDoc::XmlDoc() {
m_readThreadOut = false;
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) m_msg7s[i] = NULL;
m_esbuf.setLabel("exputfbuf");
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
m_freed = false;
m_contentInjected = false;
m_wasContentInjected = false;
// warc parsing stuff
m_msg7 = NULL;
m_warcError = 0;
m_arcError = 0;
m_doneInjectingWarc = false;
m_numInjectionsOut = 0;
m_fptr = NULL;
m_fptrEnd = NULL;
m_fileBuf = NULL;
m_warcContentPtr = NULL;
m_calledWgetThread = false;
//m_coll = NULL;
m_ubuf = NULL;
m_pbuf = NULL;
//m_contactDoc = NULL;
m_rootDoc = NULL;
m_oldDoc = NULL;
m_dx = NULL;
m_printedMenu = false;
// reset all *valid* flags to false
void *p = &m_VALIDSTART;
void *pend = &m_VALIDEND;
memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p
m_msg22Request.m_inUse = 0;
m_msg4Waiting = false;
m_msg4Launched = false;
//m_sectiondbData = NULL;
//m_placedbData = NULL;
m_dupTrPtr = NULL;
m_oldTitleRec = NULL;
m_filteredContent = NULL;
m_filteredContentAllocSize = 0;
m_metaList = NULL;
m_metaListSize = 0;
m_metaListAllocSize = 0;
//m_titleRec = NULL;
//m_freeTitleRec = true;
m_rootTitleRec = NULL;
m_outlinkHopCountVector = NULL;
//m_gsbuf = NULL;
m_extraDoc = NULL;
m_ahrefsDoc = NULL;
m_wikiqbuf = NULL;
//m_cr = NULL;
//m_msg3aArray = NULL;
m_msg3a = NULL;
m_query3a = NULL;
//m_numMsg99Replies = 0;
m_numMsg95Replies = 0;
m_seoSocket = NULL;
m_hackSocket = NULL;
m_doingSEO = false;
//m_newxd = NULL;
//m_newxd2 = NULL;
//m_newMsg20 = NULL;
m_registeredSocketCallback = false;
//m_numMsg98Requests = 0;
//m_numMsg98Replies = 0;
m_numMsg8eReplies = 0;
m_numMsg8eRequests = 0;
m_tempMsg25Page = NULL;
m_tempMsg25Site = NULL;
m_numLinkRequestsOut = 0;
m_numLinkRequestsIn = 0;
m_numMsg3fReplies = 0;
m_numMsg3fRequests = 0;
m_numMsg4fRequests = 0;
m_numMsg4fReplies = 0;
m_sentMsg4fRequests = false;
//m_notifyBlocked = 0;
//m_mcasts = NULL;
//for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ )
// m_currentBinPtrs[i] = NULL;
m_registeredWgetReadCallback = false;
m_pipe = NULL;
reset();
};
XmlDoc::~XmlDoc() {
setStatus("freeing this xmldoc");
reset();
m_freed = true;
};
static int64_t s_lastTimeStart = 0LL;
// for debugging
class XmlDoc *g_xd;
void XmlDoc::reset ( ) {
m_zeroedOut = false;
m_oldDocExistedButHadError = false;
m_addedStatusDocId = 0;
if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) {
mfree ( m_diffbotProxyReply , sizeof(ProxyReply) , "dprox" );
m_diffbotProxyReply = NULL;
}
if ( m_readThreadOut )
log("build: deleting xmldoc class that has a read thread out "
"on a warc file");
if ( m_fileValid ) {
m_file.close();
m_file.unlink();
}
if ( m_fileBuf )
mfree ( m_fileBuf , m_fileBufAllocSize , "fbdd");
for ( int i = 0 ; i < MAXMSG7S ; i++ ) {
Msg7 *msg7 = m_msg7s[i];
if ( ! msg7 ) continue;
if(msg7->m_inUse) {
log("build: archive: reseting xmldoc when msg7s are outstanding");
}
mdelete ( msg7 , sizeof(Msg7) , "xdmsg7" );
delete ( msg7 );
m_msg7s[i] = NULL;
}
if ( m_msg7 ) {
mdelete ( m_msg7 , sizeof(Msg7) , "xdmsg7" );
delete ( m_msg7 );
m_msg7 = NULL;
}
m_warcContentPtr = NULL;
m_arcContentPtr = NULL;
m_anyContentPtr = NULL;
m_savedChar = '\0';
m_contentDelim = NULL;
if(m_registeredWgetReadCallback && m_pipe) {
log("build:came back from sleep callback");
g_loop.unregisterReadCallback( fileno(m_pipe), this,doneReadingArchiveFileWrapper);
m_registeredWgetReadCallback = false;
}
if(m_pipe) {
int32_t retCode = fclose(m_pipe);
log("we closed the warc pipe on reset with error %s", mstrerror(retCode));
m_pipe = NULL;
}
m_redirUrl.reset();
m_updatedMetaData = false;
m_ipStartTime = 0;
m_ipEndTime = 0;
m_diffbotReplyRetries = 0;
m_isImporting = false;
m_printedMenu = false;
// for hashing CT_STATUS docs consistently, this might be invalid
// so call it 0
m_pubDate = 0;
m_tmpBuf2.purge();
m_gotFacets = false;
m_bodyStartPos = 0;
m_mcastArray = NULL;
m_skipIframeExpansion = false;
m_indexedTime = 0;
m_didDelete = false;
m_metaList2.purge();
m_zbuf.purge();
m_kbuf.purge();
m_mySiteLinkInfoBuf.purge();
m_myPageLinkInfoBuf.purge();
m_myTempLinkInfoBuf.purge();
// reset count for nukeJSONObjects() function
m_joc = 0;
// notifications pending?
//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
m_sentToDiffbot = 0;
m_gotDiffbotSuccessfulReply = 0;
// we need to reset this to false
m_useTimeAxis = false;
m_sentToDiffbotThisTime = false;
m_loaded = false;
m_msg4Launched = false;
m_diffbotReplyError = 0;
m_diffbotJSONCount = 0;
//m_downloadAttempted = false;
m_incrementedAttemptsCount = false;
m_incrementedDownloadCount = false;
if ( m_dx ) {
mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
delete ( m_dx );
m_dx = NULL;
//log("diffbot: deleting m_dx2");
}
m_isDiffbotJSONObject = false;
m_dmozBuf.purge();
m_fakeIpBuf.purge();
m_fakeTagRecPtrBuf.purge();
m_tlbufTimer = 0LL;
m_gsbuf.reset();
//m_launchedAll = false;
m_qstringTable.reset();
//m_setForReplyPtrs = false;
//m_setForLinkPtrs = false;
// must be none outstanding
if ( m_numMsg3fReplies != m_numMsg3fRequests ) { char *xx=NULL;*xx=0;}
if ( m_numMsg4fReplies != m_numMsg4fRequests ) { char *xx=NULL;*xx=0;}
m_numMsg4fRequests = 0;
m_numMsg4fReplies = 0;
m_sentMsg4fRequests = false;
// free table's mem if used
//m_tmpDupTable.reset();
//m_newxd2Blocked = false;
m_lastPrintedDocId = 0LL;
m_loggedMsg3 = false;
m_progressBar = 0;
m_triedToAddWordPosInfoToCachedb = false;
if ( m_numLinkRequestsOut > m_numLinkRequestsIn ){char *xx=NULL;*xx=0;}
m_doConsistencyTesting = g_conf.m_doConsistencyTesting;
m_computedMetaListCheckSum = false;
m_msg3aErrno = 0;
m_hadMatchError = 0;
m_clientClosed = false;
m_lastCheckTime = 0;
m_calledMsg25ForSite = false;
m_calledMsg25ForPage = false;
m_checkedCachedbForSite = false;
m_checkedCachedbForPage = false;
m_allHashed = false;
// nuke it
if ( m_tempMsg25Page ) {
mdelete ( m_tempMsg25Page , sizeof(Msg25), "m25li" );
delete ( m_tempMsg25Page );
m_tempMsg25Page = NULL;
}
if ( m_tempMsg25Site ) {
mdelete ( m_tempMsg25Site , sizeof(Msg25), "m25li" );
delete ( m_tempMsg25Site );
m_tempMsg25Site = NULL;
}
m_numLinkRequestsOut = 0;
m_seoDebug = 0;
//m_seoInfoSetFromCache = false;
m_checkedCachedb = false;
m_processedCachedbReply = false;
m_cacheList.freeList();
for ( int32_t i = 0; m_numMsg8eReplies && i < g_hostdb.m_numHosts;i++) {
if ( ! m_msg8eReply[i] ) continue;
mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
m_msg8eReply[i] = NULL;
}
m_numMsg8eRequests = 0;
m_numMsg8eReplies = 0;
for ( int32_t i = 0; m_numMsg95Replies && i < g_hostdb.m_numHosts;i++) {
if ( ! m_msg95ReplyPtrs[i] ) continue;
mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
m_msg95ReplyPtrs[i] = NULL;
}
m_numMsg95Replies = 0;
m_numMsg3fRequests = 0;
m_numMsg3fReplies = 0;
m_qcursor = 0;
//m_binError = 0;
//m_msg98ReplyError = 0;
//m_binErrorForReplyPtrs = 0;
//m_binErrorForLinkPtrs = 0;
//m_msg17.reset();
//m_triedCache = false;
//m_cacheRec = NULL;
//m_cacheRecSize = 0;
// reset this crap
m_beginTimeAllMatch = 0LL;
m_beginTimeMatchUrl = 0LL;
m_beginTimeFullQueries = 0LL;
m_beginTimeLinks = 0LL;
//m_beginMsg98s = 0LL;
m_beginRelatedQueries = 0LL;
m_doledbKey.n0 = 0LL;
m_doledbKey.n1 = 0;
// sanity check, any outstanding?
//if( m_numMsg98Requests != m_numMsg98Replies ) { char *xx=NULL;*xx=0;}
// reset them now
//m_numMsg98Requests = 0;
//m_numMsg98Replies = 0;
//if ( m_newxd ) {
// mdelete ( m_newxd , sizeof(XmlDoc),"newxd");
// delete ( m_newxd );
// m_newxd = NULL;
//}
//if ( m_newxd2 ) {
// mdelete ( m_newxd2 , sizeof(XmlDoc),"newxd2");
// delete ( m_newxd2 );
// m_newxd2 = NULL;
//}
/*
if ( m_newMsg20 ) {
mdelete ( m_newMsg20 , sizeof(Msg20),"newmsg20");
delete ( m_newMsg20 );
m_newMsg20 = NULL;
}*/
/*
NO! we use this for clientClosedConnection() function now
if ( m_seoSocket ) {
TcpServer *tcp = m_seoSocket->m_this;
// gotta set this so it can be destroyed and closed
m_seoSocket->m_waitingOnHandler = false;
tcp->destroySocket ( m_seoSocket );
m_seoSocket = NULL;
}
*/
if ( m_registeredSocketCallback ) { char *xx=NULL; *xx=0; }
//for ( int32_t i = 0 ; i < m_numMsg99Replies ; i++ ) {
// if ( ! m_msg99ReplyPtrs[i] ) continue;
// mfree ( m_msg99ReplyPtrs [i] ,
// m_msg99ReplyAlloc[i] ,
// "m99reply" );
//}
//m_numMsg99Replies = 0;
//m_sentMsg99Requests = false;
if ( m_msg3a ) {
mdelete ( m_msg3a , sizeof(Msg3a) , "xdmsg3a" );
delete ( m_msg3a );
m_msg3a = NULL;
}
if ( m_query3a ) {
mdelete ( m_query3a , sizeof(Query),"xdqry3a");
delete ( m_query3a );
m_query3a = NULL;
}
m_surroundingTextBuf.purge();
m_rssItemBuf.purge();
//m_twbuf.purge();
m_topMatchingQueryBuf.purge();
//m_queryPtrs.purge();
m_queryOffsets.purge();
m_extraQueryBuf.purge();
//m_socketWriteBuf.purge();
m_relatedDocIdBuf.purge();
m_relatedTitleBuf.purge();
m_commonQueryNumBuf.purge();
m_queryLinkBuf.purge();
//m_relatedQueryLinksIntersected.purge();
m_queryLinkStringBuf.purge();
//m_queryRelBuf.purge();
//m_relPtrs.purge();
m_sortedPosdbListBuf.purge();
m_wpSortedPosdbListBuf.purge();
m_termListBuf.purge();
m_insertableTermsBuf.purge();
//m_iwfiBuf.purge();
m_wordPosInfoBuf.purge();
//m_msg20ReplyPtrBuf.purge();
m_recommendedLinksBuf.purge();
m_tmpMsg0Buf.purge();
m_msg20Array.purge();
m_newLinkerBuf.purge();
//m_msg99ReplyBuf.purge();
m_matchingQueryBuf.purge();
m_relatedQueryBuf.purge();
m_queryLinkBuf.purge();
m_matchingQueryStringBuf.purge();
m_relatedQueryStringBuf.purge();
m_queryLinkStringBuf.purge();
m_docIdListBuf.purge();
m_queryChangeBuf.purge();
m_queryLogBuf.purge();
//m_itStrBuf.purge();
m_debugScoreInfoBuf.purge();
m_origScoreInfoBuf.purge();
m_msg20Buf.purge();
m_topDocIdsBuf.purge();
m_missingTermBuf.purge();
m_termInfoBuf.purge();
m_newTermInfoBuf.purge();
m_matchingTermBuf.purge();
m_termId32Buf.purge();
m_storeList.freeList();
//m_queryHashTable.reset();
m_tidTable32.reset();
m_queryOffsetTable.reset();
m_tmpTable.reset();
m_fullQueryDedup.reset();
//m_dupVotes.reset();
m_wordSpamBuf.purge();
m_fragBuf.purge();
m_downloadLevel = 0;
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) {
if ( ! m_xmlDocs[i] ) continue;
mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
delete ( m_xmlDocs[i] );
m_xmlDocs[i] = NULL;
}
s_lastTimeStart = 0LL;
m_req = NULL;
m_doneWithAhrefs = false;
m_useAhrefs = false;
m_linkDedupTablePtr = NULL;
m_domDedupTablePtr = NULL;
m_storeTermListInfo = false;
m_gotDupStats = false;
//m_nextSection = (Section *)-1;
m_si = (Section *)-1;
// for limiting # of iframe tag expansions
m_numExpansions = 0;
// . are not allowed to exit if waiting for msg4 to complete
// . yes we are, it should be saved as addsinprogress.dat
if ( m_msg4Waiting ) {
log("doc: resetting xmldoc with outstanding msg4. should "
"me saved in addsinprogress.dat. docid=%"UINT64"",m_docId);
//char *xx=NULL;*xx=0; }
}
m_ei = 0;
m_lastLaunch = -1;
m_pbuf = NULL;
m_wts = NULL;
m_deleteFromIndex = false;
//if ( m_contactDocValid ) nukeDoc ( m_contactDoc );
if ( m_rootDocValid ) nukeDoc ( m_rootDoc );
if ( m_oldDocValid ) nukeDoc ( m_oldDoc );
if ( m_extraDocValid ) nukeDoc ( m_extraDoc );
if ( m_ahrefsDocValid ) nukeDoc ( m_ahrefsDoc );
if ( m_linkInfo1Valid && ptr_linkInfo1 && m_freeLinkInfo1 ) {
// it now points into m_myPageLinkInfoBuf !
//mfree ( ptr_linkInfo1 , size_linkInfo1, "LinkInfo1");
ptr_linkInfo1 = NULL;
m_linkInfo1Valid = false;
}
if ( m_linkInfo2Valid && ptr_linkInfo2 && m_freeLinkInfo2 ) {
// should point into a safebuf as well
//mfree ( ptr_linkInfo2 , size_linkInfo2, "LinkInfo2");
ptr_linkInfo2 = NULL;
m_linkInfo2Valid = false;
}
if ( m_rawUtf8ContentValid && m_rawUtf8Content && !m_setFromTitleRec
// was content supplied by pageInject.cpp?
//! m_contentInjected ) {
) {
mfree ( m_rawUtf8Content, m_rawUtf8ContentAllocSize,"Xml3");
}
// reset this
m_contentInjected = false;
m_rawUtf8ContentValid = false;
m_wasContentInjected = false;
m_rootDoc = NULL;
// if this is true, then only index if new
m_newOnly = 0;
//if ( m_sectiondbData ) {
// mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" );
// m_sectiondbData = NULL;
//}
//if ( m_placedbData ) {
// mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" );
// m_placedbData = NULL;
//}
if ( m_httpReplyValid && m_httpReply ) {
mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
m_httpReply = NULL;
m_httpReplyValid = false;
}
if ( m_filteredContentAllocSize ) {
mfree (m_filteredContent,m_filteredContentAllocSize,"xdfc");
m_filteredContent = NULL;
m_filteredContentAllocSize = 0;
}
//if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content )
// mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3");
if ( m_metaList ) { // m_metaListValid && m_metaList ) {
mfree ( m_metaList , m_metaListAllocSize , "metalist");
m_metaList = NULL;
m_metaListSize = 0;
m_metaListAllocSize = 0;
}
if ( m_ubuf ) {
// log("xmldoc: delete m_ubuf=%"PTRFMT" this=%"PTRFMT
// , (PTRTYPE) m_ubuf
// , (PTRTYPE) this
// );
mfree ( m_ubuf , m_ubufAlloc , "ubuf");
m_ubuf = NULL;
}
//if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) {
// mfree ( m_titleRec , m_titleRecAllocSize , "trec" );
//}
//m_titleRec = NULL;
m_titleRecBuf.purge();
if ( m_dupTrPtr ) {
mfree ( m_dupTrPtr , m_dupTrSize , "trecd" );
m_dupTrPtr = NULL;
}
if ( m_oldTitleRecValid && m_oldTitleRec ) {
mfree ( m_oldTitleRec , m_oldTitleRecSize , "treca" );
m_oldTitleRec = NULL;
m_oldTitleRecValid = false;
}
if ( m_rootTitleRecValid && m_rootTitleRec ) {
mfree ( m_rootTitleRec , m_rootTitleRecSize , "treca" );
m_rootTitleRec = NULL;
m_rootTitleRecValid = false;
}
if ( m_outlinkHopCountVectorValid && m_outlinkHopCountVector ) {
int32_t sz = m_outlinkHopCountVectorSize;
mfree ( m_outlinkHopCountVector,sz,"ohv");
}
m_outlinkHopCountVector = NULL;
//if ( m_gsbufValid && m_gsbuf ) {
// mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" );
//}
//m_gsbuf = NULL;
m_gsbuf.reset();
// reset all *valid* flags to false
void *p = &m_VALIDSTART;
void *pend = &m_VALIDEND;
memset ( p , 0 , (char *)pend - (char *)p );
m_hashedMetas = false;
m_mcastBuf.purge();
m_serpBuf.purge();
// Doc.cpp:
m_mime.reset();
m_words.reset();
m_phrases.reset();
m_bits.reset();
m_sections.reset();
//m_weights.reset();
m_countTable.reset();
m_dates.reset();
m_addresses.reset();
// other crap
m_xml.reset();
m_links.reset();
m_bits2.reset();
m_pos.reset();
//m_synonyms.reset();
m_synBuf.reset();
//m_nsvt.reset();
//m_osvt.reset();
m_turkVotingTable.reset();
m_turkBitsTable.reset();
m_vtr.reset();
m_rdtab.reset();
m_vctab.reset();
m_vcduptab.reset();
m_images.reset();
m_countTable.reset();
m_mime.reset();
m_tagRec.reset();
m_newTagBuf.reset();
m_catRec.reset();
//m_clockCandidatesTable.reset();
//m_cctbuf.reset();
m_dupList.reset();
//m_oldMetaList.reset();
m_msg8a.reset();
//m_siteLinkInfo.reset();
//m_msg25.reset();
//m_msgc.reset();
m_msg13.reset();
m_tmpsb1.reset();
m_tmpsb2.reset();
m_turkBuf.reset();
m_msg0b.reset();
//m_siteGetter.reset();
m_msge0.reset();
m_msge1.reset();
m_reply.reset();
// mroe stuff skipped
m_wtsTable.reset();
m_wbuf.reset();
m_pageLinkBuf.reset();
m_siteLinkBuf.reset();
m_esbuf.reset();
m_xbuf.reset();
m_tagRecBuf.reset();
//m_titleRec = NULL;
//m_titleRecSize = 0;
// origin of this XmlDoc
m_setFromTitleRec = false;
m_setFromUrl = false;
m_setFromDocId = false;
m_setFromSpiderRec = false;
m_freeLinkInfo1 = false;
m_freeLinkInfo2 = false;
m_checkedUrlFilters = false;
m_indexCode = 0;
m_masterLoop = NULL;
m_masterState = NULL;
//m_isAddUrl = false;
m_isInjecting = false;
m_useFakeMime = false;
m_useSiteLinkBuf = false;
m_usePageLinkBuf = false;
m_printInXml = false;
m_check1 = false;
m_check2 = false;
m_prepared = false;
// keep track of updates to the rdbs we have done, so we do not re-do
m_listAdded = false;
m_listFlushed = false;
m_updatedCounts = false;
m_updatedCounts2 = false;
//m_updatedTagdb1 = false;
//m_updatedTagdb2 = false;
//m_updatedTagdb3 = false;
//m_updatedTagdb4 = false;
//m_updatedTagdb5 = false;
m_copied1 = false;
m_updatingSiteLinkInfoTags = false;
m_addressSetCalled = false;
m_hashedTitle = false;
m_registeredSleepCallback = false;
m_addedNegativeDoledbRec = false;
m_numRedirects = 0;
m_numOutlinksAdded = 0;
// . use sameDomain and sameIp waits?
// . these may be bypassed in getContactDoc()
//m_throttleDownload = true;
m_spamCheckDisabled = false;
m_useRobotsTxt = true;
m_redirectFlag = false;
// Scraper.cpp sets this to true
//m_isScraping = false;
m_allowSimplifiedRedirs = false;
//m_calledMsg22a = false;
//m_calledMsg22b = false;
//m_calledMsg22c = false;
m_didDelay = false;
m_didDelayUnregister = false;
m_calledMsg22d = 0LL;
m_calledMsg22e = false;
m_calledMsg22f = false;
m_calledMsg25 = false;
m_calledMsg25b = false;
m_calledMsg40 = false;
m_calledSections = false;
m_calledThread = false;
m_alreadyRegistered = false;
m_loaded = false;
m_firstEntry = true;
m_firstEntry2 = true;
m_launchedSpecialMsg8a = false;
m_launchedMsg8a2 = false;
m_numSectiondbReads = 0;
m_numSectiondbNeeds = 0;
m_sectiondbRecall = 0;
//m_triedVoteCache = false;
//m_storedVoteCache = false;
m_setTr = false;
//m_checkedRobots = false;
m_triedTagRec = false;
m_didGatewayPage = false;
m_didQuickDupCheck = false;
m_calledMsg8b = false;
m_recycleContent = false;
//m_loadFromOldTitleRec = false;
m_callback1 = NULL;
m_callback2 = NULL;
m_state = NULL;
// used for getHasContactInfo()
m_processed0 = false;
m_hasContactInfo = false;
m_hasContactInfo2 = false;
//m_checkForRedir = true;
m_processedLang = false;
m_doingConsistencyCheck = false;
// used for getting contact info
//m_triedRoot = false;
//m_winner = -2;
// tell Msg13 to just call HttpServer::getDoc() and not to forward
// the download request to another host. although this does not
// exclude possible forwarding it to a compression proxy if
// g_conf.m_useCompressionProxy is set
m_forwardDownloadRequest = false;
m_isChildDoc = false;
m_parentDocPtr = NULL;
// for utf8 content functions
m_savedp = NULL;
m_oldp = NULL;
m_didExpansion = false;
// Repair.cpp now explicitly sets these to false if needs to
m_usePosdb = true;
//m_useDatedb = true;
m_useClusterdb = true;
m_useLinkdb = true;
m_useSpiderdb = true;
m_useTitledb = true;
m_useTagdb = true;
m_usePlacedb = true;
//m_useTimedb = true;
// only use for custom crawls for now to save disk space
m_useSectiondb = false;
//m_useRevdb = true;
m_useSecondaryRdbs = false;
//m_useIpsTxtFile = true;
// used by Msg13.cpp only. kinda a hack.
m_isSpiderProxy = false;
// do not cache the http reply in msg13 etc.
m_maxCacheAge = 0;
// reset these ptrs too!
void *px = &ptr_firstUrl;
void *pxend = &size_firstUrl;
memset ( px , 0 , (char *)pxend - (char *)px );
m_hasMetadata = false;
ptr_metadata = NULL;
size_metadata = 0;
}
// . set the url with the intention of adding it or deleting it from the index
// . Msg7 and Repair.cpp can also set other members of XmlDoc rather than just
// m_firstUrl. they can provide the ip, the http reply, content, filtered
// content, the forced next spider time and the forced first indexed date,
// the hop count
// . they might also want to skip deduping, or any algo deemed unnecessary
// by setting, for instance, m_isDupValid = true, or something
bool XmlDoc::set1 ( char *url ,
char *coll ,
SafeBuf *pbuf ,
int32_t niceness ) {
reset();
// this is true
m_setFromUrl = true;
//m_coll = coll;
m_pbuf = pbuf;
m_niceness = niceness;
m_version = TITLEREC_CURRENT_VERSION;
m_versionValid = true;
// sanity check
if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
// copy this in case collection gets deleted i guess...
//m_forceDelete = forceDelete;
// did we get this url from PageAddUrl?
//m_isAddUrl = isAddUrl;
// set m_indexCode so that XmlDoc::indexDoc() will delete it
//if ( forceDelete ) m_indexCode = EDOCFORCEDELETE;
// set this important member var
//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
//if ( ! cr ) return false;
if ( ! setCollNum ( coll ) ) return false;
setFirstUrl ( url , false );
//setSpideredTime();
return true;
}
char *XmlDoc::getTestDir ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// return NULL if we are not the "qatest123" collection
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
// then return "test-spider" otherwise...
//if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
// return "qa";//"test-spider";
// ... default to "test-parser"
//return "test-parser";
return "qa";
/*
if ( getIsPageParser() )
return "test-page-parser";
//if ( m_sreqValid && m_sreq.m_isInjecting )
// return "test-page-inject";
else if ( g_conf.m_testParserEnabled )
return "test-parser";
else if ( g_conf.m_testSpiderEnabled )
return "test-spider";
// default to being from PageInject
return "test-page-inject";
*/
//else { char *xx=NULL;*xx=0; }
//return NULL;
}
int32_t XmlDoc::getSpideredTime ( ) {
// stop if already set
if ( m_spideredTimeValid ) return m_spideredTime;
// tmp var
int32_t date = 0;
CollectionRec *cr = getCollRec();
if ( ! cr ) return 0;
// if not test collection keep it simple
if ( strcmp(cr->m_coll,"qatest123") || cr->m_useTimeAxis) {
// . set spider time to current time
// . this might already be valid if we set it in
// getTestSpideredDate()
m_spideredTime = getTimeGlobal();
m_spideredTimeValid = true;
return m_spideredTime;
}
char *testDir = getTestDir();
// get url
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) { char *xx=NULL;*xx=0; }
// this returns false if not in there, in which case, add it
if ( ! getTestSpideredDate(cu,&date,testDir) ) {
m_spideredTime = getTimeGlobal();
m_spideredTimeValid = true;
addTestSpideredDate ( cu , m_spideredTime , testDir );
return m_spideredTime;
}
// if we are injecting into the test coll for the 2nd+ time
// we need to use the spidered date from the first time we
// injected the doc in order to ensure things are parsed
// exactly the same way since some things depend on the
// spideredTime, like Dates (for setting "in future"
// flags)
m_spideredTimeValid = true;
m_spideredTime = date;
// hack for test coll which has fake vals for these because
// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
//m_minPubDate = m_spideredTime - 48*3600;
//m_maxPubDate = m_spideredTime - 24*3600;
return m_spideredTime;
}
// . we need this so PageGet.cpp can get the cached web page
// . but not for Msg20::getSummary(), that uses XmlDoc::set(Msg20Request*)
// . returns false and sets g_errno on error
bool XmlDoc::set3 ( int64_t docId ,
char *coll ,
int32_t niceness ) {
reset();
// this is true
m_setFromDocId = true;
m_docId = docId;
m_docIdValid = true;
//m_coll = coll;
m_niceness = niceness;
// . sanity check
// . why can't we allow this??? MDW
//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
// set this important member var
//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
//if ( ! cr ) { m_errno = ENOCOLLREC; return false; }
if ( ! setCollNum ( coll ) ) return false;
// solidify some parms
//m_eliminateMenus = cr->m_eliminateMenus;
//m_eliminateMenusValid = true;
return true;
}
void loadFromOldTitleRecWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// note it
THIS->setStatus ( "loading from old title rec wrapper" );
// return if it blocked
if ( ! THIS->loadFromOldTitleRec ( ) ) return;
char *coll = "";
CollectionRec *cr = THIS->getCollRec();
if ( cr ) coll = cr->m_coll;
// error?
if ( g_errno ) log("doc: loadfromtitlerec coll=%s: %s",
coll,
mstrerror(g_errno));
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
// returns false if blocked, returns true and sets g_errno on error otherwise
bool XmlDoc::loadFromOldTitleRec ( ) {
// . we are an entry point.
// . if anything blocks, this will be called when it comes back
if ( ! m_masterLoop ) {
m_masterLoop = loadFromOldTitleRecWrapper;
m_masterState = this;
}
// if we already loaded!
if ( m_loaded ) return true;
// if set from a docid, use msg22 for this!
char **otr = getOldTitleRec ( );
// error?
if ( ! otr ) return true;
// blocked?
if ( otr == (void *)-1 ) return false;
// this is a not found
if ( ! *otr ) {
// so we do not retry
m_loaded = true;
// make it an error
g_errno = ENOTFOUND;
return true;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// use that. decompress it! this will also set
// m_setFromTitleRec to true
if ( ! set2 ( m_oldTitleRec ,
m_oldTitleRecSize , // maxSize
cr->m_coll ,
NULL , // pbuf
m_niceness )) {
// we are now loaded, do not re-call
m_loaded = true;
// return true with g_errno set on error uncompressing
return true;
}
// we are now loaded, do not re-call
m_loaded = true;
// sanity check
if ( ! m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
// good to go
return true;
}
bool XmlDoc::setCollNum ( char *coll ) {
CollectionRec *cr;
cr = g_collectiondb.getRec ( coll , gbstrlen(coll) );
if ( ! cr ) {
g_errno = ENOCOLLREC;
return log("build: collrec not found for %s",coll);
}
// we can store this safely:
m_collnum = cr->m_collnum;
m_collnumValid = true;
// if user "resets" the collection we need to know
m_lastCollRecResetCount = cr->m_lastResetCount;
return true;
}
CollectionRec *XmlDoc::getCollRec ( ) {
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
if ( ! cr ) {
log("build: got NULL collection rec for collnum=%"INT32".",
(int32_t)m_collnum);
g_errno = ENOCOLLREC;
return NULL;
}
// was it reset since we started spidering this url?
// we don't do it this way, when resetting a coll when delete it and
// re-add under a different collnum to avoid getting msg4 adds to it.
//if ( cr->m_lastResetCount != m_lastCollRecResetCount ) {
// log("build: collection rec was reset. returning null.");
// g_errno = ENOCOLLREC;
// return NULL;
//}
return cr;
}
// returns false and sets g_errno on error
bool XmlDoc::set4 ( SpiderRequest *sreq ,
key_t *doledbKey ,
char *coll ,
SafeBuf *pbuf ,
int32_t niceness ,
char *utf8ContentArg ,
bool deleteFromIndex ,
int32_t forcedIp ,
uint8_t contentType ,
uint32_t spideredTime ,
bool contentHasMimeArg ,
char *contentDelim,
char *metadata ,
uint32_t metadataLen,
int32_t payloadLen
) {
// sanity check
if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; }
reset();
if ( g_conf.m_logDebugSpider )
log("xmldoc: set4 uh48=%"UINT64" parentdocid=%"UINT64"",
sreq->getUrlHash48(),sreq->getParentDocId());
// used by PageSpiderdb.cpp
m_startTime = gettimeofdayInMilliseconds();
m_startTimeValid = true;
// this is true
m_setFromSpiderRec = true;
// did page inject (pageinject) request to delete it?
m_deleteFromIndex = deleteFromIndex;
// PageReindex.cpp will set this in the spider request
if ( sreq->m_forceDelete )
m_deleteFromIndex = true;
// if we are a container doc then we need the content delimeter,
// unless if we are a warc or arc, then we know how those delimit
// already.
m_contentDelim = contentDelim;
m_contentDelimValid = true;
bool contentHasMime = contentHasMimeArg;
// but if we are a container doc then this parm applies to each subdoc
// not to us, so turn it off for this part.
if ( isContainerDoc() ) {
contentHasMime = false;
m_subDocsHaveMime = contentHasMimeArg;
}
char *utf8Content = utf8ContentArg;
if ( contentHasMime && utf8Content ) {
// get length of it all
int32_t clen = gbstrlen(utf8Content);
// return true on error with g_errno set
if ( ! m_mime.set ( utf8ContentArg , clen , NULL ) ) {
if ( ! g_errno ) g_errno = EBADMIME;
log("xmldoc: could not set mime: %s",
mstrerror(g_errno));
return false;
}
// it's valid
m_mimeValid = true;
// advance
utf8Content = m_mime.getContent();
if(payloadLen != -1) {
payloadLen -= m_mime.getContent() - utf8ContentArg;
}
}
// use this to avoid ip lookup if it is not zero
if ( forcedIp ) {
m_ip = forcedIp;
m_ipValid = true;
}
// sometimes they supply the content they want! like when zaks'
// injects pages from PageInject.cpp
if ( utf8Content ) {
// . this is the most basic content from the http reply
// . only set this since sometimes it is facebook xml and
// contains encoded html which needs to be decoded.
// like <name>Ben &amp; Jerry's</name> otherwise are
// sentence formation stops at the ';' in the "&amp;" and
// we also index "amp" which is bad.
m_content = utf8Content;
if(payloadLen != -1) {
m_contentLen = payloadLen;
}
else if ( m_mimeValid && m_mime.m_contentLen > 0) {
m_contentLen = m_mime.m_contentLen;
} else {
m_contentLen = gbstrlen(utf8Content);
}
m_contentValid = true;
//m_rawUtf8Content = utf8Content;
//m_expandedUtf8Content = utf8Content;
//ptr_utf8Content = utf8Content;
//size_utf8Content = slen+1;
//m_rawUtf8ContentValid = true;
//m_expandedUtf8ContentValid = true;
//m_utf8ContentValid = true;
m_contentInjected = true;
m_wasContentInjected = true;
m_contentType = contentType;
m_contentTypeValid = true;
// use this ip as well for now to avoid ip lookup
//m_ip = atoip("127.0.0.1");
//m_ipValid = true;
// do not need robots.txt then
m_isAllowed = true;
m_isAllowedValid = true;
// nor mime
m_httpStatus = 200;
m_httpStatusValid = true;
// this too
m_downloadStatus = 0;
m_downloadStatusValid = true;
// assume this is the download time since the content
// was pushed/provided to us
if ( spideredTime )
m_downloadEndTime = spideredTime;
else
m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
// either way, validate it
m_downloadEndTimeValid = true;
// and need a legit mime
if ( ! m_mimeValid ) {
m_mime.m_bufLen = 1;
m_mimeValid = true;
m_mime.m_contentType = contentType;
}
m_isContentTruncated = false;
m_isContentTruncatedValid = true;
// no redir
ptr_redirUrl = NULL;
size_redirUrl = 0;
m_redirUrl.reset();
m_redirUrlPtr = NULL;//&m_redirUrl;
m_redirUrlValid = true;
m_redirErrorValid = true;
m_redirError = 0;
m_crawlDelay = -1;
m_crawlDelayValid = true;
}
// override content type based on mime for application/json
if ( m_mimeValid ) {
m_contentType = m_mime.m_contentType;
m_contentTypeValid = true;
}
//m_coll = coll;
m_pbuf = pbuf;
m_niceness = niceness;
m_version = TITLEREC_CURRENT_VERSION;
m_versionValid = true;
/*
// set min/max pub dates right away
m_minPubDate = -1;
m_maxPubDate = -1;
// parentPrevSpiderTime is 0 if that was the first time that the
// parent was spidered, in which case isNewOutlink will always be set
// for every outlink it had!
if ( sreq->m_isNewOutlink && sreq->m_parentPrevSpiderTime ) {
// sanity check
if ( ! sreq->m_parentPrevSpiderTime ) {char *xx=NULL;*xx=0;}
// pub date is somewhere between these two times
m_minPubDate = sreq->m_parentPrevSpiderTime;
m_maxPubDate = sreq->m_addedTime;
}
*/
// this is used to removing the rec from doledb after we spider it
m_doledbKey.setMin();
if ( doledbKey ) m_doledbKey = *doledbKey;
// . sanity check
// . we really don't want the parser holding up the query pipeline
// even if this page is being turked!
//if ( m_niceness == 0 &&
// // spider proxy uses xmldoc class to expand iframe tags and
// // sometimes the initiating msg13 class was re-niced to 0
// // in the niceness converstion logic.
// ! g_hostdb.m_myHost->m_isProxy ) {
// char *xx=NULL; *xx=0; }
if ( sreq->isCorrupt(m_collnum) )
return log("XmlDoc: set4() spider request is corrupt in coll "
"%s u=%s",coll,sreq->m_url);
m_sreqValid = true;
// store the whole rec, key+dataSize+data, in case it disappears.
gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
// set m_collnum etc.
if ( ! setCollNum ( coll ) )
return log("XmlDoc: set4() coll %s invalid",coll);
// it should be valid since we just set it
CollectionRec *cr = getCollRec();
m_useRobotsTxt = cr->m_useRobotsTxt;
// solidify some parms
//m_eliminateMenus = cr->m_eliminateMenus;
//m_eliminateMenusValid = true;
// validate these here too
/*
m_titleWeight = cr->m_titleWeight;
m_headerWeight = cr->m_headerWeight;
m_urlPathWeight = cr->m_urlPathWeight;
m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
m_conceptWeight = cr->m_conceptWeight;
m_titleWeightValid = true;
m_headerWeightValid = true;
m_urlPathWeightValid = true;
m_externalLinkTextWeightValid = true;
m_internalLinkTextWeightValid = true;
m_conceptWeightValid = true;
*/
// fix some corruption i've seen
if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
m_sreq.m_urlIsDocId = 0;
}
// if url is a docid... we are from pagereindex.cpp
//if ( sreq->m_isPageReindex ) {
// now we can have url-based page reindex requests because
// if we have a diffbot json object fake url reindex request
// we add a spider request of the PARENT url for it as page reindex
//if ( is_digit ( sreq->m_url[0] ) ) {
// watch out for 0.r.msn.com!!
if ( m_sreq.m_urlIsDocId ) {
m_docId = atoll(m_sreq.m_url);
// assume its good
m_docIdValid = true;
// similar to set3() above
m_setFromDocId = true;
// use content and ip from old title rec to save time
// . crap this is making the query reindex not actually
// re-download the content.
// . we already check the m_deleteFromIndex flag below
// in getUtf8Content() and use the old content in that case
// so i'm not sure why we are recycling here, so take
// this out. MDW 9/25/2014.
//m_recycleContent = true;
// sanity
if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; }
}
else {
// add www is now REQUIRED for all!
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
// www.tmblr.co has no IP
setFirstUrl ( m_sreq.m_url , false );//true ); // false );
// you can't call this from a docid based url until you
// know the uh48
//setSpideredTime();
}
// now query reindex can specify a recycle content option so it
// can replace the rebuild tool. try to recycle on global index.
if ( m_sreqValid )
m_recycleContent = m_sreq.m_recycleContent;
m_hasMetadata = (bool)metadata;
ptr_metadata = metadata;
size_metadata = metadataLen;
return true;
}
// . set our stuff from the TitleRec (from titledb)
// . returns false and sets g_errno on error
bool XmlDoc::set2 ( char *titleRec ,
int32_t maxSize ,
char *coll ,
SafeBuf *pbuf ,
int32_t niceness ,
SpiderRequest *sreq ) {
// NO! can't do this. see below
//reset();
setStatus ( "setting xml doc from title rec");
// . it resets us, so save this
// . we only save these for set2() not the other sets()!
//void (*cb1)(void *state) = m_callback1;
//bool (*cb2)(void *state) = m_callback2;
//void *state = m_state;
// . clear it all out
// . no! this is clearing our msg20/msg22 reply...
// . ok, but repair.cpp needs it so do it there then
//reset();
// restore callbacks
//m_callback1 = cb1;
//m_callback2 = cb2;
//m_state = state;
// sanity check - since we do not reset
if ( m_contentValid ) { char *xx=NULL;*xx=0; }
// this is true
m_setFromTitleRec = true;
// this is valid i guess. includes key, etc.
//m_titleRec = titleRec;
//m_titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
//m_titleRecValid = true;
// . should we free m_cbuf on our reset/destruction?
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
// that should not be freed, besides the alloc size is not known!
//m_freeTitleRec = false;
int32_t titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
// . should we free m_cbuf on our reset/destruction?
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
// that should not be freed, besides the alloc size is not known!
m_titleRecBuf.setBuf ( titleRec ,
titleRecSize , // bufmax
titleRecSize , // bytes in use
false, // ownData?
csUTF8); // encoding
m_titleRecBufValid = true;
//m_coll = coll;
m_pbuf = pbuf;
m_niceness = niceness;
// . sanity check
// . NO! could be from XmlDoc::getMsg20Reply()!
//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }
// it must be there!
if ( !titleRec||titleRecSize==0 ) {g_errno=ENOTFOUND; return false;}
// set our collection number
if ( ! setCollNum ( coll ) ) return false;
// store the whole rec, key+dataSize+data, in case it disappears.
if ( sreq ) {
gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
m_sreqValid = true;
}
m_hashedTitle = false;
m_hashedMetas = false;
// save the compressed buffer in case we should free it when done
//m_titleRec = titleRec;
// should we free m_cbuf on our reset/destruction?
//m_freeTitleRec = true;
// our record may not occupy all of m_cbuf, careful
//m_titleRecAllocSize = maxSize;
// get a parse ptr
char *p = titleRec ;
// . this is just like a serialized RdbList key/dataSize/data of 1 rec
// . first thing is the key
// . key should have docId embedded in it
m_titleRecKey = *(key_t *) p ;
//m_titleRecKeyValid = true;
p += sizeof(key_t);
// bail on error
if ( (m_titleRecKey.n0 & 0x01) == 0x00 ) {
g_errno = EBADTITLEREC;
log("db: Titledb record is a negative key.");
char *xx=NULL; *xx=0;
return false;
}
// set m_docId from key
m_docId = g_titledb.getDocIdFromKey ( m_titleRecKey );
// validate that
m_docIdValid = true;
// then the size of the data that follows this
int32_t dataSize = *(int32_t *) p ;
p += 4;
// bail on error
if ( dataSize < 4 ) {
g_errno = EBADTITLEREC;
return log("db: Titledb record has size of %"INT32" which "
"is less then 4. Probable disk corruption in a "
"titledb file.",
dataSize);
}
// what is the size of cbuf/titleRec in bytes?
int32_t cbufSize = dataSize + 4 + sizeof(key_t);
// . the actual data follows "dataSize"
// . what's the size of the uncompressed compressed stuff below here?
m_ubufSize = *(int32_t *) p ; p += 4;
// . because of disk/network data corruption this may be wrong!
// . we can now have absolutely huge titlerecs...
if ( m_ubufSize <= 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 )
g_errno = EBADTITLEREC;
return log("db: TitleRec::set: uncompress uncompressed "
"size=%"INT32".",m_ubufSize );
}
// trying to uncompress corrupt titlerecs sometimes results in
// a seg fault... watch out
if ( m_ubufSize > 100*1024*1024 ) {
g_errno = EBADTITLEREC;
return log("db: TitleRec::set: uncompress uncompressed "
"size=%"INT32" > 100MB. unacceptable, probable "
"corruption.",m_ubufSize );
}
// make buf space for holding the uncompressed stuff
m_ubufAlloc = m_ubufSize;
m_ubuf = (char *) mmalloc ( m_ubufAlloc ,"TitleRecu1");
// log("xmldoc: m_ubuf=%"PTRFMT" this=%"PTRFMT
// , (PTRTYPE) m_ubuf
// , (PTRTYPE) this
// );
if ( ! m_ubuf ) {
// we had bad ubufsizes on gb6, like > 1GB print out key
// so we can manually make a titledb.dat file to delete these
// bad keys
log("build: alloc failed ubufsize=%"INT32" key.n1=%"UINT32" "
"n0=%"UINT64,
m_ubufAlloc,m_titleRecKey.n1,m_titleRecKey.n0);
return false;
}
// we need to loop since uncompress is wierd, sometimes it needs more
// space then it should. see how much it actually took.
int32_t realSize = m_ubufSize;
// time it
int64_t startTime = gettimeofdayInMilliseconds();
// debug msg
setStatus( "Uncompressing title rec." );
// . uncompress the data into m_ubuf
// . m_ubufSize should remain unchanged since we stored it
int err = gbuncompress ( (unsigned char *) m_ubuf ,
(uint32_t *) &realSize ,
(unsigned char *) p ,
(uint32_t ) (dataSize - 4) );
// hmmmm...
if ( err == Z_BUF_ERROR ) {
log("db: Buffer is too small to hold uncompressed "
"document. Probable disk corruption in a titledb file.");
g_errno = EUNCOMPRESSERROR;
return false;
}
// set g_errno and return false on error
if ( err != Z_OK ) {
g_errno = EUNCOMPRESSERROR;
return log("db: Uncompress of document failed. ZG_ERRNO=%i. "
"cbufSize=%"INT32" ubufsize=%"INT32" realSize=%"INT32"",
err , cbufSize , m_ubufSize , realSize );
}
if ( realSize != m_ubufSize ) {
g_errno = EBADENGINEER;
return log("db: Uncompressed document size is not what we "
"recorded it to be. Probable disk corruption in "
"a titledb file.");
}
// . add the stat
// . use white for the stat
g_stats.addStat_r ( 0 ,
startTime ,
gettimeofdayInMilliseconds(),
0x00ffffff );
// first 2 bytes in m_ubuf is the header size
int32_t headerSize = *(uint16_t *)m_ubuf;
int32_t shouldbe = (char *)&ptr_firstUrl - (char *)&m_headerSize;
if ( headerSize != shouldbe ) {
g_errno = ECORRUPTDATA;
return log("doc: bad header size in title rec");
}
// set our easy stuff
gbmemcpy ( (void *)this , m_ubuf , headerSize );
// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
// like in Msg.cpp and Msg20Reply.cpp
if ( m_pbuf ) {
int32_t crc = hash32(m_ubuf,headerSize);
m_pbuf->safePrintf("crchdr=0x%"XINT32" sizehdr=%"INT32", ",
crc,headerSize);
}
// point to the string data
char *up = m_ubuf + headerSize;
// end of the rec
char *upend = m_ubuf + m_ubufSize;
// how many XmlDoc::ptr_* members do we have? set "np" to that
int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ;
np /= sizeof(char *);
// point to the first ptr
char **pd = (char **)&ptr_firstUrl;
// point to the first size
int32_t *ps = (int32_t *)&size_firstUrl;
// loop over them
for ( int32_t i = 0 ; i < np ; i++ , pd++ , ps++ ) {
// zero out the ith ptr_ and size_ member
*pd = 0;
*ps = 0;
// make the mask
uint32_t mask = 1 << i ;
// do we have this member? skip if not.
if ( ! (m_internalFlags1 & mask) ) continue;
// watch out for corruption
if ( up > upend ) {
g_errno = ECORRUPTDATA;
return log("doc: corrupt titlerec.");
}
// get the size
*ps = *(int32_t *)up;
// this should never be 0, otherwise, why was its flag set?
if ( *ps <= 0 ) { char *xx=NULL;*xx=0; }
// skip over to point to data
up += 4;
// point to the data. could be 64-bit ptr.
*pd = up;//(int32_t)up;
// debug
if ( m_pbuf ) {
int32_t crc = hash32(up,*ps);
m_pbuf->safePrintf("crc%"INT32"=0x%"XINT32" size%"INT32"=%"INT32", ",
i,crc,i,*ps);
}
// skip over data
up += *ps;
// watch out for corruption
if ( up > upend ) {
g_errno = ECORRUPTDATA;
return log("doc: corrupt titlerec.");
}
}
// cap it
char *pend = m_ubuf + m_ubufSize;
// sanity check. must match exactly.
if ( up != pend ) { char *xx=NULL;*xx=0; }
// set the urls i guess
m_firstUrl.set ( ptr_firstUrl );
if ( ptr_redirUrl ) {
m_redirUrl.set ( ptr_redirUrl );
m_currentUrl.set ( ptr_redirUrl );
m_currentUrlValid = true;
m_redirUrlPtr = &m_redirUrl;
}
else {
m_currentUrl.set ( ptr_firstUrl );
m_currentUrlValid = true;
m_redirUrlPtr = NULL;
}
m_firstUrlValid = true;
m_redirUrlValid = true;
// convert 8 bit to a 32 bit
//m_numBannedOutlinks = score8to32 ( m_numBannedOutlinks8 );
// validate *shadow* members since bit flags cannot be returned
m_isRSS2 = m_isRSS;
m_isPermalink2 = m_isPermalink;
m_isAdult2 = m_isAdult;
m_spiderLinks2 = m_spiderLinks;
m_isContentTruncated2 = m_isContentTruncated;
m_isLinkSpam2 = m_isLinkSpam;
m_hasAddress2 = m_hasAddress;
m_hasTOD2 = m_hasTOD;
//m_hasSiteVenue2 = m_hasSiteVenue;
m_hasContactInfo2 = m_hasContactInfo;
//m_skipIndexingByte = m_skipIndexing;
m_isSiteRoot2 = m_isSiteRoot;
// these members are automatically validated
m_ipValid = true;
m_spideredTimeValid = true;
m_indexedTimeValid = true;
m_pubDateValid = true;
m_firstIndexedValid = true;
m_outlinksAddedDateValid = true;
m_charsetValid = true;
m_countryIdValid = true;
/*
m_titleWeightValid = true;
m_headerWeightValid = true;
m_urlPathWeightValid = true;
m_externalLinkTextWeightValid = true;
m_internalLinkTextWeightValid = true;
m_conceptWeightValid = true;
*/
// new stuff
m_siteNumInlinksValid = true;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
//m_sitePopValid = true;
m_rootLangIdValid = true;
m_hasContactInfoValid = true;
m_metaListCheckSum8Valid = true;
m_hopCountValid = true;
//m_numBannedOutlinksValid = true;
m_langIdValid = true;
m_contentTypeValid = true;
m_isRSSValid = true;
m_isPermalinkValid = true;
m_isAdultValid = true;
//m_eliminateMenusValid = true;
m_spiderLinksValid = true;
m_isContentTruncatedValid = true;
m_isLinkSpamValid = true;
m_hasAddressValid = true;
m_tagRecDataValid = true;
m_gigabitHashesValid = true;
m_contentHash32Valid = true;
//m_tagHash32Valid = true;
m_tagPairHash32Valid = true;
m_adVectorValid = true;
m_wikiDocIdsValid = true;
m_imageDataValid = true;
m_catIdsValid = true;
m_indCatIdsValid = true;
// ptr_dmozTitles/Summs/Anchors valid:
m_dmozInfoValid = true;
m_utf8ContentValid = true;
//m_sectionsReplyValid = true;
//m_sectionsVotesValid = true;
//m_addressReplyValid = true;
m_siteValid = true;
m_linkInfo1Valid = true;
m_linkInfo2Valid = true;
m_versionValid = true;
m_httpStatusValid = true;
m_crawlDelayValid = true;
//m_sectiondbDataValid = true;
//m_placedbDataValid = true;
//m_clockCandidatesDataValid = true;
//m_skipIndexingValid = true;
m_isSiteRootValid = true;
// ptr_linkInfo2 is valid. so getDiffbotTitleHashes() works.
m_diffbotTitleHashBufValid = true;
// set "m_oldTagRec" from ptr_tagRecData
//gbmemcpy ( &m_oldTagRec , ptr_tagRecData , size_tagRecData );
//m_oldTagRecValid = true;
// there was no issue indexing it...
m_indexCode = 0;
m_indexCodeValid = true;
m_redirError = 0;
m_redirErrorValid = true;
// stop core when importing and calling getNewSpiderReply()
m_downloadEndTime = m_spideredTime;
m_downloadEndTimeValid = true;
// make a copy for new tag rec too, this one we modify
//gbmemcpy ( &m_newTagRec , ptr_tagRecData , size_tagRecData );
// set "m_siteNumInlinks" from m_oldTagRec
//Tag *tag = m_oldTagRec.getTag("sitenuminlinks");
// must always be there!
//if ( ! tag ) { char *xx=NULL;*xx=0; }
// must be null terminated
//if ( tag->getTagData()[tag->getTagData()Size-1] != 0 ) {
// char *xx=NULL;*xx=0; }
// grab that
//m_siteNumInlinks = atol(tag->getTagData());
//m_siteNumInlinksValid = true;
// must not be negative
if ( m_siteNumInlinks < 0 ) { char *xx=NULL;*xx=0; }
// set m_hasContactInfo in case someone calls ::getHasContactInfo()
// which will do a bunch of parsing!!
//tag = m_oldTagRec.getTag ("hascontactinfo");
//if ( tag ) m_hasContactInfo = true;
//else m_hasContactInfo = false;
//m_hasContactInfoValid = true;
// sanity check. if m_siteValid is true, this must be there
if ( ! ptr_site ) {
log("set2: ptr_site is null for docid %"INT64"",m_docId);
//char *xx=NULL;*xx=0; }
g_errno = ECORRUPTDATA;
return false;
}
// lookup the tagdb rec fresh if setting for a summary. that way we
// can see if it is banned or not
//if ( m_req ) m_tagRecDataValid = false;
// debug thing
ptr_sectiondbData = NULL;
size_sectiondbData = 0;
// set m_sections.m_nsvt from data. ptr_sectiondbData is the m_osvt
// serialized, which is from our read of sectiondb at the time we
// indexed it. but now that we may have nulled out our content to
// save space in titledb because m_skipIndexing is true, then we have
// to save our votes as well, BUT, only if we skipped indexing.
// and not allowed to serialize UNLESS we skipped because
// that would waste space as well
//if (! m_skipIndexing && size_sectionsVotes ) { char *xx=NULL;*xx=0; }
// success, return true then
return true;
}
bool XmlDoc::setFirstUrl ( char *u , bool addWWW , Url *baseUrl ) {
m_firstUrl.reset();
m_currentUrl.reset();
m_firstUrlValid = true;
// sanity check. "u" must be normalized
//if ( strncmp(u,"http",4 ) != 0 ) { char *xx=NULL;*xx=0; }
// assume url is not correct format
ptr_firstUrl = NULL;
size_firstUrl = 0;
if ( ! u || ! u[0] ) {
//if ( ! m_indexCode ) m_indexCode = EBADURL;
return true;
}
//if ( gbstrlen (u) + 1 > MAX_URL_LEN )
// m_indexCode = EURLTOOLONG;
m_firstUrl.set ( baseUrl , u , gbstrlen(u) , addWWW ) ;
// it is the active url
m_currentUrl.set ( &m_firstUrl , false );
m_currentUrlValid = true;
// set this to the normalized url
ptr_firstUrl = m_firstUrl.getUrl();
size_firstUrl = m_firstUrl.getUrlLen() + 1;
// is it is a link loop?
//if ( m_firstUrl.isLinkLoop() ) {
// if ( ! m_indexCode ) m_indexCode = ELINKLOOP;
// return true;
//}
// it it illegal?
//if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
// if ( ! m_indexCode ) m_indexCode = EBADURL;
// return true;
//}
// check if url is porn words in it
//if ( cr->m_doUrlSpamCheck && m_firstUrl.isSpam() ) {
// if ( ! m_indexCode ) m_indexCode = EDOCURLSPAM;
// return true;
//}
return true;
}
//CollectionRec *XmlDoc::getCollRec ( ) {
// return g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
//}
//bool XmlDoc::setRedirUrl ( char *u , bool addWWW ) {
// m_redirUrl.set ( u , gbstrlen(u) , addWWW );
// ptr_redirUrl = m_redirUrl.getUrl();
// size_redirUrl = m_redirUrl.getUrlLen()+1;
// return true;
//}
void XmlDoc::setStatus ( char *s ) {
m_statusMsg = s;
m_statusMsgValid = true;
static char *s_last = NULL;
if ( s == s_last ) return;
bool timeIt = false;
// if ( m_sreqValid &&
// m_sreq.m_isInjecting &&
// m_sreq.m_isPageInject )
// timeIt = true;
if ( g_conf.m_logDebugBuildTime )
timeIt = true;
// log times to detect slowness
if ( timeIt ) {
int64_t now = gettimeofdayInMillisecondsLocal();
if ( s_lastTimeStart == 0LL ) s_lastTimeStart = now;
int32_t took = now - s_lastTimeStart;
//if ( took > 100 )
log("xmldoc: %s (xd=0x%"PTRFMT" "
"u=%s) took %"INT32"ms",
s_last,
(PTRTYPE)this,
m_firstUrl.m_url,
took);
s_lastTimeStart = now;
}
s_last = s;
bool logIt = g_conf.m_logDebugBuild;
// CollectionRec *cr = NULL;
// if ( m_collnumValid )
// cr = g_collectiondb.m_recs[m_collnum];
// if ( cr &&
// cr->m_coll &&
// cr->m_coll[0] == 'c' &&
// cr->m_coll[1] == 'r' &&
// strncmp(cr->m_coll,"crawlbottesting-",16) == 0 )
// logIt = true;
if ( ! logIt ) return;
//return;
if ( m_firstUrlValid )
logf(LOG_DEBUG,"build: status = %s for %s (this=0x%"PTRFMT")",
s,m_firstUrl.m_url,(PTRTYPE)this);
else
logf(LOG_DEBUG,"build: status = %s for docId %"INT64" "
"(this=0x%"PTRFMT")",
s,m_docId, (PTRTYPE)this);
}
// caller must now call XmlDoc::setCallback()
void XmlDoc::setCallback ( void *state, void (* callback) (void *state) ) {
m_state = state;
m_callback1 = callback;
// add this additional state==this constraint to prevent core when
// doing a page parser
if ( state == this &&
// i don't remember why i added this sanity check...
callback == getMetaListWrapper ) { char *xx=NULL;*xx=0; }
}
void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) {
m_state = state;
m_callback2 = callback;
}
// . similar to XmlDoc::indexDoc() but just adds m_firstUrl to spiderdb
// . used by PageAddUrl.cpp
/*
bool XmlDoc::addToSpiderdb ( ) {
// set a flag
m_isAddUrl = true;
// url must be valid
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
// do not add if something wrong with url
if ( m_indexCode ) return true;
// this should just add to spiderdb because m_isAddUrl is true
return indexDoc(false,false,false,false,true,false);
}
*/
void indexDocWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// note it
THIS->setStatus ( "in index doc wrapper" );
// return if it blocked
if ( ! THIS->indexDoc( ) ) return;
// otherwise, all done, call the caller callback
// g_statsdb.addStat ( MAX_NICENESS,
// "docs_indexed",
// 20,
// 21,
// );
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
// for registerSleepCallback
void indexDocWrapper2 ( int fd , void *state ) {
indexDocWrapper ( state );
}
// . inject from http request
// . replace more of Msg7.cpp logic with this?
//bool XmlDoc::injectDoc ( HttpRequest *hr ) {
//}
// . the highest level function in here
// . user is requesting to inject this url
// . returns false if blocked and your callback will be called when done
// . returns true and sets g_errno on error
bool XmlDoc::injectDoc ( char *url ,
CollectionRec *cr ,
char *content ,
char *diffbotReply, // usually null
bool contentHasMimeArg ,
int32_t hopCount,
int32_t charset,
bool deleteUrl,
char *contentTypeStr, // text/html application/json
bool spiderLinks ,
char newOnly, // index iff new
void *state,
void (*callback)(void *state) ,
uint32_t firstIndexed,
uint32_t lastSpidered ,
int32_t injectDocIp ,
char *contentDelim,
char *metadata,
uint32_t metadataLen,
int32_t payloadLen
) {
// wait until we are synced with host #0
if ( ! isClockInSync() ) {
log("xmldocl: got injection request but clock not yet "
"synced with host #0");
g_errno = ETRYAGAIN;//CLOCKNOTSYNCED;
return true;
}
// normalize url
Url uu;
// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
// which has no www.tmblr.co IP!
uu.set(url,gbstrlen(url),false);//true);
// if (!strncmp(url , "http://www.focusinfo.com/products/mxprodv" ,40))
// log("hey");
// remove >'s i guess and store in st1->m_url[] buffer
char cleanUrl[MAX_URL_LEN+1];
cleanInput ( cleanUrl,
MAX_URL_LEN,
uu.getUrl(),
uu.getUrlLen() );
int32_t contentType = CT_UNKNOWN;
if ( contentTypeStr && contentTypeStr[0] )
contentType = getContentTypeFromStr(contentTypeStr);
// use CT_HTML if contentTypeStr is empty or blank. default
if ( ! contentTypeStr || ! contentTypeStr[0] )
contentType = CT_HTML;
// this can go on the stack since set4() copies it
SpiderRequest sreq;
sreq.setFromInject ( cleanUrl );
if ( lastSpidered )
sreq.m_addedTime = lastSpidered;
if ( deleteUrl )
sreq.m_forceDelete = 1;
//static char s_dummy[3];
// sometims the content is indeed NULL...
//if ( newOnly && ! content ) {
// // don't let it be NULL because then xmldoc will
// // try to download the page!
// s_dummy[0] = '\0';
// content = s_dummy;
// //char *xx=NULL;*xx=0; }
//}
// . use the enormous power of our new XmlDoc class
// . this returns false with g_errno set on error
if ( ! set4 ( &sreq ,
NULL ,
cr->m_coll ,
NULL , // pbuf
// from PageInject.cpp:
// give it a niceness of 1, we have to be
// careful since we are a niceness of 0!!!!
1, // niceness, // 1 ,
// inject this content
content ,
deleteUrl, // false, // deleteFromIndex ,
injectDocIp, // 0,//forcedIp ,
contentType ,
lastSpidered,//lastSpidered overide
contentHasMimeArg ,
contentDelim,
metadata,
metadataLen,
payloadLen
)) {
// g_errno should be set if that returned false
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// a diffbot reply? should be in json
if ( diffbotReply ) {
if ( ! m_diffbotReply.safeStrcpy(diffbotReply) )
return true;
// it was injected so assume no error
m_diffbotReplyError = 0;
m_diffbotReplyValid = true;
}
//m_doConsistencyTesting = doConsistencyTesting;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
//if ( recycleContent ) m_recycleContent = true;
// othercrap. used for importing from titledb of another coll/cluster.
if ( firstIndexed ) {
m_firstIndexedDate = firstIndexed;
m_firstIndexedDateValid = true;
}
if ( lastSpidered ) {
m_spideredTime = lastSpidered;
m_spideredTimeValid = true;
}
if ( hopCount != -1 ) {
m_hopCount = hopCount;
m_hopCountValid = true;
}
// PageInject calls memset on gigablastrequest so add '!= 0' here
if ( charset != -1 && charset != csUnknown && charset != 0 ) {
m_charset = charset;
m_charsetValid = true;
}
// avoid looking up ip of each outlink to add "firstip" tag to tagdb
// because that can be slow!!!!!!!
m_spiderLinks = spiderLinks;
m_spiderLinks2 = spiderLinks;
m_spiderLinksValid = true;
// . newOnly is true --> do not inject if document is already indexed!
// . maybe just set indexCode
m_newOnly = newOnly;
// do not re-lookup the robots.txt
m_isAllowed = true;
m_isAllowedValid = true;
m_crawlDelay = -1; // unknown
m_crawlDelayValid = true;
m_isInjecting = true;
m_isInjectingValid = true;
// set this now
//g_inPageInject = true;
// log it now
//log("inject: indexing injected doc %s",cleanUrl);
// make this our callback in case something blocks
setCallback ( state , callback );
// . now tell it to index
// . this returns false if blocked
// . eventually it will call "callback" when done if it blocks
bool status = indexDoc ( );
if ( ! status ) return false;
// log it. i guess only for errors when it does not block?
// because xmldoc.cpp::indexDoc calls logIt()
if ( status ) logIt();
// undo it
//g_inPageInject = false;
return true;
}
// XmlDoc::injectDoc uses a fake spider request so we have to add
// a real spider request into spiderdb so that the injected doc can
// be spidered again in the future by the spidering process, otherwise,
// injected docs can never be re-spidered. they would end up having
// a SpiderReply in spiderdb but no matching SpiderRequest as well.
void XmlDoc::getRevisedSpiderRequest ( SpiderRequest *revisedReq ) {
if ( ! m_sreqValid ) { char *xx=NULL; *xx=0; }
// we are doing this because it has a fake first ip
if ( ! m_sreq.m_fakeFirstIp ) { char *xx=NULL;*xx=0; }
// copy it over from our current spiderrequest
gbmemcpy ( revisedReq , &m_sreq , m_sreq.getRecSize() );
// this must be valid for us of course
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// wtf? it might be invalid!!! parent caller will handle it...
//if ( m_firstIp == 0 || m_firstIp == -1 ) { char *xx=NULL;*xx=0; }
// store the real ip in there now
revisedReq->m_firstIp = m_firstIp;
// but turn off this flag! the whole point of all this...
revisedReq->m_fakeFirstIp = 0;
// re-make the key since it contains m_firstIp
int64_t uh48 = m_sreq.getUrlHash48();
int64_t parentDocId = m_sreq.getParentDocId();
// set the key properly to reflect the new "first ip" since
// we shard spiderdb by that.
revisedReq->m_key = g_spiderdb.makeKey ( m_firstIp,
uh48,
true, // is request?
parentDocId ,
false );// isDel );
revisedReq->setDataSize();
}
void XmlDoc::getRebuiltSpiderRequest ( SpiderRequest *sreq ) {
// memset 0
sreq->reset();
// assume not valid
sreq->m_siteNumInlinks = -1;
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
// how many site inlinks?
sreq->m_siteNumInlinks = m_siteNumInlinks;
sreq->m_siteNumInlinksValid = true;
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// set other fields besides key
sreq->m_firstIp = m_firstIp;
sreq->m_hostHash32 = m_hostHash32a;
//sreq->m_domHash32 = m_domHash32;
//sreq->m_siteNumInlinks = m_siteNumInlinks;
//sreq->m_pageNumInlinks = m_pageNumInlinks;
sreq->m_hopCount = m_hopCount;
sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32;
sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32;
sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32;
sreq->m_pageNumInlinks = 0;//m_sreq.m_parentFirstIp;
Url *fu = getFirstUrl();
sreq->m_isNewOutlink = 0;
sreq->m_isAddUrl = 0;//m_isAddUrl;
sreq->m_isPingServer = fu->isPingServer();
//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;
// transcribe from old spider rec, stuff should be the same
sreq->m_addedTime = m_firstIndexedDate;
sreq->m_sameDom = 0;//m_sreq.m_sameDom;
sreq->m_sameHost = 0;//m_sreq.m_sameHost;
sreq->m_sameSite = 0;//m_sreq.m_sameSite;
sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed;
sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS;
sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink;
sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer;
// validate the stuff so getUrlFilterNum() acks it
sreq->m_hopCountValid = 1;
// we need this now for ucp ucr upp upr new url filters that do
// substring matching on the url
if ( m_firstUrlValid )
strcpy(sreq->m_url,m_firstUrl.m_url);
// re-make the key since it contains m_firstIp
long long uh48 = fu->getUrlHash48();
// set the key properly to reflect the new "first ip"
// since we shard spiderdb by that.
sreq->m_key = g_spiderdb.makeKey ( m_firstIp,//ip,
uh48,
true,//is req?
0LL, // parentDocId ,
false );//isDel
sreq->setDataSize();
}
////////////////////////////////////////////////////////////////////
// THIS IS THE HEART OF HOW THE PARSER ADDS TO THE RDBS
////////////////////////////////////////////////////////////////////
// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
// . this is now a WRAPPER for indexDoc2() and it will deal with
// g_errnos by adding an error spider reply so we offload the
// logic to the url filters table
bool XmlDoc::indexDoc ( ) {
// return from the msg4.addMetaList() below?
if ( m_msg4Launched ) {
// must have been waiting
if ( ! m_msg4Waiting ) { char *xx=NULL;*xx=0; }
return true;
}
// return true with g_errno set on error
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
if ( ! m_masterLoop ) {
m_masterLoop = indexDocWrapper;
m_masterState = this;
}
// do not index if already indexed and we are importing
// from the code in PageInject.cpp from a foreign titledb file
if ( m_isImporting && m_isImportingValid ) {
char *isIndexed = getIsIndexed();
if ( ! isIndexed ) {
log("import: import had error: %s",mstrerror(g_errno));
return true;
}
if ( isIndexed == (char *)-1)
return false;
if ( *isIndexed ) {
log("import: skipping import for %s. already indexed.",
m_firstUrl.getUrl());
return true;
}
}
// . even if not using diffbot, keep track of these counts
// . even if we had something like EFAKEFIRSTIP, OOM, or whatever
// it was an attempt we made to crawl this url
if ( ! m_isDiffbotJSONObject &&
! m_incrementedAttemptsCount ) {
// do not repeat
m_incrementedAttemptsCount = true;
// log debug
//log("build: attempted %s count=%"INT64"",m_firstUrl.getUrl(),
// cr->m_localCrawlInfo.m_pageDownloadAttempts);
// this is just how many urls we tried to index
//cr->m_localCrawlInfo.m_urlsConsidered++;
// avoid counting if it is a fake first ip
bool countIt = true;
// pagereindex.cpp sets this as does any add url (bulk job)
if ( m_sreqValid && m_sreq.m_fakeFirstIp )
countIt = false;
if ( countIt ) {
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
// changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
}
// need to save collection rec now during auto save
cr->m_needsSave = true;
// update this just in case we are the last url crawled
//int64_t now = gettimeofdayInMillisecondsGlobal();
//cr->m_diffbotCrawlEndTime = now;
}
bool status = true;
if ( ! g_errno ) status = indexDoc2 ( );
// blocked?
if ( ! status ) return false;
// done with no error?
bool success = true;
if ( g_errno ) success = false;
// if we were trying to spider a fakefirstip request then
// pass through because we lookup the real firstip below and
// add a new request as well as a reply for this one
if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) success = false;
if ( success ) return true;
// . ignore failed child docs like diffbot pages
// . they are getting EMALFORMEDSECTIONS
if ( m_isChildDoc ) {
log("build: done indexing child doc. error=%s. not adding "
"spider reply for %s",
mstrerror(g_errno),
m_firstUrl.m_url);
return true;
}
///
// otherwise, an internal error. we must add a SpiderReply
// to spiderdb to release the lock.
///
logErr:
if ( m_firstUrlValid && g_errno )
log("build: %s had internal error = %s. adding spider "
"error reply.",
m_firstUrl.m_url,mstrerror(g_errno));
else if ( g_errno )
log("build: docid=%"INT64" had internal error = %s. "
"adding spider error reply.",
m_docId,mstrerror(g_errno));
// seems like this was causing a core somehow...
if ( g_errno == ENOMEM )
return true;
// and do not add spider reply if shutting down the server
if ( g_errno == ESHUTTINGDOWN )
return true;
// i saw this on shard 9, how is it happening
if ( g_errno == EBADRDBID )
return true;
// if docid not found when trying to do a query reindex...
// this really shouldn't happen but i think we were adding
// additional SpiderRequests since we were using a fake first ip.
// but i have since fixed that code. so if the titlerec was not
// found when trying to do a force delete... it's not a temporary
// error and should not be retried. if we set indexCode to
// EINTERNALERROR it seems to be retried.
if ( g_errno == ENOTFOUND ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
// this should not be retired either. i am seeing it excessively
// retried from a
// "TitleRec::set: uncompress uncompressed size=-2119348471"
// error condition. it also said
// "Error spidering for doc http://www.... : Bad cached document"
if ( g_errno == EBADTITLEREC ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
// transId=496026 nice=1 net=default): Buf too small.
// so fix that with this
if ( g_errno == EBUFTOOSMALL ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
if ( g_errno == EBADURL ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
if ( g_errno == ENOTITLEREC ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
// default to internal error which will be retried forever otherwise
if ( ! m_indexCodeValid ) {
m_indexCode = EINTERNALERROR;//g_errno;
m_indexCodeValid = true;
}
// if our spiderrequest had a fake "firstip" so that it could be
// injected quickly into spiderdb, then do the firstip lookup here
// and re-add the new spider request with that, and add the reply
// to the fake firstip request below.
if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) {
// at least get this if possible
int32_t *fip = getFirstIp();
if ( fip == (void *) -1 ) return false;
// error? g_errno will be changed if this is NULL
if ( ! fip ) {
log("build: error getting real firstip: %s",
mstrerror(g_errno));
m_indexCode = EINTERNALERROR;
m_indexCodeValid = true;
goto logErr;
}
// sanity log
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// sanity log
if ( *fip == 0 || *fip == -1 ) {
//
// now add a spider status doc for this so we know
// why a crawl might have failed to start
//
SafeBuf *ssDocMetaList = NULL;
// save this
int32_t saved = m_indexCode;
// make it the real reason for the spider status doc
m_indexCode = EDNSERROR;
// get the spiderreply ready to be added. false=del
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
// revert
m_indexCode = saved;
// error?
if ( ! ssDocMetaList ) return true;
// blocked?
if ( ssDocMetaList == (void *)-1 ) return false;
// need to alloc space for it too
char *list = ssDocMetaList->getBufStart();
int32_t len = ssDocMetaList->length();
//needx += len;
// this too
m_addedStatusDocSize = len;
m_addedStatusDocSizeValid = true;
char *url = "unknown";
if ( m_sreqValid ) url = m_sreq.m_url;
log("build: error2 getting real firstip of "
"%"INT32" for "
"%s. Not adding new spider req. "
"spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
m_addedStatusDocSize);
// also count it as a crawl attempt
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
if ( ! m_metaList2.safeMemcpy ( list , len ) )
return true;
goto skipNewAdd1;
}
// store the new request (store reply for this below)
char rd = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
if ( ! m_metaList2.pushChar(rd) )
return true;
// store it here
SpiderRequest revisedReq;
// this fills it in
getRevisedSpiderRequest ( &revisedReq );
// and store that new request for adding
if ( ! m_metaList2.safeMemcpy (&revisedReq,
revisedReq.getRecSize()))
return true;
// make sure to log the size of the spider request
m_addedSpiderRequestSize = revisedReq.getRecSize();
m_addedSpiderRequestSizeValid = true;
}
skipNewAdd1:
SpiderReply *nsr = NULL;
// if only rebuilding posdb do not rebuild spiderdb
if ( m_useSpiderdb && ! m_addedSpiderReplySizeValid ) {
////
//
// make these fake so getNewSpiderReply() below does not block
//
////
nsr = getFakeSpiderReply ( );
// this can be NULL and g_errno set to ENOCOLLREC or something
if ( ! nsr )
return true;
//SafeBuf metaList;
char rd = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
if ( ! m_metaList2.pushChar( rd ) )
return true;
if ( ! m_metaList2.safeMemcpy ( (char *)nsr,nsr->getRecSize()))
return true;
m_addedSpiderReplySize = nsr->getRecSize();
m_addedSpiderReplySizeValid = true;
}
// for other errors like EBADTITLEREC we are not adding spider
// status docs, so add them here
/*
if ( ! m_addedStatusDocSizeValid ) {
SafeBuf *ssDocMetaList = NULL;
// if calling getSpiderStatusDocMetaList blocks then
// call addErrorStuffWrapper() to call msg4
//m_masterLoop = addErrorStuffWrapper();
//m_state = this;
// this uses m_indexCode to set it
// if this blocks it ends up calling m_masterLoop and
// re-entering this function with g_errno clear possibly
// so do we make it back here????? MDW
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);
// error?
if ( ! ssDocMetaList ) return true;
// blocked?
if ( ssDocMetaList == (void *)-1 ) return false;
// need to alloc space for it too
char *list = ssDocMetaList->getBufStart();
int32_t len = ssDocMetaList->length();
// this too
m_addedStatusDocSize = len;
m_addedStatusDocSizeValid = true;
// also count it as a crawl attempt
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
if ( ! m_metaList2.safeMemcpy ( list , len ) )
return true;
}
*/
m_msg4Launched = true;
// display the url that had the error
logIt();
// log this for debug now
if ( nsr ) {
SafeBuf tmp;
nsr->print(&tmp);
log("xmldoc: added reply %s",tmp.getBufStart());
}
// clear g_errno
g_errno = 0;
// "cr" might have been deleted by calling indexDoc() above i think
// so use collnum here, not "cr"
if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart() ,
m_metaList2.length() ,
m_collnum,//cr->m_coll ,
m_masterState , // state
m_masterLoop ,
m_niceness ) ) {
// spider hang bug
//if ( g_conf.m_testSpiderEnabled )
// logf(LOG_DEBUG,"build: msg4 meta add3 blocked"
// "msg4=0x%"XINT32"" ,(int32_t)&m_msg4);
m_msg4Waiting = true;
return false;
}
//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );
m_msg4Launched = false;
// all done
return true;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
bool XmlDoc::indexDoc2 ( ) {
if ( g_isYippy ) return true;
// if anything blocks, this will be called when it comes back
if ( ! m_masterLoop ) {
m_masterLoop = indexDocWrapper;
m_masterState = this;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// do this before we increment pageDownloadAttempts below so that
// john's smoke tests, which use those counts, are not affected
if ( m_sreqValid &&
m_sreq.m_fakeFirstIp &&
// only do for add url, not for injects. injects expect
// the doc to be indexed while the browser waits. add url
// is really just adding the spider request and returning
// to the browser without delay.
! m_sreq.m_isInjecting &&
// not for page reindexes either!
! m_sreq.m_isPageReindex &&
// just add url
m_sreq.m_isAddUrl &&
// diffbot requests are ok though!
! strstr(m_sreq.m_url,"-diffbotxyz") ) {
m_indexCodeValid = true;
m_indexCode = EFAKEFIRSTIP;
return true;
}
// ensure that CollectionRec::m_globalCrawlInfo (spider stats)
// is at least 1 minute in sync with counts of
// all hosts in network. this returns false if it sent out requests
// to update the counts from all the hosts in the network, and
// when it updates CollectionRec::m_crawlInfoGlobal with all the
// requests from each hosts in the network it will call the
// specified callback, m_masterLoop with m_masterState. this code
// is all in Spider.cpp.
// this is now in a sleep wrapper in spider.cpp.
//setStatus ( "updating crawl info" );
//if ( ! g_errno &&
// ! updateCrawlInfo ( cr , m_masterState , m_masterLoop ) )
// return false;
// MDW: we do this in indexDoc() above why do we need it here?
/*
// even if not using diffbot, keep track of these counts
if ( ! m_isDiffbotJSONObject &&
! m_incrementedAttemptsCount ) {
// do not repeat
m_incrementedAttemptsCount = true;
// this is just how many urls we tried to index
//cr->m_localCrawlInfo.m_urlsConsidered++;
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
// need to save collection rec now during auto save
cr->m_needsSave = true;
// update this just in case we are the last url crawled
int64_t now = gettimeofdayInMillisecondsGlobal();
cr->m_diffbotCrawlEndTime = now;
}
*/
/*
// if we are being called from Spider.cpp and we met our max
// to crawl requirement, then bail out on this. this might
// become true when we are in the middle of processing this url...
if ( ! m_isDiffbotJSONObject &&
// this is just for this collection, from all hosts in network
cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >=
cr->m_diffbotMaxToCrawl ) {
// set the code to badness
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
m_indexCodeValid = true;
log("diffbot: abandoning url because we hit crawl limit "
"of %"INT64". downloaded %"INT64". Disabling spiders."
,cr->m_diffbotMaxToCrawl
,cr->m_globalCrawlInfo.m_pageDownloadSuccesses
);
g_errno = m_indexCode;
// if spiders already off..
if ( ! cr->m_spideringEnabled ) return true;
// do not repeat call sendNotification()
cr->m_spideringEnabled = false;
// set this
m_emailInfo.reset();
m_emailInfo.m_finalCallback = m_masterLoop;
m_emailInfo.m_finalState = m_masterState;
m_emailInfo.m_collnum = m_collnum;
// note it
setStatus("sending notification");
// this returns false if it would block, so we ret fals
if ( ! sendNotification ( &m_emailInfo ) ) return false;
// it didn't block
g_errno = m_indexCode;
return true;
}
// likewise if we hit the max processing limit...
if ( ! m_isDiffbotJSONObject &&
cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >=
cr->m_diffbotMaxToProcess ) {
// set the code to badness
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
m_indexCodeValid = true;
log("diffbot: abandoning url because we hit process limit "
"of %"INT64". processed %"INT64". Disabling spiders."
, cr->m_diffbotMaxToProcess
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
);
g_errno = m_indexCode;
// if spiders already off...
if ( ! cr->m_spideringEnabled ) return true;
// turn them off and send notification (email or url)
cr->m_spideringEnabled = false;
// set this
m_emailInfo.reset();
m_emailInfo.m_finalCallback = m_masterLoop;
m_emailInfo.m_finalState = m_masterState;
m_emailInfo.m_collnum = m_collnum;
// note it
setStatus("sending notification");
// . this returns false if it would block, so we ret fals
// . this is now in PingServer.cpp
if ( ! sendNotification( &m_emailInfo ) ) return false;
// it didn't block
g_errno = m_indexCode;
return true;
}
*/
setStatus("indexing doc");
// maybe a callback had g_errno set?
if ( g_errno ) return true;
// before indexing this doc, index its inlinks it has according
// to ahrefs?
if ( m_downloadLevel == 1 && m_useAhrefs && ! m_doneWithAhrefs ) {
// do not repeat this call!
m_doneWithAhrefs = true;
// call it
if ( ! injectAhrefsLinks () ) return false;
}
if ( m_firstUrlValid && (m_firstUrl.isArc() || m_firstUrl.isWarc())) {
// this returns false if it would block and callback will be
// called
if ( ! indexWarcOrArc ( ) )
return false;
logIt();
// all done! no need to add the parent doc.
return true;
}
if ( isContainerDoc() ) {
// m_delimeter should be set!
if ( ! indexContainerDoc () )
return false;
logIt();
// all done! no need to add the parent doc.
return true;
}
// . now get the meta list from it to add
// . returns NULL and sets g_errno on error
char *metaList = getMetaList ( );
// error?
if ( ! metaList ) {
// sanity check. g_errno must be set
if ( ! g_errno ) {
log("build: Error UNKNOWN error spidering. setting "
"to bad engineer.");
g_errno = EBADENGINEER;
//char *xx=NULL;*xx=0; }
}
log("build: Error spidering for doc %s: %s",
m_firstUrl.m_url,mstrerror(g_errno));
return true;
}
// did it block? return false if so, we will be recalled since
// we set m_masterLoop to indexDoc
if ( metaList == (char *) -1 ) return false;
// before we add the meta list let's updateTagdb()
//char *ret = updateTagdb();
// it returns NULL on error
//if ( ret == NULL ) return true;
// return false if it blocked
//if ( ret == (char *)-1 ) return false;
// . let's update tagdb's venue address default too
// . no. that is in getTitleRecBuf()
// must be valid
int32_t *indexCode = getIndexCode();
if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
// . check to make sure the parser is consistent so we can cleanly
// delete the various rdb records if we need to in the future solely
// based on the titleRec.
// . force = false
// . unless we force it, the test is only done at random intervals
// for performance reasons
if ( ! *indexCode ) doConsistencyTest ( false );
// ignore errors from that
g_errno = 0;
// unregister any sleep callback
if ( m_registeredSleepCallback ) {
g_loop.unregisterSleepCallback(m_masterState,indexDocWrapper2);
m_registeredSleepCallback = false;
}
//////////
// . add the doledb negative key quickly to our tree to avoid a
// respider because the msg4 doledb negative key is buffered by msg4
// . make it negative
// . well it should not be respidered because the lock is on it!!
// -- so let's comment this out
/////////
/*
key_t negative = m_doledbKey;
// make it negative
negative.n0 &= 0xfffffffffffffffeLL;
// . store it in our tree if we can
// . returns false and sets g_errno on error
// . i.e. g_errno == ETRYAGAIN
if ( ! m_addedNegativeDoledbRec &&
! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
NULL,0,m_niceness)){
log("build: error trying to add to doledb: %s",
mstrerror(g_errno));
// set sleep wrapper
g_loop.registerSleepCallback(1000,m_masterState,
indexDocWrapper2,m_niceness);
// note it
m_registeredSleepCallback = true;
// sleep and retry
return false;
}
*/
// we did that
m_addedNegativeDoledbRec = true;
// now add it
if ( ! m_listAdded && m_metaListSize ) {
// only call thuis once
m_listAdded = true;
// show it for now
//printMetaList(m_metaList , m_metaList + m_metaListSize,NULL);
// test it
verifyMetaList ( m_metaList ,
m_metaList + m_metaListSize ,
false );
// do it
if ( ! m_msg4.addMetaList ( m_metaList ,
m_metaListSize ,
m_collnum,//cr->m_coll ,
m_masterState , // state
m_masterLoop ,
m_niceness ) ) {
// spider hang bug
if ( g_conf.m_testSpiderEnabled )
logf(LOG_DEBUG,"build: msg4 meta add blocked"
"msg4=0x%"PTRFMT"" ,(PTRTYPE)&m_msg4);
m_msg4Waiting = true;
return false;
}
// error with msg4? bail
if ( g_errno ) return logIt();
}
// make sure our msg4 is no longer in the linked list!
if (m_msg4Waiting && isInMsg4LinkedList(&m_msg4)){char *xx=NULL;*xx=0;}
if ( m_msg4Waiting && g_conf.m_testSpiderEnabled )
logf(LOG_DEBUG,"build: msg4=0x%"PTRFMT" returned"
,(PTRTYPE)&m_msg4);
// we are not waiting for the msg4 to return
m_msg4Waiting = false;
bool flush = false;
// no longer flush injections.
// TODO: pass in a flush flag with injection and flush in that
// case, but for now disable to make things faster. profiler
// indicates too much msg4 activity.
//if ( m_contentInjected ) flush = true;
//if ( m_sreqValid && m_sreq.m_isPageInject ) flush = true;
// to keep our qa runs consistent
if ( strcmp(cr->m_coll,"qatest123") == 0 ) flush = true;
if ( ! m_listAdded ) flush = false;
if ( m_listFlushed ) flush = false;
// HACK: flush it if we are injecting it in case the next thing we
// spider is dependent on this one
if ( flush ) {
// note it
setStatus ( "flushing msg4" );
// only do it once
m_listFlushed = true;
// do it
if ( ! flushMsg4Buffers ( m_masterState , m_masterLoop ) )
return false;
}
// . all done with that. core if we block i guess.
// . but what if we were not the function that set this to begin w/?
//m_masterLoop = NULL;
return logIt();
/*
// if not doing exact quotas, we're done
if ( ! cr->m_exactQuotas ) return logIt();
char *isIndexed = getIsIndexed();
// this means it blocked
if ( isIndexed == (char *)-1) { char *xx=NULL; *xx=0; }
// returns NULL with g_errno set
if ( isIndexed ) return logIt();
// otherwise, tell Msg36 to update our quota count for this site
// so we don't have to keep merging site: termlists
m_incCount = false;
m_decCount = false;
if ( m_indexCode ) m_decCount = true;
//if ( m_forceDelete ) m_decCount = true;
// fix for the exact quota bug found on eurekster collection. bug 229
// if we're not a new doc, then don't increment the count because
// we have been already counted as the old doc. MDW: i added the
// condition that if decCount is true we need to update the count!
if ( *isIndexed && ! m_decCount ) return logIt();
// if it is new and we are not adding it to the index then no need
// to update any quota count...
if ( ! *isIndexed && m_decCount ) return logIt();
// if not decrementing the count, must be incrementing it then!
if ( ! m_decCount ) m_incCount = true;
*/
// i am not using quotas, so disable this for now
/*
log(LOG_DEBUG,"build: inc'ing quota to REMOTE table "
"for termIdHost %"UINT64" termIdDom %"UINT64" for %s.",
m_msg16.m_termIdHost,m_msg16.m_termIdDom,m_url.getUrl());
setStatus ( "updating quota cache" );
// sanity checks
if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; }
if ( m_msg16.m_termIdDom == 0 ) { char *xx = NULL; *xx = 0; }
// . Msg36 gets the correct count from disk and puts it in cache. It
// doesn't try to increment or decrement the quotas in cache, because
// then it would have to be done on all twins, and also the correct
// split will have to be found.
// . Actually, we should only use the cache on one host to hold the
// sum of all splits. This will be the authority cache.
if ( ! m_updatedCounts ) {
// only call this once
m_updatedCounts = true;
// do it
if ( ! m_msg36.getTermFreq ( m_coll ,
0 , // maxAge
m_msg16.m_termIdHost ,
this ,
m_masterLoop ,
m_niceness ,
m_exactQuotas ,
m_incCount ,
m_decCount ,
false ))
// we blocked
return false;
// error?
if ( g_errno ) return logIt();
}
// add the second entry for domain
if ( ! m_updatedCounts2 ) {
// only call this once
m_updateCounts2 = true;
// do it
if ( ! m_msg36.getTermFreq ( m_coll ,
0 , // maxAge
m_msg16.m_termIdDom ,
this ,
doneAddingMsg36Entry2,
m_niceness ,
m_exactQuotas ,
m_incCount ,
m_decCount ,
false ))
// we blocked
return false;
// error?
if ( g_errno ) return logIt();
}
// that is it!
return logIt();
*/
}
bool isRobotsTxtFile ( char *u , int32_t ulen ) {
if ( ulen > 12 && ! strncmp ( u + ulen - 11 , "/robots.txt" , 11 ) )
return true;
return false;
}
// does this doc consist of a sequence of smaller sub-docs?
// if so we'll index the subdocs and not the container doc itself.
bool XmlDoc::isContainerDoc ( ) {
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
//if ( m_contentDelim ) return true;
if ( m_contentDelimValid && m_contentDelim ) return true;
return false;
}
// returns false if would block, true otherwise. returns true and sets g_errno on err
bool XmlDoc::indexContainerDoc ( ) {
if ( ! m_contentDelim ) {
log("build: can not index container doc. no delimeter.");
g_errno = EBADENGINEER;
return true;
}
// int8_t *hc = getHopCount();
// if ( ! hc ) return true; // error?
// if ( hc == (void *)-1 ) return false;
// first download
// in the case of a list of delimeted http server replies let's
// not convert into utf8 here but just use as-is
char **cpp = getContent();//getUtf8Content();
// return true with g_errno set on error
if ( ! cpp ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// would block? return false then
if ( cpp == (void *)-1 )
return false;
// need this. it is almost 1MB in size, so alloc it
if ( ! m_msg7 ) {
try { m_msg7 = new ( Msg7 ); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
}
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
}
// inject input parms:
InjectionRequest *ir = &m_msg7->m_injectionRequest;
// the cursor for scanning the subdocs
if ( ! m_anyContentPtr ) {
// init the content cursor to point to the first subdoc
m_anyContentPtr = *cpp;
// but skip over initial separator if there. that is a
// faux pau
int32_t dlen = gbstrlen(m_contentDelim);
if ( strncmp(m_anyContentPtr,m_contentDelim,dlen) == 0 )
m_anyContentPtr += dlen;
// init the input parms
memset ( ir , 0 , sizeof(InjectionRequest) );
// reset it
ir->m_spiderLinks = false;
ir->m_injectLinks = false;
ir->m_hopCount = 0;//*hc + 1;
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
ir->m_collnum = m_collnum;
// will this work on a content delimeterized doc?
ir->m_deleteUrl = m_deleteFromIndex;
// each subdoc will have a mime since it is an arc
ir->m_hasMime = m_subDocsHaveMime;//true;
}
subdocLoop:
QUICKPOLL ( m_niceness );
// EOF?
if ( m_anyContentPtr == (char *)-1 ) {
m_indexCode = 0;//m_warcError;
m_indexCodeValid = true;
return true;
}
// we had \0 terminated the end of the previous record, so put back
if ( m_savedChar && ! *m_anyContentPtr ) {
*m_anyContentPtr = m_savedChar;
m_anyContentPtr += gbstrlen(m_contentDelim);
}
// index this subdoc
ir->ptr_content = m_anyContentPtr;
// . should have the url as well.
// . the url, ip etc. are on a single \n terminated line for an arc!
char *separator = strstr(m_anyContentPtr,m_contentDelim);
if ( separator ) {
m_savedChar = *separator;
m_anyContentPtr = separator;
*m_anyContentPtr = '\0';
//ir->size_content = separator - ir->ptr_content;
}
// if no separator found, this is our last injection
if ( ! separator ) {
m_anyContentPtr = (char *)-1;
}
// these are not defined. will be autoset in set4() i guess.
ir->m_firstIndexed = 0;
ir->m_lastSpidered = 0;
bool setUrl = false;
// HOWEVER, if an hasmime is true and an http:// follows
// the delimeter then use that as the url...
// this way we can specify our own urls.
if ( ir->m_hasMime ) {
char *du = ir->ptr_content;
//du += gbstrlen(delim);
if ( du && is_wspace_a ( *du ) ) du++;
if ( du && is_wspace_a ( *du ) ) du++;
if ( du && is_wspace_a ( *du ) ) du++;
if ( ir->m_hasMime &&
(strncasecmp( du,"http://",7) == 0 ||
strncasecmp( du,"https://",8) == 0 ) ) {
// flag it
setUrl = true;
// find end of it
char *uend = du + 7;
for ( ; *uend && ! is_wspace_a(*uend) ; uend++ );
// inject that then
m_injectUrlBuf.reset();
m_injectUrlBuf.safeMemcpy ( du , uend - du );
m_injectUrlBuf.nullTerm();
// and point to the actual http mime then
// well, skip that space, right
ir->ptr_content = uend + 1;
ir->ptr_url = m_injectUrlBuf.getBufStart();
ir->size_url = m_injectUrlBuf.length()+1; // include \0
// if (!strncmp(ir->ptr_url,"http://www.focusinfo.com/"
// "products/mxprodv" ,40) )
// log("hey");
}
}
QUICKPOLL ( m_niceness );
// make the url from parent url
// use hash of the content
int64_t ch64 = hash64n ( ir->ptr_content , 0LL );
// need this for an injection
ir->size_content = gbstrlen(ir->ptr_content) + 1;// improve this?
QUICKPOLL ( m_niceness );
if ( ! setUrl ) {
// reset it
m_injectUrlBuf.reset();
// by default append a -<ch64> to the provided url
m_injectUrlBuf.safePrintf("%s-%"UINT64"",
m_firstUrl.getUrl(),ch64);
ir->ptr_url = m_injectUrlBuf.getBufStart();
ir->size_url = m_injectUrlBuf.length()+1; // include \0
}
bool status = m_msg7->sendInjectionRequestToHost ( ir ,
m_masterState ,
m_masterLoop ) ;
// it would block, callback will be called later
if ( status )
return false;
QUICKPOLL ( m_niceness );
// error?
if ( g_errno ) {
log("build: index flatfile error %s",mstrerror(g_errno));
return NULL;
}
else
log("build: index flatfile did not block");
// loop it up
goto subdocLoop;
}
void doneInjectingArchiveRec ( void *state ) {
Msg7 *THIS = (Msg7 *)state;
THIS->m_inUse = false;
XmlDoc *xd = THIS->m_stashxd;
xd->m_numInjectionsOut--;
log("build: archive: injection thread returned. %"INT32" out now.",
xd->m_numInjectionsOut);
// reset g_errno so it doesn't error out in ::indexDoc() when
// we are injecting a ton of these msg7s and then xmldoc ends up
// getting reset and when a msg7 reply comes back in, we core
g_errno = 0;
xd->m_masterLoop ( xd );
}
void doneReadingArchiveFileWrapper ( int fd, void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// . go back to the main entry function
// . make sure g_errno is clear from a msg3a g_errno before calling
// this lest it abandon the loop
THIS->m_masterLoop ( THIS->m_masterState );
}
#define MAXWARCRECSIZE 5000000
bool XmlDoc::readMoreWarc() {
// We read everything we can off the pipe in a sleep timer.
// When we have enough to start processing, we call the
// processing function.
// If reading gets too far ahead of the processing and we can
// no longer buffer the read, then we save the offset of what
// we processed, free the readbuffer and restart the pipe and
// skip until the offset we last processed
if(!m_calledWgetThread) {
m_pipe = getUtf8ContentInFile();
}
// return true with g_errno set on error
if ( ! m_pipe ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
log("We don't have the warc pipe.");
return true;
}
int64_t leftOver = 0;
int64_t skipAhead = 0;
// How much is unprocessed
if(m_fptr != m_fptrEnd) {
leftOver = m_fptrEnd - m_fptr;
}
if(leftOver < 0) {
// Happens when we skip a record which is too big
skipAhead = - leftOver;
leftOver = 0;
m_fptr = m_fileBuf;
m_fptrEnd = m_fileBuf;
}
// We don't want to be memmoving the buffer up for every single
// document we process so only do it when we need it.
if(leftOver > MAXWARCRECSIZE) return false;
int64_t bytesRemaining = m_fileBufAllocSize - (m_fptrEnd - m_fileBuf) - 1;
// Scoot up everything we haven't processed
if(bytesRemaining < MAXWARCRECSIZE) {
//log("scooting up by left over %"INT64, leftOver);
// count everything we've processed
m_bytesStreamed += m_fptr - m_fileBuf;
memmove(m_fileBuf, m_fptr, leftOver);
m_fptr = m_fileBuf;
m_fptrEnd = m_fileBuf + leftOver;
*m_fptrEnd = '\0';
bytesRemaining += leftOver;
}
int64_t toRead = m_fileBufAllocSize - leftOver - 1;
if(toRead > bytesRemaining) toRead = bytesRemaining;
if(toRead == 0) {
//log("build: not enough room to read, lets process the buffer" );
return false;
}
g_loop.disableTimer();
errno = 0;
int bytesRead = fread(m_fptrEnd, 1, toRead, m_pipe);
g_loop.enableTimer();
// if(bytesRead > 0) {
// log("build: warc pipe read %"INT32" more bytes of the pipe. errno = %s, buf space = %"INT64 " processed = %"INT64 " skipAhead=%"INT64,
// bytesRead, mstrerror(errno),toRead, m_bytesStreamed, skipAhead);
// }
if(bytesRead <= 0 && errno != EAGAIN) {
// if(errno == EAGAIN){
// log("build: fd is not ready, lets process the buffer" );
// return false;
// } else {
if(m_registeredWgetReadCallback) {
//log("build:came back from read callback");
g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
m_registeredWgetReadCallback = false;
}
if(m_pipe) {
int32_t retCode = fclose(m_pipe);
if(retCode) {
log("we closed the pipe with error %s", mstrerror(retCode));
}
m_pipe = NULL;
}
//log("build: warc problem pipe terminated %s", mstrerror(errno));
m_hasMoreToRead = false;
return false;
// }
}
//m_fptr = m_fileBuf;
m_fptrEnd = m_fptrEnd + bytesRead;
*m_fptrEnd = '\0';
m_fptr += skipAhead;
return false;
}
// . returns false if would block, true otherwise.
// . returns true and sets g_errno on err
// . injectwarc
bool XmlDoc::indexWarcOrArc ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
if ( ! cr->m_indexWarcs ) {
g_errno = EDOCWARC;
return true;
}
// This can be a busy loop if we have max injections out but we
// are getting a read ready callback. Should we unregister
// when max injections are out and then reregister when we have room?
int32_t max = g_hostdb.m_numHosts * 2;
if ( max > MAXMSG7S ) max = MAXMSG7S;
if ( m_numInjectionsOut >= max ) return false;
char ctype;
if ( m_firstUrl.isWarc() ) {
ctype = CT_WARC;
} else {
ctype = CT_ARC;
}
int8_t *hc = getHopCount();
if ( ! hc ) return true; // error?
if ( hc == (void *)-1 ) return false;
if ( ! m_fileBuf ) {
// Do this exacly once.
m_fileBufAllocSize = (5 * MAXWARCRECSIZE) + 1;
m_fileBuf=(char *)mmalloc(m_fileBufAllocSize ,"sibuf");
m_fptr = m_fileBuf;
m_fptrEnd = m_fileBuf;
m_bytesStreamed = 0;
m_hasMoreToRead = true;
}
if ( ! m_fileBuf ) {
log("build: failed to alloc buf to read archive file %s",m_firstUrl.getUrl());
return true;
}
if(m_hasMoreToRead) readMoreWarc();
setStatus ("injecting archive records");
QUICKPOLL ( m_niceness );
// did an inject return?
if ( m_doneInjectingWarc ) {
warcDone:
// log("build: done parsing %"INT64" bytes of archive file %s. left over =%"INT32 "done injecting %"INT32 " hasmoretoread %"INT32,
// m_bytesStreamed + m_fptrEnd - m_fileBuf,
// m_firstUrl.getUrl(),
// (int32_t)(m_fptrEnd - m_fptr),
// (int32_t)m_doneInjectingWarc,
// (int32_t)m_hasMoreToRead);
m_doneInjectingWarc = true;
// return if all injects have returned.
if ( m_numInjectionsOut == 0) { // && !m_hasMoreToRead
g_errno = m_warcError;
m_indexCode = m_warcError;
m_indexCodeValid = true;
return true;
}
log("build: waiting for injection threads to return.");
// we would block
return false;
}
// Dup strings into here so we don't write nulls into our buffer, sometimes we have
// to rewind over a rec and we want the buf to be the same every time.
char scratchSpace[1024*10];
SafeBuf scratch(scratchSpace, 1024*10);
loop:
scratch.reset();
QUICKPOLL ( m_niceness );
if ( max > MAXMSG7S ) max = MAXMSG7S;
// wait for one to come back before launching another msg7
if ( m_numInjectionsOut >= max ) {
// Don't need to read anymore so don't call us
if(m_registeredWgetReadCallback && m_pipe && m_fptr < m_fptrEnd) {
g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
m_registeredWgetReadCallback = false;
}
return false;
}
char *realStart = m_fptr;
// need at least say 100k for warc header
if ( m_fptr + 100000 > m_fptrEnd && m_hasMoreToRead ) {
//log("build need more of the record to process so sleeping.");
if(!m_registeredWgetReadCallback) {
if(!g_loop.registerReadCallback ( fileno(m_pipe),
this ,
doneReadingArchiveFileWrapper,
m_niceness )) {
log("build: failed to register warc read callback." );
return true;
}
log("build: reregistered the read callback. need more");
m_registeredWgetReadCallback = true;
}
return false;
}
int64_t recTime = 0;
char *recIp = NULL;
char *recUrl = NULL;
char *recContent = NULL;
int64_t recContentLen = 0;
// what we skip over
uint64_t recSize = 0;
//
// set recUrl, recIp, recTime, recContent, recContentLen and recSize
//
//log("buf size is %"INT64 " four chars %c%c%c%c%c%c",
//m_fptrEnd-m_fptr, m_fptr[0], m_fptr[1], m_fptr[2], m_fptr[3],m_fptr[4],m_fptr[5]);
if ( ctype == CT_WARC ) {
// find "WARC/1.0" or whatever
char *whp = m_fptr;
if( ! whp ) {
// FIXME: shouldn't get here with a NULL
log("build: No buffer for file=%s", m_firstUrl.getUrl());
goto warcDone;
}
// we do terminate last warc rec with \0 so be aware of that...
int32_t maxCount = 10;
for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++);
// none?
if ( ! *whp ) {
log("build: could not find WARC/1 header start for "
"file=%s", m_firstUrl.getUrl());
// we don't really need this and since we force the
// http reply to end in \0 before calling inject2() on
// it it gets messed up
goto warcDone;
}
char *warcHeader = whp;
// find end of warc mime HEADER not the content
char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
if ( ! warcHeaderEnd ) {
log("build: could not find end of WARC header for "
"file=%s.",
m_firstUrl.getUrl());
goto warcDone;
}
// \0 term for strstrs below
char tmp = *warcHeaderEnd;
*warcHeaderEnd = '\0';
char *warcLen = strstr(warcHeader,"Content-Length:");
char *warcUrl = strstr(warcHeader,"WARC-Target-URI:");
char *warcType = strstr(warcHeader,"WARC-Type:");
char *warcDate = strstr(warcHeader,"WARC-Date:");
char *warcIp = strstr(warcHeader,"WARC-IP-Address:");
char *warcCon = strstr(warcHeader,"Content-Type:");
// advance
if ( warcLen ) warcLen += 15;
if ( warcUrl ) warcUrl += 16;
if ( warcType ) warcType += 10;
if ( warcIp ) warcIp += 17;
if ( warcCon ) warcCon += 13;
if ( warcDate ) warcDate += 10;
// skip initial spaces spaces
for ( ; warcUrl && is_wspace_a(*warcUrl ) ; warcUrl ++ );
for ( ; warcLen && is_wspace_a(*warcLen ) ; warcLen ++ );
for ( ; warcType && is_wspace_a(*warcType) ; warcType++ );
for ( ; warcDate && is_wspace_a(*warcDate) ; warcDate++ );
for ( ; warcIp && is_wspace_a(*warcIp ) ; warcIp ++ );
for ( ; warcCon && is_wspace_a(*warcCon ) ; warcCon ++ );
// get Content-Length: of WARC header for its content
if ( ! warcLen ) {
// this is a critical stop.
log("build: warc problem: could not find WARC Content-Length:");
goto warcDone;
}
//
// advance m_fptr to point to the next warc record in case we
// end up calling 'goto loop' below
//
recContent = warcHeaderEnd + 4;
recContentLen = atoll(warcLen);
//log("build content len was %"INT64, recContentLen);
char *warcContentEnd = recContent + recContentLen;
recSize = (warcContentEnd - realStart);
recUrl = warcUrl;
// point to the next warc record
m_fptr += recSize;
*warcHeaderEnd = tmp;
//log("skipping %"UINT64, recSize);
// advance the file offset to the next record as well
// get WARC-Type:
// revisit (if url was already done before)
// request (making a GET or DNS request)
// response (reponse to a GET or dns request)
// warcinfo (crawling parameters, robots: obey, etc)
// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
if ( ! warcType ) {
log("build: could not find WARC-Type:");
goto loop;
}
//http://www.mpaa.org/Resources/5bec4ac9-a95e-443b-987b-bff6fb5455a9.pdf
// get Content-Type:
// application/warc-fields (fetch time, hops from seed)
// application/http; msgtype=request (the GET request)
// application/http; msgtype=response (the GET reply)
if ( ! warcCon ) {
log("build: could not find Content-Type:");
goto loop;
}
if ( ! warcUrl ) {
// no URI?
log("build: could not find url");
goto loop;
}
// if WARC-Type: is not response, skip it. so if it
// is a revisit then skip it i guess.
if ( strncmp ( warcType,"response", 8 ) != 0) {
//log("build: was not type response %s *****%s*****", warcUrl, warcType);
// read another warc record
goto loop;
}
// warcConType needs to be
// application/http; msgtype=response
if ( !(strncmp(warcCon,"application/http; msgtype=response",34) == 0 ||
strncmp(warcCon,"application/http;msgtype=response",33) == 0)) {
// read another warc record
//log("build: wrong content type %s ---%s---", warcUrl, warcCon);
goto loop;
}
recTime = 0;
if ( warcDate ) recTime = atotime ( warcDate );
recIp = warcIp;
}
// END WARC SPECIFIC PARSING
//
// set recUrl, recIp, recTime, recContent, recContentLen and recSize
//
if ( ctype == CT_ARC ) {
// find \n\nhttp://
char *whp = m_fptr;
for ( ; *whp ; whp++ ) {
if ( whp[0] != '\n' ) continue;
if ( strncmp(whp+1,"http://",7) == 0) break;
if ( strncmp(whp+1,"https://",8) == 0) break;
}
// none?
if ( ! *whp ) {
log("build: arc: could not find next \\nhttp:// in "
"arc file %s",m_firstUrl.getUrl());
goto warcDone;
}
char *arcHeader = whp;
// find end of arc header not the content
char *arcHeaderEnd = strstr(arcHeader+1,"\n");
if ( ! arcHeaderEnd ) {
log("build: warc problem: could not find end of ARC header. file=%s",
m_firstUrl.getUrl());
goto warcDone;
}
// \0 term for strstrs below
char tmp = *arcHeaderEnd;
*arcHeaderEnd = '\0';
char *arcContent = arcHeaderEnd + 1;
// parse arc header line
char *url = arcHeader + 1;
char *hp = url;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("build: warc problem: bad arc header 1.file=%s", m_firstUrl.getUrl());
goto warcDone;
}
url = scratch.pushStr(url, hp-url);
hp++;
char *ipStr = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("build: warc problem: bad arc header 2.file=%s", m_firstUrl.getUrl());
goto warcDone;
}
ipStr = scratch.pushStr(ipStr, hp - ipStr);
hp++;
char *timeStr = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("build: warc problem: bad arc header 3.file=%s", m_firstUrl.getUrl());
goto warcDone;
}
timeStr = scratch.pushStr(timeStr, hp - timeStr);
hp++;
char *arcConType = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("build: warc problem: bad arc header 4.file=%s", m_firstUrl.getUrl());
goto warcDone;
}
arcConType = scratch.pushStr(arcConType, hp - arcConType);
hp++;
char *arcContentLenStr = hp;
// get arc content len
int64_t arcContentLen = atoll(arcContentLenStr);
char *arcContentEnd = arcContent + arcContentLen;
//uint64_t oldOff = s_off;
recSize = (arcContentEnd - realStart);
// point to the next arc record
m_fptr += recSize;
*arcHeaderEnd = tmp;
// advance the file offset to the next record as well
// arcConType needs to indexable
int32_t ct = getContentTypeFromStr ( arcConType );
if ( ct != CT_HTML &&
ct != CT_TEXT &&
ct != CT_XML &&
ct != CT_PDF &&
ct != CT_XLS &&
ct != CT_PPT &&
ct != CT_PS &&
ct != CT_DOC &&
ct != CT_JSON ) {
// read another arc record
log("build: was not indexable response %s", arcConType);
goto loop;
}
// convert to timestamp
// this time structure, once filled, will help yield a time_t
struct tm t;
// DAY OF MONTH
t.tm_mday = atol2 ( timeStr + 6 , 2 );
// MONTH
t.tm_mon = atol2 ( timeStr + 4 , 2 );
// YEAR - # of years since 1900
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
// TIME
t.tm_hour = atol2 ( timeStr + 8 , 2 );
t.tm_min = atol2 ( timeStr + 10 , 2 );
t.tm_sec = atol2 ( timeStr + 12 , 2 );
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
recTime = timegm ( &t );
// set content as well
recContent = arcContent;
recContentLen = arcContentLen;
recUrl = url;
recIp = ipStr;
}
// END ARC SPECIFIC PARSING
// must be http not dns:
// url must start with http:// or https://
// it's probably like WARC-Target-URI: dns:www.xyz.com
// so it is a dns response
if ( strncmp(recUrl,"http://" ,7) != 0 &&
strncmp(recUrl,"https://",8) != 0 )
goto loop;
// get length of it, null term it
char *recUrlEnd = recUrl;
for ( ; *recUrlEnd && ! is_wspace_a(*recUrlEnd) ; recUrlEnd++ );
int32_t recUrlLen = recUrlEnd - recUrl;
//*recUrlEnd = '\0';
// skip if robots.txt
if ( isRobotsTxtFile( recUrl , recUrlLen ) )
goto loop;
// how can there be no more to read?
if ( m_fptr > m_fptrEnd && ! m_hasMoreToRead ) {
log("build: warc problem: archive file %s exceeded file length.",
m_firstUrl.getUrl());
goto warcDone;
}
// if we fall outside of the current read buf, read next rec if too big
if ( m_fptr > m_fptrEnd && recSize > MAXWARCRECSIZE ) {
log("build: skipping archive file of %"INT64" "
"bytes which is too big",recSize);
if(!m_registeredWgetReadCallback) {
if(!g_loop.registerReadCallback ( fileno(m_pipe),
this ,
doneReadingArchiveFileWrapper,
m_niceness )) {
log("build: failed to register warc read callback." );
return true;
}
log("build: reregistered the read callback. skip bigrec");
m_registeredWgetReadCallback = true;
}
return false;
}
// don't read the next record, read THIS one again, we can fit it
if ( m_fptr > m_fptrEnd ) {
//log("build: record end is past the end of what we read by %"INT64 " %"UINT64, m_fptrEnd - m_fptr, recSize);
m_fptr -= recSize;
if(!m_registeredWgetReadCallback) {
if(!g_loop.registerReadCallback ( fileno(m_pipe),
this ,
doneReadingArchiveFileWrapper,
m_niceness )) {
log("build: failed to register warc read callback." );
return true;
}
log("build: reregistered the read callback. reread this record");
m_registeredWgetReadCallback = true;
}
return false;
}
char *httpReply = recContent;
int64_t httpReplySize = recContentLen;
// should be a mime that starts with GET or POST
HttpMime m;
if ( ! m.set ( httpReply , httpReplySize , NULL ) ) {
log("build: archive: failed to set http mime at in "
"file");
goto loop;
}
// check content type
int ct2 = m.getContentType();
if ( ct2 != CT_HTML &&
ct2 != CT_TEXT &&
ct2 != CT_XML &&
ct2 != CT_PDF &&
ct2 != CT_XLS &&
ct2 != CT_PPT &&
ct2 != CT_PS &&
ct2 != CT_DOC &&
ct2 != CT_JSON ) {
//log("build:got wrong type %"INT32, (int32_t)ct2);
goto loop;
}
// grab an available msg7
Msg7 *msg7 = NULL;
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) {
msg7 = m_msg7s[i];
// if we got an available one stop
if ( msg7 ) {
if( msg7->m_inUse ) continue;
break; // reuse this one.
}
// ok, create one, 1MB each about
try { msg7 = new ( Msg7 ); }
catch ( ... ) {g_errno=ENOMEM;m_warcError=g_errno;return true;}
mnew ( msg7 , sizeof(Msg7),"xdmsgs7");
// store it for re-use
m_msg7s[i] = msg7;
break;
}
if(!msg7 || msg7->m_inUse) {
// shouldn't happen, but it does... why?
log("build: archive: Ran out of msg7s to inject doc.");
return false;
}
// inject input parms:
InjectionRequest *ir = &msg7->m_injectionRequest;
// reset it
ir->m_hopCount = *hc + 1;
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
ir->m_collnum = m_collnum;
// will this work on a content delimeterized doc?
ir->m_deleteUrl = m_deleteFromIndex;
// each subdoc will have a mime since it is a warc
ir->m_hasMime = true;
// it has a mime so we shouldn't need to set this
ir->ptr_contentTypeStr = NULL;
// we are injecting a single page, not a container file
ir->ptr_contentDelim = NULL;
// miscelleaneous. faster than memsetting the whole gr class (32k)
ir->m_getSections = 0;
ir->m_gotSections = 0;
ir->m_spiderLinks = false;
ir->m_injectLinks = false;
ir->m_shortReply = false;
ir->m_newOnly = false;
ir->m_recycle = false;
ir->m_dedup = true;
ir->m_doConsistencyTesting = false;
ir->m_charset = 0;
ir->ptr_queryToScrape = NULL;
ir->ptr_contentFile = NULL;
ir->ptr_diffbotReply = NULL;
// Stick the capture date in the metadata
StackBuf(newKey);
newKey.safePrintf("\"gbcapturedate\":%"INT64, recTime);
SafeBuf newMetadata(newKey.length() * 2 + size_metadata, "ModifiedMetadata");
newMetadata.safeMemcpy(ptr_metadata, size_metadata);
Json::prependKey(newMetadata, newKey.getBufStart());
ir->ptr_metadata = newMetadata.getBufStart();
ir->size_metadata = newMetadata.length();
newMetadata.nullTerm();
// set 'timestamp' for injection
ir->m_firstIndexed = recTime;
ir->m_lastSpidered = recTime;
// set 'ip' for injection
ir->m_injectDocIp = 0;
// get the record IP address from the warc header if there
if ( recIp ) {
// get end of ip
char *ipEnd = recIp;
// skip digits and periods
while ( *ipEnd && ! is_wspace_a(*ipEnd) ) ipEnd++;
// we now have the ip address for doing ip: searches
// this func is in ip.h
ir->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
}
// we end up repopulating m_fileBuf to read the next warc sometimes
// so do not destroy the content we are injecting from the original
// m_fileBuf. so we have to copy it.
msg7->m_contentBuf.reset();
msg7->m_contentBuf.reserve ( httpReplySize + 5 );
msg7->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
msg7->m_contentBuf.nullTerm();
// set 'content' for injection
ir->ptr_content = msg7->m_contentBuf.getBufStart();
ir->size_content = msg7->m_contentBuf.getLength() + 1;
// set the rest of the injection parms
ir->m_hopCount = -1;
ir->m_newOnly = 0;
// all warc records have the http mime
ir->m_hasMime = true;
ir->ptr_url = recUrl;
ir->size_url = recUrlLen+1;
// stash this
msg7->m_stashxd = this;
QUICKPOLL ( m_niceness );
// log it
*recUrlEnd = '\0';
log("build: archive: injecting archive url %s",recUrl);
QUICKPOLL ( m_niceness );
if (msg7->sendInjectionRequestToHost(ir,msg7,doneInjectingArchiveRec)){
m_numInjectionsOut++;
msg7->m_inUse = true;
goto loop;
}
log("build: index archive: msg7 inject: %s",
mstrerror(g_errno));
goto loop;
}
void getTitleRecBufWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// note it
THIS->setStatus ( "in get title rec wrapper" );
// return if it blocked
if ( THIS->getTitleRecBuf() == (void *)-1 ) return;
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
key_t *XmlDoc::getTitleRecKey() {
if ( m_titleRecBufValid ) return &m_titleRecKey;
SafeBuf *tr = getTitleRecBuf();
if ( ! tr || tr == (void *)-1 ) return (key_t *)tr;
return &m_titleRecKey;
}
int32_t *XmlDoc::getIndexCode ( ) {
int32_t *indexCode = getIndexCode2();
if ( ! indexCode || indexCode == (void *)-1 ) return indexCode;
// if zero good!
if ( *indexCode == 0 ) return indexCode;
//
// should we neutralize it?
//
// in the case of indexing dmoz urls outputted from
// 'dmozparse urldump -s' it outputs a meta tag
// (<meta name=ignorelinksexternalerrors content=1>) that
// indicates to index the links even in the case of some errors,
// so that we can be assured to have exactly the same urls the dmoz
// has in our index. so when we do a gbcatid:xxx query we get the same
// urls in the search results that dmoz has for that category id.
if ( ! m_sreqValid || ! m_sreq.m_ignoreExternalErrors )
return indexCode;
// only neutralize certain errors
if ( * indexCode != EDNSTIMEDOUT
&& *indexCode != ETCPTIMEDOUT
&& *indexCode != EUDPTIMEDOUT
// from m_redirError
&& *indexCode != EDOCSIMPLIFIEDREDIR
&& *indexCode != EDOCNONCANONICAL
&& *indexCode != EDNSDEAD
&& *indexCode != ENETUNREACH
&& *indexCode != EHOSTUNREACH
&& *indexCode != EDOCFILTERED
&& *indexCode != EDOCREPEATSPAMMER
&& *indexCode != EDOCDUP
&& *indexCode != EDOCISERRPG
&& *indexCode != EDOCHIJACKED
&& *indexCode != EDOCBADHTTPSTATUS
&& *indexCode != EDOCDISALLOWED
&& *indexCode != EBADCHARSET
&& *indexCode != EDOCDUPWWW
&& *indexCode != EBADIP
&& *indexCode != EDOCEVILREDIRECT // fix video.google.com dmoz
&& *indexCode != EBADMIME
// index.t and .exe files are in dmoz but those
// extensions are "bad" according to Url::isBadExtension()
&& *indexCode != EDOCBADCONTENTTYPE
// repeat url path components are ok:
&& *indexCode != ELINKLOOP
&& *indexCode != ECONNREFUSED
// malformed sections:
&& *indexCode != EDOCBADSECTIONS
&& *indexCode != ECORRUPTHTTPGZIP
)
return indexCode;
// ok, neutralize it
*indexCode = 0;
// if we could not get an ip we need to make a fake one
if ( ! m_ipValid || m_ip == 0 || m_ip == -1 ) {
log("build: ip unattainable. forcing ip address of %s "
"to 10.5.123.45",m_firstUrl.m_url);
m_ip = atoip("10.5.123.45");
m_ipValid = true;
}
// make certain things valid to avoid core in getNewSpiderReply()
if ( ! m_crawlDelayValid ) {
m_crawlDelayValid = true;
m_crawlDelay = -1;
}
return indexCode;
}
// . return NULL and sets g_errno on error
// . returns -1 if blocked
int32_t *XmlDoc::getIndexCode2 ( ) {
// return it now if we got it already
if ( m_indexCodeValid ) return &m_indexCode;
setStatus ( "getting index code");
// page inject can set deletefromindex to true
if ( m_deleteFromIndex ) {
m_indexCode = EDOCFORCEDELETE;
m_indexCodeValid = true;
return &m_indexCode;
}
// . internal callback
// . so if any of the functions we end up calling directly or
// indirectly block and return -1, we will be re-called from the top
//if ( ! m_masterLoop ) {
// m_masterLoop = getTitleRecWrapper;
// m_masterState = this;
//}
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
if ( m_firstUrl.m_ulen <= 5 ) {
m_indexCode = EBADURL;
m_indexCodeValid = true;
return &m_indexCode;
}
if ( m_firstUrl.m_ulen + 1 >= MAX_URL_LEN ) {
m_indexCode = EURLTOOLONG;
m_indexCodeValid = true;
return &m_indexCode;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// "url is repeating path components" error?
if ( ! m_check1 ) {
m_check1 = true;
if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
m_indexCode = ELINKLOOP;
m_indexCodeValid = true;
return &m_indexCode;
}
}
// fix for "http://.xyz.com/...."
if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
m_indexCode = EBADURL;
m_indexCodeValid = true;
return &m_indexCode;
}
if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
m_check2 = true;
if ( m_firstUrl.isSpam() ) {
m_indexCode = EDOCURLSPAM;
m_indexCodeValid = true;
return &m_indexCode;
}
}
// . don't spider robots.txt urls for indexing!
// . quickly see if we are a robots.txt url originally
int32_t fulen = getFirstUrl()->getUrlLen();
char *fu = getFirstUrl()->getUrl();
char *fp = fu + fulen - 11;
if ( fulen > 12 &&
fp[1] == 'r' &&
! strncmp ( fu + fulen - 11 , "/robots.txt" , 11 )) {
m_indexCode = EBADURL;
m_indexCodeValid = true;
return &m_indexCode;
}
// if this is an injection and "newonly" is not zero then we
// only want to do the injection if the url is "new", meaning not
// already indexed. "m_wasContentInjected" will be true if this is
// an injection. "m_newOnly" will be true if the injector only
// wants to proceed with the injection if this url is not already
// indexed.
if ( m_wasContentInjected && m_newOnly ) {
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
XmlDoc *od = *pod;
// if the old doc does exist and WAS NOT INJECTED itself
// then abandon this injection. it was spidered the old
// fashioned way and we want to preserve it and NOT overwrite
// it with this injection.
if ( od && ! od->m_wasContentInjected ) {
m_indexCode = EABANDONED;
m_indexCodeValid = true;
return &m_indexCode;
}
// if it was injected itself, only abandon this injection
// in the special case that m_newOnly is "1". otherwise
// if m_newOnly is 2 then we will overwrite any existing
// titlerecs that were not injected themselves.
if ( od && od->m_wasContentInjected && m_newOnly == 1 ) {
m_indexCode = EABANDONED;
m_indexCodeValid = true;
return &m_indexCode;
}
}
// need tagrec to see if banned
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
// this is an automatic ban!
if ( gr->getLong("manualban",0) ) {
m_indexCode = EDOCBANNED;
m_indexCodeValid = true;
return &m_indexCode;
}
// get the ip of the current url
int32_t *ip = getIp ( );
if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
if ( *ip == 0 ) {
m_indexCode = EBADIP;
m_indexCodeValid = true;
return &m_indexCode;
}
// . check robots.txt
// . uses the curernt url
// . if we end in /robots.txt then this quickly returns true
// . no, we still might want to index if we got link text, so just
// check this again below
bool *isAllowed = getIsAllowed();
if ( ! isAllowed || isAllowed == (void *)-1) return (int32_t *)isAllowed;
/*
if ( ! *isAllowed ) {
m_indexCode = EDOCDISALLOWED;
m_indexCodeValid = true;
return &m_indexCode;
}
*/
// . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc.
// . this will be the reply from diffbot.com if using diffbot
int32_t *dstatus = getDownloadStatus();
if ( ! dstatus || dstatus == (void *)-1 ) return (int32_t *)dstatus;
if ( *dstatus ) {
m_indexCode = *dstatus;
m_indexCodeValid = true;
return &m_indexCode;
}
// check the mime
HttpMime *mime = getMime();
if ( ! mime || mime == (HttpMime *)-1 ) return (int32_t *)mime;
// no, now the smart compression will nuke a reply if it has
// no good date or for other reasons...
// if empty, bad mime
//if ( mime->getMimeLen() <= 0 && ! m_recycleContent ) {
// m_indexCode = EBADMIME;
// m_indexCodeValid = true;
// return &m_indexCode;
//}
// check redir url
Url **redirp = getRedirUrl();
if ( ! redirp || redirp == (void *)-1 ) return (int32_t *)redirp;
// this must be valid now
if ( ! m_redirErrorValid ) { char *xx=NULL;*xx=0; }
if ( m_redirError ) {
m_indexCode = m_redirError;
m_indexCodeValid = true;
return &m_indexCode;
}
int64_t *d = getDocId();
if ( ! d || d == (void *)-1 ) return (int32_t *)d;
if ( *d == 0LL ) {
m_indexCode = ENODOCID;
m_indexCodeValid = true;
return &m_indexCode;
}
// . is the same url but with a www. present already in titledb?
// . example: if we are xyz.com and www.xyz.com is already in titledb
// then nuke ourselves by setting m_indexCode to EDOCDUPWWW
char *isWWWDup = getIsWWWDup ();
if ( ! isWWWDup || isWWWDup == (char *)-1) return (int32_t *)isWWWDup;
if ( *isWWWDup ) {
m_indexCode = EDOCDUPWWW;
m_indexCodeValid = true;
return &m_indexCode;
}
uint16_t *charset = getCharset();
if ( ! charset && g_errno == EBADCHARSET ) {
g_errno = 0;
m_indexCode = EBADCHARSET;
m_indexCodeValid = true;
return &m_indexCode;
}
if ( ! charset || charset == (void *)-1) return (int32_t *)charset;
// we had a 2024 for charset come back and that had a NULL
// get_charset_str() but it was not supported
if ( ! supportedCharset(*charset) ) { //&&get_charset_str(*charset) ) {
m_indexCode = EBADCHARSET;
m_indexCodeValid = true;
return &m_indexCode;
}
// get local link info
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int32_t *)info1;
// get remote link info
LinkInfo **pinfo2 = getLinkInfo2();
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (int32_t *)pinfo2;
LinkInfo *info2 = *pinfo2;
// if robots.txt said no, and if we had no link text, then give up
bool disallowed = true;
if ( *isAllowed ) disallowed = false;
if ( info1 && info1->hasLinkText() ) disallowed = false;
if ( info2 && info2->hasLinkText() ) disallowed = false;
// if we generated a new sitenuminlinks to store in tagdb, we might
// want to add this for that only reason... consider!
if ( disallowed ) {
m_indexCode = EDOCDISALLOWED;
m_indexCodeValid = true;
return &m_indexCode;
}
// check for bad url extension, like .jpg
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (int32_t *)cu;
// take this check out because it is hurting
// http://community.spiceworks.com/profile/show/Mr.T
// because 't' was in the list of bad extensions.
// now we use the url filters table to exclude the extensions we want.
// and we use the 'ismedia' directive to exclude common media
// extensions. having this check here is no longer needed and confusing
// BUT on the otherhand stuff like .exe .rpm .deb is good to avoid!
// so i'll just edit the list to remove more ambiguous extensions
// like .f and .t
bool badExt = cu->isBadExtension ( m_version );
if ( badExt && ! info1->hasLinkText() &&
( ! info2 || ! info2->hasLinkText() ) ) {
m_indexCode = EDOCBADCONTENTTYPE;
m_indexCodeValid = true;
return &m_indexCode;
}
int16_t *hstatus = getHttpStatus();
if ( ! hstatus || hstatus == (void *)-1 ) return (int32_t *)hstatus;
if ( *hstatus != 200 ) {
m_indexCode = EDOCBADHTTPSTATUS;
m_indexCodeValid = true;
return &m_indexCode;
}
// debug point
//if ( cr->m_localCrawlInfo.m_pageDownloadAttempts >= 2 ) {
// m_indexCode = ETCPTIMEDOUT;
// m_indexCodeValid = true;
// return &m_indexCode;
//}
// if this page is hijacked, toss it!
char *hj = getIsHijacked();
if ( ! hj || hj == (char *)-1 ) return (int32_t *)hj;
// if not allowed m_indexCode will be set
if ( *hj ) {
m_indexCode = EDOCHIJACKED;
m_indexCodeValid = true;
return &m_indexCode;
}
// check for EDOCISERRPG (custom error pages)
char *isErrorPage = getIsErrorPage();
if ( !isErrorPage||isErrorPage==(void *)-1) return (int32_t *)isErrorPage;
if ( *isErrorPage ) {
m_indexCode = EDOCISERRPG;
m_indexCodeValid = true;
return &m_indexCode;
}
// . i moved this up to perhaps fix problems of two dup pages being
// downloaded at about the same time
// . are we a dup of another doc from any other site already indexed?
char *isDup = getIsDup();
if ( ! isDup || isDup == (char *)-1 ) return (int32_t *)isDup;
if ( *isDup ) {
m_indexCode = EDOCDUP;
m_indexCodeValid = true;
return &m_indexCode;
}
// . is a non-canonical page that have <link ahref=xxx rel=canonical>
// . also sets m_canonicanlUrl.m_url to it if we are not
// . returns NULL if we are the canonical url
// . do not do this check if the page was injected
bool checkCanonical = true;
if ( m_wasContentInjected ) checkCanonical = false;
if ( m_isInjecting && m_isInjectingValid ) checkCanonical = false;
// do not do canonical deletion if recycling content either i guess
if ( m_sreqValid && m_sreq.m_recycleContent ) checkCanonical = false;
// do not delete from being canonical if doing a query reindex
if ( m_sreqValid && m_sreq.m_isPageReindex ) checkCanonical = false;
if ( checkCanonical ) {
Url **canon = getCanonicalRedirUrl();
if ( ! canon || canon == (void *)-1 ) return (int32_t *)canon;
// if there is one then we are it's leaf, it is the primary
// page so we should not index ourselves
if ( *canon ) {
m_indexCode = EDOCNONCANONICAL;
m_indexCodeValid = true;
return &m_indexCode;
}
}
// was page unchanged since last time we downloaded it?
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
XmlDoc *od = NULL;
if ( *pod ) od = *pod;
// if recycling content is true you gotta have an old title rec.
if ( ! od && m_recycleContent ) {
m_indexCode = ENOTITLEREC;
m_indexCodeValid = true;
return &m_indexCode;
}
bool check = true;
if ( ! od ) check = false;
// do not do this logic for diffbot because it might want to get
// the diffbot reply even if page content is the same, because it
// might have an ajax call that updates the product price.
// onlyProcessIfNewUrl defaults to true, so typically even diffbot
// crawls will do this check.
if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl &&
// but allow urls like *-diffbotxyz2445187448 to be deduped,
// that is the whole point of this line
! m_isDiffbotJSONObject )
check = false;
if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError )
check = false;
// or if recycling content turn this off as well! otherwise
// it will always be 100% the same
if ( m_recycleContent )
check = false;
// never check for a bulk job
if ( cr->m_isCustomCrawl == 2 )
check = false;
if ( check ) {
// check inlinks now too!
LinkInfo *info1 = getLinkInfo1 ();
if ( ! info1 || info1 == (LinkInfo *)-1 )
return (int32_t *)info1;
LinkInfo *info2 = od->getLinkInfo1 ();
if ( ! info2 || info2 == (LinkInfo *)-1 )
return (int32_t *)info2;
Inlink *k1 = NULL;
Inlink *k2 = NULL;
char *s1, *s2;
int32_t len1,len2;
if ( info1->getNumGoodInlinks() !=
info2->getNumGoodInlinks() )
goto changed;
for ( ; k1=info1->getNextInlink(k1) ,
k2=info2->getNextInlink(k2); ) {
if ( ! k1 )
break;
if ( ! k2 )
break;
if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks )
goto changed;
s1 = k1->getLinkText();
len1 = k1->size_linkText - 1; // exclude \0
s2 = k2->getLinkText();
len2 = k2->size_linkText - 1; // exclude \0
if ( len1 != len2 )
goto changed;
if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
goto changed;
}
// no change in link text, look for change in page content now
int32_t *ch32 = getContentHash32();
if ( ! ch32 || ch32 == (void *)-1 ) return (int32_t *)ch32;
if ( *ch32 == od->m_contentHash32 ) {
m_indexCode = EDOCUNCHANGED;
m_indexCodeValid = true;
// hack these values on or off.
// really should be function calls.
// but it never gets set when it should if the
// doc is unchanged.
m_sentToDiffbot = od->m_sentToDiffbot;
return &m_indexCode;
}
}
changed:
// words
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (int32_t *)words;
// we set the D_IS_IN_DATE flag for these bits
Bits *bits = getBits(); if ( ! bits ) return NULL;
// . check for date buffer overflow before setting sections
// . returns false and sets g_errno on error
/*
if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits )) {
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("doc: parseDates: %s",mstrerror(g_errno));
// this just means we ran out of stack space to parse
// out all the dates, so ignore and continue... that way
// Spider.cpp does not give up and keep retrying us over
// and over again
if ( g_errno != EBUFOVERFLOW ) return NULL;
g_errno = 0;
m_indexCode = EDOCBADDATES;
m_indexCodeValid = true;
return &m_indexCode;
}
*/
// bad sections? fixes http://www.beerexpedition.com/northamerica.shtml
// being continuously respidered when its lock expires every
// MAX_LOCK_AGE seconds
Sections *sections = getSections();
// on EBUFOVERFLOW we will NEVER be able to parse this url
// correctly so do not retry!
if ( ! sections && g_errno == EBUFOVERFLOW ) {
g_errno = 0;
m_indexCode = EBUFOVERFLOW;
m_indexCodeValid = true;
return &m_indexCode;
}
if (!sections||sections==(Sections *)-1) return (int32_t *)sections;
if ( sections->m_numSections == 0 && words->m_numWords > 0 ) {
m_indexCode = EDOCBADSECTIONS;
m_indexCodeValid = true;
return &m_indexCode;
}
// i think an oom error is not being caught by Sections.cpp properly
if ( g_errno ) { char *xx=NULL;*xx=0; }
Dates *dp = getDates();
if ( ! dp && g_errno == EBUFOVERFLOW ) {
g_errno = 0;
m_indexCode = EBUFOVERFLOW;
m_indexCodeValid = true;
return &m_indexCode;
}
if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
// make sure address buffers did not overflow
Addresses *aa = getAddresses ();
if ( (! aa && g_errno == EBUFOVERFLOW) ||
// it sets m_breached now if there's a problem
(aa && aa->m_breached) ) {
g_errno = 0;
m_indexCode = EBUFOVERFLOW;
m_indexCodeValid = true;
return &m_indexCode;
}
if ( ! aa || aa == (void *)-1 ) return (int32_t *)aa;
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (int32_t *)isRoot;
// get the tag rec
//TagRec *gr = getTagRec ();
//if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
bool spamCheck = true;
// if we are a root, allow repeat spam
if ( *isRoot ) spamCheck = false;
// if we are being spidered deep, allow repeat spam
if ( gr->getLong("deep",0) ) spamCheck = false;
// not for crawlbot
if ( cr->m_isCustomCrawl ) spamCheck = false;
// only html for now
if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
// turn this off for now
spamCheck = false;
// otherwise, check the weights
if ( spamCheck ) {
char *ws = getWordSpamVec();
if ( ! ws || ws == (void *)-1 ) return (int32_t *)ws;
if ( m_isRepeatSpammer ) {
m_indexCode = EDOCREPEATSPAMMER;
m_indexCodeValid = true;
return &m_indexCode;
}
}
// validate this here so getSpiderPriority(), which calls
// getUrlFilterNum(), which calls getNewSpiderReply(), which calls
// us, getIndexCode() does not repeat all this junk
//m_indexCodeValid = true;
//m_indexCode = 0;
// fix query reindex on global-index from coring because
// the spider request is null
if ( m_isDiffbotJSONObject ) {
m_indexCode = 0;
m_indexCodeValid = true;
return &m_indexCode;
}
// this needs to be last!
int32_t *priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1) {
// allow this though
if ( g_errno == EBUFOVERFLOW ) {
g_errno = 0;
m_indexCode = EBUFOVERFLOW;
m_indexCodeValid = true;
return &m_indexCode;
}
// but if it blocked, then un-validate it
m_indexCodeValid = false;
// and return to be called again i hope
return (int32_t *)priority;
}
if ( *priority == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
m_indexCode = EDOCFILTERED;
m_indexCodeValid = true;
return &m_indexCode;
}
// if ( *priority == SPIDER_PRIORITY_BANNED ) {
// m_indexCode = EDOCBANNED;
// m_indexCodeValid = true;
// return &m_indexCode;
// }
// . if using diffbot and the diffbot reply had a time out error
// or otherwise... diffbot failure demands a re-try always i guess.
// put this above getSpiderPriority() call otherwise we end up in
// a recursive loop with getIndexCode() and getNewSpiderReply()
// . NO, don't do this anymore, however, if there is a diffbot
// reply error then record it in the spider reply BUT only if it is
// a diffbot reply error that warrants a retry. for instance,
// EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
// error trying to download the page so it probably should not
// retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
// SafeBuf *dbr = getDiffbotReply();
// if ( ! dbr || dbr == (void *)-1 ) return (int32_t *)dbr;
// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
// m_indexCode= m_diffbotReplyError;
// m_indexCodeValid = true;
// return &m_indexCode;
// }
// no error otherwise
m_indexCode = 0;
m_indexCodeValid = true;
return &m_indexCode;
}
char *XmlDoc::prepareToMakeTitleRec ( ) {
// do not re-call this for speed
if ( m_prepared ) return (char *)1;
int32_t *indexCode = getIndexCode();
if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
if ( *indexCode ) { m_prepared = true; return (char *)1; }
//
// do all the sets here
//
// . this gets our old doc from titledb, if we got it
// . TODO: make sure this is cached in the event of a backoff, we
// will redo this again!!! IMPORTANT!!!
char *isIndexed = getIsIndexed();
if ( ! isIndexed || isIndexed == (char *)-1) return (char *)isIndexed;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if we are injecting into the "qatest123" coll, then we need to have
// m_spideredTimeValid be true before calling getIsSpam() which calls
// getSiteNumInlinks() which adds tags to tagdb using that date, but
// only for the "qatest123" coll!
// that keeps our parser output consistent across runs!
char **content = NULL;
if ( ! strcmp ( cr->m_coll,"qatest123") ) {
content = getContent ( );
if ( ! content || content == (void *)-1 )
return (char *)content;
}
// get our site root
char *mysite = getSite();
if ( ! mysite || mysite == (void *)-1 ) return (char *)mysite;
// if we are a root page, update tagdb with the root lang id
//bool *status1 = updateRootLangId();
//if ( ! status1 || status1 == (void *)-1 ) return (char *)status1;
// if we are a root page, update tagdb with the root lang id
//bool *status2 = updateSiteTitleBuf();
//if ( ! status2 || status2 == (void *)-1 ) return (char *)status2;
// if we found some default venue addresses on page, add to tagdb
//bool *status3 = updateVenueAddresses();
//if ( ! status3 || status3 == (void *)-1 ) return (char *)status3;
// add "firstip" to tag rec if we need to
//bool *status4 = updateFirstIp();
//if ( ! status4 || status4 == (void *)-1 ) return (char *)status4;
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
int32_t *datedbDate = getPubDate();
if ( ! datedbDate || datedbDate == (int32_t *)-1 )
return (char *)datedbDate;
getHostHash32a();
getContentHash32();
//Images *images = getImages();
//if ( ! images || images == (Images *)-1 ) return (char *)images;
char **id = getThumbnailData();
if ( ! id || id == (void *)-1 ) return (char *)id;
int8_t *hopCount = getHopCount();
if ( ! hopCount || hopCount == (void *)-1 ) return (char *)hopCount;
char *spiderLinks = getSpiderLinks();
if ( ! spiderLinks || spiderLinks == (char *)-1 )
return (char *)spiderLinks;
//int32_t *nextSpiderTime = getNextSpiderTime();
//if ( ! nextSpiderTime || nextSpiderTime == (int32_t *)-1 )
// return (char *)nextSpiderTime;
//int8_t *nextSpiderPriority = getNextSpiderPriority();
//if ( ! nextSpiderPriority || nextSpiderPriority == (void *)-1 )
// return (char *)nextSpiderPriority;
int32_t *firstIndexedDate = getFirstIndexedDate();
if ( ! firstIndexedDate || firstIndexedDate == (int32_t *)-1 )
return (char *)firstIndexedDate;
int32_t *outlinksAddedDate = getOutlinksAddedDate();
if ( ! outlinksAddedDate || outlinksAddedDate == (int32_t *)-1 )
return (char *)outlinksAddedDate;
uint16_t *countryId = getCountryId();
if ( ! countryId||countryId==(uint16_t *)-1) return (char *)countryId;
char *trunc = getIsContentTruncated();
if ( ! trunc || trunc == (char *)-1 ) return (char *)trunc;
char *pl = getIsPermalink();
if ( ! pl || pl == (char *)-1 ) return (char *)pl;
//int32_t *numBannedOutlinks = getNumBannedOutlinks();
// set this
//m_numBannedOutlinks8 = score32to8 ( *numBannedOutlinks );
Dates *dp = getDates();
if ( ! dp || dp == (Dates *)-1 ) return (char *)dp;
// . before storing this into title Rec, make sure all tags
// are valid and tagRec is up to date
// . like we might need to update the contact info, siteNumInlinks,
// or other tags because, for instance, contact info might not
// be in there because isSpam() never required it.
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
char *hci = getHasContactInfo();
if ( ! hci || hci == (char *)-1 ) return (char *)hci;
char *ict = getIsContentTruncated();
if ( ! ict || ict == (char *)-1 ) return (char *)ict;
int64_t **wd = getWikiDocIds();
if ( ! wd || wd == (void *)-1 ) return (char *)wd;
int64_t **avp = getAdVector();
if ( ! avp || avp == (void *)-1 ) return (char *)avp;
char *at = getIsAdult();
if ( ! at || at == (void *)-1 ) return (char *)at;
char *ls = getIsLinkSpam();
if ( ! ls || ls == (void *)-1 ) return (char *)ls;
uint32_t *tph = getTagPairHash32();
if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;
// sets the ptr_sectionsReply, that is all we need it to do
//char **sd = getSectionsReply ( ) ;
//if ( ! sd || sd == (void *)-1 ) return (char *)sd;
// sets the ptr_addressReply, that is all we need it to do
//char **ad = getAddressReply ( ) ;
//if ( ! ad || ad == (void *)-1 ) return (char *)ad;
uint8_t *rl = getRootLangId();
if ( ! rl || rl == (void *)-1 ) return (char *)rl;
int32_t **pcids = getCatIds();
if ( ! pcids || pcids == (void *)-1) return (char *)pcids;
// get dmoz ptr_dmozTitles, ptr_dmozSumms, ptr_dmozAnchors
if ( ! setDmozInfo() ) return (char *)-1;
m_prepared = true;
return (char *)1;
}
#define MAX_DMOZ_TITLES 10
int32_t *XmlDoc::getNumDmozEntries() {
// MDW: wth is this?
//int32_t **getDmozCatIds();
int32_t nc = size_catIds / 4;
if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
m_numDmozEntries = nc;
return &m_numDmozEntries;
}
// list of \0 terminated titles, etc. use getNumDmozTitles() to get #
char **XmlDoc::getDmozTitles ( ) {
// returns false if blocked
if ( ! setDmozInfo() ) return (char **)-1;
if ( g_errno ) return NULL;
return &ptr_dmozTitles;
}
char **XmlDoc::getDmozSummaries ( ) {
// returns false if blocked
if ( ! setDmozInfo() ) return (char **)-1;
if ( g_errno ) return NULL;
return &ptr_dmozSumms;
}
char **XmlDoc::getDmozAnchors ( ) {
// returns false if blocked
if ( ! setDmozInfo() ) return (char **)-1;
if ( g_errno ) return NULL;
return &ptr_dmozAnchors;
}
// returns false if blocked, true otherwise. sets g_errno on error & rets true
bool XmlDoc::setDmozInfo () {
if ( m_dmozInfoValid ) return true;
g_errno = 0;
// return true and set g_errno on error
if ( ! m_dmozBuf.reserve(12000) ) {
log("xmldoc: error getting dmoz info: %s",mstrerror(g_errno));
// ensure log statement does not clear g_errno
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// start here
char *dmozBuf = m_dmozBuf.getBufStart();
char *titles = dmozBuf;
char *summs = dmozBuf+5000;
char *anchors = dmozBuf+10000;
// the end of it
char *dtend = dmozBuf + 5000;
char *dsend = dmozBuf + 10000;
char *daend = dmozBuf + 12000;
// point into those bufs
char *dt = titles;
char *ds = summs;
char *da = anchors;
// MDW: i limit this to 10 to save stack space!
int32_t nc = size_catIds / 4;
if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
for (int32_t i = 0; i < nc ; i++) {
// breathe
QUICKPOLL ( m_niceness );
// temp stuff
int32_t dtlen = 0;
int32_t dslen = 0;
unsigned char dalen = 0;
// . store all dmoz info separated by \0's into titles[] buffer
// . crap, this does a disk read and blocks on that
//
// . TODO: make it non-blocking!!!!
//
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
m_firstUrl.getUrlLen(),
ptr_catIds[i],
dt,//&titles[titlesLen],
&dtlen,//&titleLens[i],
dtend-dt,
ds,//&summs[summsLen],
&dslen,//&summLens[i],
dsend-ds,
da,//&anchors[anchorsLen],
&dalen,//&anchorLens[i],
daend-da,
m_niceness);
// advance ptrs
dt += dtlen;
ds += dslen;
da += dalen;
// null terminate
*dt++ = 0;
*ds++ = 0;
*ds++ = 0;
}
// if empty, make it a \0 to keep in sync with the rest
if ( dt == titles ) *dt++ = '\0';
if ( ds == summs ) *ds++ = '\0';
if ( da == anchors ) *da++ = '\0';
// set these
ptr_dmozTitles = titles;
ptr_dmozSumms = summs;
ptr_dmozAnchors = anchors;
size_dmozTitles = dt - titles;
size_dmozSumms = ds - summs;
size_dmozAnchors = da - anchors;
m_dmozInfoValid = true;
return true;
}
// . create and store the titlerec into "buf".
// . it is basically the header part of all the member vars in this XmlDoc.
// . it has a key,dataSize,compressedData so it can be a record in an Rdb
// . return true on success, false on failure
bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){
//setStatus ( "making title rec");
// assume could not make one because we were banned or something
tbuf->purge(); // m_titleRec = NULL;
// start seting members in THIS's header before compression
m_version = TITLEREC_CURRENT_VERSION;
// tag rec must have "sitenuminlinks" in it
//if (! m_newTagRec.getTag("sitenuminlinks") ) { char *xx=NULL;*xx=0; }
// we often update m_oldTagRec above by calling updateRootLangId(), etc
// so update the size our of tag rec here
//size_tagRecData = m_oldTagRec.getSize();
// and sanity check this
//if( ptr_tagRecData != (char *)&m_oldTagRec ) { char *xx=NULL;*xx=0; }
// lookup dmoz title and summary for this site
//int32_t titleLens [10];
//int32_t summLens [10];
//unsigned char anchorLens [10];
//int32_t titlesLen = 0;
//int32_t summsLen = 0;
//int32_t anchorsLen = 0;
//char titles [10*1024];
//char summs [10*4096];
//char anchors [10* 256];
/*
MDW oct 12 2013 -
why is this here? we should store this info at spider time?
char *titles = m_dmozBuf;
char *summs = m_dmozBuf+5000;
char *anchors = m_dmozBuf+10000;
// the end of it
char *dtend = m_dmozBuf + 5000;
char *dsend = m_dmozBuf + 10000;
char *daend = m_dmozBuf + 12000;
// point into those bufs
char *dt = titles;
char *ds = summs;
char *da = anchors;
// MDW: i limit this to 10 to save stack space!
int32_t nc = size_catIds / 4;
if ( nc > 10 ) nc = 10;
for (int32_t i = 0; i < nc ; i++) {
// breathe
QUICKPOLL ( m_niceness );
// temp stuff
int32_t dtlen = 0;
int32_t dslen = 0;
unsigned char dalen = 0;
// . store all dmoz info separated by \0's into titles[] buffer
// . crap, this does a disk read and blocks on that
//
// . TODO: make it non-blocking!!!!
//
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
m_firstUrl.getUrlLen(),
ptr_catIds[i],
dt,//&titles[titlesLen],
&dtlen,//&titleLens[i],
dtend-dt,
ds,//&summs[summsLen],
&dslen,//&summLens[i],
dsend-ds,
da,//&anchors[anchorsLen],
&dalen,//&anchorLens[i],
daend-da,
m_niceness);
// advance ptrs
dt += dtlen;
ds += dslen;
da += dalen;
// null terminate
if ( dtlen>0 && dt[dtlen-1]!='\0' ) { *dt++=0; dtlen++; }
if ( dslen>0 && ds[dslen-1]!='\0' ) { *ds++=0; dslen++; }
if ( dalen>0 && da[dalen-1]!='\0' ) { *da++=0; dalen++; }
// must always be something!
if ( dtlen==0 ) {*dt++=0; dtlen++;}
if ( dslen==0 ) {*ds++=0; dslen++;}
if ( dalen==0 ) {*da++=0; dalen++;}
}
// set these
ptr_dmozTitles = titles;
ptr_dmozSumms = summs;
ptr_dmozAnchors = anchors;
size_dmozTitles = dt - titles;
size_dmozSumms = ds - summs;
size_dmozAnchors = da - anchors;
*/
// set our crap that is not necessarily set
//ptr_firstUrl = m_firstUrl.getUrl();
//ptr_redirUrl = m_redirUrl.getUrl();
//ptr_tagRecData = (char *)&m_oldTagRec;
// this must be valid now
//if ( ! m_skipIndexingValid ) { char *xx=NULL;*xx=0; }
// CT_STATUS docs do not have a valid XmlDoc really, it is
// just the first 2048 bytes, so there is no m_collnum member
// in the first 2048 bytes that is valid or even in legit memory.
// see 'char xdhead[2048];' below.
CollectionRec *cr = NULL;
if ( m_contentType != CT_STATUS ) {
cr = getCollRec();
if ( ! cr ) return false;
}
// zero out the content to save disk space if it is a custom crawl
// and the page was not processed (i.e. sent to diffbot).
// this will cause some undeletable data in the index, like for
// indexing meta tags perhaps, but in general we do not index
// most of the html document in custom crawls because we set
// 'indexBody/indexDoc' to false. but don't do this if we have
// ever sent this url to diffbot for processing before at any time.
// this may screw up content hash deduping, because the original
// hash will always be indexed, even if the doc changes or is
// deleted.
bool zeroOut = false;
if ( cr && cr->m_isCustomCrawl && ! m_sentToDiffbot ) zeroOut = true;
if ( zeroOut && m_isDiffbotJSONObject ) zeroOut = false;
if ( zeroOut && ! m_exactContentHash64Valid ) zeroOut = false;
// don't zero out spider status documents
if ( zeroOut && m_contentType == CT_STATUS ) zeroOut = false;
// disable for now. probably most disk space is from the spider status
// docs.
//zeroOut = false;
char *savedPtr = ptr_utf8Content;
int32_t savedSize = size_utf8Content;
if ( zeroOut ) {
// record the 64 bit content hash here and make
// getExactContentHash64() return it as a 64-bit binary number.
// that way we can preserve it.
sprintf(m_tmp9,"gbzeroedout:%"UINT64"",m_exactContentHash64);
ptr_utf8Content = m_tmp9;
size_utf8Content = gbstrlen(ptr_utf8Content) + 1;
m_zeroedOut = true;
}
// set this
m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;
// add in variable length data
int32_t *ps = (int32_t *)&size_firstUrl;
// data ptr, consider a NULL to mean empty too!
char **pd = (char **)&ptr_firstUrl;
// how many XmlDoc::ptr_* members do we have? set "np" to that
int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ;
np /= sizeof(char *);
// count up total we need to alloc
int32_t need1 = m_headerSize;
// clear these
m_internalFlags1 = 0;
// loop over em
for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
// skip if empty
if ( *ps <= 0 ) continue;
// or empty string ptr
if ( ! *pd ) continue;
// skip utf8content if we should -- no events or addresses
//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
// 4 bytes for the size
need1 += 4;
// add it up
need1 += *ps;
// make the mask
uint32_t mask = 1 << i ;
// add it in
m_internalFlags1 |= mask;
}
// alloc the buffer
char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
// return NULL with g_errno set on error
if ( ! ubuf ) {
// restore if we were zeroed out
ptr_utf8Content = savedPtr;
size_utf8Content = savedSize;
return false;
}
// serialize into it
char *p = ubuf;
// copy our crap into there
gbmemcpy ( p , &m_headerSize , m_headerSize );
// skip it
p += m_headerSize;
// reset data ptrs
pd = (char **)&ptr_firstUrl;
// reset data sizes
ps = (int32_t *)&size_firstUrl;
// then variable length data
for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
// skip if empty, do not serialize
if ( ! *ps ) continue;
// or empty string ptr
if ( ! *pd ) continue;
// skip utf8content if we should -- no events or addresses
//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
// store size first
*(int32_t *)p = *ps;
p += 4;
// then the data
gbmemcpy ( p , *pd , *ps );
// skip *ps bytes we wrote. should include a \0
p += *ps;
}
// sanity check
if ( p != ubuf + need1 ) { char *xx=NULL; *xx=0; }
// restore in case zeroOut was true
ptr_utf8Content = savedPtr;
size_utf8Content = savedSize;
// now restore it for other functions to use
//size_content = saved;
// . now compress our "title rec" data into a titleRec
// . cbuf should not be set
//if ( cbuf ) {
// log(LOG_LOGIC,"db: titlerec: compress: cbuf is set.");
// char *p = NULL; *p = 0; exit(-1);
//}
// should we free cbuf on our reset/destruction?
//m_owncbuf = ownCompressedData;
// . make a buf big enough to hold compressed, we'll realloc afterwards
// . according to zlib.h line 613 compress buffer must be .1% larger
// than source plus 12 bytes. (i add one for round off error)
// . now i added another extra 12 bytes cuz compress seemed to want it
int32_t need2 = ((int64_t)need1 * 1001LL) / 1000LL + 13 + 12;
// we also need to store a key then regular dataSize then
// the uncompressed size in cbuf before the compression of m_ubuf
int32_t hdrSize = sizeof(key_t) + 4 + 4;
// . now i add 12 bytes more so Msg14.cpp can also squeeze in a
// negative key to delete the old titleRec, cuz we use this cbuf
// to set our list that we add to our twins with
// . we now store the negative rec before the positive rec in Msg14.cpp
//hdrSize += sizeof(key_t) + 4;
need2 += hdrSize;
// alloc what we need
//char *cbuf = (char *) mmalloc ( need2 ,"TitleRecc");
//if ( ! cbuf ) return false;
// return false on error
if ( ! tbuf->reserve ( need2 ,"titbuf" ) ) return false;
// int16_tcut
char *cbuf = tbuf->getBufStart();
// set cbuf sizes, we set cbufSize below to fit exactly used buf
//int32_t cbufMaxSize = need2;
// . how big is the buf we're passing to ::compress()?
// . don't include the last 12 byte, save for del key in Msg14.cpp
int32_t size = need2 - hdrSize ;
// . uncompress the data into ubuf
// . this will reset cbufSize to a smaller value probably
// . "size" is set to how many bytes we wrote into "cbuf + hdrSize"
int err = gbcompress ( (unsigned char *)cbuf + hdrSize,
(uint32_t *)&size,
(unsigned char *)ubuf ,
(uint32_t )need1 );
// note it
//log("test: compressed %s from %"INT32" to %"INT32" bytes",
// m_firstUrl.m_url,need2-hdrSize,size);
// free the buf we were trying to compress now
mfree ( ubuf , need1 , "trub" );
// we should check ourselves
if ( err == Z_OK && size > (need2 - hdrSize ) ) {
//mfree ( cbuf , need2 ,"TitleRecc" );
tbuf->purge();
g_errno = ECOMPRESSFAILED;
log("db: Failed to compress document of %"INT32" bytes. "
"Provided buffer of %"INT32" bytes.",
size, (need2 - hdrSize ) );
return false;
}
// check for error
if ( err != Z_OK ) {
//mfree ( cbuf , need2 ,"TitleRecc" );
tbuf->purge();
g_errno = ECOMPRESSFAILED;
log("db: Failed to compress document.");
return false;
}
// calc cbufSize, the uncompressed header + compressed stuff
//cbufSize = hdrSize + size ;
//int64_t uh48 = getFirstUrlHash48();
// . make the key from docId
// . false = delkey?
//m_titleRecKey = g_titledb.makeKey (*getDocId(),uh48,false);//delkey?
key_t tkey = g_titledb.makeKey (docId,uh48,false);//delkey?
// validate it
//m_titleRecKeyValid = true;
// get a ptr to the Rdb record at start of the header
p = cbuf;
// skip over the negative rec reserved space for Msg14.cpp
//p += 12 + 4;
// . store key in header of cbuf
// . store in our host byte ordering so we can be a rec in an RdbList
*(key_t *) p = tkey;
p += sizeof(key_t);
// store total dataSize in header (excluding itself and key only)
int32_t dataSize = size + 4;
*(int32_t *) p = dataSize ;
p += 4;
// store uncompressed size in header
*(int32_t *) p = need1 ; p += 4;
// sanity check
if ( p != cbuf + hdrSize ) { char *xx = NULL; *xx = 0; }
// sanity check
if ( need1 <= 0 ) { char *xx = NULL; *xx = 0; }
// advance over data
p += size;
// update safebuf::m_length so it is correct
tbuf->setLength ( p - cbuf );
return true;
}
// . return NULL and sets g_errno on error
// . returns -1 if blocked
SafeBuf *XmlDoc::getTitleRecBuf ( ) {
// return it now if we got it already
if ( m_titleRecBufValid ) return &m_titleRecBuf;
setStatus ( "making title rec");
// did one of our many blocking function calls have an error?
if ( g_errno ) return NULL;
// . HACK so that TitleRec::isEmpty() return true
// . faster than calling m_titleRec.reset()
//m_titleRec.m_url.m_ulen = 0;
int32_t *indexCode = getIndexCode();
// not allowed to block here
if ( indexCode == (void *)-1) { char *xx=NULL;*xx=0; }
// return on errors with g_errno set
if ( ! indexCode ) return NULL;
// force delete? EDOCFORCEDELETE
if ( *indexCode ) { m_titleRecBufValid = true; return &m_titleRecBuf; }
// . internal callback
// . so if any of the functions we end up calling directly or
// indirectly block and return -1, we will be re-called from the top
if ( ! m_masterLoop ) {
m_masterLoop = getTitleRecBufWrapper;
m_masterState = this;
}
/*
// parsing knobs
if ( ! m_titleWeightValid ) {
// TODO: watchout for overruns!! these are 16-bits only!
//m_eliminateMenus = cr->m_eliminateMenus;
m_titleWeight = cr->m_titleWeight;
m_headerWeight = cr->m_headerWeight;
m_urlPathWeight = cr->m_urlPathWeight;
m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
m_conceptWeight = cr->m_conceptWeight;
//int32_t siteNumInlinksBoost = cr->m_siteNumInlinksBoost;
// validate these
//m_eliminateMenusValid = true;
m_titleWeightValid = true;
m_headerWeightValid = true;
m_urlPathWeightValid = true;
m_externalLinkTextWeightValid = true;
m_internalLinkTextWeightValid = true;
m_conceptWeightValid = true;
}
*/
/////////
//
// IF ANY of these validation sanity checks fail then update
// prepareToMakeTitleRec() so it makes them valid!!!
//
/////////
// verify key parts
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// verify record parts
//if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
if ( ! m_firstIndexedDateValid ) { char *xx=NULL;*xx=0; }
if ( ! m_outlinksAddedDateValid ) { char *xx=NULL;*xx=0; }
if ( ! m_charsetValid ) { char *xx=NULL;*xx=0; }
if ( ! m_countryIdValid ) { char *xx=NULL;*xx=0; }
if ( ! m_httpStatusValid ) { char *xx=NULL;*xx=0; }
/*
if ( ! m_titleWeightValid ) { char *xx=NULL;*xx=0; }
if ( ! m_headerWeightValid ) { char *xx=NULL;*xx=0; }
if ( ! m_urlPathWeightValid ) { char *xx=NULL;*xx=0; }
if ( ! m_externalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
if ( ! m_internalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
if ( ! m_conceptWeightValid ) { char *xx=NULL;*xx=0; }
*/
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
// if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
// if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
// if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
if ( ! m_rootLangIdValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
if ( ! m_metaListCheckSum8Valid ) { char *xx=NULL;*xx=0; }
//if ( ! m_numBannedOutlinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_eliminateMenusValid ) { char *xx=NULL;*xx=0; }
if ( ! m_spiderLinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isContentTruncatedValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isLinkSpamValid ) { char *xx=NULL;*xx=0; }
// buffers
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
if ( ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_metaRedirUrlValid ) { char *xx=NULL;*xx=0; }
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
if ( ! m_adVectorValid ) { char *xx=NULL;*xx=0; }
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
if ( ! m_imageDataValid ) { char *xx=NULL;*xx=0; }
if ( ! m_catIdsValid ) { char *xx=NULL;*xx=0; }
if ( ! m_indCatIdsValid ) { char *xx=NULL;*xx=0; }
if ( ! m_dmozInfoValid ) { char *xx=NULL;*xx=0; }
// if m_recycleContent is true, these are not valid
if ( ! m_recycleContent ) {
if ( ! m_rawUtf8ContentValid ) { char *xx=NULL;*xx=0; }
if ( ! m_expandedUtf8ContentValid ) { char *xx=NULL;*xx=0; }
}
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
if ( ! m_datesValid ) { char *xx=NULL;*xx=0; }
// why do we need valid sections for a titlerec? we no longer user
// ptr_sectiondbData...
//if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_sectionsReplyValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_addressReplyValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
//if ( ! m_sectiondbDataValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_placedbDataValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_clockCandidatesDataValid ) { char *xx=NULL;*xx=0; }
// do we need these?
if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; }
if ( ! m_contentHash32Valid ) { char *xx=NULL;*xx=0; }
//if ( ! m_tagHash32Valid ) { char *xx=NULL;*xx=0; }
if ( ! m_tagPairHash32Valid ) { char *xx=NULL;*xx=0; }
// sanity checks
if ( ! m_addressesValid ) { char *xx=NULL;*xx=0; }
// breathe
QUICKPOLL( m_niceness );
setStatus ( "compressing into final title rec");
int64_t uh48 = getFirstUrlHash48();
int64_t *docId = getDocId();
// time it
int64_t startTime = gettimeofdayInMilliseconds();
//////
//
// fill in m_titleRecBuf
//
//////
// we need docid and uh48 for making the key of the titleRec
if ( ! setTitleRecBuf ( &m_titleRecBuf , *docId , uh48 ) )
return NULL;
// set this member down here because we can't set it in "xd"
// because it is too int16_t of an xmldoc stub
m_versionValid = true;
// breathe
QUICKPOLL( m_niceness );
// . add the stat
// . use white for the stat
g_stats.addStat_r ( 0 ,
startTime ,
gettimeofdayInMilliseconds(),
0x00ffffff );
QUICKPOLL( m_niceness );
char *cbuf = m_titleRecBuf.getBufStart();
m_titleRecKey = *(key_t *)cbuf;
m_titleRecKeyValid = true;
// we are legit
//m_freeTitleRec = true;
//m_titleRec = cbuf;
// key + dataSize + ubufSize + compressedData
//m_titleRecSize = sizeof(key_t)+ 4 + 4 + size;
//m_titleRecAllocSize = need2;
// now valid. congratulations!
m_titleRecBufValid = true;
return &m_titleRecBuf;
}
// . an "id" of 2 means very indicative of a dirty doc
// . an "id" of 1 means it must be joined with another dirty word to indicate
// . taken mostly from Url.cpp
// . see matches2.h for Needle class definition
static Needle s_dirtyWords [] = {
{"upskirt" ,0,2,0,0,NULL,0,NULL},
{"downblouse" ,0,2,0,0,NULL,0,NULL},
{"shemale" ,0,1,0,0,NULL,0,NULL},
{"spank" ,0,1,0,0,NULL,0,NULL},
{"dildo" ,0,2,0,0,NULL,0,NULL},
{"bdsm" ,0,2,0,0,NULL,0,NULL},
{"voyeur" ,0,2,0,0,NULL,0,NULL},
{"fisting" ,0,2,0,0,NULL,0,NULL},
{"vibrator" ,0,2,0,0,NULL,0,NULL},
{"ejaculat" ,0,2,0,0,NULL,0,NULL},
{"rgasm" ,0,2,0,0,NULL,0,NULL},
{"orgy" ,0,2,0,0,NULL,0,NULL},
{"orgies" ,0,2,0,0,NULL,0,NULL},
{"stripper" ,0,1,0,0,NULL,0,NULL},
{"softcore" ,0,2,0,0,NULL,0,NULL},
{"whore" ,0,2,0,0,NULL,0,NULL},
// gary slutkin on ted.com. make this just 1 point.
{"slut" ,0,1,0,0,NULL,0,NULL},
{"smut" ,0,2,0,0,NULL,0,NULL},
{"tits" ,0,2,0,0,NULL,0,NULL},
{"lesbian" ,0,2,0,0,NULL,0,NULL},
{"swinger" ,0,2,0,0,NULL,0,NULL},
{"fetish" ,0,2,0,0,NULL,0,NULL},
{"nude" ,0,1,0,0,NULL,0,NULL},
{"centerfold" ,0,2,0,0,NULL,0,NULL},
{"incest" ,0,2,0,0,NULL,0,NULL},
{"pedophil" ,0,2,0,0,NULL,0,NULL},
{"pedofil" ,0,2,0,0,NULL,0,NULL},
{"horny" ,0,2,0,0,NULL,0,NULL}, // horny toad
{"pussy" ,0,2,0,0,NULL,0,NULL}, // pussy willow pussy cat
{"pussies" ,0,2,0,0,NULL,0,NULL},
{"penis" ,0,2,0,0,NULL,0,NULL},
{"vagina" ,0,2,0,0,NULL,0,NULL},
{"phuck" ,0,2,0,0,NULL,0,NULL},
{"blowjob" ,0,2,0,0,NULL,0,NULL},
{"blow job" ,0,2,0,0,NULL,0,NULL},
{"gangbang" ,0,2,0,0,NULL,0,NULL},
{"xxx" ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
{"porn" ,0,2,0,0,NULL,0,NULL},
{"felch" ,0,2,0,0,NULL,0,NULL},
{"cunt" ,0,2,0,0,NULL,0,NULL},
{"bestial" ,0,2,0,0,NULL,0,NULL},
{"beastial" ,0,2,0,0,NULL,0,NULL},
{"kink" ,0,2,0,0,NULL,0,NULL},
// . "sex" is often substring in tagids.
// . too many false positives, make "1" not "2"
{"sex" ,0,1,0,0,NULL,0,NULL},
{"anal" ,0,2,0,0,NULL,0,NULL},
{"cum" ,0,2,0,0,NULL,0,NULL}, // often used for cumulative
{"clit" ,0,2,0,0,NULL,0,NULL},
{"fuck" ,0,2,0,0,NULL,0,NULL},
{"boob" ,0,1,0,0,NULL,0,NULL},
{"wank" ,0,2,0,0,NULL,0,NULL},
{"fick" ,0,2,0,0,NULL,0,NULL},
{"eroti" ,0,2,0,0,NULL,0,NULL},
{"gay" ,0,1,0,0,NULL,0,NULL}, // make 1 pt. 'marvin gay'
// new stuff not in Url.cpp
{"thong" ,0,1,0,0,NULL,0,NULL},
{"masturbat" ,0,2,0,0,NULL,0,NULL},
{"bitch" ,0,1,0,0,NULL,0,NULL},
{"hell" ,0,1,0,0,NULL,0,NULL},
{"damn" ,0,1,0,0,NULL,0,NULL},
{"rimjob" ,0,2,0,0,NULL,0,NULL},
{"cunnilingu" ,0,2,0,0,NULL,0,NULL},
{"felatio" ,0,2,0,0,NULL,0,NULL},
{"fellatio" ,0,2,0,0,NULL,0,NULL},
{"dick" ,0,1,0,0,NULL,0,NULL},
{"cock" ,0,1,0,0,NULL,0,NULL},
{"rape" ,0,2,0,0,NULL,0,NULL},
{"raping" ,0,2,0,0,NULL,0,NULL},
{"bukake" ,0,2,0,0,NULL,0,NULL},
{"shit" ,0,2,0,0,NULL,0,NULL},
{"naked" ,0,1,0,0,NULL,0,NULL},
{"nympho" ,0,2,0,0,NULL,0,NULL},
{"hardcore" ,0,1,0,0,NULL,0,NULL}, // hardcore gamer, count as 1
{"sodom" ,0,2,0,0,NULL,0,NULL},
{"titties" ,0,2,0,0,NULL,0,NULL}, // re-do
{"twat" ,0,2,0,0,NULL,0,NULL},
{"bastard" ,0,1,0,0,NULL,0,NULL},
{"erotik" ,0,2,0,0,NULL,0,NULL},
// EXCEPTIONS
// smut
{"transmut" ,0,-2,0,0,NULL,0,NULL},
{"bismuth" ,0,-2,0,0,NULL,0,NULL},
// sex
{"middlesex" ,0,-1,0,0,NULL,0,NULL},
{"sussex" ,0,-1,0,0,NULL,0,NULL},
{"essex" ,0,-1,0,0,NULL,0,NULL},
{"deusex" ,0,-1,0,0,NULL,0,NULL},
{"sexchange" ,0,-1,0,0,NULL,0,NULL},
{"sexpress" ,0,-1,0,0,NULL,0,NULL},
{"sexpert" ,0,-1,0,0,NULL,0,NULL},
// EXCEPTIONS
// sex
{"middlesex" ,0,-1,0,0,NULL,0,NULL},
{"sussex" ,0,-1,0,0,NULL,0,NULL},
{"essex" ,0,-1,0,0,NULL,0,NULL},
{"deusex" ,0,-1,0,0,NULL,0,NULL},
{"sexchange" ,0,-1,0,0,NULL,0,NULL},
{"sexpress" ,0,-1,0,0,NULL,0,NULL},
{"sexpert" ,0,-1,0,0,NULL,0,NULL},
{"sexcel" ,0,-1,0,0,NULL,0,NULL},
{"sexist" ,0,-1,0,0,NULL,0,NULL},
{"sexile" ,0,-1,0,0,NULL,0,NULL},
{"sexperi" ,0,-1,0,0,NULL,0,NULL},
{"sexual" ,0,-1,0,0,NULL,0,NULL},
{"sexpose" ,0,-1,0,0,NULL,0,NULL},
{"sexclu" ,0,-1,0,0,NULL,0,NULL},
{"sexo" ,0,-1,0,0,NULL,0,NULL},
{"sexism" ,0,-1,0,0,NULL,0,NULL},
{"sexpan" ,0,-1,0,0,NULL,0,NULL}, // buttonsexpanion
{"same-sex" ,0,-1,0,0,NULL,0,NULL},
{"opposite sex",0,-1,0,0,NULL,0,NULL},
// anal
{"analog" ,0,-2,0,0,NULL,0,NULL},
{"analy" ,0,-2,0,0,NULL,0,NULL},
{"canal" ,0,-2,0,0,NULL,0,NULL},
{"kanal" ,0,-2,0,0,NULL,0,NULL},
{"banal" ,0,-2,0,0,NULL,0,NULL},
{"ianalbert" ,0,-2,0,0,NULL,0,NULL}, // ian albert
// cum
{"circum" ,0,-2,0,0,NULL,0,NULL},
{"cum laude" ,0,-2,0,0,NULL,0,NULL},
{"succum" ,0,-2,0,0,NULL,0,NULL},
{"cumber" ,0,-2,0,0,NULL,0,NULL},
{"docum" ,0,-2,0,0,NULL,0,NULL},
{"cumul" ,0,-2,0,0,NULL,0,NULL},
{"acumen" ,0,-2,0,0,NULL,0,NULL},
{"incum" ,0,-2,0,0,NULL,0,NULL},
{"capsicum" ,0,-2,0,0,NULL,0,NULL},
{"modicum" ,0,-2,0,0,NULL,0,NULL},
{"locum" ,0,-2,0,0,NULL,0,NULL},
{"scum" ,0,-2,0,0,NULL,0,NULL},
{"accum" ,0,-2,0,0,NULL,0,NULL},
{"cumbre" ,0,-2,0,0,NULL,0,NULL},
{"swank" ,0,-2,0,0,NULL,0,NULL},
{"fickle" ,0,-2,0,0,NULL,0,NULL},
{"traffick" ,0,-2,0,0,NULL,0,NULL},
{"scleroti" ,0,-2,0,0,NULL,0,NULL},
{"gaylor" ,0,-2,0,0,NULL,0,NULL},
{"gaynor" ,0,-2,0,0,NULL,0,NULL},
{"gayner" ,0,-2,0,0,NULL,0,NULL},
{"gayton" ,0,-2,0,0,NULL,0,NULL},
{"dipthong" ,0,-1,0,0,NULL,0,NULL},
// hell
{"hellen" ,0,-1,0,0,NULL,0,NULL},
{"hellman" ,0,-1,0,0,NULL,0,NULL},
{"shell" ,0,-1,0,0,NULL,0,NULL},
{"mitchell" ,0,-1,0,0,NULL,0,NULL},
{"chelle" ,0,-1,0,0,NULL,0,NULL}, // me/michelle
{"hello" ,0,-1,0,0,NULL,0,NULL},
{"moschella" ,0,-1,0,0,NULL,0,NULL},
{"othello" ,0,-1,0,0,NULL,0,NULL},
{"schelling" ,0,-1,0,0,NULL,0,NULL},
{"seychelles" ,0,-1,0,0,NULL,0,NULL},
{"wheller" ,0,-1,0,0,NULL,0,NULL},
{"winchell" ,0,-1,0,0,NULL,0,NULL},
// dick
{"dicker" ,0,-1,0,0,NULL,0,NULL},
{"dickins" ,0,-1,0,0,NULL,0,NULL},
{"dickies" ,0,-1,0,0,NULL,0,NULL},
{"dickran" ,0,-1,0,0,NULL,0,NULL},
// cock
{"babcock" ,0,-1,0,0,NULL,0,NULL},
{"cocked" ,0,-1,0,0,NULL,0,NULL},
{"cocking" ,0,-1,0,0,NULL,0,NULL},
{"cockpit" ,0,-1,0,0,NULL,0,NULL},
{"cockroach" ,0,-1,0,0,NULL,0,NULL},
{"cocktail" ,0,-1,0,0,NULL,0,NULL},
{"cocky" ,0,-1,0,0,NULL,0,NULL},
{"hancock" ,0,-1,0,0,NULL,0,NULL},
{"hitchcock" ,0,-1,0,0,NULL,0,NULL},
{"peacock" ,0,-1,0,0,NULL,0,NULL},
{"shuttlecock" ,0,-1,0,0,NULL,0,NULL},
{"stopcock" ,0,-1,0,0,NULL,0,NULL},
{"weathercock" ,0,-1,0,0,NULL,0,NULL},
{"woodcock" ,0,-1,0,0,NULL,0,NULL},
{"cockburn" ,0,-1,0,0,NULL,0,NULL},
// kink
{"kinko" ,0,-2,0,0,NULL,0,NULL},
{"ukink" ,0,-2,0,0,NULL,0,NULL}, // ink shop in uk
// naked
{"snaked" ,0,-1,0,0,NULL,0,NULL},
// rape
{"drape" ,0,-2,0,0,NULL,0,NULL},
{"grape" ,0,-2,0,0,NULL,0,NULL},
{"scrape" ,0,-2,0,0,NULL,0,NULL},
{"therape" ,0,-2,0,0,NULL,0,NULL},
{"trapez" ,0,-2,0,0,NULL,0,NULL},
{"parapet" ,0,-2,0,0,NULL,0,NULL},
{"scraping" ,0,-2,0,0,NULL,0,NULL},
{"draping" ,0,-2,0,0,NULL,0,NULL},
// twat
{"twatch" ,0,-2,0,0,NULL,0,NULL}, // courtwatch -- cspan.org
// clit
{"heraclitus" ,0,-2,0,0,NULL,0,NULL},
// boob
{"booboo" ,0,-1,0,0,NULL,0,NULL},
// shit
{"shitak" ,0,-2,0,0,NULL,0,NULL}
};
////
//// New stuff from sex.com adult word list
////
////
//// make it a 2nd part because of performance limits on matches2.cpp algo
////
static Needle s_dirtyWordsPart2 [] = {
{"amateurfoto" ,0,2,0,0,NULL,0,NULL},
{"amateurhardcore" ,0,2,0,0,NULL,0,NULL},
{"amateurindex" ,0,2,0,0,NULL,0,NULL},
{"amateurnaked" ,0,2,0,0,NULL,0,NULL},
{"amatuerhardcore" ,0,2,0,0,NULL,0,NULL},
{"ampland" ,0,2,0,0,NULL,0,NULL},
//{"animehentai" ,0,2,0,0,NULL,0,NULL}, dup
{"anitablonde" ,0,2,0,0,NULL,0,NULL},
{"asiacarrera" ,0,2,0,0,NULL,0,NULL},
{"asshole" ,0,2,0,0,NULL,0,NULL},
{"asslick" ,0,2,0,0,NULL,0,NULL},
{"asspic" ,0,2,0,0,NULL,0,NULL},
{"assworship" ,0,2,0,0,NULL,0,NULL},
//{"badgirl" ,0,2,0,0,NULL,0,NULL}, not necessarily bad
{"bareceleb" ,0,2,0,0,NULL,0,NULL},
{"barenaked" ,0,2,0,0,NULL,0,NULL},
{"beaverboy" ,0,2,0,0,NULL,0,NULL},
{"beavershot" ,0,2,0,0,NULL,0,NULL}, // was beavershots
//{"bigball" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
{"bigbreast" ,0,2,0,0,NULL,0,NULL},
//{"bigbutt" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
{"bigcock" ,0,2,0,0,NULL,0,NULL},
{"bigdick" ,0,2,0,0,NULL,0,NULL},
{"biggestdick" ,0,2,0,0,NULL,0,NULL},
{"biggesttit" ,0,2,0,0,NULL,0,NULL},
{"bighairyball" ,0,2,0,0,NULL,0,NULL},
{"bighooter" ,0,2,0,0,NULL,0,NULL},
{"bignipple" ,0,2,0,0,NULL,0,NULL},
{"bigtit" ,0,2,0,0,NULL,0,NULL},
{"blackbooty" ,0,2,0,0,NULL,0,NULL},
{"blackbutt" ,0,2,0,0,NULL,0,NULL},
{"blackcock" ,0,2,0,0,NULL,0,NULL},
{"blackdick" ,0,2,0,0,NULL,0,NULL},
{"blackhardcore" ,0,2,0,0,NULL,0,NULL},
{"blackonblonde" ,0,2,0,0,NULL,0,NULL},
{"blacksonblonde" ,0,2,0,0,NULL,0,NULL},
{"blacktit" ,0,2,0,0,NULL,0,NULL},
{"blacktwat" ,0,2,0,0,NULL,0,NULL},
{"boner" ,0,1,0,0,NULL,0,NULL}, // softcore, someone's lastname?
{"bordello" ,0,2,0,0,NULL,0,NULL},
{"braless" ,0,2,0,0,NULL,0,NULL},
{"brothel" ,0,2,0,0,NULL,0,NULL},
{"bukake" ,0,2,0,0,NULL,0,NULL},
{"bukkake" ,0,2,0,0,NULL,0,NULL},
{"bustyblonde" ,0,2,0,0,NULL,0,NULL},
{"bustyceleb" ,0,2,0,0,NULL,0,NULL},
{"butthole" ,0,2,0,0,NULL,0,NULL},
{"buttman" ,0,2,0,0,NULL,0,NULL},
{"buttpic" ,0,2,0,0,NULL,0,NULL},
{"buttplug" ,0,2,0,0,NULL,0,NULL},
{"buttthumbnails" ,0,2,0,0,NULL,0,NULL},
{"callgirl" ,0,2,0,0,NULL,0,NULL},
{"celebritiesnaked" ,0,2,0,0,NULL,0,NULL},
{"celebritybush" ,0,2,0,0,NULL,0,NULL},
{"celebritybutt" ,0,2,0,0,NULL,0,NULL},
{"chaseylain" ,0,2,0,0,NULL,0,NULL},
{"chickswithdick" ,0,2,0,0,NULL,0,NULL},
{"christycanyon" ,0,2,0,0,NULL,0,NULL},
{"cicciolina" ,0,2,0,0,NULL,0,NULL},
//{"cunilingus" ,0,2,0,0,NULL,0,NULL},
{"cunniling" ,0,2,0,0,NULL,0,NULL}, // abbreviate
{"cyberlust" ,0,2,0,0,NULL,0,NULL},
{"danniashe" ,0,2,0,0,NULL,0,NULL},
{"dicksuck" ,0,2,0,0,NULL,0,NULL},
{"dirtymind" ,0,2,0,0,NULL,0,NULL},
{"dirtypicture" ,0,2,0,0,NULL,0,NULL},
{"doggiestyle" ,0,2,0,0,NULL,0,NULL},
{"doggystyle" ,0,2,0,0,NULL,0,NULL},
{"domatrix" ,0,2,0,0,NULL,0,NULL},
{"dominatrix" ,0,2,0,0,NULL,0,NULL},
//{"dyke" ,0,2,0,0,NULL,0,NULL}, // dick van dyke!
{"ejaculation" ,0,2,0,0,NULL,0,NULL},
{"erosvillage" ,0,2,0,0,NULL,0,NULL},
{"facesit" ,0,2,0,0,NULL,0,NULL},
{"fatass" ,0,2,0,0,NULL,0,NULL},
{"feetfetish" ,0,2,0,0,NULL,0,NULL},
{"felatio" ,0,2,0,0,NULL,0,NULL},
{"fellatio" ,0,2,0,0,NULL,0,NULL},
{"femdom" ,0,2,0,0,NULL,0,NULL},
{"fetishwear" ,0,2,0,0,NULL,0,NULL},
{"fettegirl" ,0,2,0,0,NULL,0,NULL},
{"fingerbang" ,0,2,0,0,NULL,0,NULL},
{"fingering" ,0,1,0,0,NULL,0,NULL}, // fingering the keyboard? use 1
{"flesh4free" ,0,2,0,0,NULL,0,NULL},
{"footfetish" ,0,2,0,0,NULL,0,NULL},
{"footjob" ,0,2,0,0,NULL,0,NULL},
{"footlicking" ,0,2,0,0,NULL,0,NULL},
{"footworship" ,0,2,0,0,NULL,0,NULL},
{"fornication" ,0,2,0,0,NULL,0,NULL},
{"freeass" ,0,2,0,0,NULL,0,NULL},
{"freebigtit" ,0,2,0,0,NULL,0,NULL},
{"freedick" ,0,2,0,0,NULL,0,NULL},
{"freehardcore" ,0,2,0,0,NULL,0,NULL},
//{"freehentai" ,0,2,0,0,NULL,0,NULL}, dup
{"freehooter" ,0,2,0,0,NULL,0,NULL},
{"freelargehooter" ,0,2,0,0,NULL,0,NULL},
{"freenakedpic" ,0,2,0,0,NULL,0,NULL},
{"freenakedwomen" ,0,2,0,0,NULL,0,NULL},
{"freetit" ,0,2,0,0,NULL,0,NULL},
{"freevoyeur" ,0,2,0,0,NULL,0,NULL},
{"gratishardcoregalerie" ,0,2,0,0,NULL,0,NULL},
{"hardcorecelebs" ,0,2,0,0,NULL,0,NULL},
{"hardcorefree" ,0,2,0,0,NULL,0,NULL},
{"hardcorehooter" ,0,2,0,0,NULL,0,NULL},
{"hardcorejunkie" ,0,2,0,0,NULL,0,NULL},
{"hardcorejunky" ,0,2,0,0,NULL,0,NULL},
{"hardcoremovie" ,0,2,0,0,NULL,0,NULL},
{"hardcorepic" ,0,2,0,0,NULL,0,NULL},
{"hardcorepix" ,0,2,0,0,NULL,0,NULL},
{"hardcoresample" ,0,2,0,0,NULL,0,NULL},
{"hardcorestories" ,0,2,0,0,NULL,0,NULL},
{"hardcorethumb" ,0,2,0,0,NULL,0,NULL},
{"hardcorevideo" ,0,2,0,0,NULL,0,NULL},
{"harddick" ,0,2,0,0,NULL,0,NULL},
{"hardnipple" ,0,2,0,0,NULL,0,NULL},
{"hardon" ,0,2,0,0,NULL,0,NULL},
{"hentai" ,0,2,0,0,NULL,0,NULL},
{"interacialhardcore" ,0,2,0,0,NULL,0,NULL},
{"intercourseposition" ,0,2,0,0,NULL,0,NULL},
{"interracialhardcore" ,0,2,0,0,NULL,0,NULL},
{"ittybittytitty" ,0,2,0,0,NULL,0,NULL},
{"jackoff" ,0,2,0,0,NULL,0,NULL},
{"jennajameson" ,0,2,0,0,NULL,0,NULL},
{"jennicam" ,0,2,0,0,NULL,0,NULL},
{"jerkoff" ,0,2,0,0,NULL,0,NULL},
{"jism" ,0,2,0,0,NULL,0,NULL},
{"jiz" ,0,2,0,0,NULL,0,NULL},
{"justhardcore" ,0,2,0,0,NULL,0,NULL},
{"karasamateurs" ,0,2,0,0,NULL,0,NULL},
{"kascha" ,0,2,0,0,NULL,0,NULL},
{"kaylakleevage" ,0,2,0,0,NULL,0,NULL},
{"kobetai" ,0,2,0,0,NULL,0,NULL},
{"lapdance" ,0,2,0,0,NULL,0,NULL},
{"largedick" ,0,2,0,0,NULL,0,NULL},
{"largehooter" ,0,2,0,0,NULL,0,NULL},
{"largestbreast" ,0,2,0,0,NULL,0,NULL},
{"largetit" ,0,2,0,0,NULL,0,NULL},
{"lesben" ,0,2,0,0,NULL,0,NULL},
{"lesbo" ,0,2,0,0,NULL,0,NULL},
{"lickadick" ,0,2,0,0,NULL,0,NULL},
{"lindalovelace" ,0,2,0,0,NULL,0,NULL},
{"longdick" ,0,2,0,0,NULL,0,NULL},
{"lovedoll" ,0,2,0,0,NULL,0,NULL},
{"makinglove" ,0,2,0,0,NULL,0,NULL},
{"mangax" ,0,2,0,0,NULL,0,NULL},
{"manpic" ,0,2,0,0,NULL,0,NULL},
{"marilynchambers" ,0,2,0,0,NULL,0,NULL},
{"massivecock" ,0,2,0,0,NULL,0,NULL},
{"masterbating" ,0,2,0,0,NULL,0,NULL},
{"mensdick" ,0,2,0,0,NULL,0,NULL},
{"milf" ,0,2,0,0,NULL,0,NULL},
{"minka" ,0,2,0,0,NULL,0,NULL},
{"monstercock" ,0,2,0,0,NULL,0,NULL},
{"monsterdick" ,0,2,0,0,NULL,0,NULL},
{"muffdiving" ,0,2,0,0,NULL,0,NULL},
{"nacktfoto" ,0,2,0,0,NULL,0,NULL},
{"nakedblackwomen" ,0,2,0,0,NULL,0,NULL},
{"nakedceleb" ,0,2,0,0,NULL,0,NULL},
{"nakedcelebrity" ,0,2,0,0,NULL,0,NULL},
{"nakedcheerleader" ,0,2,0,0,NULL,0,NULL},
{"nakedchick" ,0,2,0,0,NULL,0,NULL},
{"nakedgirl" ,0,2,0,0,NULL,0,NULL},
{"nakedguy" ,0,2,0,0,NULL,0,NULL},
{"nakedladies" ,0,2,0,0,NULL,0,NULL},
{"nakedlady" ,0,2,0,0,NULL,0,NULL},
{"nakedman" ,0,2,0,0,NULL,0,NULL},
{"nakedmen" ,0,2,0,0,NULL,0,NULL},
{"nakedness" ,0,2,0,0,NULL,0,NULL},
{"nakedphoto" ,0,2,0,0,NULL,0,NULL},
{"nakedpic" ,0,2,0,0,NULL,0,NULL},
{"nakedstar" ,0,2,0,0,NULL,0,NULL},
{"nakedwife" ,0,2,0,0,NULL,0,NULL},
{"nakedwoman" ,0,2,0,0,NULL,0,NULL},
{"nakedwomen" ,0,2,0,0,NULL,0,NULL},
{"nastychat" ,0,2,0,0,NULL,0,NULL},
{"nastythumb" ,0,2,0,0,NULL,0,NULL},
{"naughtylink" ,0,2,0,0,NULL,0,NULL},
{"naughtylinx" ,0,2,0,0,NULL,0,NULL},
{"naughtylynx" ,0,2,0,0,NULL,0,NULL},
{"naughtynurse" ,0,2,0,0,NULL,0,NULL},
{"niceass" ,0,2,0,0,NULL,0,NULL},
{"nikkinova" ,0,2,0,0,NULL,0,NULL},
{"nikkityler" ,0,2,0,0,NULL,0,NULL},
{"nylonfetish" ,0,2,0,0,NULL,0,NULL},
{"nympho" ,0,2,0,0,NULL,0,NULL},
{"openleg" ,0,2,0,0,NULL,0,NULL},
{"oral4free" ,0,2,0,0,NULL,0,NULL},
{"pantyhosefetish" ,0,2,0,0,NULL,0,NULL},
{"peepcam" ,0,2,0,0,NULL,0,NULL},
{"persiankitty" ,0,2,0,0,NULL,0,NULL},
{"perverted" ,0,2,0,0,NULL,0,NULL},
{"pimpserver" ,0,2,0,0,NULL,0,NULL},
{"pissing" ,0,2,0,0,NULL,0,NULL},
{"poontang" ,0,2,0,0,NULL,0,NULL},
{"privatex" ,0,2,0,0,NULL,0,NULL},
{"prono" ,0,2,0,0,NULL,0,NULL},
{"publicnudity" ,0,2,0,0,NULL,0,NULL},
{"puffynipple" ,0,2,0,0,NULL,0,NULL},
{"racqueldarrian" ,0,2,0,0,NULL,0,NULL},
//{"rape" ,0,2,0,0,NULL,0,NULL}, // dup!
{"rawlink" ,0,2,0,0,NULL,0,NULL},
{"realhardcore" ,0,2,0,0,NULL,0,NULL},
{"rubberfetish" ,0,2,0,0,NULL,0,NULL},
{"seka" ,0,2,0,0,NULL,0,NULL},
{"sheboy" ,0,2,0,0,NULL,0,NULL},
{"showcam" ,0,2,0,0,NULL,0,NULL},
{"showercam" ,0,2,0,0,NULL,0,NULL},
{"smallbreast" ,0,2,0,0,NULL,0,NULL},
{"smalldick" ,0,2,0,0,NULL,0,NULL},
{"spycamadult" ,0,2,0,0,NULL,0,NULL},
{"strapon" ,0,2,0,0,NULL,0,NULL},
{"stripclub" ,0,2,0,0,NULL,0,NULL},
{"stripshow" ,0,2,0,0,NULL,0,NULL},
{"striptease" ,0,2,0,0,NULL,0,NULL},
{"strokeit" ,0,2,0,0,NULL,0,NULL},
{"strokeme" ,0,2,0,0,NULL,0,NULL},
{"suckdick" ,0,2,0,0,NULL,0,NULL},
{"sylviasaint" ,0,2,0,0,NULL,0,NULL},
{"teenhardcore" ,0,2,0,0,NULL,0,NULL},
{"teenie" ,0,2,0,0,NULL,0,NULL},
{"teenpic" ,0,2,0,0,NULL,0,NULL},
{"teensuck" ,0,2,0,0,NULL,0,NULL},
{"tgp" ,0,2,0,0,NULL,0,NULL},
{"threesome" ,0,2,0,0,NULL,0,NULL},
{"thumblord" ,0,2,0,0,NULL,0,NULL},
{"thumbzilla" ,0,2,0,0,NULL,0,NULL},
{"tiffanytowers" ,0,2,0,0,NULL,0,NULL},
{"tinytitties" ,0,2,0,0,NULL,0,NULL},
//{"tities" ,0,2,0,0,NULL,0,NULL}, // entities
{"titman" ,0,2,0,0,NULL,0,NULL},
{"titsandass" ,0,2,0,0,NULL,0,NULL},
{"titties" ,0,2,0,0,NULL,0,NULL},
{"titts" ,0,2,0,0,NULL,0,NULL},
{"titty" ,0,2,0,0,NULL,0,NULL},
{"tokyotopless" ,0,2,0,0,NULL,0,NULL},
{"tommysbookmark" ,0,2,0,0,NULL,0,NULL},
{"toplesswomen" ,0,2,0,0,NULL,0,NULL},
{"trannies" ,0,2,0,0,NULL,0,NULL},
{"twinks" ,0,2,0,0,NULL,0,NULL},
{"ultradonkey" ,0,2,0,0,NULL,0,NULL},
{"ultrahardcore" ,0,2,0,0,NULL,0,NULL},
{"uncutcock" ,0,2,0,0,NULL,0,NULL},
{"vividtv" ,0,2,0,0,NULL,0,NULL},
{"wendywhoppers" ,0,2,0,0,NULL,0,NULL},
{"wetdick" ,0,2,0,0,NULL,0,NULL},
{"wetpanties" ,0,2,0,0,NULL,0,NULL},
{"wifesharing" ,0,2,0,0,NULL,0,NULL},
{"wifeswapping" ,0,2,0,0,NULL,0,NULL},
{"xrated" ,0,2,0,0,NULL,0,NULL}
};
// . store this in clusterdb rec so family filter works!
// . check content for adult words
char *XmlDoc::getIsAdult ( ) {
if ( m_isAdultValid ) return &m_isAdult2;
// call that
setStatus ("getting is adult bit");
int32_t **pici = getIndCatIds();
if ( ! pici || pici == (void *)-1 ) return (char *)pici;
// check categories
for ( int32_t i = 0 ; i < size_indCatIds / 4 ; i++ ) {
int32_t ic = ptr_indCatIds[i];
// skip if not an adult category
if ( ! g_categories->isIdAdult ( ic ) ) continue;
// got it
m_isAdult = true;
m_isAdult2 = true;
m_isAdultValid = true;
return &m_isAdult2;
}
// . if any of the wiki docids we are in are adult.... then we are
// . we set the top bit of wiki docids to indicate if adult
//for ( int32_t i = 0 ; i < size_wikiDocIds / 8 ; i++ ) {
// int64_t d = ptr_wikiDocIds[i];
// if ( ! ( d & 0x8000000000000000 ) ) continue;
// // got it
// m_isAdult = true;
// m_isAdultValid = true;
// return &m_isAdult;
//}
// need the content
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1) return (char *)u8;
// time it
int64_t start = gettimeofdayInMilliseconds();
// score that up
int32_t total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
m_niceness , m_firstUrl.m_url );
// then the url
//char *u = getFirstUrl()->getUrl();
//total += getDirtyPoints ( u , gbstrlen(u) );
// and redir url
//char *r = getRedirUrl()->getUrl();
//total += getDirtyPoints ( r , gbstrlen(r) );
// debug msg
int64_t took = gettimeofdayInMilliseconds() - start;
if ( took > 10 )
logf(LOG_DEBUG,
"build: Took %"INT64" ms to check doc of %"INT32" bytes for "
"dirty words.",took,size_utf8Content-1);
m_isAdult = false;
// adult?
if ( total >= 2 ) m_isAdult = true;
// set shadow member
m_isAdult2 = (bool)m_isAdult;
// validate
m_isAdultValid = true;
// note it
if ( m_isAdult2 && g_conf.m_logDebugDirty )
log("dirty: %s points = %"INT32"",m_firstUrl.m_url,total);
// no dirty words found
return &m_isAdult2;
}
int32_t getDirtyPoints ( char *s , int32_t slen , int32_t niceness , char *url ) {
// . use the matches function to get all the matches
// . then check each match to see if it is actually a legit word
// . actually match the dirty words, then match the clean words
// then we can subtract counts.
int32_t numDirty = sizeof(s_dirtyWords) / sizeof(Needle);
getMatches2 ( s_dirtyWords ,
numDirty ,
s ,
slen ,
NULL , // linkPos
NULL , // needleNum
false , // stopAtFirstMatch?
NULL , // hadPreMatch ptr
true , // saveQuickTables?
niceness );
int32_t points = 0;
// each needle has an associated score
for ( int32_t i = 0 ; i < numDirty ; i++ ) {
// skip if no match
if ( s_dirtyWords[i].m_count <= 0 ) continue;
// . the "id", is positive for dirty words, - for clean
// . uses +2/-2 for really dirty words
// . uses +1/-1 for borderline dirty words
points += s_dirtyWords[i].m_id;
// log debug
if ( ! g_conf.m_logDebugDirty ) continue;
// show it in the log
log("dirty: %s %"INT32" %s"
,s_dirtyWords[i].m_string
,(int32_t)s_dirtyWords[i].m_id
,url
);
}
////
//
// repeat for part2
//
// we have to do two separate parts otherwise the algo in
// matches2.cpp gets really slow. it was not meant to match
// so many needles in one haystack.
//
///
int32_t numDirty2 = sizeof(s_dirtyWordsPart2) / sizeof(Needle);
// . disable this for now. most of these are phrases and they
// will not be detected.
// . TODO: hash the dirty words and phrases and just lookup
// words in that table like we do for isStopWord(), but use
// isDirtyWord(). Then replace the code is Speller.cpp
// with isDirtyUrl() which will split the string into words
// and call isDirtyWord() on each one. also use bi and tri grams
// in the hash table.
numDirty2 = 0;
getMatches2 ( s_dirtyWordsPart2 ,
numDirty2 ,
s ,
slen ,
NULL , // linkPos
NULL , // needleNum
false , // stopAtFirstMatch?
NULL , // hadPreMatch ptr
true , // saveQuickTables?
niceness );
// each needle has an associated score
for ( int32_t i = 0 ; i < numDirty2 ; i++ ) {
// skip if no match
if ( s_dirtyWordsPart2[i].m_count <= 0 ) continue;
// . the "id", is positive for dirty words, - for clean
// . uses +2/-2 for really dirty words
// . uses +1/-1 for borderline dirty words
points += s_dirtyWordsPart2[i].m_id;
// log debug
if ( ! g_conf.m_logDebugDirty ) continue;
// show it in the log
log("dirty: %s %"INT32" %s"
,s_dirtyWordsPart2[i].m_string
,(int32_t)s_dirtyWordsPart2[i].m_id
,url
);
}
return points;
}
int32_t **XmlDoc::getIndCatIds ( ) {
// if XmlDoc was set from a titleRec it should validate this
if ( m_indCatIdsValid ) return &ptr_indCatIds;
// otherwise, we must compute them!
CatRec *cat = getCatRec ();
// blocked or error?
if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
// set this
ptr_indCatIds = cat->m_indCatids;
size_indCatIds = cat->m_numIndCatids * 4;
m_indCatIdsValid = true;
// parse that up
return &ptr_indCatIds;
}
int32_t **XmlDoc::getCatIds ( ) {
// if XmlDoc was set from a titleRec it should validate this
if ( m_catIdsValid ) return &ptr_catIds;
// otherwise, we must compute them!
CatRec *cat = getCatRec ();
// blocked or error?
if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
// set this
ptr_catIds = cat->m_catids;
size_catIds = cat->m_numCatids * 4;
m_catIdsValid = true;
// parse that up
return &ptr_catIds;
}
CatRec *XmlDoc::getCatRec ( ) {
// return what we got
if ( m_catRecValid ) return &m_catRec;
// call that
setStatus ("getting dmoz cat rec");
// callback?
if ( m_calledMsg8b ) {
// return NULL on error
if ( g_errno ) return NULL;
// otherwise, success
m_catRecValid = true;
return &m_catRec;
}
// consider it called
m_calledMsg8b = true;
// assume empty and skip the call for now
m_catRec.reset();
m_catRecValid = true;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// let's bring dmoz back
//return &m_catRec;
// compute it otherwise
if ( ! m_msg8b.getCatRec ( &m_firstUrl ,
cr->m_coll ,
gbstrlen(cr->m_coll) ,
true , // use canonical name?
m_niceness ,
&m_catRec , // store here
m_masterState , // state
m_masterLoop )) // callback
// return -1 if we blocked
return (CatRec *)-1;
// error?
if ( g_errno ) return NULL;
// we got it somehow without blocking... local cached lookup?
m_catRecValid = true;
return &m_catRec;
}
void gotWikiResultsWrapper ( void *state , UdpSlot *slot ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->gotWikiResults ( slot );
THIS->m_masterLoop ( THIS->m_masterState );
}
// . get the wiki pages that this page matches
// . use the docids of the wiki pages to represent them
// . use an independent 32-node cluster to index all of wikipedia so it is all
// in ram. do not need datedb, etc.
// . get the gigabits for this page, up to 50 of them, and use that as a rat=0
// query on the wiki cluster
// . score each wiki docid too, based on match
// . normalize scores so they range from 10% to 100%, based on # of gigabits
// that the wiki page matches
// . index these as gbwiki:<wikipagedocid> with the score given (8-bit) mapped
// to 32 bits using score8to32() so the score itself is preserved
// . WE CAN ALSO call this at QUERY TIME, using the actual query of the
// searcher instead of the string of gigabits
// . BUT i will probably just look at the wiki topics of the search results,
// that will be faster and maybe more accurate...
int64_t **XmlDoc::getWikiDocIds ( ) {
if ( m_wikiDocIdsValid ) return (int64_t **)&ptr_wikiDocIds;
setStatus ( "getting wiki docids" );
// . get our gigabit vector
// . consists of array of 32-bit hashes
// . followed by 1-1 array of 16-bit scores
// . TODO: restrict gigabits to capitalized words and phrases, and
// also to 2+ word wiki titles
char *gq = getGigabitQuery ( );
if ( ! gq || gq == (char *)-1 ) return (int64_t **)gq;
// empty? then no wiki match i guess
//logf(LOG_DEBUG,"FIX ME FIX ME - getWikiDocIds");
// MDW: for now bail here too!
if ( ! gq[0] || 1 == 1 ) {
ptr_wikiDocIds = m_wikiDocIds;
ptr_wikiScores = m_wikiScores;
size_wikiDocIds = 0;
size_wikiScores = 0;
m_wikiDocIdsValid = true;
return (int64_t **)&ptr_wikiDocIds;
}
// set our query to these gigabits
// re-enable this later
//if ( ! m_calledMsg40 ) m_wq.set ( gq );
int32_t need = 200 + gbstrlen(gq);
// make buf
m_wikiqbuf = (char *)mmalloc ( need , "wikiqbuf");
// error?
if ( ! m_wikiqbuf ) return NULL;
// save size
m_wikiqbufSize = need;
// use large single tier for speed
char *p = m_wikiqbuf;
p += sprintf ( p ,
"GET /search?raw=9&n=%"INT32"&sc=0&dr=0&"//dio=1&"
"t0=1000000&rat=0&"
"c=wiki&q=%s", (int32_t)MAX_WIKI_DOCIDS, gq );
// terminate it
*p++ = '\0';
// then put in the ip
*(int32_t *)p = g_hostdb.m_myHost->m_ip;
// skip over ip
p += 4;
// sanity check
if ( p - m_wikiqbuf > need ) { char *xx=NULL;*xx=0; }
int32_t ip = g_conf.m_wikiProxyIp;
// if not given, make it gf1 for now
if ( ! ip ) ip = atoip ( "10.5.62.11" , 10 );
int32_t port = g_conf.m_wikiProxyPort;
// port default too to gf1
if ( ! port ) port = 9002;
// send it using msg 0xfd to the wiki cluster's proxy
if ( ! g_udpServer.sendRequest ( m_wikiqbuf ,
p - m_wikiqbuf ,
0xfd ,
ip ,
port ,
-1 , // hostId
NULL , // retSlot
this , // state
gotWikiResultsWrapper ,
1000 ) )
// we had an error, g_errno should be set
return NULL;
// got without blocking? no way!
return (int64_t **)-1;
}
void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
setStatus ( "got wiki docids" );
// do not free our request in slot
slot->m_sendBufAlloc = NULL;
// free request buf
mfree ( m_wikiqbuf , m_wikiqbufSize , "wikiqbuf" );
// error getting the wiki results?
if ( g_errno ) return;
// TODO: normalize all scores with each other some how. i think
// they are fairly absolute, but now sure with a lot of rat=0 terms!
logf(LOG_DEBUG,"wiki: fix my scoring stuff. have a min score... "
" and somehow normalize scores to be in [0,1.0]");
// . force this reply to be NULL terminated
// . i can't fix in the code now because the reply is coming from
// a different cluster running an older version of gb
char *s = slot->m_readBuf;
char *end = s + slot->m_readBufSize - 1;
// overwrite the last '>', who cares!
*end = '\0';
// make our xml
Xml xml;
if ( ! xml.set ( s ,
end - s ,
false , // ownData?
0 ,
false ,
TITLEREC_CURRENT_VERSION ,
false , // setParents?
m_niceness ,
CT_HTML ))
// return if g_errno got set
return;
// grab docids
int32_t nd = 0;
int32_t nn = xml.getNumNodes();
XmlNode *nodes = xml.getNodes();
float score = 0.0;
int64_t docId = 0LL;
for ( int32_t i = 0 ; i + 1 < nn ; i++ ) {
if ( nodes[i].m_nodeId != 1 ) continue;
// tagname is <docid>?
if ( nodes[i].m_tagNameLen == 5 &&
nodes[i].m_tagName[0] == 'd' &&
! strncmp(nodes[i].m_tagName,"docId",5) )
docId = atoll ( nodes[i].m_tagName );
// is <score>? (after docid tag)
if ( nodes[i].m_tagNameLen == 8 &&
nodes[i].m_tagName[0] == 'a' &&
! strncmp(nodes[i].m_tagName,"absScore",8) ) {
score = atof ( nodes[i].m_tagName );
// add it
m_wikiDocIds [ nd ] = docId;
m_wikiScores [ nd ] = score;
nd++;
// do not overflow
if ( nd >= MAX_WIKI_DOCIDS ) break;
}
}
// point to them
ptr_wikiDocIds = m_wikiDocIds;
ptr_wikiScores = m_wikiScores;
size_wikiDocIds = nd * 8;
size_wikiScores = nd * sizeof(rscore_t);
log ( LOG_DEBUG , "build: got %"INT32" wiki docids",nd);
m_wikiDocIdsValid = true;
}
int32_t *XmlDoc::getPubDate ( ) {
if ( m_pubDateValid ) return (int32_t *)&m_pubDate;
// get date parse
Dates *dp = getDates();
if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
// got it
m_pubDateValid = true;
m_pubDate = dp->getPubDate();
// print it once for page parser. we now do this in XmlDoc::print()
//if ( m_pbuf ) m_dates.printPubDates ( m_pbuf );
// set m_ageInDays
if ( m_pubDate == (uint32_t)-1 ) return (int32_t *)&m_pubDate;
// for parsing date
//int32_t currentTime = getTimeGlobal();
// this must be valid
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
int32_t spideredTime = getSpideredTime();
// get doc age
//float age = currentTime - m_pubDate;
float age = spideredTime - m_pubDate;
// convert to days (could be negative if in the future)
m_ageInDays = age / (3600*24.0);
// fix it if negative
if ( m_ageInDays < 0.0 ) m_ageInDays = 0.0;
return (int32_t *)&m_pubDate;
}
Dates *XmlDoc::getDates ( ) {
if ( m_datesValid ) return &m_dates;
// skip for now
m_datesValid = true;
return &m_dates;
// set status. we can time status changes with this routine!
setStatus ( "getting dates");
Dates *dd = getSimpleDates();
// bail on error
if ( ! dd ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return NULL;
}
// need addresses
Addresses *aa = getAddresses ();
if ( ! aa || aa == (void *)-1 ) return (Dates *)aa;
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (Dates *)isRoot;
// . get root doc, from titlerec is ok ( TODO: make sure from titlerec)
// . TODO: make sure to save in titledb too???
// . we need this now too
// . now set DF_IN_ROOTDOC on dates that were in the same section but
// in the root doc.
// . if we are not the root, we use the root title rec to see if
// the website repeats the store hours on every page. in that case
// . TODO: a special cache just fo rholding "svt" for root pages.
// should be highly efficient!!!
//XmlDoc *rd = NULL;
// setPart2() needs the implied sections set, so set them
Sections *sections = getSections();
if ( !sections ||sections==(Sections *)-1) return(Dates *)sections;
//SectionVotingTable *osvt = getOldSectionVotingTable();
//if ( ! osvt || osvt == (void *)-1 ) return (Dates *)osvt;
// table should be empty if we are the root!
//HashTableX *rvt = getRootVotingTable();
//if ( ! rvt || rvt == (void *)-1 ) return (Dates *)rvt;
char *isRSS = getIsRSS();
if ( ! isRSS || isRSS == (void *)-1 ) return (Dates *)isRSS;
uint8_t *ctype = getContentType();
if ( ! ctype || ctype == (void *)-1 ) return (Dates *)ctype;
bool isXml = false;
if ( *isRSS ) isXml = true;
if ( *ctype == CT_XML ) isXml = true;
int32_t minPubDate = -1;
int32_t maxPubDate = -1;
// parentPrevSpiderTime is 0 if that was the first time that the
// parent was spidered, in which case isNewOutlink will always be set
// for every outlink it had!
if ( m_sreqValid &&
m_sreq.m_isNewOutlink &&
m_sreq.m_parentPrevSpiderTime ) {
// pub date is somewhere between these two times
minPubDate = m_sreq.m_parentPrevSpiderTime;
//maxPubDate = m_sreq.m_addedTime;
maxPubDate = m_sreq.m_discoveryTime;
}
// now set part2 , returns false and sets g_errno on error
if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
isXml , *isRoot )) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("doc: dates2: %s",mstrerror(g_errno));
// this just means we ran out of stack space to parse
// out all the dates, so ignore and continue... that way
// Spider.cpp does not give up and keep retrying us over
// and over again
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
// on all other errors, return NULL
if ( g_errno ) return NULL;
}
// debug EBADENGINEER error
if ( g_errno ) { char *xx=NULL;*xx=0; }
// overflow? does not set g_errno. at least clear all so we do not
// get a messed up partial representation.
//if ( m_dates.m_overflowed ) {
// log("doc: date overflow for %s",m_firstUrl.m_url);
// m_dates.reset();
//}
// only call it once
m_datesValid = true;
// return it
return &m_dates;
}
Dates *XmlDoc::getSimpleDates ( ) {
if ( m_simpleDatesValid ) return &m_dates;
// note that
setStatus("get dates part 1");
// try the current url
Url *u = getCurrentUrl();
// and ip
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (Dates *)ip;
// the docid
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Dates *)d;
// the site hash
int32_t *sh32 = getSiteHash32();
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Dates *)sh32;
// words
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (Dates *)words;
// we set the D_IS_IN_DATE flag for these bits
Bits *bits = getBits(); if ( ! bits ) return NULL;
// sections. is it ok that these do not include implied sections?
Sections *sections = getExplicitSections();
if (!sections||sections==(Sections *)-1) return (Dates *)sections;
// link info (this is what we had the problem with)
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Dates *)info1;
//int32_t *sv = getPageSampleVector();
//if ( ! sv || sv == (int32_t *)-1 ) return (Dates *)sv;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Dates *)xml;
// this must be valid, cuz Dates.cpp uses it!
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0;}
// . get the xml doc of the previously stored title rec
// . Dates will compare the two docs to check for clocks, etc.
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (Dates *)pod;
Url **redir = getRedirUrl();
if ( ! redir || redir == (Url **)-1 ) return (Dates *)redir;
//char *ru = NULL;
//if ( *redir ) ru = (*redir)->getUrl();
// this should deserialize from its title rec data
//Dates *odp = NULL;
//if ( *pod ) odp = (*pod)->getDates ();
// the key in this table is the date tagHash and occNum, and the
// value is the timestamp of the date. this is used by the clock
// detection algorithm to compare a date in the previous version
// of this web page to see if it changed and is therefore a clock then.
// HashTableX *cct = NULL;
// if ( *pod ) cct = (*pod)->getClockCandidatesTable();
// this should be valid
uint8_t ctype = *getContentType();
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// this now returns false and sets g_errno on error, true on success
if ( ! m_dates.setPart1 ( u , //->getUrl(),
*redir, // ru ,
ctype ,
*ip ,
*d ,
*sh32 ,
xml ,
words ,
// set D_IS_IN_DATE flag so Address.cpp
// can avoid such word in addresses!
bits ,
sections ,
info1 ,
//sv ,
//odp , // old dates
NULL , // cct ,
this , // us
*pod , // old XmlDoc
cr->m_coll ,
m_niceness )) {
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("doc: dates1: %s",mstrerror(g_errno));
// this just means we ran out of stack space to parse
// out all the dates, so ignore and continue... that way
// Spider.cpp does not give up and keep retrying us over
// and over again
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
// on all other errors, return NULL
if ( g_errno ) return NULL;
}
// only call it once
m_simpleDatesValid = true;
// return it
return &m_dates;
}
/*
// returns NULL and sets g_errno on error, returns -1 if blocked
HashTableX *XmlDoc::getClockCandidatesTable ( ) {
// return if valid
if ( m_clockCandidatesTableValid ) return &m_clockCandidatesTable;
// otherwise, deserialize?
if ( m_clockCandidatesDataValid ) {
// and table is now valid
m_clockCandidatesTableValid = true;
// return empty table if ptr is NULL. take this out then.
if(!ptr_clockCandidatesData ) return &m_clockCandidatesTable;
// otherwise, deserialize
m_clockCandidatesTable.deserialize(ptr_clockCandidatesData ,
size_clockCandidatesData,
m_niceness );
// and return that
return &m_clockCandidatesTable;
}
// no longer using this since we got ptr_metadata
return &m_clockCandidatesTable;
// otherwise, get our dates
Dates *dp = getDates();
if ( ! dp || dp == (Dates *)-1 ) return (HashTableX *)dp;
// reset table just in case
m_clockCandidatesTable.reset();
// if no dates, bail
if ( dp->m_numDatePtrs == 0 ) {
m_clockCandidatesTableValid = true;
m_clockCandidatesDataValid = true;
// ptr_clockCandidatesData = NULL;
// size_clockCandidatesData = 0;
return &m_clockCandidatesTable;
}
// and set size to 32 buckets to start
if ( ! m_clockCandidatesTable.set (8,4,32,NULL,0,false,m_niceness,
"clockcands") )
return NULL;
// now stock the table
for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get date
Date *di = dp->m_datePtrs[i];
// skip if got nuked
if ( ! di ) continue;
// make the key
int64_t key ;
// lower 32 bits is taghash
key = di->m_tagHash;
// upper 32 bits is occNum
key |= ((int64_t)(di->m_occNum)) << 32;
// timestamp is the val
int32_t val = di->m_timestamp;
// then store it
if ( ! m_clockCandidatesTable.addKey ( &key , &val ) )
return NULL;
}
// that is now valid
m_clockCandidatesTableValid = true;
// how many bytes to serialize?
int32_t need = m_clockCandidatesTable.getStoredSize();
// now make the ptr valid
if ( ! m_cctbuf.reserve ( need ) ) return NULL;
// store it in there
m_clockCandidatesTable.serialize ( &m_cctbuf );
// point to it
// ptr_clockCandidatesData = m_cctbuf.getBufStart();
// size_clockCandidatesData = need;
// that is valid now
m_clockCandidatesDataValid = true;
return &m_clockCandidatesTable;
}
*/
// a date of -1 means not found or unknown
int32_t XmlDoc::getUrlPubDate ( ) {
if ( m_urlPubDateValid ) return m_urlPubDate;
// need a first url. caller should have called setFirstUrl()
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
// use Dates
//Dates dp;
// -1 means unknown
m_urlPubDate = -1;
//m_urlAge = -1;
// try the FIRST url
Url *u = getFirstUrl();
// get last url we redirected to
Url **redir = getRedirUrl();
if ( ! redir || redir == (Url **)-1 ) {char *xx=NULL;*xx=0;}
subloop:
// . try to get the date just from the url
// . this will be zero if none found
m_urlPubDate = parseDateFromUrl ( u->getUrl() );
// we are kosher
m_urlPubDateValid = true;
// if we are unknown try last/redir url, if any
if ( m_urlPubDate == 0 && *redir && u != *redir ) {
u = *redir;
goto subloop;
}
// if we got a valid pub date from the url, set "m_urlAge"
if ( m_urlPubDate == 0 ) return m_urlPubDate;
// note it
log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"",
(uint32_t)m_urlPubDate );
// set the age
//m_urlAge = getTimeGlobal() - m_urlPubDate;
//if ( m_urlAge < 0 ) m_urlAge = 0;
return m_urlPubDate;
}
// . use Dates to extract pub date from the url itself if pub date exists
// . an age of "-1" means unknown
/*
int32_t XmlDoc::getOutlinkAge ( int32_t outlinkNum ) {
// use Dates
Dates dp;
// sanity
if ( outlinkNum < 0 ) { char *xx=NULL;*xx=0; }
// get it
char *us = m_links.getLinkPtr(outlinkNum);
// for now set this, until we mod Dates to use normalized
// string urls
Url u;
u.set ( us );
// try to get the date just from the url
if ( ! dp.set ( &u ,
0 , // ip
0LL , // m_newDocId
0 , // siteHash
NULL , // Xml
NULL , // Words
NULL , // Bits
NULL , // Sections
NULL , // LinkInfo
NULL , // pageSampleVec
NULL , // old date parse2
NULL , // m_newDoc
NULL , // m_oldDoc
m_coll ,
0 , // defaultTimeZone
m_niceness )){
// should never block!
char *xx=NULL; *xx= 0; }
// this will be -1 if no date was found in the url
int32_t urlPubDate = dp.getPubDate();
// if we got a valid pub date from the url, set "m_urlAge"
if ( urlPubDate == -1 ) return -1;
// note it
//log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"", m_urlDate );
// set the age
int32_t age = getTimeGlobal() - urlPubDate;
// keep positive
if ( age < 0 ) age = 0;
// return it
return age;
}
*/
// . sets g_errno on error and returns NULL
// . now returns a ptr to it so we can return NULL to signify error, that way
// all accessors have equivalent return values
// . an acessor function returns (char *)-1 if it blocked!
char *XmlDoc::getIsPermalink ( ) {
if ( m_isPermalinkValid ) return &m_isPermalink2;
Url *url = getCurrentUrl();
if ( ! url ) return NULL;
char *isRSS = getIsRSS();
// return NULL with g_errno set, -1 if blocked
if ( ! isRSS || isRSS == (char *)-1 ) return isRSS;
Links *links = getLinks();
// return NULL with g_errno set, -1 if blocked
if ( ! links || links == (Links *)-1 ) return (char *)links;
uint8_t *ct = getContentType();
// return NULL with g_errno set, -1 if blocked
if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct;
// GUESS if it is a permalink by the format of the url
int32_t p = ::isPermalink ( links , // Links ptr
url ,
*ct , // CT_HTML default?
NULL , // LinkInfo ptr
*isRSS );// isRSS?
m_isPermalink = p;
m_isPermalink2 = p;
m_isPermalinkValid = true;
return &m_isPermalink2;
}
// guess based on the format of the url if this is a permalink
char *XmlDoc::getIsUrlPermalinkFormat ( ) {
if ( m_isUrlPermalinkFormatValid ) return &m_isUrlPermalinkFormat;
setStatus ( "getting is url permalink format" );
Url *url = getCurrentUrl();
if ( ! url ) return NULL;
// just guess if we are rss here since we most likely do not have
// access to the url's content...
bool isRSS = false;
char *ext = url->getExtension();
if ( ext && strcasecmp(ext,"rss") == 0 ) isRSS = true;
// GUESS if it is a permalink by the format of the url
int32_t p = ::isPermalink ( NULL , // Links ptr
url ,
CT_HTML ,
NULL , // LinkInfo ptr
isRSS );// we guess this...
m_isUrlPermalinkFormat = p;
m_isUrlPermalinkFormatValid = true;
return &m_isUrlPermalinkFormat;
}
char *XmlDoc::getIsRSS ( ) {
if ( m_isRSSValid ) return &m_isRSS2;
// the xml tells us for sure
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
m_isRSS = xml->isRSSFeed();
m_isRSS2 = (bool)m_isRSS;
m_isRSSValid = true;
return &m_isRSS2;
}
char *XmlDoc::getIsSiteMap ( ) {
if ( m_isSiteMapValid ) return &m_isSiteMap;
uint8_t *ct = getContentType();
if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct;
char *uf = m_firstUrl.getFilename();
int32_t ulen = m_firstUrl.getFilenameLen();
// sitemap.xml
m_isSiteMap = false;
// must be xml to be a sitemap
if ( *ct == CT_XML &&
ulen == 11 &&
strncmp(uf,"sitemap.xml",11) == 0 )
m_isSiteMap = true;
m_isSiteMapValid = true;
return &m_isSiteMap;
}
// . this function should really be called getTagTokens() because it mostly
// works on HTML documents, not XML, and just sets an array of ptrs to
// the tags in the document, including ptrs to the text in between
// tags.
Xml *XmlDoc::getXml ( ) {
// return it if it is set
if ( m_xmlValid ) return &m_xml;
// note it
setStatus ( "parsing html");
// get the filtered content
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
int32_t u8len = size_utf8Content - 1;
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
// set it
if ( ! m_xml.set ( *u8 ,
u8len ,
false , // ownData?
0 , // allocSize
false , // pure xml?
m_version ,
false , // setParentsArg?
m_niceness ,
*ct ) )
// return NULL on error with g_errno set
return NULL;
// set just once
m_xmlValid = true;
// all done
return &m_xml;
}
// Language support static stuff
enum {
METHOD_TAG = 0,
METHOD_DMOZ,
METHOD_URL,
METHOD_OUTLINKS,
METHOD_INLINKS,
METHOD_FREQ,
METHOD_DEFAULT,
METHOD_IP,
METHOD_ROOT,
METHOD_CAP
};
bool setLangVec ( Words *words ,
SafeBuf *langBuf ,
Sections *ss ,
int32_t niceness ) {
int64_t *wids = words->getWordIds ();
char **wptrs = words->m_words;
int32_t nw = words->getNumWords ();
// allocate
if ( ! langBuf->reserve ( nw ) ) return false;
uint8_t *langVector = (uint8_t *)langBuf->getBufStart();
// now set the langid
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( niceness );
// default
langVector[i] = langUnknown;
// add the word
if ( wids[i] == 0LL ) continue;
// skip if number
if ( is_digit(wptrs[i][0]) ) {
langVector[i] = langTranslingual;
continue;
}
// get the lang bits. does not include langTranslingual
// or langUnknown
int64_t bits = g_speller.getLangBits64 ( &wids[i] );
// skip if not unique
char count = getNumBitsOn64 ( bits ) ;
// if we only got one lang we could be, assume that
if ( count == 1 ) {
// get it. bit #0 is english, so add 1
char langId = getBitPosLL((uint8_t *)&bits) + 1;
//langVector[i] = g_wiktionary.getLangId(&wids[i]);
langVector[i] = langId;
continue;
}
// ambiguous? set it to unknown then
if ( count >= 2 ) {
langVector[i] = langUnknown;
continue;
}
// try setting based on script. greek. russian. etc.
// if the word was not in the wiktionary.
// this will be langUnknown if not definitive.
langVector[i] = getCharacterLanguage(wptrs[i]);
}
// . now go sentence by sentence
// . get the 64 bit vector for each word in the sentence
// . then intersect them all
// . if the result is a unique langid, assign that langid to
// all words in the sentence
// get first sentence in doc
Section *si = NULL;
if ( ss ) si = ss->m_firstSent;
// scan the sentence sections and or in the bits we should
for ( ; si ; si = si->m_nextSent ) {
// breathe
QUICKPOLL ( niceness );
// reset vec
int64_t bits = LANG_BIT_MASK;
// get lang 64 bit vec for each wid in sentence
for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
// breathe
QUICKPOLL ( niceness );
// skip if not alnum word
if ( ! wids[j] ) continue;
// skip if starts with digit
if ( is_digit(wptrs[j][0]) ) continue;
// get 64 bit lang vec. does not include
// langUnknown or langTransligual bits
bits &= g_speller.getLangBits64 ( &wids[j] );
}
// bail if none
if ( ! bits ) continue;
// skip if more than one language in intersection
if ( getNumBitsOn64(bits) != 1 ) continue;
// get it. bit #0 is english, so add 1
char langId = getBitPosLL((uint8_t *)&bits) + 1;
// ok, must be this language i guess
for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
// breathe
QUICKPOLL ( niceness );
// skip if not alnum word
if ( ! wids[j] ) continue;
// skip if starts with digit
if ( is_digit(wptrs[j][0]) ) continue;
// set it
langVector[j] = langId;
}
}
// try the same thing but do not use sentences. use windows of
// 5 words. this will pick up pages that have an english menu
// where each menu item is an individual sentence and only
// one word.
// http://www.topicexchange.com/
int64_t window[5];
int32_t wpos[5];
memset ( window , 0 , 8*5 );
int32_t wp = 0;
int32_t total = 0;
// now set the langid
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( niceness );
// must be alnum
if ( ! wids[i] ) continue;
// skip if starts with digit
if ( is_digit(wptrs[i][0]) ) continue;
// skip if lang already set to a language
//if ( langVector[i] != langUnknown &&
// langVector[i] != langTranslingual )
// continue;
// get last 5
window[wp] = g_speller.getLangBits64 ( &wids[i] );
// skip if not in dictionary!
if ( window[wp] == 0 ) continue;
// otherwise, store it
wpos [wp] = i;
if ( ++wp >= 5 ) wp = 0;
// need at least 3 samples
if ( ++total <= 2 ) continue;
// intersect them all together
int64_t bits = LANG_BIT_MASK;
for ( int32_t j = 0 ; j < 5 ; j++ ) {
// skip if uninitialized, like if we have 3
// or only 4 samples
if ( ! window[j] ) continue;
// otherwise, toss it in the intersection
bits &= window[j];
}
// skip if intersection empty
if ( ! bits ) continue;
// skip if more than one language in intersection
if ( getNumBitsOn64(bits) != 1 ) continue;
// get it. bit #0 is english, so add 1
char langId = getBitPosLL((uint8_t *)&bits) + 1;
// set all in window to this language
for ( int32_t j = 0 ; j < 5 ; j++ ) {
// skip if unitialized
if ( ! window[j] ) continue;
// otherwise, set it
langVector[wpos[j]] = langId;
}
}
return true;
}
// 1-1 with the words!
uint8_t *XmlDoc::getLangVector ( ) {
if ( m_langVectorValid ) {
// can't return NULL, that means error!
uint8_t *v = (uint8_t *)m_langVec.getBufStart();
if ( ! v ) return (uint8_t *)0x01;
return v;
}
// words
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
// get the sections without implied sections
Sections *ss = getImpliedSections();
if ( ! ss || ss==(void *)-1) return (uint8_t *)ss;
if ( ! setLangVec ( words , &m_langVec , ss , m_niceness) )
return NULL;
m_langVectorValid = true;
// can't return NULL, that means error!
uint8_t *v = (uint8_t *)m_langVec.getBufStart();
if ( ! v ) return (uint8_t *)0x01;
return v;
}
// returns -1 and sets g_errno on error
uint8_t *XmlDoc::getLangId ( ) {
if ( m_langIdValid ) return &m_langId;
setStatus ( "getting lang id");
// debu ghack
//m_langId = langRussian;
//m_langIdValid = true;
//return &m_langId;
// get the stuff we need
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (uint8_t *)ip;
// . if we got no ip, we can't get the page...
// . also getLinks() will call getSiteNumInlinks() which will
// call getSiteLinkInfo() and will core if ip is 0 or -1
if ( *ip == 0 || *ip == -1 ) {
m_langId = langUnknown;
m_langIdValid = true;
return &m_langId;
}
//Xml *xml = getXml ();
//if ( ! xml || xml == (Xml *)-1 ) return (uint8_t *)xml;
Words *words = getWords ();
if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
// do not get regular sections, getSections() which will call
// getImpliedSections(), because then that will need to set addresses
// and dates, etc. the addresses could return NULL with EBUFOVERFLOW
// from a static buffer overflow causing us some problems here and
// since that g_errno is only really handled well in getIndexCode()
// it will log that CRITICAL CRITICAL message. and we really only
// need the section sot avoid looking at script tag sections, etc.
// when calling Words::getLanguage()
Sections *sections = getExplicitSections();
// did it block?
if ( sections==(Sections *)-1) return(uint8_t *)sections;
// well, it still calls Dates::parseDates which can return g_errno
// set to EBUFOVERFLOW...
if ( ! sections && g_errno != EBUFOVERFLOW ) return NULL;
// if sectinos is still NULL - try lang id without sections then,
// reset g_errno
g_errno = 0;
//Links *links = getLinks();
//if ( ! links || links == (Links *)-1 ) return (uint8_t *)links;
//LinkInfo *info1 = getLinkInfo1();
//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (uint8_t *)info1;
//CatRec *cat = getCatRec ();
//if ( ! cat || cat == (CatRec *)-1) return (uint8_t *)cat;
uint8_t *lv = getLangVector();
if ( ! lv || lv == (void *)-1 ) return (uint8_t *)lv;
setStatus ( "getting lang id");
// compute langid from vector
m_langId = computeLangId ( sections , words, (char *)lv );
if ( m_langId != langUnknown ) {
m_langIdValid = true;
return &m_langId;
}
// . try the meta description i guess
// . 99% of the time we don't need this because the above code
// captures the language
int32_t mdlen;
char *md = getMetaDescription( &mdlen );
Words mdw;
mdw.setx ( md , mdlen , m_niceness );
SafeBuf langBuf;
setLangVec ( &mdw,&langBuf,NULL,m_niceness);
char *tmpLangVec = langBuf.getBufStart();
m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
if ( m_langId != langUnknown ) {
m_langIdValid = true;
return &m_langId;
}
// try meta keywords
md = getMetaKeywords( &mdlen );
mdw.setx ( md , mdlen , m_niceness );
langBuf.purge();
setLangVec ( &mdw,&langBuf,NULL,m_niceness);
tmpLangVec = langBuf.getBufStart();
m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
m_langIdValid = true;
return &m_langId;
}
// lv = langVec
char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
// this means null too
if ( sections && sections->m_numSections == 0 ) sp = NULL;
int32_t badFlags = SEC_SCRIPT|SEC_STYLE;//|SEC_SELECT;
int32_t counts [ MAX_LANGUAGES ];
memset ( counts , 0 , MAX_LANGUAGES * 4);
int32_t nw = words->getNumWords ();
char **wptrs = words->m_words;
int32_t *wlens = words->m_wordLens;
// now set the langid
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if in script or style section
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
//
// skip if in a url
//
// blah/
if ( wptrs[i][wlens[i]] == '/' ) continue;
// blah.blah or blah?blah
if ( (wptrs[i][wlens[i]] == '.' ||
wptrs[i][wlens[i]] == '?' ) &&
is_alnum_a(wptrs[i][wlens[i]+1]) )
continue;
// /blah or ?blah
if ( (i>0 && wptrs[i][-1] == '/') ||
(i>0 && wptrs[i][-1] == '?') )
continue;
// add it up
counts[(unsigned char)lv[i]]++;
}
// get the majority count
int32_t max = 0;
int32_t maxi = 0;
// skip langUnknown by starting at 1, langEnglish
for ( int32_t i = 1 ; i < MAX_LANGUAGES ; i++ ) {
// skip translingual
if ( i == langTranslingual ) continue;
if ( counts[i] <= max ) continue;
max = counts[i];
maxi = i;
}
return maxi;
//m_langId = maxi;
//m_langIdValid = true;
//return &m_langId;
/*
int32_t freqScore = 0;
int32_t lang;
if ( ! m_processedLang ) {
// do not repeat this call for this document
m_processedLang = true;
lang = words->getLanguage( sections ,
1000 , // sampleSize ,
m_niceness,
&freqScore);
// return NULL on error with g_errno set
if ( lang == -1 ) return NULL;
// we got it from words, return
if ( lang != 0 ) {
m_langId = lang;
m_langIdValid = true;
return &m_langId;
}
}
m_langId = 0;
// try from charset
uint16_t *charset = getCharset ( );
if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset;
// do based on charset
if ( *charset == csGB18030 ) m_langId = langChineseTrad;
if ( *charset == csGBK ) m_langId = langChineseSimp;
if ( m_langId ) {
m_langIdValid = true;
return &m_langId;
}
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
// this lookup here might be unnecessary
uint8_t *rl = NULL;
if ( ! *isRoot ) {
rl = getRootLangId();
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
}
//Url *u = getCurrentUrl();
Url *u = getFirstUrl();
uint8_t gs[METHOD_CAP];
// reset language method vector
memset( gs , 0, sizeof(uint8_t) * METHOD_CAP );
// Let the site tell us what language it's in
gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml );
// Guess from the FIRST URL (unredirected url)
gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() );
// Guess from the outlinks
gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links );
// Guess from the inlinks
gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip);
// root page's language, if there was one
if ( ! *isRoot ) gs [METHOD_ROOT] = *rl;
int32_t scores[MAX_LANGUAGES];
memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES );
// weights for the 10 methods
char cw[] = { 8,9,4,7,6,7,8,1,2};
// add up weighted scores
for(int i = 0; i < METHOD_CAP; i++ )
scores[gs[i]] += cw[i];
// reset the "lang" to langUnknown which is 0
lang = langUnknown ;
int max, oldmax;
max = oldmax = 0;
// find best language
for ( int32_t i = MAX_LANGUAGES - 1; i > 0 ; i-- ) {
if ( scores[i] < max) continue;
oldmax = max;
max = scores[i];
lang = i;
}
// give up if not too conclusive
if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) {
//log(LOG_DEBUG, "build: Language: Threshold, score "
// "(%"INT32" - %"INT32") %"INT32" vs. %"INT32".\n",
// (int32_t)max,
// (int32_t)oldmax,
// (int32_t)max - oldmax,
// (int32_t)3);//(int32_t)cr->m_languageThreshold);
lang = langUnknown;
}
// Make sure we're over the bailout value, this
// keeps low scoring methods like TLD from being
// the decider if it was the only successful method.
if ( max < 5 ) { // cr->m_languageBailout ) {
//log(LOG_DEBUG, "build: Language: Bailout, "
// "score %"INT32" vs. %"INT32".",
// (int32_t)max, (int32_t)5);//cr->m_languageBailout);
lang = langUnknown;
}
// If the language is still not known,
// use the language detected from the frames.
//if(lang == langUnknown) lang = frameFoundLang;
// . try dmoz if still unknown
// . limit to 10 of them
// all done, do not repeat
m_langIdValid = true;
m_langId = lang;
m_langIdScore = max;
return &m_langId;
*/
}
Words *XmlDoc::getWords ( ) {
// return it if it is set
if ( m_wordsValid ) return &m_words;
// this will set it if necessary
Xml *xml = getXml();
// returns NULL on error, -1 if blocked
if ( ! xml || xml == (Xml *)-1 ) return (Words *)xml;
// note it
setStatus ( "getting words");
// now set what we need
if ( ! m_words.set ( xml ,
true , // computeWordIds?
m_niceness ))
return NULL;
// we got it
m_wordsValid = true;
return &m_words;
}
Bits *XmlDoc::getBits ( ) {
// return it if it is set
if ( m_bitsValid ) return &m_bits;
// this will set it if necessary
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Bits *)words;
// now set what we need
if ( ! m_bits.set ( words , m_version , m_niceness ) )
return NULL;
// we got it
m_bitsValid = true;
return &m_bits;
}
Bits *XmlDoc::getBitsForSummary ( ) {
// return it if it is set
if ( m_bits2Valid ) return &m_bits2;
// this will set it if necessary
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Bits *)words;
// now set what we need
if ( ! m_bits2.setForSummary ( words ) ) return NULL;
// we got it
m_bits2Valid = true;
return &m_bits2;
}
Pos *XmlDoc::getPos ( ) {
// return it if it is set
if ( m_posValid ) return &m_pos;
// this will set it if necessary
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Pos *)ww;
//Sections *sections = getSections();
//if ( !sections ||sections==(Sections *)-1) return(Pos *)sections;
// now set what we need
//if ( ! m_pos.set ( ww , sections ) ) return NULL;
if ( ! m_pos.set ( ww , NULL ) ) return NULL;
// we got it
m_posValid = true;
return &m_pos;
}
Phrases *XmlDoc::getPhrases ( ) {
// return it if it is set
if ( m_phrasesValid ) return &m_phrases;
// this will set it if necessary
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Phrases *)words;
// get this
Bits *bits = getBits();
// bail on error
if ( ! bits ) return NULL;
// now set what we need
if ( ! m_phrases.set ( words ,
bits ,
true , // use stop words
false , // use stems
m_version ,
m_niceness ) )
return NULL;
// we got it
m_phrasesValid = true;
return &m_phrases;
}
/*
Synonyms *XmlDoc::getSynonyms ( ) {
// return if already set
if ( m_synonymsValid ) return &m_synonyms;
// this will set it if necessary
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (Synonyms *)words;
Phrases *phrases = getPhrases ();
if ( ! phrases || phrases == (void *)-1 ) return (Synonyms *)phrases;
uint8_t *lv = getLangVector();
if ( ! lv || lv == (void *)-1 ) return (Synonyms *)lv;
// primary language of the document
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (Synonyms *) langId;
// . now set what we need
// . provide a buf for which synonyms can be stored if we need to
SafeBuf *synBuf = NULL;
if ( m_pbuf || m_storeTermListInfo ) synBuf = &m_synBuf;
// force on for printing out the synonyms in the loop below
//synBuf = &m_synBuf;
if ( ! m_synonyms.set ( words,
(char *)lv,
(char)*langId,phrases,
m_niceness,synBuf) )
return NULL;
// we got it
m_synonymsValid = true;
return &m_synonyms;
}
*/
Sections *XmlDoc::getExplicitSections ( ) {
// these sections might or might not have the implied sections in them
if ( m_explicitSectionsValid ) return &m_sections;
// if json forget this it is only html
//uint8_t *ct = getContentType();
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
// m_sectionsValid = true;
// return &m_sections;
//}
setStatus ( "getting explicit sections" );
// use the old title rec to make sure we parse consistently!
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;
// int16_tcut
//XmlDoc *od = *pod;
// if the serialized section is valid, use that
//char *sd = NULL;
//bool valid = false;
//if ( od && od->m_sectionsReplyValid ) valid = true;
//if ( valid ) sd = od->ptr_sectionsReply;
// shouldn't we use the section data in ptr_sections for this???
//bool valid = m_sectionsReplyValid ;
//char *sd = NULL;
//if ( valid ) sd = ptr_sectionsReply;
// this will set it if necessary
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
// need these too now
Phrases *phrases = getPhrases();
if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases;
// get this
Bits *bits = getBits();
// bail on error
if ( ! bits ) return NULL;
// the site hash
int64_t *sh64 = getSiteHash64();
// sanity check
if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; }
if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64;
// the docid
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
setStatus ( "getting sections");
//char *sv = NULL;
//if ( m_setFromTitleRec ) sv = ptr_sectionsVotes;
// debug time to find a slow url
int64_t start = gettimeofdayInMillisecondsLocal();
// this uses the sectionsReply to see which sections are "text", etc.
// rather than compute it expensively
if ( ! m_calledSections &&
// we get malformed sections error for some diffbot replies
//*ct != CT_JSON &&
! m_sections.set ( &m_words ,
&m_phrases ,
bits ,
getFirstUrl() ,
*d ,
*sh64 , // 64 bits
cr->m_coll ,
m_niceness ,
m_masterState , // state
m_masterLoop , // callback
*ct ,
&m_dates ,
NULL , // sd // sections data
true , // sections data valid?
NULL , // sv // for m_nsvt
//*tph ,
NULL , // buf
0 )) { // bufSize
m_calledSections = true;
// sanity check, this should not block, we are setting
// exclusively from the titleRec
//if ( sd ) { char *xx=NULL;*xx=0; }
// it blocked, return -1
return (Sections *) -1;
}
int64_t end = gettimeofdayInMillisecondsLocal();
if ( end - start > 1000 )
log("build: %s section set took %"INT64" ms",
m_firstUrl.m_url,end -start);
// error? ETAGBREACH for example... or maybe ENOMEM
if ( g_errno ) return NULL;
// set inlink bits
m_bits.setInLinkBits ( &m_sections );
// we got it
m_explicitSectionsValid = true;
return &m_sections;
}
Sections *XmlDoc::getImpliedSections ( ) {
if ( m_impliedSectionsValid ) return &m_sections;
// get the sections without implied sections
Sections *sections = getExplicitSections();
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
// just use that for now if not doing events to save time! because
// adding implied sections really sucks the resources.
m_impliedSectionsValid = true;
return &m_sections;
// this will set it if necessary
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
// get this
Bits *bits = getBits();
// bail on error
if ( ! bits ) return NULL;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
// now we need basic date types to add implied sections that
// have a dow/dom header and tod brother sections
// THIS WAS in getExplicitSections() but now m_wids is NULL.
// m_wids is set in setPart1() called by XmlDoc::getSimpleDates(),
// which calls getExplicitSections().
// . This was called for the benefit of Sections::addImpliedSections()
// but now getAddresses() which we call below ends up calling
// getSimpleDates() which calls m_dates.setPart1() which calls
// m_dates.parseDates() so this is no longer needed i guess.
/*
if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits,
sections, m_niceness , &m_firstUrl ,
*ct )) {
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("doc: dates3: %s",mstrerror(g_errno));
// this just means we ran out of stack space to parse
// out all the dates, so ignore and continue... that way
// Spider.cpp does not give up and keep retrying us over
// and over again
//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
// on all other errors, return NULL
if ( g_errno ) return NULL;
}
*/
// if we got no sections it was bad html. so don't go any further
// lest we core in other code..
// it might have also just been an empty doc.
// either way we'll core in getAddresses cuz it calls getSimpleDates
// which will core in Dates::setPart1() trying to use m_sectionPtrs
if ( sections->m_numSections == 0 ) {
m_impliedSectionsValid = true;
// hack to avoid core for empty docs like www.mini-polis.com
sections->m_addedImpliedSections = true;
return &m_sections;
}
// . now set addresses so we can use those to add implied sections
// . this calls getSimpleDates() which calles m_dates.setPart1()
// which calls parseDates again
Addresses *aa = getAddresses ();
if ( ! aa || aa == (void *)-1 ) return (Sections *)aa;
// . now add implied sections
// . return NULL with g_errno set on error
if ( ! m_sections.addImpliedSections ( aa ) ) return NULL;
// we got it
m_impliedSectionsValid = true;
return &m_sections;
}
// add in Section::m_sentFlags bits having to do with our voting tables
Sections *XmlDoc::getSections ( ) {
setStatus("getting sections");
// get the sections without implied sections
Sections *ss = getImpliedSections();
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
// hash the turk votes (each vote maps a contenthash or taghash to
// a value) and use these to set sections sentence flags, etc.
//HashTableX *tvt = getTurkVotingTable ();
//if ( ! tvt || tvt == (void *)-1 ) return (Sections *)tvt;
// returns NULL if our url is root!
//HashTableX *rvt = getRootVotingTable();
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
SectionVotingTable *osvt = getOldSectionVotingTable();
if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt;
uint32_t *tph = getTagPairHash32();
if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph;
// need a getUseSectiondb() function...
if ( ! m_useSectiondb ) {
m_sectionsValid = true;
return &m_sections;
}
// start here
Section *si;
/*
// get first sentence in doc
si = ss->m_firstSent;
// do not bother scanning if no votes
if ( osvt->getNumVotes() <= 0 ) si = NULL;
// scan the sentence sections and or in the bits we should
for ( ; si ; si = si->m_nextSent ) {
// breathe
QUICKPOLL(m_niceness);
// combine section tagHash with contentHashAll to get
// the "modified tagHash"
int32_t modified = si->m_tagHash ^ si->m_contentHash;
// save this
float dups = osvt->getNumSampled (modified,SV_TAGCONTENTHASH);
// . getNumSampled() combines both m_nsvt and m_osvt so it
// includes ourselves... NO!... let's change this!
// the osvt should not include votes from us!
// it strips those outin SectionVotingTable::addListOfVotes()
// . if it is a print-friendly version of the same page then
// one of the two should have been deduped and not indexed,
// so be strict with adhering to no more than 1!
if ( dups > 0 ) si->m_flags |= SEC_DUP;
// . content hash must be unique!
// . can detect texty bios repeated throughout the site
// . this is the hash of the words directly in the section
// . HACK: the contentHash is the "tagHash" for this call
// . SectionVote::m_numSampled is how many sections over all
// docs we indexed from this site have this m_contentHash
// . note that it is not restricted to pages with the same
// tagPairHash as us (i.e. pages with similar layouts)
// therefore it is very flexible!!! it is only restricted
// to pages with our same site hash.
// . getNumSampled() combines both m_nsvt and m_osvt so it
// includes ourselves
// . if it is a print-friendly version of the same page then
// one of the two should have been deduped and not indexed,
// so be strict with adhering to no more than 1!
if ( dups > 0 ) continue;
// . must be in a unique section
// . if the section has siblings, skip it!
if ( si->m_numOccurences > 1 ) continue;
// . eliminate dynamic menus
// . like "related posts" menus
// . therefore require that we must be "texty" ...
// . i.e. be like 80% plain text and no more than 20% link text
// . vote on this since in some cases article may be mostly
// just all in anchor text on a few article pages, but on
// other pages it is well-behaved
if ( osvt->getScore ( si->m_tagHash, SV_TEXTY) < .80 )
continue;
// . check for comment sections
// . these are text and the content is unique
// . BUT the section tagHash is typically repeated at least
// once on some other pages (HOPEFULLY!!!!)
// . if we only require there be X other pages from this site
// with the same layout, we might get unlucky in that each
// page has 1 or less comments!!! how to fix???
// . anyway, we ask for the max # sampled from all of the votes
// here because if just one page has 2+ copies of this
// section enum tag hash, that is enough to be a comment
// section
// . SV_TEXTY_MAX_SAMPLED is a statistic compiled from the
// voters and does not actually exist in sectiondb per se.
// we add this statistic transparently in addVote() below
// . it just gets the num sampled from the voter that had the
// maximum m_numSampled value, because we don't want an
// average in this case
if ( osvt->getNumSampled(si->m_tagHash,SV_TEXTY_MAX_SAMPLED)>0)
continue;
// set it
si->m_flags |= SEC_ARTICLE;
// tally it up
//m_numAlnumWordsInArticle += si->m_exclusive;
// and another flag
//m_hadArticle = true;
}
*/
//
// . how many other pages from this site have our tagpairhash?
// . that is all the unique adjacent tag pair hashes xor'd together
// . kind of represents the template of the webpage, ideally
//
//int32_t numSimLayouts = osvt->getNumSampled ( *tph , SV_TAGPAIRHASH );
///////////////////////////////////////
//
// set m_dupVotes and m_notDupVotes for each section
//
// answers the question... out of all the pages with this taghash,
// from this site, how often is this content repeated?
//
// trumba.com often repeats an event on its various feeds, but
// not on EVERY page. so we should adjust the event title penalties
// based on the ratio of repeated to not-repeated from the various
// pages on the site that have the same *taghash*
//
///////////////////////////////////////
// get first sentence in doc
si = ss->m_firstSent;
// do not bother scanning if no votes
if ( osvt->getNumVotes() <= 0 ) si = NULL;
// assume no dups
m_maxVotesForDup = 0;
// scan the sentence sections and or in the bits we should
for ( ; si ; si = si->m_nextSent ) {
// breathe
QUICKPOLL ( m_niceness );
// sanity check
if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
// how many pages from this site have this taghash for
// a sentence
float nt;
nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH);
// skip if nobody! (except us)
if ( nt <= 0.0 ) continue;
// . get out tag content hash
// . for some reason m_contentHash is 0 for like menu-y sectns
int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64;
// . now how many pages also had same content in that tag?
// . TODO: make sure numsampled only counts a docid once!
// and this is not each time it occurs on that page.
float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH);
// cast it to a int32_t
int32_t votes1 = (int32_t)nsam;
// by default, complement
int32_t votes2 = (int32_t)nt - votes1;
// store votes
si->m_votesForDup = votes1;
si->m_votesForNotDup = votes2;
// what's the most dup votes we had...
if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1;
// set it
//if ( si->m_votesForDup > 2 * si->m_votesForNotDup &&
// si->m_votesForDup >= 1 &&
// ! (si->m_flags & SEC_HAS_NONFUZZYDATE) )
// si->m_sentFlags |= SENT_DUP_SECTION;
}
m_sectionsValid = true;
return &m_sections;
}
SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) {
if ( m_nsvtValid ) return &m_nsvt;
// need sections
Sections *ss = getSections();
if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
// and dates
Dates *dp = getDates();
if ( ! dp || dp == (Dates *)-1 ) return (SectionVotingTable *)dp;
// hash of all adjacent tag pairs
uint32_t *tph = getTagPairHash32 ( ) ;
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
// are we a site root url?
//char *isRoot = getIsSiteRoot();
//if ( ! isRoot || isRoot == (char *)-1 )
// return (SectionVotingTable *)isRoot;
// init table
if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL;
// . tally the section votes from the sections class
// . only add the date votes, not the taghash/contenthash keys
// from the root, since we add those from the root voting table
// into m_osvt directly!
// . we no longer have root voting table!
// . this adds keys of the hash of each tag xpath
// . and it adds keys of the hash of each tag path PLUS its innerhtml
if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL;
// tally the section votes from the dates
if ( ! dp->addVotes ( &m_nsvt ) ) return NULL;
// our new section voting table is now valid, and ready to be added
// to sectiondb by calling SectionVotingTable::hash()
m_nsvtValid = true;
return &m_nsvt;
}
// . scan every section and look up its tag and content hashes in
// sectiondb to find out how many pages and sites have the same hash
// . use the secondary sectiondb key, key2
// . then store the stats in the Sections::m_stats class
Sections *XmlDoc::getSectionsWithDupStats ( ) {
Sections *ss = getSections();
if ( !ss ||ss==(Sections *)-1) return(Sections *)ss;
if ( m_gotDupStats ) return ss;
int32_t *sh32 = getSiteHash32();
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32;
uint32_t siteHash32 = (uint32_t)*sh32;
//int64_t *shp64 = getSiteHash64();
//if ( ! shp64 || shp64 == (void *)-1 ) return (Sections *)shp64;
//int64_t siteHash48 = *shp64 & 0x0000ffffffffffffLL;
// first time called? then init m_nextSection.
//Section *si = m_si;
// if this is -1, we are called for the first time
if ( m_si == (void *)-1 ) {
m_si = ss->m_rootSection;
m_mcastRequestsIn = 0;
m_mcastRequestsOut = 0;
m_secStatsErrno = 0;
}
//sec_t menuFlags = SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ;
for ( ; m_si ; m_si = m_si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index.
if ( ! ( m_si->m_flags & SEC_HASHXPATH ) )
continue;
// skip if sentence, only hash tags now i guess for diffbot
//if ( m_si->m_sentenceContentHash64 )
// continue;
// get hash of sentences this tag contains indirectly
uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64;
if ( ! val32 )
continue;
// skip if menu!
//if ( m_si->m_flags & menuFlags ) continue;
// get section xpath hash combined with sitehash
uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32;
// convert this to 32 bits
uint32_t innerHash32 ;
//sentHash32 = (uint32_t)m_si->m_sentenceContentHash64;
innerHash32 = (uint32_t)m_si->m_indirectSentHash64;
// save in case we need to read more than 5MB
//m_lastSection = si;
// . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32
// . we hack the "sentContentHash32" into each posdb key
// as the "value" so we can do a facet-like histogram
// over all the possible values this xpath has for this site
SectionStats *stats = getSectionStats ( secHash32,
innerHash32,
false ); // cache only?
// it returns -1 if would block
if ( stats == (void *)-1 ) {
// count it as outstanding
//m_mcastRequestsOut++;
// launch more if we have room
// UdpServer.cpp has a limit of 10 on 0x39 requests
if ( m_mcastRequestsOut - m_mcastRequestsIn < 10)
continue;
// advance m_si so we do not repeat
m_si = m_si->m_next;
// otherwise, return -1 to indicate blocked
return (Sections *)-1;
}
// NULL means g_errno
if ( ! stats ) {
// ensure g_errno is set
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// save it
m_secStatsErrno = g_errno;
// clear it
g_errno = 0;
// if still waiting though return -1
if ( m_mcastRequestsOut > m_mcastRequestsIn )
return (Sections *)-1;
// otherwise, all done i guess
return NULL;
}
// if already in the table, skip it!
}
// waiting for more replies to come back?
if ( m_mcastRequestsOut > m_mcastRequestsIn )
return (Sections *) -1;
// now scan the sections and copy the stats from the table
// into Section::m_stats of each sentence section.
// use the key hash as the the hash of the tag/xpath and the innerhtml
// and the val instead of being site hash will be hash of the
// content. then we can get the histogram of our content hash
// for this xpath on our site.
Section *si = ss->m_rootSection;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// skip if no content to hash
//if ( ! si->m_sentenceContentHash64 ) continue;
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
continue;
// skip if sentence, only hash tags now i guess for diffbot
//if ( si->m_sentenceContentHash64 )
// continue;
// get hash of sentences this tag contains indirectly
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
if ( ! val32 )
continue;
// skip if menu!
//if ( si->m_flags & menuFlags ) continue;
// get section xpath hash combined with sitehash
uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32;
// convert this to 32 bits
uint32_t innerHash32 ;
innerHash32 = (uint32_t)si->m_indirectSentHash64;
// the "stats" class should be in the table from
// the lookups above!!
SectionStats *stats = getSectionStats ( secHash32,
innerHash32,
true ); // cache only?
// sanity
//if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;}
// must have had a network error or something
if ( ! stats ) continue;
// copy
gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) );
}
//
// now if a section has no stats but has the same
// m_indirectSentHash64 as a kid, take his stats
//
Section *sx = ss->m_rootSection;
for ( ; sx ; sx = sx->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index
if ( ! ( sx->m_flags & SEC_HASHXPATH ) )
continue;
// scan up parents and set their stats to ours as int32_t as
// they have the same indirect sent hash64
Section *p = sx->m_parent;
for ( ; p ; p = p->m_parent ) {
// if parent is like an img tag, skip it
if ( p->m_tagId == TAG_IMG )
continue;
if ( p ->m_indirectSentHash64 !=
sx->m_indirectSentHash64 )
break;
// copy it to parent with the same inner html hash
gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats));
}
}
// now free the table's mem
m_sectionStatsTable.reset();
m_gotDupStats = true;
return ss;
}
static void gotReplyWrapper39 ( void *state1 , void *state2 ) {
//XmlDoc *THIS = (XmlDoc *)state;
XmlDoc *THIS = (XmlDoc *)state1;
Multicast *mcast = (Multicast *)state2;
THIS->gotSectionFacets ( mcast );
// this will end up calling getSectionsWithDupStats() again
// which will call getSectionStats() some more on new sections
// until m_gotDupStats is set to true.
THIS->m_masterLoop ( THIS->m_masterState );
}
// . launch a single msg3a::getDocIds() for a section hash, secHash32
SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 ,
uint32_t innerHash32 ,
bool cacheOnly ) {
// init cache?
if ( m_sectionStatsTable.m_numSlots == 0 &&
! m_sectionStatsTable.set(4,
sizeof(SectionStats),
32,
NULL,
0,
false,
m_niceness,
"secstatsch"))
return NULL;
// check in cache...
SectionStats *stats ;
stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 );
// if there, return it
if ( stats ) return stats;
// if cache only do not launch
if ( cacheOnly ) return NULL;
//
// TODO: shard gbxpathsitehashxxxxx by termid
// and make sure msg3a only sends to that single shard and sends
// the stats back. should make us much faster to sectionize
// a web page. but for now try without it...
//
//int32_t *sh32 = getSiteHash32();
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32;
int32_t maxOut = 32;
// . need to make new msg39Request and a new Multicast arrays
// . only need multicast since these gbfacetstr:gbxpathsitehash123456
// terms are sharded by termid, otherwise we'd have to use msg3a
if ( ! m_mcastArray ) {
// how much mem to alloc?
int32_t need = 0;
need += sizeof(Multicast);
need += sizeof(Msg39Request);
// query buf str
need += 100;
need *= maxOut;
// a single query now to be shared
//need += sizeof(Query);
// just in case we are being re-used
m_mcastBuf.reset();
// alloc space
if ( ! m_mcastBuf.reserve(need) ) return NULL;
// point to buf
char *p = m_mcastBuf.getBufStart();
// set them up
m_mcastArray = (Multicast *)p;
p += sizeof(Multicast) * maxOut;
m_msg39RequestArray = (Msg39Request *)p;
p += sizeof(Msg39Request) * maxOut;
//m_queryArray = (Query *)p;
//p += sizeof(Query) * maxOut;
//m_sharedQuery = (Query *)p;
//p += sizeof(Query);
// for holding the query string
// assume query will not exceed 100 bytes incuding \0
m_queryBuf = p;
p += 100 * maxOut;
// initialize all!
for ( int32_t i = 0 ; i < maxOut ; i++ ) {
m_mcastArray [i].constructor();
m_msg39RequestArray[i].reset();//constructor();
//m_queryArray [i].constructor();
m_queryBuf[100*i] = '\0';
//m_inUse[i] = 0;
}
}
// get first available
int32_t i;
for ( i = 0 ; i < maxOut ; i++ )
if ( ! m_mcastArray[i].m_inUse ) break;
// wtf?
if ( i >= maxOut ) { char *xx=NULL;*xx=0; }
// and our vehicle
Multicast *mcast = &m_mcastArray[i];
// mark as in use up here in case we quickpoll into this same code?!
// yeah, i guess set2() calls quickpoll?
//mcast->m_inUse = 1;
// save this for reply
//mcast->m_hack = this;
char *qbuf = m_queryBuf + 100 * i;
// . hash this special term (was gbsectionhash)
// . the wordbits etc will be a number though, the hash of the content
// of the xpath, the inner html hash
// . preceeding this term with gbfacet: will make gigablast return
// the statistics for all the values in the posdb keys of this
// termlist, which happen to be innerHTML hashes for all pages
// with this same xpath and on this same site.
sprintf(qbuf,"gbfacetstr:gbxpathsitehash%"UINT32"",
(uint32_t)secHash32);
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// set the msg39 request
Msg39Request *r = &m_msg39RequestArray[i];
// reset all to defaults
r->reset();
//r-> ptr_coll = cr->m_coll;
//r->size_coll = gbstrlen(cr->m_coll)+1;
r->m_collnum = cr->m_collnum;
r->m_maxAge = 60; // cache timeout?
r->m_addToCache = true;
r->m_docsToGet = 0; // just calc stats
r->m_niceness = m_niceness;
r->m_debug = 0;
r->m_doSiteClustering = false;
//r->m_doIpClustering = false;
r->m_doDupContentRemoval = false;
r->m_boolFlag = 2;
r->m_familyFilter = 0;
r->m_language = 0;
r->ptr_query = qbuf;//m_sectionHashQueryBuf;
r->size_query = gbstrlen(r->ptr_query)+1;
r->m_timeout = 3600; //-1;// auto-determine based on #terms
r->m_maxQueryTerms = 10;
// how much of each termlist to read in bytes
int32_t readList = 10000;
r-> ptr_readSizes = (char *)&readList;
r->size_readSizes = 4;
// term freqs
float tfw = 1.0;
r-> ptr_termFreqWeights = (char *)&tfw;
r->size_termFreqWeights = 4;
// speed it up some with this flag
r->m_forSectionStats = true;
// only do a single read of docids... do not split up
r->m_numDocIdSplits = 1;
// 1 query term
r->m_nqt = 1;
///////////////////////
//
// this tells msg3a/msg39/posdbtable its a hack! no need to do this
// because it's implied by the query.
// BUT REALLY let's eliminate this and just make our queries like
// gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of
// the section's xpath with the site. the values of that term in
// the posdb key will be 32-bit hashes of the innerHtml for such
// sections from all pages with the same xpath on the same site.
// so no need for this now, comment out.
//
//r->m_getFacetStats = true;
//
/////////////////////////
// we need to know what site is the base site so the section stats
// can set m_onSiteDocIds and m_offSiteDocIds correctly
//r->m_siteHash32 = *sh32;
// . now we use the hash of the innerHtml of the xpath
// . this is our value for the facet field of gbxpathsitehash12345678
// which is the hash of the innerHTML for that xpath on this site.
// 12345678 is the hash of the xpath and the site.
//r->m_myFacetVal32 = sentHash32;
//Query *qq = &m_queryArray[i];
// set query for msg3a. queryExpansion=false
//qq->set2 ( r->ptr_query , langUnknown , false );
Query qq;
qq.set2 ( r->ptr_query , langUnknown , false );
// TODO: ensure this just hits the one host since it is sharded
// by termid...
// what shard owns this termlist. we shard these
// gbfacetstr:gbxpathsitehash123456 terms by termid.
int64_t termId = qq.getTermId(0);
int32_t shardNum = getShardNumFromTermId ( termId );
// hack in our inner html content hash for this xpath
mcast->m_hack32 = innerHash32;
mcast->m_hack64 = secHash32;
// malloc and store the request. mcast will free it when done.
int32_t reqSize;
char *req = serializeMsg ( sizeof(Msg39Request),
&r->size_readSizes,
&r->size_whiteList,
&r->ptr_readSizes,
r,
&reqSize,
NULL,
0,
false);
// . send out a msg39 request to each shard
// . multicasts to a host in group "groupId"
// . we always block waiting for the reply with a multicast
// . returns false and sets g_errno on error
// . sends the request to fastest host in group "groupId"
// . if that host takes more than about 5 secs then sends to
// next host
// . key should be largest termId in group we're sending to
bool status;
status = mcast->send ( req , // m_rbufPtr ,
reqSize , // request size
0x39 , // msgType 0x39
true , // mcast owns m_request?
shardNum , // group to send to
false , // send to whole group?
0,//(int32_t)qh , // 0 // startKey.n1
this , // state1 data
mcast , // state2 data
gotReplyWrapper39 ,
30 , //timeout in secs
m_niceness,//m_r->m_niceness ,
false , // realtime?
-1, // firstHostId, // -1// bestHandlingHostId ,
NULL , // m_replyBuf ,
0 , // MSG39REPLYSIZE,
// this is true if multicast should free the
// reply, otherwise caller is responsible
// for freeing it after calling
// getBestReply().
// actually, this should always be false,
// there is a bug in Multicast.cpp.
// no, if we error out and never steal
// the buffers then they will go unfreed
// so they are freed by multicast by default
// then we steal control explicitly
true );
m_mcastRequestsOut++;
// if successfully launch, wait...
if ( status ) return (SectionStats *) -1;
// error?
if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; }
// sets &m_sectionStats and adds to the table
gotSectionFacets ( mcast );
// i guess did not block...
//return &msg3a->m_sectionStats;
return &m_sectionStats;
}
// . come here when msg39 got the ptr_faceHashList for our single
// gbfacet:gbxpathsitehash
// . returns false and sets g_errno on error
bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
//SectionStats *stats = &msg39->m_sectionStats;
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;}
// count it as returned
m_mcastRequestsIn++;
// mark it as available now
int32_t num = mcast - m_mcastArray;
// sanity
//if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; }
// grab the xpath/site hash
uint32_t secHash32 = mcast->m_hack64;
// and our innher html for that xpath
int32_t myFacetVal32 = mcast->m_hack32;
// sanity. should only be a gbfacet:gbxpathsitehash12345567 term.
//if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; }
// reset all counts to 0
m_sectionStats.reset();
//////
//
// compile m_sectionStats
//
///////
// set m_sectionStats from the list of facet values for this
// gbfacet:xpathsitehash term...
// Query::m_queryTerm.m_facetHashTable has the facets merged
// from all the shards. so now compute the stats from them.
// set the section stats.
//QueryTerm *qt = &msg3a->m_q->m_qterms[0];
//HashTableX *ft = &qt->m_facetHashTable;
// . get the list of facet field/value pairs.
// . see how Msg3a.cpp merges these to see how they are stored
Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply();
// this is NULL with g_errno set on error
if ( ! mr ) {
log("xmldoc: got error from sec stats mcast: %s",
mstrerror(g_errno));
return false;
}
deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf );
char *p = (char *)(mr->ptr_facetHashList);
//char *pfinal = p + mr->size_facetHashList;
//
// should only be one termid of facets in here, so no need to re-loop
//
int32_t nh = 0;
// "matches" is how many docids with this facet field had our facet val
int32_t matches = 0;
// "totalDocIds" is how many docids had this facet field
int32_t totalFields = 0;
if ( p ) {
// first is the termid
//int64_t termId = *(int64_t *)p;
// skip that
p += 8;
// the # of unique 32-bit facet values
nh = *(int32_t *)p;
p += 4;
// the end point
char *pend = p + (8 * nh);
// now compile the facet hash list into there
for ( ; p < pend ; ) {
// does this facet value match ours?
// (i.e. same inner html?)
if ( *(int32_t *)p == myFacetVal32 )
matches += *(int32_t *)(p+4);
p += 4;
// now how many docids had this facet value?
totalFields += *(int32_t *)p;
p += 4;
}
}
// how many unique inner html content hashes for this xpath/site
// hash were there?
m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed;
// how many xpaths existsed over all docs. doc can have multiple.
m_sectionStats.m_totalEntries = totalFields;
// total # unique docids that had this facet
m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits;
// how many had the same inner html content hash for
// this xpath/site as we did?
m_sectionStats.m_totalMatches = matches;
////////
//
// store m_sectionStats in cache
//
////////
// cache them. this does a copy of m_sectionStats
if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) )
log("xmldoc: failed to add sections stats: %s",
mstrerror(g_errno));
// reset that msg39 to free its data
//msg39->reset();
if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; }
// . make it available again
// . do this after all in case we were in quickpoll interruptting
// the getSectionStats() function below
//mcast->m_inUse = 0;
// free query Query::m_qwords array etc. to stop mem leaks
m_mcastArray [num].reset();
m_msg39RequestArray[num].reset();
//m_queryArray [num].reset();
// now when the master loop calls getSectionsWithDupStats() it
// should find the stats class in the cache!
return true;
}
// . for all urls from this subdomain...
// . EXCEPT root url since we use msg17 to cache that, etc.
SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {
if ( m_osvtValid ) return &m_osvt;
// do not consult sectiondb if we are set from the title rec,
// that way we avoid parsining inconsistencies since sectiondb changes!
if ( m_setFromTitleRec ) {
char *p = ptr_sectiondbData;
m_osvtValid = true;
m_osvt.m_totalSiteVoters = 0;
if ( size_sectiondbData <= 4 ) return &m_osvt;
m_osvt.m_totalSiteVoters = *(int32_t *)p;
p += 4;
int32_t remaining = size_sectiondbData - 4;
m_osvt.m_svt.deserialize(p,remaining,m_niceness);
return &m_osvt;
}
// returns empty table if WE are the site root url!
//HashTableX *rvt = getRootVotingTable();
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
// need sections
//Sections *ss = getSections();
//if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
// hash of all adjacent tag pairs
uint32_t *tph = getTagPairHash32 ( ) ;
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
int64_t *siteHash64 = getSiteHash64();
if ( ! siteHash64 || siteHash64 == (void *)-1 )
return (SectionVotingTable *)siteHash64;
// the docid
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . for us, dates are really containers of the flags and tag hash
// . init this up here, it is re-set if we re-call getSectiondbList()
// because there were too many records in it to handle in one read
if ( m_numSectiondbReads == 0 ) {
// init table
if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL;
// use site hash as the main thing
int64_t termId = *siteHash64 & TERMID_MASK;
// . start key for reading list from sectiondb
// . read all the section votes for this site
m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff);
// how many reads we have to do...
m_numSectiondbNeeds = 1;
}
//bool skipRecall = false;
// always read 5MB at a time from sectiondb
int32_t minRecSizes = 5000000;
// crap! host #28 is being totall slammed!!!!!
// why?????? in the meantime do this
//minRecSizes = 100000;
//skipRecall = true;
// is it facebook?
bool limitSectiondb = false;
// limit now to speed up repair rebuild
// limit now to speed up injection!
limitSectiondb = true;
// facebook lists often clog the tree, and when we read 2MB worth of
// it, it takes 100ms, so reduce to 50k to so it takes 2.5ms...
// because facebook is a well structured xml feed so why read any
// really!
if ( limitSectiondb ) minRecSizes = 50000;
key128_t *lastKey = NULL;
// if msg0 blocked and came back with g_errno set, like
// in preparing to merge it got an OOM
if ( g_errno ) {
log("build: sectiondb read2: %s",mstrerror(g_errno));
return NULL;
}
readLoop:
// before looking up TitleRecs using Msg20, let's first consult
// datedb to see if we got adequate data as to what sections
// are the article sections
// only get the list once
if ( m_numSectiondbReads < m_numSectiondbNeeds ) {
// only do this once
m_numSectiondbReads++;
// make the termid
uint64_t termId = *siteHash64 & TERMID_MASK;
// end key is always the same
key128_t end = g_datedb.makeEndKey ( termId , 0 );
// int16_tcut
Msg0 *m = &m_msg0;
// get the group this list is in (split = false)
uint32_t shardNum;
shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey);
// we need a group # from the groupId
//int32_t split = g_hostdb.getGroupNum ( gid );
// note it
//logf(LOG_DEBUG,"sections: "
// "reading list from sectiondb: "
// "sk.n1=0x%"XINT64" sk.n0=0x%"XINT64" "
// "ek.n1=0x%"XINT64" ek.n0=0x%"XINT64" "
// ,m_sectiondbStartKey.n1
// ,m_sectiondbStartKey.n0
// ,end.n1
// ,end.n0
// );
// . get the list
// . gets all votes for one particular site
if ( ! m->getList ( -1 , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
RDB_SECTIONDB , // was RDB_DATEDB
cr->m_collnum ,
&m_secdbList ,
(char *)&m_sectiondbStartKey ,
(char *)&end ,
minRecSizes ,
m_masterState ,
m_masterLoop ,
m_niceness , // MAX_NICENESS
// default parms follow
true , // doErrorCorrection?
true , // includeTree?
true , // doMerge?
-1 , // firstHostId
0 , // startFileNum
-1 , // numFiles
999995 , // timeout
-1 , // syncPoint
-1 , // preferLocalReads
NULL , // msg5
NULL , // msg5b
false , // isrealmerge?
true , // allowpagecache?
false , // forceLocalIndexdb?
false , // doIndexdbSplit?
shardNum ) )//split ))
// return -1 if blocks
return (SectionVotingTable *)-1;
// error?
if ( g_errno ) {
log("build: sectiondb read: %s",mstrerror(g_errno));
return NULL;
}
}
// it also returns the lastKey in the list so we can use that to
// set the startKey for a re-call if we read >= 5MB
lastKey = NULL;
//logf(LOG_DEBUG,"sections: read list of %"INT32" bytes",
// m_secdbList.m_listSize);
bool recall = true;
if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false;
// . unless it had special byte set in Msg0.cpp HACK
// . we send back a compressed list and tack on an extra 0 byte at
// the end so that we know we had a full list!
if ( (m_secdbList.m_listSize % 2) == 1 ) {
m_secdbList.m_listSize--;
m_secdbList.m_listEnd --;
recall = true;
}
// no longer bother re-calling, because facebook is way slow...
if ( limitSectiondb ) recall = false;
// . returns false and sets g_errno on error
// . compile the votes from sectiondb for this site into a hashtable
// . m_osvt is a SectionVotingTable and each entry in the hashtable
// is a SectionVote class.
// . the taghash is the key of the vote and is a hash of all the
// nested tags the section is in.
// . another vote uses the tag hash hashed with the hash of the
// content contained by the section
// . using these two vote counts we set Section::m_votesForDup
// or Section::m_votesForNotDup counts which let us know how the
// section is repeated or not repeated on the site
// . SectionVote::m_score is always 1.0 from what i can tell
// cuz it seems like addVote*() always uses a score of 1.0
// . SectionVote::m_numSampled is how many times that tagHash
// occurs in the document.
if ( ! m_osvt.addListOfVotes(&m_secdbList,
&lastKey,
*tph,
*d , // docid
m_niceness))
return NULL;
// why is this always zero it seems?
if ( g_conf.m_logDebugBuild )
log("xmldoc: added sectiondblist size=%"INT32" recall=%"INT32"",
m_secdbList.m_listSize,(int32_t)recall);
// . recall? yes if we had to truncate our list...
// . we need to be able to scan all votes for the website... that is
// why we recall here
// . limit votes by a special sectiondb key then that is a vote...
if ( recall ) {
// another debug
//logf(LOG_DEBUG,"sections: recallling read");
// just note it for now
//if ( m_sectiondbRecall > 5 )
if ( m_numSectiondbNeeds > 5 )
logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%"INT32"",
m_sectiondbRecall++);
// we should really limit voting per site! we do now!
//if ( m_recall > 5 ) { char *xx=NULL;*xx=0; }
// update our start key
if ( lastKey ) m_sectiondbStartKey = *lastKey;
// inc by 2 since we already had this key
m_sectiondbStartKey += 2;
// unflag
m_numSectiondbNeeds++;
// and repeat
goto readLoop;
}
//
// set ptr_sectiondbData so this can be set from a title rec without
// having to lookup in sectiondb again which might have changed!
//
m_sectiondbData.purge();
// alloc
int32_t need = m_osvt.m_svt.getStoredSize() + 4;
if ( ! m_sectiondbData.reserve(need) )
// oom error?
return NULL;
// serialize this number
m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters);
// serialize the hashtablex
m_osvt.m_svt.serialize ( &m_sectiondbData );
// reference it for title rec serialization
ptr_sectiondbData = m_sectiondbData.getBufStart();
size_sectiondbData = m_sectiondbData.length();
m_osvtValid = true;
return &m_osvt;
}
int32_t *XmlDoc::getLinkSiteHashes ( ) {
if ( m_linkSiteHashesValid )
return (int32_t *)m_linkSiteHashBuf.getBufStart();
// get the outlinks
Links *links = getLinks();
if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
// . get the outlink tag rec vector
// . each link's tagrec may have a "site" tag that is basically
// the cached SiteGetter::getSite() computation
TagRec ***grv = NULL;
if ( ! m_setFromTitleRec ) {
grv = getOutlinkTagRecVector();
if ( ! grv || grv == (void *)-1 ) return (int32_t *)grv;
}
// how many outlinks do we have on this page?
int32_t n = links->getNumLinks();
// reserve space
m_linkSiteHashBuf.purge();
if ( ! m_linkSiteHashBuf.reserve ( n * 4 ) ) return NULL;
if ( n == 0 ) {
ptr_linkdbData = NULL;
size_linkdbData = 0;
return (int32_t *)0x1234;
}
// if set from titlerec then assume each site is the full hostname
// of the link, unless its specified explicitly in the hashtablex
// serialized in ptr_linkdbData
if ( m_setFromTitleRec ) {
// this holds the sites that are not just the hostname
int32_t *p = (int32_t *)ptr_linkdbData;
int32_t *pend = (int32_t *)(ptr_linkdbData + size_linkdbData);
// loop over links
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get the link
char *u = links->getLinkPtr(i);
// assume site is just the host
int32_t hostLen = 0;
char *host = ::getHost ( u , &hostLen );
int32_t siteHash32 = hash32 ( host , hostLen , 0 );
// unless give as otherwise
if ( p < pend && *p == i ) {
p++;
siteHash32 = *p;
p++;
}
// store that then. should not fail since we allocated
// right above
if ( ! m_linkSiteHashBuf.pushLong(siteHash32) ) {
char *xx=NULL;*xx=0; }
}
// return ptr of array, which is a safebuf
return (int32_t *)m_linkSiteHashBuf.getBufStart();
}
// ptr_linkdbData will point into this buf
m_linkdbDataBuf.purge();
// loop through them
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get the link
char *u = links->getLinkPtr(i);
// get full host from link
int32_t hostLen = 0;
char *host = ::getHost ( u , &hostLen );
int32_t hostHash32 = hash32 ( host , hostLen , 0 );
// get the site
TagRec *gr = (*grv)[i];
char *site = NULL;
int32_t siteLen = 0;
if ( gr ) {
int32_t dataSize = 0;
site = gr->getString("site",NULL,&dataSize);
if ( dataSize ) siteLen = dataSize - 1;
}
// otherwise, make it the host or make it cut off at
// a "/user/" or "/~xxxx" or whatever path component
if ( ! site ) {
// GUESS link site... like /~xxx
site = host;
siteLen = hostLen;
}
int32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
// only store if different form host itself
if ( linkeeSiteHash32 != hostHash32 ) {
if ( ! m_linkdbDataBuf.pushLong(i) )
return NULL;
if ( ! m_linkdbDataBuf.pushLong(linkeeSiteHash32) )
return NULL;
}
// store it always in this buf
if ( ! m_linkSiteHashBuf.pushLong(linkeeSiteHash32) ) {
// space should have been reserved above!
char *xx=NULL;*xx=0; }
}
// set ptr_linkdbData
ptr_linkdbData = m_linkdbDataBuf.getBufStart();
size_linkdbData = m_linkdbDataBuf.length();
m_linkSiteHashesValid = true;
return (int32_t *)m_linkSiteHashBuf.getBufStart();
}
Links *XmlDoc::getLinks ( bool doQuickSet ) {
if ( m_linksValid ) return &m_links;
// set status
setStatus ( "getting outlinks");
// . add links from diffbot reply
// . get the reply of json objects from diffbot
// . this will be empty if we are a json object!
// . will also be empty if not meant to be sent to diffbot
// . the TOKENIZED reply consists of \0 separated json objects that
// we create from the original diffbot reply
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return (Links *)dbr;
// this will set it if necessary
Xml *xml = getXml();
// bail on error
if ( ! xml || xml == (Xml *)-1 ) return (Links *)xml;
// can't call getIsPermalink() here without entering a dependency loop
char *pp = getIsUrlPermalinkFormat();
if ( !pp || pp == (char *)-1 ) return (Links *)pp;
// use the old xml doc
XmlDoc **od = getOldXmlDoc ( );
if ( ! od || od == (XmlDoc **)-1 ) return (Links *)od;
// get Links class of the old title rec
Links *oldLinks = NULL;
// if we were set from a title rec, do not do this
if ( *od ) {
oldLinks = (*od)->getLinks();
if (!oldLinks||oldLinks==(Links *)-1) return (Links *)oldLinks;
}
Url *baseUrl = getBaseUrl();
if ( ! baseUrl || baseUrl==(Url *)-1) return (Links *)baseUrl;
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (Links *)ip;
// this ensures m_contentLen is set
//char **content = getContent();
//if ( ! content || content == (char **)-1 ) return (Links *)content;
// this will set ptr_indCatIds and size_indCatIds
int32_t **pici = getIndCatIds();
if ( ! pici || pici == (void *)-1 ) return (Links *)pici;
char *ict = getIsContentTruncated();
if ( ! ict || ict == (char *)-1 ) return (Links *)ict;
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (Links *)sni;
// get the latest url we are on
Url *u = getCurrentUrl();
//
// if we had a EDOCSIMPLIFIEDREDIR error, pretend it is a link
// so addOutlinkSpiderRecsToMetaList() will add it to spiderdb
//
if ( m_indexCodeValid && m_indexCode == EDOCSIMPLIFIEDREDIR ) {
m_links.set ( m_redirUrl.getUrl(),m_redirUrl.getUrlLen() );
m_linksValid = true;
return &m_links;
}
if ( m_indexCodeValid && m_indexCode == EDOCNONCANONICAL ) {
m_links.set(m_canonicalRedirUrl.getUrl(),
m_canonicalRedirUrl.getUrlLen());
m_linksValid = true;
return &m_links;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
bool useRelNoFollow = true;
if ( ! cr->m_obeyRelNoFollowLinks ) useRelNoFollow = false;
// to keep things simple, for diffbot custom crawls, if robots.txt
// is not used then do not use rel no follow
if ( ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
useRelNoFollow = false;
// . set it
// . if parent is a permalink we can avoid its suburl outlinks
// containing "comment" from being classified as permalinks
if ( ! m_links.set ( useRelNoFollow ,
xml ,
u ,
true , // setLinkHashes?
baseUrl ,
m_version ,
m_niceness ,
*pp , // parent url in permalink format?
oldLinks ,// oldLinks, might be NULL!
doQuickSet ,
dbr ) )
return NULL;
m_linksValid = true;
// do not bother setting that bit if we are being called for link
// text because that bit was already in the linkdb key, and it
// was set to zero! so if getting msg20 reply.... bail now
if ( m_req ) return &m_links;
// . apply link spam settings
// . set the "spam bits" in the Links class
setLinkSpam ( *ip ,
ptr_indCatIds ,
size_indCatIds / 4 ,
u , // linker url
*sni ,
xml ,
&m_links ,
*ict ,
m_niceness );
// we got it
return &m_links;
}
HashTableX *XmlDoc::getCountTable ( ) {
// return it if we got it
if ( m_countTableValid ) return &m_countTable;
setStatus ("getting count table");
// get the stuff we need
Xml *xml = getXml ();
if ( ! xml || xml == (Xml *)-1 ) return (HashTableX *)xml;
Words *words = getWords ();
if ( ! words || words == (Words *)-1 ) return (HashTableX *)words;
Phrases *phrases = getPhrases ();
if ( ! phrases || phrases==(Phrases *)-1) return (HashTableX *)phrases;
Bits *bits = getBits ();
if ( ! bits || bits == (Bits *)-1 ) return (HashTableX *)bits;
Sections *sections = getSections();
if ( !sections||sections==(Sections *)-1) return(HashTableX *)sections;
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (HashTableX *)info1;
// . reduce score of words in badly repeated fragments to 0 so we do
// not count them here!
// . ff[i] will have score of 0 if in repeated frag
// . make sure this is stored for whole doc... since we only use it
// for the body
char *fv = getFragVec();
if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv;
//LinkInfo *info2 = getLinkInfo2();
//if ( ! info2 || info2 == (LinkInfo *)-1 ) return (HashTableX *)info2;
// init our count table otherwise
//if(! m_countTable.set( 8,4,1024,NULL,0,false,m_niceness,"xmlcnttbl"))
// return NULL;
// breathe
QUICKPOLL ( m_niceness );
//
// this was in Weights.cpp, but now it is here...
//
// int16_tcut
HashTableX *ct = &m_countTable;
// reset the counts, just in case set() below does not
//ct->reset();
// ez var
int64_t *wids = words->getWordIds ();
nodeid_t *tids = words->getTagIds ();
int32_t nw = words->getNumWords ();
char **wptrs = words->m_words;
int32_t *wlens = words->m_wordLens;
int64_t *pids = phrases->getPhraseIds2();
// add 5000 slots for inlink text in hashString_ct() calls below
int32_t numSlots = nw * 3 + 5000;
// only alloc for this one if not provided
if (!ct->set(8,4,numSlots,NULL,0,false,m_niceness,"xmlct"))
return (HashTableX *)NULL;
//char *ff = getFragVec ( ) ;
//if ( ! ff ) return false;
// . now hash all the phrase ids we have in order to see if the phrase
// is unique or not. if phrase is repeated a lot we punish the scores
// of the individual words in the phrase and boost the score of the
// phrase itself. We check for uniqueness down below.
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// add the word
if ( wids[i] == 0LL ) continue;
//if ( wids[i] == 708411945052722517LL )
// log("hey4 got new pid=%"INT64" i=%"INT32"",pids[i],i);
// . skip if in repeated fragment
// . unfortunately we truncate the frag vec to like
// the first 80,000 words for performance reasons
if ( i < MAXFRAGWORDS && fv[i] == 0 ) continue;
// accumulate the wid with a score of 1 each time it occurs
if ( ! ct->addTerm ( &wids[i] ) ) return (HashTableX *)NULL;
// skip if word #i does not start a phrase
if ( ! pids [i] ) continue;
// if phrase score is less than 100% do not consider as a
// phrase so that we do not phrase "albuquerque, NM" and stuff
// like that... in fact, we can only have a space here...
if ( wptrs[i+1][0] == ',' ) continue;
if ( wptrs[i+1][1] == ',' ) continue;
if ( wptrs[i+1][2] == ',' ) continue;
// put it in, accumulate, max score is 0x7fffffff
if ( ! ct->addTerm ( &pids[i] ) ) return (HashTableX *)NULL;
}
// now add each meta tag to the pot
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// skip if not a meta tag
if ( tids[i] != 68 ) continue;
// find the "content=" word
char *w = wptrs[i];
int32_t wlen = wlens[i];
char *wend = w + wlen;
char *p ;
p = strncasestr (w,wlen,"content=");
// skip if we did not have any content in this meta tag
if ( ! p ) continue;
// skip the "content="
p += 8;
// skip if empty meta content
if ( wend - p <= 0 ) continue;
// our ouw hash
if ( ! hashString_ct ( ct , p , wend - p ) )
return (HashTableX *)NULL;
}
// add each incoming link text
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL ( m_niceness );
// int16_tcuts
char *p;
int32_t plen;
// hash link text (was hashPwids())
p = k-> getLinkText();
plen = k->size_linkText - 1;
if ( ! verifyUtf8 ( p , plen ) ) {
log("xmldoc: bad link text 3 from url=%s for %s",
k->getUrl(),m_firstUrl.m_url);
continue;
}
if ( ! hashString_ct ( ct , p , plen ) )
return (HashTableX *)NULL;
// hash this stuff (was hashPwids())
p = k->getSurroundingText();
plen = k->size_surroundingText - 1;
if ( ! hashString_ct ( ct , p , plen ) )
return (HashTableX *)NULL;
}
// we got it
m_countTableValid = true;
return &m_countTable;
}
// . a special function used by XmlDoc::getCountTable() above
// . kinda similar to XmlDoc::hashString()
bool XmlDoc::hashString_ct ( HashTableX *ct , char *s , int32_t slen ) {
Words words;
Bits bits;
Phrases phrases;
if ( ! words.set ( s , slen , m_version , true , m_niceness ) )
return false;
if ( ! bits.set ( &words , m_version , m_niceness ) )
return false;
if ( ! phrases.set(&words,&bits,true,false,m_version,m_niceness))
return false;
int32_t nw = words.getNumWords();
int64_t *wids = words.getWordIds();
int64_t *pids = phrases.m_phraseIds2;
char **wptrs = words.m_words;
int32_t *wlens = words.m_wordLens;
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// add the word
if ( wids[i] == 0LL ) continue;
// skip if in repeated fragment
// . NO, we do not use this for these int16_t strings
//if ( ww[i] == 0 ) continue;
// accumulate the wid with a score of 1 each time it occurs
if ( ! ct->addTerm ( &wids[i] ) ) return false;
// skip if word #i does not start a phrase
if ( ! pids [i] ) continue;
// if phrase score is less than 100% do not consider as a
// phrase so that we do not phrase "albuquerque, NM" and stuff
// like that... in fact, we can only have a space here...
if ( i+1<nw ) {
if ( wptrs[i+1][0] == ',' ) continue;
if ( wlens[i+1]>=2 && wptrs[i+1][1] == ',' ) continue;
if ( wlens[i+1]>=3 && wptrs[i+1][2] == ',' ) continue;
}
// put it in, accumulate, max score is 0x7fffffff
if ( ! ct->addTerm ( &pids[i] ) ) return false;
}
return true;
}
uint8_t *XmlDoc::getSummaryLangId ( ) {
// return if we got it already
if ( m_summaryLangIdValid ) return &m_summaryLangId;
Summary *s = getSummary();
if ( ! s || s == (void *)-1 ) return (uint8_t *)s;
char *sum = s->getSummary();
// now set the words class
Words ww;
if ( ! ww.set9 ( sum , m_niceness ) ) return NULL;
// check it out. 0 means langUnknown. -1 means error.
int32_t ret = ww.getLanguage ( NULL , 100 , m_niceness , NULL );
// -1 means error! g_errno should be set
if ( ret < 0 ) return NULL;
// set it
m_summaryLangId = (uint8_t)ret;
// assume valid
m_summaryLangIdValid = true;
// return it
return &m_summaryLangId;
}
int cmp ( const void *h1 , const void *h2 ) ;
// vector components are 32-bit hashes
int32_t *XmlDoc::getTagPairHashVector ( ) {
if ( m_tagPairHashVecValid ) return m_tagPairHashVec;
Xml *xml = getXml ();
if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
// store the hashes here
uint32_t hashes [ 2000 ];
int32_t nh = 0;
// go through each node
XmlNode *nodes = xml->getNodes ();
int32_t n = xml->getNumNodes ();
// start with the ith node
int32_t i = 0;
uint32_t saved = 0;
uint32_t lastHash = 0;
// loop over the nodes
for ( ; i < n ; i++ ) {
// breathe a little
QUICKPOLL ( m_niceness );
// skip NON tags
if ( ! nodes[i].isTag() ) continue;
// use the tag id as the hash, its unique
uint32_t h = hash32h ( nodes[i].getNodeId() , 0 );
// ensure hash is not 0, that has special meaning
if ( h == 0 ) h = 1;
// store in case we have only one hash
saved = h;
// if we are the first, set this
if ( ! lastHash ) {
lastHash = h;
continue;
}
// if they were the same do not xor, they will zero out
if ( h == lastHash ) hashes[nh++] = h;
// incorporate it into the last hash
else hashes[nh++] = h ^ lastHash;
// we are the new last hash
lastHash = h;
// bust out if no room
if ( nh >= 2000 ) break;
}
// if only had one tag after, use that
if ( nh == 0 && saved ) hashes[nh++] = saved;
// breathe
QUICKPOLL ( m_niceness ) ;
// . TODO: remove the link text hashes here?
// . because will probably be identical..
// . now sort hashes to get the top MAX_PAIR_HASHES
gbsort ( hashes , nh , 4 , cmp );
// breathe
QUICKPOLL ( m_niceness ) ;
// uniquify them
int32_t d = 0;
for ( int32_t j = 1 ; j < nh ; j++ ) {
if ( hashes[j] == hashes[d] ) continue;
hashes[++d] = hashes[j];
}
// breathe
QUICKPOLL ( m_niceness ) ;
// how many do we got?
nh = d;
// truncate to MAX_PAIR_HASHES MINUS 1 so we can put a 0 at the end
if ( nh > MAX_TAG_PAIR_HASHES-1 ) nh = MAX_TAG_PAIR_HASHES-1;
// store the top MAX_PAIR_HASHES
gbmemcpy ( m_tagPairHashVec , hashes , nh * 4 );
// null term it. all vectors need this so computeSimilarity() works
m_tagPairHashVec [ nh++ ] = 0;
m_tagPairHashVecValid = true;
m_tagPairHashVecSize = nh * 4;
return m_tagPairHashVec;
}
// sort in descending order
int cmp ( const void *h1 , const void *h2 ) {
return *(uint32_t *)h2 - *(uint32_t *)h1;
}
// . m_tagVector.setTagPairHashes(&m_xml, niceness);
// . Sections.cpp and getIsDup() both use this hash
// . returns NULL and sets g_errno on error
// . xors all the unique adjacent tag hashes together
// . kind of represents the template the web pages uses
// . we add this to sectiondb as a vote in Sections::addVotes()
uint32_t *XmlDoc::getTagPairHash32 ( ) {
// only compute once
if ( m_tagPairHash32Valid ) return &m_tagPairHash32;
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (uint32_t *)words;
// int16_tcuts
//int64_t *wids = words->getWordIds ();
nodeid_t *tids = words->getTagIds ();
int32_t nw = words->getNumWords ();
int32_t nt = words->m_numTags;
// . get the hash of all the tag pair hashes!
// . we then combine that with our site hash to get our site specific
// html template termid
// . put all tag pairs into a hash table
// . similar to Vector::setTagPairHashes() but we do not compute a
// vector, just a single scalar/hash of 32 bits, m_termId
HashTableX tp; // T<int64_t,char> tp;
if ( ! tp.set ( 4 , 1 , nt * 4 , NULL , 0 , true,m_niceness,"xmltp"))
return 0LL;
uint32_t lastTid = 0;
char val = 1;
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not tag
if ( tids[i] == 0LL ) continue;
// skip if back tag
if ( tids[i] & BACKBIT ) continue;
// get last tid
uint32_t h = hash32h ( tids[i] , lastTid );
//logf(LOG_DEBUG,"build: tph %"INT32" h=%"UINT64"",i,(int64_t)h);
// . add to table (skip if 0, means empty bucket)
// . return NULL and set g_errno on error
if ( h && ! tp.addKey ( &h , &val ) ) return NULL;
// update this
lastTid = h;
}
// linear scan on hash table to get all the hash, XOR together
uint32_t hx = 0;
int32_t nb = tp.getNumSlots();
char *flags = tp.m_flags;
// get keys
uint32_t *keys = (uint32_t *)tp.m_keys;
for ( int32_t i = 0 ; i < nb ; i++ ) {
// skip if empty
if ( flags[i] == 0 ) continue;
// skip if empty
//if ( keys[i] == 0LL ) continue;
// incorporate
hx ^= keys[i];
}
// never return 0, make it 1. 0 means an error
if ( hx == 0 ) hx = 1;
// set the hash
m_tagPairHash32 = hx ;
// it is now valid
m_tagPairHash32Valid = true;
return &m_tagPairHash32;
}
// . used for deduping search results
// . also uses the title
int32_t *XmlDoc::getSummaryVector ( ) {
if ( m_summaryVecValid ) return (int32_t *)m_summaryVec;
Summary *s = getSummary();
if ( ! s || s == (Summary *)-1 ) return (int32_t *)s;
Title *ti = getTitle();
if ( ! ti || ti == (Title *)-1 ) return (int32_t *)ti;
// store title and summary into "buf" so we can call words.set()
//char buf[5000];
SafeBuf sb;
//char *p = buf;
//int32_t avail = 5000;
//int32_t len;
// put title into there
int32_t tlen = ti->m_titleBytes - 1;
//if ( len > avail ) len = avail - 10;
if ( tlen < 0 ) tlen = 0;
// put summary into there
int32_t slen = s->m_summaryLen;
// allocate space
int32_t need = tlen + 1 + slen + 1;
if ( ! sb.reserve ( need ) ) return NULL;
//gbmemcpy ( p , ti->m_title , len );
//p += len;
sb.safeMemcpy ( ti->m_title , tlen );
// space separting the title from summary
if ( tlen > 0 ) sb.pushChar(' ');
//if ( len > avail ) len = avail - 10;
//gbmemcpy ( p , s->m_summary , len );
//p += len;
sb.safeMemcpy ( s->m_summary , slen );
// null terminate it
//*p = '\0';
sb.nullTerm();
// word-ify it
Words words;
if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL;
// . now set the dedup vector from big summary and title
// . store sample vector in here
// . returns size in bytes including null terminating int32_t
m_summaryVecSize = computeVector ( NULL , &words ,
(uint32_t *)m_summaryVec );
m_summaryVecValid = true;
return m_summaryVec;
}
bool getWordVector ( char *s ,
HashTableX *ht ,
uint32_t *d ,
int32_t *nd ,
int32_t ndmax ) {
// utf8 char size
char size;
// grab each word and hash it
for ( ; *s ; s += size ) {
// get size
size = getUtf8CharSize(s);
// skip if tag
if ( *s == '<' ) {
while ( *s && *s!='>' )
s += getUtf8CharSize(s);
continue;
}
// skip if other type of punct
if ( ! is_alnum_utf8(s) ) continue;
// ok, we got a word then
char *start = s;
// see how long the word is
for ( ; *s && is_alnum_utf8(s);s+=getUtf8CharSize(s));
// get wordid, a simple hash, just like Words.cpp does
uint64_t h = hash64Lower_utf8(start,s - start);
// do not inc this time
size = 0;
// breathe
//QUICKPOLL ( m_niceness );
// make 32 bit
uint32_t wid32 = (uint32_t)h;
//
// TODO: ignore if it is a day name or month name or
// number because those are like dates
//
if ( ht ) {
// do not add if we already got it
if ( ht->getSlot ( &wid32 ) >= 0 ) continue;
// add to hash table. return NULL and set g_errno onerr
if ( ! ht->addKey (&wid32 )) return false;
}
// add it to our vector
d[*nd] = (uint32_t)wid32;
// inc it
*nd = *nd + 1;
// stop after 3000 for sure
if ( *nd >= ndmax ) return true;
}
return true;
}
// used by getIsDup() and Dates.cpp for detecting dups and for
// seeing if the content changed respectively
int32_t *XmlDoc::getPageSampleVector ( ) {
if ( m_pageSampleVecValid ) return m_pageSampleVec;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
Sections *ss = NULL;
//if ( m_eliminateMenus ) {
//ss = getSections();
//if ( ! ss || ss == (Sections *)-1) return (int32_t *)ss;
//}
m_pageSampleVecSize = computeVector ( ss, ww,
(uint32_t *)m_pageSampleVec );
m_pageSampleVecValid = true;
return m_pageSampleVec;
}
// . this is the vector of the words right after the hypertext for the link
// we are voting on.
// . it is used to dedup voters in Msg25.cpp
int32_t *XmlDoc::getPostLinkTextVector ( int32_t linkNode ) {
if ( m_postVecValid ) return m_postVec;
// assume none
m_postVecSize = 0;
// set up
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
// sanity check
if ( linkNode < 0 ) { char *xx=NULL;*xx=0; }
// linkNode starts pointing to a <a> tag so skip over that!
linkNode++;
// limit
int32_t nn = xml->getNumNodes();
XmlNode *nodes = xml->getNodes();
// and advance i to the next anchor tag thereafter, we do not
// want to include link text in this vector because it is usually
// repeated and will skew our "similarities"
for ( ; linkNode < nn ; linkNode++ ) {
// stop if we hit </a> or <a>
if ( (nodes[linkNode].m_nodeId & BACKBITCOMP) != 2 ) continue;
// advance over the </a> or <a>
linkNode++;
// then stop, we will start gathering link text here
break;
}
// if we hit end of the doc, we got not vector then
if ( linkNode >= nn ) return m_postVec;
// now convert the linkNode # to a word #, "start"
int32_t nw = ww->getNumWords ();
int64_t *wids = ww->getWordIds ();
nodeid_t *tids = ww->getTagIds ();
int32_t *wn = ww->m_nodes;
int32_t i = 0;
for ( ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// stop when we got the first word in this node #
if ( wn[i] == linkNode ) break;
}
// if none, bail now, size is 0
if ( i >= nw ) return m_postVec;
// save that
int32_t start = i;
// likewise, set the end of it
int32_t end = nw;
// count alnum words
int32_t count = 0;
// limit it
for ( i = start ; i < nw && count < 35 ; i++ ) {
// get tag id
nodeid_t tid = tids[i] & BACKBITCOMP;
// stop if certain ones
if ( tid == TAG_TABLE ) break;
if ( tid == TAG_UL ) break;
// <a>, </a> is ok
if ( tids[i] == TAG_A ) break;
// only up to 35 words allowed in the hash
if ( wids[i] ) count++;
}
// set the end of the words to hash
end = i;
// specify starting node # now
m_postVecSize = computeVector(NULL,ww,(uint32_t *)m_postVec,start,end);
// return what we got
return m_postVec;
}
// . was kinda like "m_tagVector.setTagPairHashes(&m_xml, niceness);"
// . this is used by getIsDup() (below)
// . this is used by Dates.cpp to see how much a doc has changed
// . this is also now used for getting the title/summary vector for deduping
// search results
// . if we couldn't extract a good pub date for the doc, and it has changed
// since last spidered, use the bisection method to come up with our own
// "last modified date" which we use as the pub date.
// . this replaces the clusterdb.getSimilarity() logic in Msg14.cpp used
// to do the same thing. but we call Vector::setForDates() from
// Dates.cpp. that way the logic is more contained in Dates!
// . doesn't Msg14 already do that?
// . yes, but it uses two TermTables and calls Clusterdb::getSimilarity()
// . returns false and sets g_errno on error
// . these words classes should have been set by a call to Words::set(Xml *...)
// so that we have "tids1" and "tids2"
// . returns NULL and sets g_errno on error
// . TODO: if our title rec is non-empty consider getting it from that
// . we use this vector to compare two docs to see how similar they are
int32_t XmlDoc::computeVector ( Sections *sections, Words *words, uint32_t *vec ,
int32_t start , int32_t end ) {
// assume empty vector
vec[0] = 0;
// skip if no article section. then we have no vector.
if ( sections && ! sections->m_hadArticle ) return 0;
// int16_tcuts
int32_t nw = words->getNumWords();
//int32_t nt = words->m_numTags;
int64_t *wids = words->getWordIds();
// set the end to the real end if it was specified as less than zero
if ( end < 0 ) end = nw;
// # of alnum words, about... minus the tags, then the punct words
// are half of what remains...
int32_t count = words->m_numAlnumWords;
// if we got sections, how many good words?
if ( sections ) count = sections->m_numAlnumWordsInArticle;
// google seems to index SEC_MARQUEE so i took that out
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
// these Section ptrs are 1-1 with the words
Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs;
// . Get sample vector from content section only.
// . This helps remove duplicate menu/ad from vector
// 4 bytes per hash, save the last one for a NULL terminator, 0 hash
int32_t maxTerms = SAMPLE_VECTOR_SIZE / 4 - 1;
// what portion of them do we want to mask out from the rest?
int32_t ratio = count / maxTerms ;
// a mask of 0 means to get them all
unsigned char mask = 0x00;
// if we got twice as many terms as we need, then set mask to 0x01
// to filter out half of them! but actually, let's aim for twice
// as many as we need to ensure we really get as many as we need.
// so if we got 4 or more than we need then cut in half...
while ( ratio >= 4 ) {
// shift the mask down, ensure hi bit is set
mask >>= 1;
mask |= 0x80;
ratio >>= 1; // /2
}
// store vector into "d" for now. will sort below
uint32_t d [ 3000 ];
// dedup our vector using this hashtable, "ht"
char hbuf[3000*6*2];
HashTableX ht;
if ( ! ht.set(4,0,3000,hbuf,3000*6*2,false,m_niceness,"xmlvecdedup")){
char*xx=NULL;*xx=0;}
again:
// a buffer to hold the top termIds
int32_t nd = 0;
// count how many we mask out
int32_t mo = 0;
// . buffer should have at least "maxTerms" in it
// . these should all be 12 byte keys
for ( int32_t i = start ; i < end ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// skip if not alnum word
if ( wids[i] == 0 ) continue;
// skip if mask filters it
if ( ((wids[i]>>(NUMTERMIDBITS-8)) & mask)!=0) {mo++;continue;}
// skip if in select, style, script or marquee tag section
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
// make 32 bit
uint32_t wid32 = (uint32_t)wids[i];
// do not add if we already got it
if ( ht.getSlot ( &wid32 ) >= 0 ) continue;
// add to hash table. return NULL and set g_errno on error
if ( ! ht.addKey (&wid32 )){char*xx=NULL;*xx=0; }
// add it to our vector
d[nd] = (uint32_t)wids[i];
// stop after 3000 for sure
if ( ++nd < 3000 ) continue;
// bitch and break out on error
log(LOG_INFO,"build: Sample vector overflow. Slight "
"performance hit.");
break;
}
// . if nd was too small, don't use a mask to save time
// . well just make the mask less restrictive
if ( nd < maxTerms && mask && mo ) {
// shift the mask UP, allow more termIds to pass through
mask <<= 1;
// reset hash table since we are starting over
ht.clear();
goto again;
}
// bubble sort them
bool flag = true;
while ( flag ) {
// breathe
QUICKPOLL ( m_niceness );
flag = false;
for ( int32_t i = 1 ; i < nd ; i++ ) {
if ( d[i-1] <= d[i] ) continue;
uint32_t tmp = d[i-1];
d[i-1] = d[i];
d[i] = tmp;
flag = true;
}
}
// truncate
if ( nd > maxTerms ) nd = maxTerms;
// null terminate
d [ nd++ ] = 0;
// store in our sample vector
gbmemcpy ( vec , d , nd * 4 );
// return size in bytes
return nd * 4;
}
float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
int32_t *tv1 = getTagPairHashVector();
if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
int32_t *tv2 = xd2->getTagPairHashVector();
if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
m_niceness );
// this means error, g_errno should be set
if ( m_tagSimilarity == -1.0 ) return NULL;
return &m_tagSimilarity;
}
float *XmlDoc::getGigabitSimilarity ( XmlDoc *xd2 ) {
int32_t **gv1 = getGigabitHashes();
if ( ! gv1 || gv1 == (int32_t **)-1 ) return (float *)gv1;
int32_t **gv2 = xd2->getGigabitHashes();
if ( ! gv2 || gv2 == (int32_t **)-1 ) return (float *)gv2;
// *gv1 could be NULL if vec was empty in titlerec's ptr_gigabitHashes
m_gigabitSimilarity = computeSimilarity ( *gv1, *gv2, NULL, NULL, NULL,
m_niceness );
// this means error, g_errno should be set
if ( m_gigabitSimilarity == -1.0 ) return NULL;
return &m_gigabitSimilarity;
}
float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
int32_t *sv1 = getPageSampleVector();
if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
int32_t *sv2 = xd2->getPageSampleVector();
if ( ! sv2 || sv2 == (int32_t *)-1 ) return (float *)sv2;
m_pageSimilarity = computeSimilarity ( sv1, sv2, NULL, NULL, NULL,
m_niceness );
// this means error, g_errno should be set
if ( m_pageSimilarity == -1.0 ) return NULL;
return &m_pageSimilarity;
}
// . compare old page vector with new
// . returns ptr to a float from 0.0 to 100.0
float *XmlDoc::getPercentChanged ( ) {
// if we got it
if ( m_percentChangedValid ) return &m_percentChanged;
// get the old doc
XmlDoc **od = getOldXmlDoc ( );
if ( ! od || od == (XmlDoc **)-1 ) return (float *)od;
// if empty, assume 0% changed
if ( ! *od ) {
m_percentChanged = 0;
m_percentChangedValid = true;
return &m_percentChanged;
}
// get its page c
float *ps = getPageSimilarity ( *od );
if ( ! ps || ps == (float *)-1 ) return (float *)ps;
// got it
m_percentChanged = *ps;
m_percentChangedValid = true;
// just return it
return &m_percentChanged;
}
// . Address.cpp converts a place name into a vector for comparing via a
// call to computeSimilarity() below
// . returns -1 and set g_errno on error
// . "vbufSize" is in BYTES!
// . returns length of word vector in int32_ts (# components stored)
int32_t makeSimpleWordVector (char *s,int32_t *vbuf,int32_t vbufSize,int32_t niceness ) {
// nonsense?
if ( vbufSize < 4 ) { char *xx=NULL;*xx=0; }
// empty it
*vbuf = 0;
// no words, no vector
if ( ! s ) return 0;
// set them
Words w;
// return -1 with g_errno set on error
if ( ! w.set9 ( s , niceness ) ) return -1;
// skip if no words
if ( w.m_numWords == 0 ) return 0;
// int16_t cut
int64_t *wids = w.m_wordIds;
int64_t pid = 0LL;
// count insertions
int32_t count = 0;
// ptr
int32_t *vbufPtr = vbuf;
int32_t *vbufEnd = vbuf + vbufSize/4;
// put words into a vector
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
// if no room stop. need room for NULL terminator
if ( vbufPtr + 2 >= vbufEnd ) return count;
// put it in
//*vbufPtr = (int32_t)wids[i];
// . use the synonym instead if it had one
// . maps "theatre" to "theater", "4th" to "fourth", etc.
// . false = is street name?
int64_t *p = getSynonymWord ( &wids[i] , &pid , false );
// set this
pid = wids[i];
//int64_t *p = (int64_t *)synTable->getValue64( wids[i] );
// 0 means to ignore it
if ( *p == 0LL ) continue;
// otherwise add into our vector
*vbufPtr = *p;
// advance
vbufPtr++;
// NULL termination
*vbufPtr = 0;
// count it
count++;
}
// all done
return count;
}
// . compare two vectors
// . components in vectors are int32_ts
// . last component is a zero, to mark EOV = end of vector
// . discount any termIds that are in the query vector, qvec, which may be NULL
// . returns -1 and sets g_errno on error
// . vector components are 32-bit hashes of the words (hash32())???
// i would say they should be the lower 32 bits of the 64-bit hashes!
// . replaces:
// g_clusterdb.getGigabitSimilarity()
// m_tagVec->getLinkBrotherProbability()
// g_clusterdb.getSampleSimilarity()
float computeSimilarity ( int32_t *vec0 ,
int32_t *vec1 ,
int32_t *s0 , // corresponding scores vector
int32_t *s1 , // corresponding scores vector
Query *q ,
int32_t niceness ,
bool dedupVectors ) {
static int32_t s_tmp = 0;
if ( ! vec0 ) vec0 = &s_tmp;
if ( ! vec1 ) vec1 = &s_tmp;
// if both empty, assume not similar at all
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
// if either is empty, return 0 to be on the safe side
if ( *vec0 == 0 ) return 0;
if ( *vec1 == 0 ) return 0;
// flag if from query vector
HashTableX qt;
char qbuf[5000];
if ( q ) {
// init hash table
if ( ! qt.set ( 4,0,512,qbuf,5000,false,niceness,"xmlqvtbl") )
return -1;
// . stock the query term hash table
// . use the lower 32 bits of the termids to make compatible
// with the other vectors we use
//int64_t *qtids = q->getTermIds ();
int32_t nt = q->getNumTerms();
for ( int32_t i = 0 ; i < nt ; i++ ) {
// get query term
QueryTerm *QT = &q->m_qterms[i];
// get the termid
int64_t termId = QT->m_termId;
// get it
uint32_t h = (uint32_t)(termId & 0xffffffff);
// hash it
if ( ! qt.addKey ( &h ) ) return -1;
}
}
// if we ignore cardinality then it only matters if both vectors
// have a particular value, and not how many times they each have it.
// so we essentially dedup each vector if dedupVectors is true.
// but we do total up the score and put it behind the one unique
// occurence though. we do this only for
// Sections::addDateBasedImpliedSections() right now
bool allowDups = true;
if ( dedupVectors ) allowDups = false;
HashTableX ht;
char hbuf[10000];
if ( ! ht.set ( 4,4,-1,hbuf,10000,allowDups,niceness,"xmlqvtbl2"))
return -1;
bool useScores = (bool)s0;
int32_t matches = 0;
int32_t total = 0;
int32_t matchScore = 0;
int32_t totalScore = 0;
// hash first vector. accumulating score total and total count
for ( int32_t *p = vec0; *p ; p++ , s0++ ) {
// breathe
QUICKPOLL(niceness);
// skip if matches a query term
if ( q && qt.getSlot ( p ) ) continue;
// count it
total++;
// get it
int32_t score = 1;
// get the score if valid
if ( useScores ) score = *s0;
// total it up
totalScore += score;
// add it
if ( dedupVectors ) {
// accumulate all the scores into this one bucket
// in the case of p being a dup
if ( ! ht.addTerm32 ( p , score ) ) return -1;
}
else {
// otherwise, add each into its own bucket since
// ht.m_allowDups should be true
if ( ! ht.addKey ( p , &score ) ) return -1;
}
}
int32_t zero = 0;
// see what components of this vector match
for ( int32_t *p = vec1; *p ; p++ , s1++ ) {
// breathe
QUICKPOLL(niceness);
// skip if matches a query term
if ( q && qt.getSlot ( p ) ) continue;
// count it
total++;
// get it
int32_t score = 1;
// get the score if valid
if ( useScores ) score = *s1;
// and total scores
totalScore += score;
// is it in there?
int32_t slot = ht.getSlot ( p );
// skip if unmatched
if ( slot < 0 ) continue;
// otherwise, it is a match!
matches++;
// and scores
matchScore += score;
// and score of what we matched
uint32_t *val = (uint32_t *)ht.getValueFromSlot ( slot );
// he is hit too
matchScore += *val;
// remove it as we match it to deal with dups
if ( allowDups ) {
// once we match it once, do not match again, score was
// already accumulated
ht.setValue ( slot , &zero );
}
else {
// otherwise, remove this dup and try to match any
// remaining dups in the table
ht.removeSlot ( slot );
}
}
// if after subtracting query terms we got no hits, return 0.framesets?
if ( useScores && totalScore == 0 ) return 0;
if ( total == 0 ) return 0;
// . what is the max possible score we coulda had?
// . subtract the vector components that matched a query term
float percent = 100 * (float)matchScore / (float)totalScore;
//if ( useScores)percent = 100 * (float)matchScore / (float)totalScore;
//else percent = 100 * (float)matches / (float)total;
// sanity
//if ( percent > 100 ) percent = 100;
if ( percent > 100 ) { char *xx=NULL;*xx=0; }
return percent;
}
// this returns true if the two vecs are "percentSimilar" or more similar
bool isSimilar_sorted ( int32_t *vec0 ,
int32_t *vec1 ,
int32_t nv0 , // how many int32_ts in vec?
int32_t nv1 , // how many int32_ts in vec?
// they must be this similar or more to return true
int32_t percentSimilar,
int32_t niceness ) {
// if both empty, assume not similar at all
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
// if either is empty, return 0 to be on the safe side
if ( *vec0 == 0 ) return 0;
if ( *vec1 == 0 ) return 0;
// do not include last 0
nv0--;
nv1--;
int32_t total = nv0 + nv1;
// so if the "noMatched" count ever EXCEEDS (not equals) this
// "brink" we can bail early because there's no chance of getting
// the similarity "percentSimilar" provided. should save some time.
int32_t brink = ((100-percentSimilar) * total) / 100;
// scan each like doing a merge
int32_t *p0 = vec0;
int32_t *p1 = vec1;
int32_t yesMatched = 0;
int32_t noMatched = 0;
mergeLoop:
// stop if both exhausted. we didn't bail on brink, so it's a match
if ( *p0 == 0 && *p1 == 0 )
return true;
if ( *p0 < *p1 || *p1 == 0 ) {
p0++;
if ( ++noMatched > brink ) return false;
goto mergeLoop;
}
if ( *p1 < *p0 || *p0 == 0 ) {
p1++;
if ( ++noMatched > brink ) return false;
goto mergeLoop;
}
yesMatched += 2;
p1++;
p0++;
goto mergeLoop;
}
uint64_t *XmlDoc::getFuzzyDupHash ( ) {
if ( m_dupHashValid ) return &m_dupHash;
uint32_t *h1 = getTagPairHash32();
if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;
uint32_t *h2 = getGigabitVectorScorelessHash ( ) ;
if ( ! h2 || h2 == (uint32_t *)-1 ) return (uint64_t *)h2;
//uint64_t h2b = (uint64_t)*h2;
m_dupHash = hash64 ( (uint64_t)*h1 , (uint64_t)*h2 );
m_dupHashValid = true;
return &m_dupHash;
}
int64_t *XmlDoc::getExactContentHash64 ( ) {
if ( m_exactContentHash64Valid )
return &m_exactContentHash64;
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1) return (int64_t *)u8;
// if (m_docId==88581116800LL)
// log("got article1 diffbot");
// if (m_docId==201689682865LL)
// log("got article11 diffbot");
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if we are diffbot, then do not quite do an exact content hash.
// there is a "url:" field in the json that changes. so we have
// to exclude that field. otherwise getDupList() spider time dedup
// detection will fail the TestDuplicateContent.testDuplicate smoketest
if ( cr->m_isCustomCrawl == 1 && m_isDiffbotJSONObject ) {
int32_t *ch32 = getContentHashJson32();
if ( ! ch32 || ch32 == (void *)-1 ) return (int64_t *)ch32;
m_exactContentHash64Valid = true;
m_exactContentHash64 = (uint64_t)(uint32_t)*ch32;
return &m_exactContentHash64;
}
unsigned char *p = (unsigned char *)*u8;
int32_t plen = size_utf8Content;
if ( plen > 0 ) plen--;
// if we zeroed out this doc to save disk space, then we only
// record the exact 64-bit hash, so extract it here so that
// we can delete the gbcontenthash: term from the index if we are
// deleting this doc or updating it with a fresh copy.
if ( plen < 100 && p && plen > 12 &&
strncmp((char *)p,"gbzeroedout:",12) == 0 ) {
sscanf((char *)p+12,"%"UINT64,&m_exactContentHash64);
m_exactContentHash64Valid = true;
return &m_exactContentHash64;
}
// sanity
//if ( ! p ) return 0LL;
//if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
unsigned char *pend = (unsigned char *)p + plen;
uint64_t h64 = 0LL;
unsigned char pos = 0;
bool lastWasSpace = true;
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL ( m_niceness );
// treat sequences of white space as a single ' ' (space)
if ( is_wspace_a(*p) ) {
if ( lastWasSpace ) continue;
lastWasSpace = true;
// treat all white space as a space
h64 ^= g_hashtab[pos][(unsigned char)' '];
pos++;
continue;
}
lastWasSpace = false;
// xor this in right
h64 ^= g_hashtab[pos][p[0]];
pos++;
}
m_exactContentHash64Valid = true;
m_exactContentHash64 = h64;
return &m_exactContentHash64;
}
RdbList *XmlDoc::getDupList ( ) {
if ( m_dupListValid ) return &m_dupList;
// until we start using posdb and not indexdb, just return an
// empty list.
// TODO: MDW fix the deduping.
//m_dupList.reset();
//m_dupListValid = true;
//return &m_dupList;
//
// end temp hack
//
//uint64_t *dh = getDupHash ( );
//if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
int64_t *ph64 = getExactContentHash64();
//int64_t *ph64 = getLooseContentHash64();
if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64;
// must match term in XmlDoc::hashVectors()
char qbuf[256];
snprintf(qbuf, 256, "%"UINT64"",*ph64);
int64_t pre = hash64b ( "gbcontenthash" , 0LL );
int64_t rawHash = hash64b ( qbuf , 0LL );
int64_t termId = hash64 ( rawHash , pre );
// get the startkey, endkey for termlist
key144_t sk ;
key144_t ek ;
g_posdb.makeStartKey ( &sk,termId ,0);
g_posdb.makeEndKey ( &ek,termId ,MAX_DOCID);
// note it
log(LOG_DEBUG,"build: check termid=%"UINT64" for docid %"UINT64""
,(uint64_t)(termId&TERMID_MASK)
,m_docId);
// assume valid now
m_dupListValid = true;
// this is a no-split lookup by default now
if ( ! m_msg0.getList ( -1 , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // add to cache?
RDB_POSDB, // INDEXDB ,
cr->m_collnum,
&m_dupList ,
(char *)&sk ,
(char *)&ek ,
606006 , // minRecSizes in bytes
m_masterState , // state
m_masterLoop ,
m_niceness ,
true , // error correction?
true , // include tree?
true , // domerge?
-1 , // firsthosti
0 , // startfilenum
-1, // # files
// never timeout when spidering in case
// a host is down.
9999977 , // timeout
-1 , // syncpoint
-1 , // preferlocal reads
NULL, // msg5
NULL, // msg5b
false , // isRealMerge
true , // allow page cache
false , // forcelocalindexdb
true ) ) // shardByTermId? THIS IS DIFFERENT!!!
// return -1 if this blocks
return (RdbList *)-1;
// assume valid!
m_dupListValid = true;
return &m_dupList;
}
// moved DupDetector.cpp into here...
char *XmlDoc::getIsDup ( ) {
if ( m_isDupValid ) return &m_isDup;
// assume we are not a dup
m_isDup = false;
// get it
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// skip if we should
if ( ! cr->m_dedupingEnabled ||
// bulk jobs never dedup
cr->m_isCustomCrawl == 2 ) {
m_isDupValid = true;
return &m_isDup;
}
// if &links was given in the diffbot api url then do not do
// spider time deduping because the pages are likely rendered using
// javascript, so they'd all seem to be dups of one another.
if ( cr->m_isCustomCrawl ) {
SafeBuf *au = getDiffbotApiUrl();
if ( ! au || au == (void *)-1 ) return (char *)au;
char *linksParm = NULL;
if ( au->length() > 0 )
linksParm = strstr ( au->getBufStart() , "&links");
if ( ! linksParm && au->length() > 0 )
linksParm = strstr ( au->getBufStart() , "?links");
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
linksParm = NULL;
if ( linksParm ) {
m_isDupValid = true;
m_isDup = false;
return &m_isDup;
}
}
// do not dedup seeds
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
if ( cr->m_isCustomCrawl && isSeed ) {
m_isDupValid = true;
m_isDup = false;
return &m_isDup;
}
setStatus ( "checking for dups" );
// BUT if we are already indexed and a a crawlbot/bulk diffbot job
// then do not kick us out just because another indexed doc is
// a dup of us because it messes up the TestOnlyProcessIfNew smoketests
// because in the 2nd round we end up deleting article1.html after
// indexing it in the first round, then we add article11.html's
// diffbot reply in the 2nd round because article1.html and its
// diffbot reply was deleted. thereby giving it a new timestamp and
// makeing the smoke fail.
if ( cr->m_isCustomCrawl ) {
char *isIndexed = getIsIndexed();
if ( ! isIndexed || isIndexed == (char *)-1)
return (char *)isIndexed;
if ( *isIndexed ) {
m_isDupValid = true;
return &m_isDup;
}
}
//we need both vectors to be non-empty
//uint64_t *tv = getTagPairHash();
//if ( ! tv || tv == (uint64_t *)-1) return (char *)tv;
// get our docid
int64_t *mydocid = getDocId();
if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
// get the duplist!
RdbList *list = getDupList();
if ( ! list || list == (RdbList *)-1 ) return (char *)list;
// sanity. must be posdb list.
if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}
// so getSiteRank() does not core
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
// . see if there are any pages that seem like they are dups of us
// . they must also have a HIGHER score than us, for us to be
// considered the dup
//if ( ! m_didQuickDupCheck ) {
// // do not repeat
// m_didQuickDupCheck = true;
int32_t myRank = getSiteRank ( );
// init
//uint8_t maxScore = 0;
//uint8_t myScore = 0;
//char maxSiteRank = -1;
//int64_t maxDocId = -1LL;
// assume not a dup
m_isDup = false;
// get the docid that we are a dup of
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// breathe
QUICKPOLL(m_niceness);
//int64_t d = list->getCurrentDocId();
char *rec = list->getCurrentRec();
// get the docid
int64_t d = g_posdb.getDocId ( rec );
// get the score
//uint8_t score = list->getCurrentScore();
// just let the best site rank win i guess?
// even though one page may have more inlinks???
char sr = (char )g_posdb.getSiteRank ( rec );
// skip if us!
//if ( d == *getDocId() ) {
// // record our score
// //myScore = score;
// mySiteRank = sr;
// continue;
//}
// skip if us
if ( d == m_docId ) continue;
// for debug
//if ( d != m_docId )
//log("build: doc %s is dup of docid %"INT64"",
// m_firstUrl.m_url,d);
// if his rank is <= ours then he was here first and we
// are the dup i guess...
if ( sr >= myRank ) {
log("build: doc %s is dup of docid %"INT64"",
m_firstUrl.m_url,d);
m_isDup = true;
m_isDupValid = true;
m_docIdWeAreADupOf = d;
return &m_isDup;
}
// get the winner
//if ( score > maxScore ) maxScore = score;
//if ( sr > maxSiteRank || maxSiteRank == -1 ) {
// maxSiteRank = sr;
// maxDocId = d;
// continue;
//}
//if ( sr < maxSiteRank ) continue;
// fallback to docid?
// do it first come first server othereise i guess
// this will prevent dups from existing in the index at least
// if they have the same siterank...
//if ( d < maxDocId ) {
// maxDocId = d;
// continue;
//}
}
// are we the highest scoring doc with this template?
// corollary: if all dups have equal scores they will be
// removed until there is only one doc that matches the pattern
//if ( myScore >= maxScore ) {
//if ( maxDocId >= 0 && maxDocId != *mydocid && out) {
// m_isDup = true;
// m_isDupValid = true;
// return &m_isDup;
//}
m_isDup = false;
m_isDupValid = true;
return &m_isDup;
/*
we now temporarily at least, do exact dup checking...
later we will bring in the fuzzy code...
// reset its ptr for stuff below
list->resetListPtr();
loop:
// . get a title rec for the current docid
// . but if exhausted, we are not a dup!
if ( list->isExhausted() ) { m_isDupValid = true; return &m_isDup; }
// get the docid
int64_t d = list->getCurrentDocId();
// continue if us!
if ( d == *mydocid ) { list->skipCurrentRecord(); goto loop; }
// is this a dup of us?
char *dup = isDupOfUs ( d );
if ( ! dup || dup == (char *)dup ) return (char *)dup;
// if dup of us, bail out
if ( *dup ) { m_isDup = true; m_isDupValid = true; return &m_isDup; }
// prepare for next
list->skipCurrentRecord();
// loop up
goto loop;
*/
}
char *XmlDoc::isDupOfUs ( int64_t d ) {
// sanity check
if ( d <= 0 ) { char *xx=NULL;*xx=0; }
// get our current title rec
SafeBuf *tr = getTitleRecBuf();
if ( ! tr || tr == (void *)-1 ) return (char *)tr;
// we should not be here if we know we are a dup of another doc
if ( m_isDup ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// get the title rec for this docid if we haven't yet done so
if ( m_calledMsg22d != d ) { // .m_docId != d ) {
bool s;
// note it
setStatus ( "getting possible dup title rec" );
// do not re-call
m_calledMsg22d = d;
// get the guy that might be a dup of us
s = m_msg22d.getTitleRec ( &m_msg22Request ,
NULL ,
d ,
cr->m_coll ,
&m_dupTrPtr ,
&m_dupTrSize ,
false , // just check tfndb?
false , // getAvailDocIdOnly
m_masterState, // state
m_masterLoop , // callback
m_niceness ,
false , // add to cache
60*60*24 , // maxcacheage
999999 );// timeout
// we blocked
if ( ! s ) return (char *)-1;
// error?
if ( g_errno ) return NULL;
}
// if not there do not count as an error
if ( ! m_dupTrPtr ) { g_errno = 0; return &m_isDup; }
// ignore any errors too i guess...
if ( m_msg22d.m_errno ) {
log(LOG_WARN, "build: Dup Detection error with "
"titlerec fetch: %s",mstrerror(m_msg22d.m_errno));
g_errno = 0;
return &m_isDup;
}
// we need to parse this potential dup doc
XmlDoc dd;
// . parse the possible dup title rec into another XmlDoc class
// . it returns false and sets g_errno on error
if ( ! dd.set2 ( m_dupTrPtr ,
m_dupTrSize ,
cr->m_coll ,
NULL , // m_pbuf ,
m_niceness ) )
return NULL;
LinkInfo *info1a = dd.getLinkInfo1();
LinkInfo *info1b = getLinkInfo1();
float pageNumInlinksA = info1a->m_numGoodInlinks;//getNumInlinksExtrapolated();
float pageNumInlinksB = info1b->m_numGoodInlinks;//getNumInlinksExtrapolated();
// . if the old dup doc is of lower quality than the new doc that
// we are checking, then that one should be removed, not us!
// if they are equal, we keep the int16_ter url of the two
// . dd was set from title rec so these numInlinks should be taken
// from the TagRec in ptr_tagRecData, and therefore NOT BLOCK!
if ( *dd.getSiteNumInlinks() < *getSiteNumInlinks() )
return &m_isDup;
if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
pageNumInlinksA < pageNumInlinksB )
return &m_isDup;
if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
pageNumInlinksA == pageNumInlinksB &&
dd.getFirstUrl()->getUrlLen() > getFirstUrl()->getUrlLen())
return &m_isDup;
float *ts = getTagSimilarity ( &dd );
if ( ! ts || ts == (float *)-1 ) return (char *)ts;
float *gs = getGigabitSimilarity ( &dd );
if ( ! gs || gs == (float *)-1 ) return (char *)gs;
float *ps = getPageSimilarity ( &dd );
if ( ! ps || ps == (float *)-1 ) return (char *)ps;
int32_t gigabitVecSimilarity = (int32_t)*gs;
int32_t tagVecSimilarity = (int32_t)*ts;
int32_t sampleVecSimilarity = (int32_t)*ps;
int32_t notSimilarCount = 0;
if ( gigabitVecSimilarity < 80 ) {
notSimilarCount++;
if ( gigabitVecSimilarity < 50 ) return &m_isDup;
}
if ( tagVecSimilarity < 80 ) {
notSimilarCount++;
if ( tagVecSimilarity < 50 ) return &m_isDup;
}
if ( sampleVecSimilarity < 80 ) {
notSimilarCount++;
if ( sampleVecSimilarity < 50 ) return &m_isDup;
}
// if it is similar enough, we got a dup!
if ( notSimilarCount <= 0 ) { m_isDupValid = true; m_isDup = true; }
return &m_isDup;
}
// hash a gigabit hash vector without its scores, also order independent
uint32_t *XmlDoc::getGigabitVectorScorelessHash ( ) {
if ( m_gigabitVectorHashValid ) return &m_gigabitVectorHash;
int32_t **gbvec = getGigabitHashes();
if ( ! gbvec || gbvec == (int32_t **)-1 ) return (uint32_t *)gbvec;
uint32_t h = 0;
// this bad boy is NULL terminated
uint32_t *gbv = (uint32_t *)*gbvec;
// i guess zak likes the simple XOR'ing thing...
for ( int32_t i = 0; gbv && gbv[i] ; i++) h ^= gbv[i];
m_gigabitVectorHashValid = true;
m_gigabitVectorHash = h;
return &m_gigabitVectorHash;
}
// . the original vector used for deduping similar search results is just from
// random sample of indexed terms, but gigabit vector is
// formed using the hashes of the top-scoring gigabits of the document, and
// therefore uses the words class
// . sets g_errno and returns NULL on error
// . ptr_gigabitHashes can be NULL...
int32_t **XmlDoc::getGigabitHashes ( ) {
// if it was already set, treat this as an accessor
if ( m_gigabitHashesValid ) return &ptr_gigabitHashes;
// this also sets the vector
char *gq = getGigabitQuery();
if ( ! gq || gq == (char *)-1) return (int32_t **)gq;
// it should be valid now!
if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
return &ptr_gigabitHashes;
}
// . the new function to get gigabits
// . sets and validates m_gigabitQuery[] and m_gigabitHashes[] among others
// . candidates = capitalized word, capitalized sequence of words,
// uncapitalized 2+ word wikipedia phrase.
// . candidates exclude uncapitalized query stop words.
// . calls addGigabits() which is called by each doc in search results
// when we use this at query time.
// . separates gigabits with a comma (delimeter) in m_gigabitQuery[]
// . quotes multiple word gigabits
char *XmlDoc::getGigabitQuery ( ) {
if ( m_gigabitQueryValid ) return m_gigabitQuery;
setStatus ( "getting gigabit query" );
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (char *)d;
Sections *ss = getSections();
if ( ! ss || ss == (Sections *)-1 ) return (char *)ss;
//Weights *we = getWeights();
//if ( ! we || we == (Weights *)-1 ) return (char *)we;
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
LinkInfo **pinfo2 = getLinkInfo2();
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
HashTableX ht;
char buf [ 200000 ];
// pass in niceness in case it has to grow really big and re-hash all!!
ht.set ( 8 , 4 , -1 , buf , 200000 , false, m_niceness,"xmlgbtbl");
// . add gigabits from our body words
// . includes title and header tags so pts can work well!
if ( ! addGigabits ( ww , *d , ss , *langId ) ) return NULL;
// add gigabits from link info
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
// sanity check
char *txt = k->getLinkText();
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
if ( ! verifyUtf8 ( txt , tlen ) ) {
log("xmldoc: bad link text 0 from url=%s for %s",
k->getUrl(),m_firstUrl.m_url);
continue;
}
// add those in
if (!addGigabits(txt, *d, *langId ) ) return NULL;
// add in neighborhoods
if(!addGigabits(k->getSurroundingText(),*d,*langId))
return NULL;
}
// add in gigabits for meta keywords
int32_t mdlen;
char *md = getMetaDescription( &mdlen );
if ( ! addGigabits2 ( md , mdlen, *d , *langId ) ) return NULL;
// add in gigabits for meta description
int32_t mklen;
char *mk = getMetaKeywords( &mklen );
if ( ! addGigabits2 ( mk , mklen , *d , *langId ) ) return NULL;
// set m_gigabitQuery and m_gigabitScores
//GigabitInfo *top[100];
// fill in "top" in order of score
m_numTop = getTopGigabits ( &ht , m_top , 100 , 0 );
// error? then g_errno should be set
if ( m_numTop == -1 ) return NULL;
char *p = m_gigabitQuery;
char *pend = m_gigabitQuery + XD_GQ_MAX_SIZE - 1;
// reset count of vector components for setting gigabit vector
int32_t ng = 0;
// total score
//int32_t total = 0;
// . now set the gigabit query!
// . start with the highest scoring node first, the last node since
// nodes are ranked by lowest to highest key
for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
// get the info
GigabitInfo *gi = m_top[i];
// stop if too big
if ( p + gi->m_len + 10 >= pend ) continue;
// get 32 bit hash
uint32_t h = gi->m_hash & 0xffffffff;
// never allow 0
if ( h == 0 ) h = 1;
// add to vector
if ( ng + 1 < XD_MAX_GIGABIT_HASHES ) {
// the term hash
m_gigabitHashes[ng] = (int32_t)h ;
// and the score
m_gigabitScores[ng] = gi->m_pts;
// point into it, where we will copy it to
m_gigabitPtrs [ng] = p + 1;
// advance
ng++;
}
// quote it
*p++ = '\"';
// write into buffer
gbmemcpy ( p , gi->m_ptr , gi->m_len );
// finish quote
*p++ = '\"';
// separate terms just in case
//gbmemcpy ( p , " , ", 4 );
//p += 4;
*p++ = ',';
}
// done
*p++ = '\0';
// NULL termiante the vector to make it a legit vector
m_gigabitHashes [ ng ] = 0;
m_gigabitScores [ ng ] = 0;
// include the terminating 0
ng++;
// validate both the query and vector
m_gigabitQueryValid = true;
m_gigabitHashesValid = true;
// set this too
ptr_gigabitHashes = m_gigabitHashes;
ptr_gigabitScores = m_gigabitScores;
size_gigabitHashes = ng * 4 ; // 4 bytes each component
size_gigabitScores = ng * 4 ; // 4 bytes each score
return m_gigabitQuery;
}
// . fill in "top" in order of score
// . returns -1 and sets g_errno on error
int32_t getTopGigabits ( HashTableX *ht ,
GigabitInfo **top ,
int32_t max ,
int32_t minDocCount ) {
// store top 100 into this tree
RdbTree tree;
if ( ! tree.set ( 4 , // fixedDataSize
max+2 , // maxNumNodes
true , // balance?
-1 , // maxMem
true , // own data?
"tree-topgbits" ))
return -1;
int32_t ns = ht->getNumSlots();
key_t minKey;
bool minKeyValid = false;
for ( int32_t i = 0 ; i < ns ; i++ ) {
// skip if empty
if ( ht->isEmpty(i) ) continue;
// get his info
GigabitInfo *gi = (GigabitInfo *)ht->getValueFromSlot(i);
// must be valid
if ( gi->m_count <= 0 ) { char *xx=NULL;*xx=0; }
// must be in this many docs minimum
if ( gi->m_numDocs < minDocCount ) continue;
// make the key
key_t key;
key.n1 = gi->m_pts;
key.n0 = gi->m_hash;
// should we add it?
if ( minKeyValid && key <= minKey ) continue;
// we should add it. use points as the key. use PTR as data
int32_t node = tree.addNode(0,key,(char *)&gi,4);
// error? g_errno should be set
if ( node < 0 ) return -1;
// if not full continue
if ( tree.getNumUsedNodes() < 100 ) continue;
// get the smallest node
int32_t tn = tree.getLowestNode ( ) ;
// sanity check
if ( tn < 0 ) { char *xx=NULL;*xx=0; }
// kick out smallest
tree.deleteNode3 ( tn , false );
// get new smallest
tn = tree.getLowestNode();
// set the new minkey
minKey = *(key_t *)tree.getKey ( tn );
// validate it
minKeyValid = true;
}
int32_t count = 0;
// . now set the array
// . start with the highest scoring node first, the last node since
// nodes are ranked by lowest to highest key
for ( int32_t nn=tree.getLastNode() ; nn>=0 ; nn=tree.getPrevNode(nn) ){
// get the info
GigabitInfo *gi = (GigabitInfo *)tree.getData(nn);
// store it
top[count++] = gi;
// stop if we are full
if ( count >= max ) break;
}
return count;
}
char *XmlDoc::getMetaDescription( int32_t *mdlen ) {
if ( m_metaDescValid ) {
*mdlen = m_metaDescLen;
return m_metaDesc;
}
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
//xml->getMetaContent ( m_metaDesc, 1024, "description", 11 );
// we need to point to it in the html source so our WordPosInfo
// algo works right.
m_metaDesc = xml->getMetaContentPointer("description",
11,
"name",
&m_metaDescLen);
*mdlen = m_metaDescLen;
m_metaDescValid = true;
return m_metaDesc;
}
char *XmlDoc::getMetaSummary ( int32_t *mslen ) {
if ( m_metaSummaryValid ) {
*mslen = m_metaSummaryLen;
return m_metaSummary;
}
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
m_metaSummary = xml->getMetaContentPointer("summary",
7,
"name",
&m_metaSummaryLen);
*mslen = m_metaSummaryLen;
m_metaSummaryValid = true;
return m_metaSummary;
}
char *XmlDoc::getMetaKeywords( int32_t *mklen ) {
if ( m_metaKeywordsValid ) {
*mklen = m_metaKeywordsLen;
return m_metaKeywords;
}
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
//xml->getMetaContent ( m_metaKeywords, 1024, "keywords", 8 );
// we need to point to it in the html source so our WordPosInfo
// algo works right.
m_metaKeywords=xml->getMetaContentPointer("keywords",
8,
"name",
&m_metaKeywordsLen);
*mklen = m_metaKeywordsLen;
m_metaKeywordsValid = true;
return m_metaKeywords;
}
bool XmlDoc::addGigabits ( char *s ,
int64_t docId ,
uint8_t langId ) {
Words tmp;
// skip if none
if ( ! s ) return true;
// returns NULL with g_errno set on error
if ( ! tmp.set9 ( s , m_niceness ) ) return false;
// and weights!
//Weights we;
//if ( ! we.set ( &tmp , )
// and so does this
return addGigabits ( &tmp , docId , NULL , langId );
}
bool XmlDoc::addGigabits2 ( char *s ,
int32_t slen,
int64_t docId ,
uint8_t langId ) {
Words tmp;
// skip if none
if ( ! s ) return true;
// returns NULL with g_errno set on error
if ( ! tmp.setx ( s , slen , m_niceness ) ) return false;
// and weights!
//Weights we;
//if ( ! we.set ( &tmp , )
// and so does this
return addGigabits ( &tmp , docId , NULL , langId );
}
bool XmlDoc::addGigabits(Words *ww,int64_t docId,Sections *sections,
uint8_t langId ) {
// skip sections marked as these:
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
// get this
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
// not if we don't have any identified sections
if ( sections && sections->m_numSections <= 0 ) sp = NULL;
// int16_tcuts
int64_t *wids = ww->m_wordIds;
char **wptrs = ww->m_words;
int32_t *wlens = ww->m_wordLens;
nodeid_t *tids = ww->m_tagIds;
int32_t nw = ww->getNumWords();
//int32_t flags;
// inital # of slots
int32_t is = 0;
if ( m_wordsValid ) is = ww->m_numAlnumWords;
// put gigabits into this hash table
HashTableX ht;
if ( ! ht.set ( 8 , sizeof(GigabitInfo),is,NULL,0,false,m_niceness,
"gigabits") )
return false;
// scan through the words
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe if being called by spider
QUICKPOLL ( m_niceness );
// skip if not alnum word
if ( ! wids[i] ) continue;
// get section
Section *sx = NULL;
// get flags
if ( sp ) sx = sp[i];//flags = sp[i]->m_flags;
//else flags = 0;
// skip if ignored. i.e. in the menu or not in the article text
//if ( flags & badFlags ) continue;
// are we capitalized?
bool cap = ww->isCapitalized(i);
// ignore lower case query stop words
if (!cap&&isQueryStopWord(wptrs[i],wlens[i],wids[i],langId))
continue;
// hash of word then the phrase
//uint32_t h = wids[i] & 0xffffffff;
//uint64_t h = wids[i];
// add the word itself. return NULL with g_errno set on error
if ( ! addGigabit (&ht,wptrs[i],wlens[i],docId,
sx,true,langId,i)) return false;
// save position
int32_t j = i + 1 ;
// check this far out
int32_t maxj = i + 12; if ( maxj > nw ) maxj = nw;
// do we got a cap phrase?
bool capPhrase = false;
// if capitalized look for sequence
for ( ; cap && j < maxj ; j++ ) {
// . stop on tags
// . tids is NULL if being set from meta tag...
if ( tids && tids[j] ) break;
// skip if not alnum
if ( ! wids[j] ) {
// make sure it is like a single space or
// something we can "phrase across"
// TODO: can be like "capt. "
if ( wlens[j] == 1 ) continue;
// otherwise it stops the phrase
break;
}
// if not capitalized stop
if ( ! ww->isCapitalized(j) ) break;
// got one!
capPhrase = true;
// . hash it into the ongoing hash
// . Speller::getPopularity() should use this same
// method so we can get popularities of the gigabits!
//h = hash32Fast ( wids[j] & 0xffffffff , h );
//h = hash64Fast ( wids[j] , h );
}
// if we added something... skip whole phrase, if any
if ( capPhrase ) {
// get length of it
int32_t len = wptrs[j-1] + wlens[j-1] - wptrs[i];
// add that entire sequence, [i,j)
if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,
false,langId,i)) return false;
// advance to end of phrase
i = j - 1;
continue;
}
// reset
j = i + 1;
// this must be true
// . ok, look for a wiki phrase then!
// . we can speed this up if too slow... using a crazy hash tbl
int32_t wikij = -1;
// init the hash for wiki lookup
uint32_t h = 0;
// loop over successive terms
for ( ; j < maxj ; j++ ) {
// . stop on tags
// . tids is NULL if being set from meta tag
if ( tids && tids[j] ) break;
// skip if not alnum
if ( ! wids[j] ) {
// make sure it is like a single space or
// something we can "phrase across"
// TODO: can be like "capt. "
if ( wlens[j] == 1 ) continue;
// otherwise it stops the phrase
break;
}
// init it
if ( ! h ) h = hash32Fast ( wids[i] & 0xffffffff , 0 );
// hash it into the ongoing hash
h = hash32Fast ( wids[j] & 0xffffffff , h );
// is this in the wiki?
if ( ! g_wiki.isInWiki ( h ) ) continue;
// it is, mark it
wikij = j + 1;
}
// must be a 2+ word phrase in the wiki to be a gigabit
if ( wikij == -1 ) continue;
// bail if breach
if ( wikij >= nw ) continue;
// get len
int32_t len = wptrs[wikij] + wlens[wikij] - wptrs[i];
// add what we got
if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,false,
langId,i) ) return false;
// advance to end of phrase
i = wikij - 1;
}
return true;
}
char* XmlDoc::getMetadata(int32_t* retlen) {
if(!m_hasMetadata) {
*retlen = 0;
return NULL;
}
*retlen = size_metadata;
return ptr_metadata;
}
// . this is called by Msg40.cpp to intersect gigabits from multiple docs
// . returns -1 and sets g_errno on error
// . returns # of GigabitInfos stored into "top"
/*
int32_t intersectGigabits ( Msg20 **mp , // search results
int32_t n , // # of em
uint8_t langId , // searcher's langId
int32_t maxTop ,
int32_t docsToScan ,
int32_t minDocCount , // must be in this # docs
GigabitInfo *top ,
int32_t niceness ) {
// put gigabits into this hash table
HashTableX ht;
ht.set ( 8 , sizeof(GigabitInfo),0,NULL,0,false,niceness,"ginttbl");
for ( int32_t i = 0 ; i < n && i < docsToScan ; i++ ) {
// get the reply/searchResult
Msg20Reply *mr = mp[i]->m_r;
// sanity check
if ( ! mr && ! mp[i]->m_errno ) { char *xx=NULL;*xx=0; }
// this is NULL on error
if ( ! mr ) continue;
// count them
int32_t count = 0;
// add each gigabit for it
for ( char *p = mr->ptr_gigabitQuery ; p && *p ; count++ ) {
// skip the comma
p++;
// point to next
char *end = strchr ( p , ',' );
// do not allow NULLs
if ( ! end ) end = p + gbstrlen(p);
// get the score. aka GigabitInfo::m_pts
int32_t ptsArg = mr->ptr_gigabitScores[count];
// sanity check for bad scores
if ( ptsArg <= 0 ) { char *xx=NULL;*xx=0; }
// add it in
if ( ! addGigabit ( &ht ,
p ,
end - p , // langth
mr->m_docId ,
NULL ,// section ptr
false , // singleWrd? unused
langId ,
-1 , // word #i not used
ptsArg ) )
return -1;
// advance p
p = end;
// if not comma, all done
if ( *p != ',' ) break;
// skip comma
p++;
}
}
// . get up to the top 50 gigabits
GigabitInfo *array [ 50 ];
int32_t numTop = getTopGigabits ( &ht , array , 50 , minDocCount );
// error? g_errno should be set
if ( numTop == -1 ) return -1;
// sanity check
if ( numTop > maxTop ) { char *xx=NULL;*xx=0; }
// now copy into our array
for ( int32_t i = 0 ; i < numTop ; i++ ) {
// get it
GigabitInfo *gi = array[i];
// copy it
gbmemcpy ( &top[i] , gi , sizeof(GigabitInfo) );
}
// return how many we copied
return numTop;
}
*/
// . "docId" is the document Id that "h" came from
// . if being called at query time we often get called on each search result!
// . if being called at parse/index time we are being called on a single docId
// . returns false and sets g_errno on error
bool addGigabit ( HashTableX *ht ,
char *s ,
int32_t slen ,
int64_t docId ,
Section *sp ,
bool singleWord ,
uint8_t langId ,
// starts with word #i
int32_t i ,
int32_t ptsArg ) {
// get its hash
uint64_t h = hash64d ( s , slen );
// get the slot where its at
int32_t slot = ht->getSlot ( &h );
// info for this hash/gigabit in the doc
GigabitInfo *gi ;
// otherwise, init a new slot. set the key to h
if ( slot < 0 ) {
// . add key to a new slot, set "gi" to the value ptr
// . use NULL for the GigabitInfo ptr temporarily so it should
// not gbmemcpy into the slot
if ( ! ht->addKey ( &h , NULL , &slot ) ) return false;
// get data ptr to the bogus data
gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
// . set all the stuff now. this way avoids a gbmemcpy...
// . every wiki title should have a popularity i guess...
// . "pop" is # of docs out of 10,000 that have this phrase?
int32_t pop = g_speller.getPhrasePopularity(s,h,true,langId);
gi->m_pop = pop;
gi->m_pts = 0;
gi->m_count = 0;
gi->m_numDocs = 0;
gi->m_lastDocId = 0LL;
gi->m_currentDocCount = 0; // a char
gi->m_ptr = s;
gi->m_len = slen;
gi->m_hash = h;
// sanity test
GigabitInfo *tt = (GigabitInfo *)ht->getValue ( &h );
if ( tt->m_pop != pop ) { char *xx=NULL;*xx=0; }
}
else {
gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
// only allow up to 5 votes per document!
if ( gi->m_currentDocCount >= 5 ) return true;
}
// inc the count, we got one more occurence
gi->m_count++;
// doc count. how many docs have this gigabit? count it.
if ( docId != gi->m_lastDocId ) {
gi->m_numDocs++;
gi->m_lastDocId = docId;
gi->m_currentDocCount = 1;
}
else
gi->m_currentDocCount++;
// given?
if ( ptsArg != -1 ) {
gi->m_pts += ptsArg;
return true;
}
// base points on popularity
float pts = 1.0;
if ( gi->m_pop < 1 ) pts = 1000;
else if ( gi->m_pop < 2 ) pts = 500;
else if ( gi->m_pop < 3 ) pts = 250;
else if ( gi->m_pop < 4 ) pts = 200;
else if ( gi->m_pop < 5 ) pts = 150;
else if ( gi->m_pop < 6 ) pts = 100;
else if ( gi->m_pop < 7 ) pts = 20;
else if ( gi->m_pop < 8 ) pts = 10;
else if ( gi->m_pop < 10 ) pts = 5;
else if ( gi->m_pop < 15 ) pts = 3;
else if ( gi->m_pop < 20 ) pts = 2;
// . special boost if in title, header or anchor tag
// . the weights class ONLY boosts the first 20 or so words in
// header tags... how can we fix that??????????????????
// . TODO: FIX THAT!!!
//if ( flags & SEC_TITLE ) pts = pts * 6.0/(float)we->m_titleWeight;
//if ( flags & SEC_HEADER) pts = pts * 4.0/(float)we->m_headerWeight;
//if ( flags & SEC_A ) pts = pts * 4.0/(float)we->m_linkTextWeight;
if ( sp ) {
if ( sp->m_flags & SEC_IN_TITLE ) pts = pts * 6.0;
if ( sp->m_flags & SEC_IN_HEADER ) pts = pts * 4.0;
if ( sp->m_tagId == TAG_A ) pts = pts * 4.0;
}
// if for the query 'recreation' you get the phrase "park bench"
// 100 times and the word "bench" 100 times. the word weight
// for "bench" should be very low! Weights.cpp also demotes repreated
// sentence fragments, etc. it is generally a really handy thing!
// and i think it already boosts scores for being in the title, etc.
// IF BEING called from meta tag, weights are NULL!
// TODO: we need to use the diversity vector here then...
//if ( we ) {
// if ( singleWord ) pts *= we->m_ww[i];
// else pts *= we->m_pw[i];
//}
// add them in
gi->m_pts += (int32_t)pts;
// good to go
return true;
}
/*
-- this will be a url filter var like "numindexed"
int32_t *XmlDoc::getSiteSpiderQuota ( ) {
if ( m_siteSpiderQuotaValid ) return &m_siteSpiderQuota;
int32_t *siteNumInlinks = getSiteNumInlinks();
if ( ! siteNumInlinks ) return NULL;
if ( siteNumInlinks == (int32_t *)-1 ) return (int32_t *)-1;
// get this fresh each time
int32_t *rn = getRegExpNum ( -1 );
if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
// bail early? this happens if we match a banned/filtered rule in
// the url filters table
if ( m_indexCode ) return NULL;
// valid at this point
m_siteSpiderQuotaValid = true;
// if no match, or filtered or banned, assume no quota
if ( *rn == -1 ) m_siteSpiderQuota = -1;
else m_siteSpiderQuota = cr->m_spiderQuotas[*rn];
// get the quota, -1 means no limit
return &m_siteSpiderQuota;
}
*/
Url *XmlDoc::getCurrentUrl ( ) {
if ( m_currentUrlValid ) return &m_currentUrl;
// otherwise, get first url
Url *fu = getFirstUrl();
if ( ! fu || fu == (void *)-1 ) return (Url *)fu;
// make that current url
m_currentUrl.set ( &m_firstUrl , false );
m_currentUrlValid = true;
return &m_currentUrl;
/*
// need a valid url
Url *u = getFirstUrl();
if ( ! u ) return NULL;
// but use redir if we got that
Url *r = getRedirUrl();
if ( r && m_redirUrlValid ) return r;
return u;
*/
}
Url *XmlDoc::getFirstUrl() {
if ( m_firstUrlValid ) return &m_firstUrl;
// we might have a title rec
if ( m_setFromTitleRec ) {
setFirstUrl ( ptr_firstUrl , false );
m_firstUrlValid = true;
return &m_firstUrl;
}
// must be this otherwise
if ( ! m_setFromDocId ) { char *xx=NULL;*xx=0; }
// this must be valid
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (Url *)pod;
// int16_tcut
XmlDoc *od = *pod;
// now set it
setFirstUrl ( od->ptr_firstUrl , false );
m_firstUrlValid = true;
return &m_firstUrl;
}
int64_t XmlDoc::getFirstUrlHash48() {
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
// this must work
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
if ( getUseTimeAxis() ) {
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
m_firstUrlHash48Valid = true;
return m_firstUrlHash48;
}
m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL;
m_firstUrlHash48Valid = true;
return m_firstUrlHash48;
}
int64_t XmlDoc::getFirstUrlHash64() {
if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
// this must work
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
if ( getUseTimeAxis() ) {
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
m_firstUrlHash64Valid = true;
return m_firstUrlHash64;
}
m_firstUrlHash64 = hash64b ( m_firstUrl.m_url );
m_firstUrlHash64Valid = true;
return m_firstUrlHash64;
}
Url **XmlDoc::getLastRedirUrl() {
Url **ru = getRedirUrl();
if ( ! ru || ru == (void *)-1 ) return ru;
// m_redirUrlPtr will be NULL in all cases, however, the
// last redir url we actually got will be set in
// m_redirUrl.m_url so return that.
m_lastRedirUrlPtr = &m_redirUrl;
return &m_lastRedirUrlPtr;
}
// . operates on the latest m_httpReply
Url **XmlDoc::getRedirUrl() {
if ( m_redirUrlValid ) return &m_redirUrlPtr;
setStatus ( "getting redir url" );
// assume no redirect
m_redirUrlPtr = NULL;
//ptr_redirUrl = NULL;
//size_redirUrl = 0;
// bail on this
//if ( ! m_checkForRedir ) {
// m_redirError = 0;
// m_redirErrorValid = true;
// return &m_redirUrlPtr;
//}
// we might have a title rec
if ( m_setFromTitleRec ) { char *xx=NULL;*xx=0; }
// or recycling content from old title rec
if ( m_recycleContent ) {
m_redirError = 0;
m_redirErrorValid = true;
m_redirUrlValid = true;
return &m_redirUrlPtr;
}
// get the current http reply, not the final http reply necessarily
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
// set a mime on the stack
HttpMime mime;
// int16_tcut
int32_t LEN = m_httpReplySize - 1;
// sanity check
if ( LEN > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
// empty reply, no redir
if ( LEN == 0 ) {
// bad mime, but i guess valid empty redir url
m_redirUrlValid = true;
// no error
m_redirError = 0;
m_redirErrorValid = true;
// return a fake thing. content length is 0.
return &m_redirUrlPtr;
}
// set it. if 'connection refused' then LEN is -1.
if ( LEN<0 || ! mime.set ( m_httpReply, LEN, getCurrentUrl() ) ) {
// set this on mime error
//if ( ! m_indexCode ) m_indexCode = EBADMIME;
// bad mime, but i guess valid empty redir url
m_redirUrlValid = true;
// return nothing, no redirect url was there
m_redirUrlPtr = NULL;
// no error
m_redirError = 0;
m_redirErrorValid = true;
// return a fake thing. content length is 0.
return &m_redirUrlPtr;
}
int32_t httpStatus = mime.getHttpStatus() ;
Url *loc = NULL;
// quickly see if we are a robots.txt url originally
bool isRobotsTxt = isFirstUrlRobotsTxt ( );
//
// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
// if httpStatus is not a redirect
//
if ( httpStatus < 300 || httpStatus > 399 ) {
// ok, crap, i was getting the xml here to get the meta
// http-equiv refresh tag, but that added an element of
// recursion that is just too confusing to deal with. so
// let's just parse out the meta tag by hand
bool checkMeta = true;
if ( isRobotsTxt ) checkMeta = false;
// if we are a doc that consists of a sequence of sub-docs that
// we are indexing/injecting then don't do this check.
if ( isContainerDoc() ) checkMeta = false;
if ( checkMeta ) {
Url **mrup = getMetaRedirUrl();
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
// set it. might be NULL if not there.
loc = *mrup;
}
}
else
// get Location: url (the redirect url) from the http mime
loc = mime.getLocationUrl();
// get current url
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (Url **)cu;
// this call set size_catIds
int32_t **pcids = getCatIds();
if ( ! pcids || pcids == (void *)-1) return (Url **)pcids;
// get local link info
LinkInfo *info1 = getLinkInfo1();
// error or blocked
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
// get remote link info
LinkInfo **pinfo2 = getLinkInfo2();
// error or blocked
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (Url **)pinfo2;
// convenience
LinkInfo *info2 = *pinfo2;
// breathe
QUICKPOLL(m_niceness);
// did we send a cookie with our last request?
bool sentCookieLastTime = false;
if ( m_redirCookieBuf.length() )
sentCookieLastTime = true;
// get cookie for redirect to fix nyt.com/nytimes.com
// for gap.com it uses multiple Set-Cookie:\r\n lines so we have
// to accumulate all of them into a buffer now
m_redirCookieBuf.reset();
mime.addCookiesIntoBuffer ( &m_redirCookieBuf );
m_redirCookieBufValid = true;
/*
char *cookie = mime.getCookie();
// find end of cookie at the semicolon
char *s = cookie;
for ( ; s && *s && *s != ';' ; s++ );
if ( s && *s == ';' ) {
// do not include ;
int32_t clen = s - cookie;
m_redirCookieBuf.reset();
m_redirCookieBuf.safeMemcpy ( cookie , clen );
m_redirCookieBuf.nullTerm();
m_redirCookieBufValid = true;
}
*/
// mdw23
//log("http: reply=%s",m_httpReply);
// a hack for removing session ids already in there. for
// brilliantshopper's bs4 collection and gk0 cluster
//bool forceRedirect = false;
if ( size_catIds == 0 &&
// must not have an actual redirect url in there
! loc &&
// must be a valid http status
httpStatus == 200 &&
(gb_strcasestr( cu->getUrl(), "sessionid") ||
gb_strcasestr( cu->getUrl(), "oscsid") ) ) {
Url *tt = &m_redirUrl;
tt->set ( cu->getUrl() ,
cu->getUrlLen() ,
true , // addwww?
true ); // strip sessid?
// if it no longer has the session id, force redirect it
if ( ! gb_strcasestr( tt->getUrl(), "sessionid") &&
! gb_strcasestr( tt->getUrl(), "oscsid") ) {
m_redirUrlValid = true;
m_redirUrlPtr = &m_redirUrl;
// TODO: log redir url in spider log output
//logf(LOG_INFO,"build: %s force redirected to %s",
// cu->getUrl(),m_redirUrl.getUrl());
m_redirUrlValid = true;
ptr_redirUrl = m_redirUrl.m_url;
size_redirUrl = m_redirUrl.m_ulen+1;
// no error
m_redirError = 0;
m_redirErrorValid = true;
return &m_redirUrlPtr;
}
}
// breathe
QUICKPOLL(m_niceness);
// if no location url, then no redirect a NULL redir url
if ( ! loc || loc->m_url[0] == '\0' ) {
// validate it
m_redirUrlValid = true;
// no error
m_redirError = 0;
m_redirErrorValid = true;
// and return an empty one
return &m_redirUrlPtr;
}
// breathe
QUICKPOLL(m_niceness);
// this is handy
//Url tmp;
// TODO: make sure we got this logic elsewhere
// if robots.txt said no, and if we had no link text, then give up
//if(! *isAllowed && !info1->hasLinkText() && !info2->hasLinkText() ) {
// m_indexCode = EDOCDISALLOWED;
// set our redir url from the mime's Location: field. addWWW=false
//if ( loc != &tmp ) tmp.set ( loc , false );
bool keep = false;
if ( size_catIds > 0 ) keep = true;
if ( info1->hasLinkText() ) keep = true;
if ( info2 && info2->hasLinkText() ) keep = true;
// at this point we do not block anywhere
m_redirUrlValid = true;
// store the redir error
m_redirError = 0;
m_redirErrorValid = true;
// i've seen a "Location: 2010..." bogus url as well, so make sure
// we got a legit url
if ( ! loc->getDomain() || loc->getDomainLen() <= 0 ) {
if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
return &m_redirUrlPtr;
}
//bool injected = false;
// get from spider request if there
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
// . if redirect url is nothing new, then bail (infinite loop)
// . www.xbox.com/SiteRequirements.htm redirects to itself
// until you send a cookie!!
// . www.twomileborris.com does the cookie thing, too
if ( strcmp ( cu->getUrl(), loc->getUrl() ) == 0 ) {
// try sending the cookie if we got one now and didn't have
// one for this last request
if ( ! sentCookieLastTime && m_redirCookieBuf.length() ) {
m_redirUrl.set ( loc->getUrl() );
m_redirUrlPtr = &m_redirUrl;
return &m_redirUrlPtr;
}
if ( ! keep ) m_redirError = EDOCREDIRECTSTOSELF;
return &m_redirUrlPtr;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . don't allow redirects when injecting!
// . otherwise, we would mfree(m_buf) which would free our
// injected reply... yet m_injectedReplyLen would still be
// positive! can you say 'seg fault'?
// . hmmm... seems to have worked though
if ( cr->m_recycleContent || m_recycleContent ) { // || injected
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
return &m_redirUrlPtr;
}
// . if we followed too many then bail
// . www.motorolamobility.com www.outlook.com ... failed when we
// had >= 4 here
if ( ++m_numRedirects >= 10 ) {
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
return &m_redirUrlPtr;
}
// sometimes idiots don't supply us with a Location: mime
if ( loc->getUrlLen() == 0 ) {
if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
return &m_redirUrlPtr;
}
// . protocol of url must be http or https
// . we had one url redirect to an ihttp:// protocol and caused
// spider to core dump when it saw that SpiderRequest record
char *proto = loc->getScheme();
if ( strncmp(proto,"http://" ,7) && strncmp(proto,"https://",8) ) {
m_redirError = EDOCBADREDIRECTURL;
return &m_redirUrlPtr;
}
// do not allow redirects to evil-G or bing
//if ( strstr(loc->getUrl(),".google.com/") ||
// strstr(loc->getUrl(),".bing.com/") ) {
// m_redirError = EDOCEVILREDIRECT;
// return &m_redirUrlPtr;
//}
// log a msg
if ( g_conf.m_logSpideredUrls )
logf(LOG_INFO,"build: %s redirected to %s",
cu->getUrl(),loc->getUrl());
// if not same Domain, it is not a simplified redirect
bool sameDom = true;
int32_t dlen = loc->getDomainLen();
if ( cu->getDomainLen() != dlen ) sameDom=false;
else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
if ( ! sameDom ) {
m_redirectFlag = true;
m_redirUrl.set ( loc , false ); // addWWW=false
m_redirUrlPtr = &m_redirUrl;
ptr_redirUrl = m_redirUrl.m_url;
size_redirUrl = m_redirUrl.m_ulen+1;
return &m_redirUrlPtr;
}
// if redirecting to the same domain, then do not add "www.".
// this way we can take care of slashdot.org, etc.
//bool addwww = false;
// but never modify if in dmoz, keep it pure
//if ( size_catIds > 0 ) addwww = false;
// debug msg
//if ( strcmp(m_redirUrl.getUrl(),url->getUrl())== 0 )
// log("Redirect error: same url");
//bool stripSessId = (size_catIds == 0);
// . reset m_redirUrl now (do not addWWW for slashdot.org, etc)
// . we now add "www." UNLESS it's a redirect from the same
// domain or firstUrl is in catdb
//tmp.set( loc->getUrl(),loc->getUrlLen(),addwww,stripSessId);
/*
// get this
bool sameHostLinks = false;
if ( *pi >= 0 ) sameHostLinks =cr->m_pq_spiderSameHostnameLinks[*pi];
// get first url ever
Url *f = getFirstUrl();
// . for same host links, addwww for comparing
// . so if we are doing google.com and it redirects to
// www.google.com then we will allow that... and vice versa
if ( sameHostLinks ) {
Url u1;
Url u2;
u1.set ( loc->getUrl () , loc->getUrlLen(), true ); // addwww?
u2.set ( f->getUrl() , f->getUrlLen () , true ); // addwww?
// host must match if we are restricted to a particular host
if ( u1.getHostLen() != u2.getHostLen() ||
strncmp ( u1.getHost() , u2.getHost() ,
u1.getHostLen () ) != 0 ) {
m_redirError = EDOCBADREDIRECTURL;
return &m_redirUrlPtr;
}
}
*/
// get first url ever
Url *f = getFirstUrl();
// breathe
QUICKPOLL(m_niceness);
// set this to true if the redirected urls is much preferred
bool simplifiedRedir = false;
// . if it redirected to a simpler url then stop spidering now
// and add the simpler url to the spider queue
// . by simpler, i mean one w/ fewer path components
// . or one with a www for hostname
// . or could be same as firstUrl but with a / appended
char *r = loc->getUrl();
char *u = f->getUrl();
int32_t rlen = loc->getUrlLen();
int32_t ulen = f->getUrlLen();
// simpler if new path depth is int16_ter
if ( loc->getPathDepth (true) < f->getPathDepth (true) )
simplifiedRedir = true;
// simpler if old has cgi and new does not
if ( f->isCgi() && ! loc->isCgi() )
simplifiedRedir = true;
// if we're a dmoz page, don't do this, unless just a / case,no
if ( size_catIds > 0 )
simplifiedRedir = false;
// simpler if new one is same as old but has a '/' at the end
if ( rlen == ulen+1 && r[rlen-1]=='/' && strncmp(r,u,ulen)==0)
simplifiedRedir = true;
// . if new url does not have semicolon but old one does
// . http://news.yahoo.com/i/738;_ylt=AoL4eFRYKEdXbfDh6W2cF
// redirected to http://news.yahoo.com/i/738
if ( strchr (u,';') && ! strchr (r,';') )
simplifiedRedir = true;
// simpler is new host is www and old is not
if ( loc->isHostWWW() && ! f->isHostWWW() )
simplifiedRedir = true;
// if redirect is to different domain, set simplified
// this helps locks from bunching on one domain
if ( loc->getDomainLen()!=f->getDomainLen() ||
strncasecmp ( loc->getDomain(),
f->getDomain(),
loc->getDomainLen() ) != 0 )
// crap, but www.hotmail.com redirects to live.msn.com
// login page ... so add this check here
if ( ! f->isRoot() )
simplifiedRedir = true;
bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;
// follow redirects if injecting so we do not return
// EDOCSIMPLIFIEDREDIR
if ( getIsInjecting ( ) )
allowSimplifiedRedirs = true;
// or if disabled then follow the redirect
if ( ! cr->m_useSimplifiedRedirects )
allowSimplifiedRedirs = true;
// . if the redir url is simpler, but has no hostname we
// prepend a "www." to it
// . this should avoids www.russ.ru and russ.ru from being
// in the index at the same time and causing url: collisions
/*
if ( size_catIds == 0 &&
simplifiedRedir &&
loc->getDomainLen() == loc->getHostLen () )
loc->set (loc->getUrl(),
loc->getUrlLen(),
true, //false, addwww?
stripSessId );
*/
// if not allow, do not do them... except for the two below
//if ( ! m_useSimplifiedRedirects || m_isDirColl )
// simplifiedRedir = false;
// special hack for nytimes.com. do not consider simplified redirs
// because it uses a cookie along with redirs to get to the final
// page.
char *dom2 = m_firstUrl.getDomain();
int32_t dlen2 = m_firstUrl.getDomainLen();
if ( dlen2 == 11 && strncmp(dom2,"nytimes.com",dlen2)==0 )
allowSimplifiedRedirs = true;
// same for bananarepublic.gap.com ?
// if ( dlen2 == 7 && strncmp(dom2,"gap.com",dlen2)==0 )
// allowSimplifiedRedirs = true;
// if redirect is setting cookies we have to follow the redirect
// all the way through so we can stop now.
if ( m_redirCookieBufValid && m_redirCookieBuf.getLength() )
allowSimplifiedRedirs = true;
// . don't bother indexing this url if the redir is better
// . 301 means moved PERMANENTLY...
// . many people use 301 on their root pages though, so treat
// it like a temporary redirect, like exclusivelyequine.com
if ( simplifiedRedir && ! allowSimplifiedRedirs &&
// for custom BULK clients don't like this i guess
// AND for custom crawl it was messing up the processing
// url format for a nytimes blog subsite which was redirecting
// to the proper nytimes.com site...
// ! cr->m_isCustomCrawl ) {
// no, we need this for custom crawls because otherwise we
// get too many dups in the index. so for nyt we need something
// else
cr->m_isCustomCrawl != 2 ) {
// returns false if blocked, true otherwise
//return addSimplifiedRedirect();
m_redirError = EDOCSIMPLIFIEDREDIR;
// set this because getLinks() treats this redirUrl
// as a link now, it will add a SpiderRequest for it:
m_redirUrl.set ( loc , false ); // addWWW=false
m_redirUrlPtr = &m_redirUrl;
// mdw: let this path through so contactXmlDoc gets a proper
// redirect that we can follow. for the base xml doc at
// least the m_indexCode will be set
return &m_redirUrlPtr;
}
// good to go
m_redirectFlag = true;
m_redirUrl.set ( loc , false ); // addWWW=false
m_redirUrlPtr = &m_redirUrl;
ptr_redirUrl = m_redirUrl.m_url;
size_redirUrl = m_redirUrl.m_ulen+1;
return &m_redirUrlPtr;
}
int32_t *XmlDoc::getFirstIndexedDate ( ) {
if ( m_firstIndexedDateValid ) return (int32_t *)&m_firstIndexedDate;
XmlDoc **od = getOldXmlDoc ( );
if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
// valid
m_firstIndexedDateValid = true;
// must be downloaded
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// assume now is the first time
m_firstIndexedDate = getSpideredTime();//m_spideredTime;
// inherit from our old title rec
if ( *od ) m_firstIndexedDate = (*od)->m_firstIndexedDate;
// return it
return (int32_t *)&m_firstIndexedDate;
}
int32_t *XmlDoc::getOutlinksAddedDate ( ) {
if ( m_outlinksAddedDateValid ) return (int32_t *)&m_outlinksAddedDate;
XmlDoc **od = getOldXmlDoc ( );
if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
// valid
m_outlinksAddedDateValid = true;
// must be downloaded
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// assume we are doing it now
m_outlinksAddedDate = getSpideredTime();//m_spideredTime;
// get that
if ( *od ) m_outlinksAddedDate = (*od)->m_outlinksAddedDate;
// return it
return (int32_t *)&m_outlinksAddedDate;
}
/*
int32_t *XmlDoc::getNumBannedOutlinks ( ) {
if ( m_numBannedOutlinksValid ) return &m_numBannedOutlinks;
setStatus ( "getting num banned outlinks" );
// get the outlinks
Links *links = getLinks();
if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
// count em
int32_t n = links->getNumLinks();
// reset
m_numBannedOutlinks = 0;
// one vote per domain hash table
char buf[20000];
HashTableX ht; ht.set ( 4 , 0 , -1 , buf , 20000 ,false,m_niceness);
// loop through them
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get the link
char *u = links->getLinkPtr(i);
// get domain of the link
int32_t dlen; char *dom = getDomFast ( u , &dlen , false );
// skip if bad domain
if ( ! dom || dlen <= 0 ) continue;
// get domHash
int32_t h = hash32 ( dom , dlen );
// one check per domain
if ( ht.getSlot ( &h ) >= 0 ) continue;
// add it, return NULL on error, g_errno should be set
if ( ! ht.addKey ( &h ) ) return NULL;
// . loop over all regular expression in the url filters table
// . stop at first regular expression it matches
int32_t *rn = getRegExpNum2 ( i );
// need to wait for a callback at this point
if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
// skip if no match in url filters table
if ( *rn == -1 ) continue;
// get spider priority
int32_t pr = cr->m_spiderPriorities[*rn];
// skip if not banned
if ( pr != -2 ) continue;
// count it
m_numBannedOutlinks++;
}
// all done
m_numBannedOutlinksValid = true;
// convert this too!
//m_numBannedOutlinks8 = score32to8 ( m_numBannedOutlinks );
// sanity check on score32to8()
//if(m_numBannedOutlinks8>0&&!m_numBannedOutlinks){char*xx=NULL;*xx=0;}
return &m_numBannedOutlinks;
}
*/
uint16_t *XmlDoc::getCountryId ( ) {
if ( m_countryIdValid ) return &m_countryId;
setStatus ( "getting country id" );
// get it
CatRec *cat = getCatRec ();
if ( ! cat || cat == (CatRec *)-1) return (uint16_t *)cat;
// MDW: i limit this to 10 to save stack space!
Url *u = getCurrentUrl();
if ( ! u || u == (void *)-1) return (uint16_t *)u;
// use the url's tld to guess the country
uint16_t country = g_langId.guessCountryTLD ( u->getUrl ( ) );
// . 0 means no country i guess. try dmoz next.
// . limit to 10 of them
int32_t nc = cat->m_numCatids;
for ( int32_t i = 0; ! country && i < nc && i < 10 ; i++) {
int32_t catid = cat->m_catids[i];
country = g_countryCode.getCountryFromDMOZ ( catid );
}
m_countryIdValid = true;
m_countryId = country;
return &m_countryId;
}
/*
XmlDoc *XmlDoc::getOldDoc ( ) {
if ( m_oldDocValid ) return &m_oldDoc;
// get current url
Url *u = getCurrentUrl();
// set its url otherwise
m_oldDoc.setFirstUrl ( u , false );
// get the old title rec
char *ret = getOldTitleRec();
if ( ! ret || ret == (char *)-1 ) return (XmlDoc *)ret;
// all done
m_oldDocValid = true;
// return it
return m_oldDoc;
}
*/
uint8_t *XmlDoc::getRootLangId ( ) {
// return it if we got it
if ( m_rootLangIdValid ) return &m_rootLangId;
// note it
setStatus ( "getting root lang id from tagdb");
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
// sanity check - should not be called on a root url
if ( *isRoot ) {
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 )
return (uint8_t *) langId;
m_rootLangId = *langId;
m_rootLangIdValid = true;
return &m_rootLangId;
//char *xx=NULL;*xx=0; }
}
// get the tag rec
TagRec *gr = getTagRec ();
if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
// just use one. there may be multiple ones!
Tag *tag = gr->getTag("rootlang");
// if there use that
if ( ! tag ) {
// . get the root doc
// . allow for a one hour cache of the titleRec
XmlDoc **prd = getRootXmlDoc( 3600 );
if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
// int16_tcut
XmlDoc *rd = *prd;
// . if no root doc, then assume language unknown
// . this happens if we are injecting because we do not want
// to download the root page for speed purposes
if ( ! rd ) {
m_rootLangId = langUnknown;
m_rootLangIdValid = true;
return &m_rootLangId;
}
// . update tagdb rec
// . on root download error use language "xx" (unknown) to
// avoid hammering the root page
//bool *status = rd->updateRootLangId ();
//if (! status || status==(void *)-1) return (uint8_t *)status;
// update our tag rec now
//Tag *tt = rd->m_newTagRec.getTag("rootlang");
// must be there
//if ( ! tt ) { char *xx=NULL;*xx=0; }
// add it for us
//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
// get it
uint8_t *rl = rd->getLangId();
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
// must be legit now!
if ( ! rd->m_langIdValid ) { char *xx=NULL;*xx=0;}
// now validate our stuff
m_rootLangIdValid = true;
//m_rootLangIdScore = rd->m_langIdScore;
m_rootLangId = rd->m_langId;
return &m_rootLangId;
}
// sanity check ( must be like "en,50\0" or could be
// "en_US,50\0" or "zh_cn,50"
if ( tag->getTagDataSize() > 6 ) { char *xx=NULL;*xx=0; }
// point to 2 character language abbreviation
char *abbr = tag->getTagData();
/*
// find comma
char *comma = strchr(abbr,',' );
// sanity check
if ( ! comma ) { char *xx=NULL;*xx=0; }
// tmp NULL
*comma = '\0';
*/
// map it to an id
uint8_t langId = getLangIdFromAbbr( abbr );
/*
// put it back
*comma = ',';
// get score
int32_t score = atol(comma+1);
// sanity check
if ( score < 0 || score > 100 ) { char *xx=NULL;*xx=0; }
*/
// set that up
m_rootLangId = langId;
//m_rootLangIdScore = score;
m_rootLangIdValid = true;
return &m_rootLangId;
}
XmlDoc **XmlDoc::getOldXmlDoc ( ) {
if ( m_oldDocValid ) return &m_oldDoc;
// note it
setStatus ( "getting old xml doc");
// if we are set from a title rec, we are the old doc
if ( m_setFromTitleRec ) {
m_oldDocValid = true;
m_oldDoc = NULL;//this;
return &m_oldDoc;
}
// . cache age is 0... super fresh
// . returns NULL w/ g_errno if not found unless isIndexed is false
// and valid, and it is not valid for pagereindexes.
char **otr = getOldTitleRec ( );
if ( ! otr || otr == (char **)-1 ) return (XmlDoc **)otr;
// if no title rec, return ptr to a null
m_oldDoc = NULL;
if ( ! *otr ) { m_oldDocValid = true; return &m_oldDoc; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if provided title rec matches our docid but not uh48 then there
// was a docid collision and we should null out our title rec
// and return with an error and no index this puppy!
// crap, we can't call getFirstUrl() because it might not be
// valid if we are a docid based doc and THIS function was called
// from getFirstUrl() -- we end up in a recursive loop.
if ( ! m_setFromDocId ) {
//int64_t uh48 = getFirstUrl()->getUrlHash48();
int64_t uh48 = getFirstUrlHash48();
int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr );
if ( uh48 != tuh48 ) {
log("xmldoc: docid collision uh48 mismatch. cannot "
"index "
"%s",getFirstUrl()->getUrl() );
g_errno = EDOCIDCOLLISION;
return NULL;
}
}
// . if *otr is NULL that means not found
// . return a NULL old XmlDoc in that case as well?
// . make a new one
// . this will uncompress it and set ourselves!
try { m_oldDoc = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
return NULL;
}
mnew ( m_oldDoc , sizeof(XmlDoc),"xmldoc1");
// debug the mem leak
// log("xmldoc: xmldoc1=%"PTRFMT" u=%s"
// ,(PTRTYPE)m_oldDoc
// ,m_firstUrl.getUrl());
// if title rec is corrupted data uncompress will fail and this
// will return false!
if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
m_oldTitleRecSize , // maxSize
cr->m_coll ,
NULL , // pbuf
m_niceness ) ) {
log("build: failed to set old doc for %s",m_firstUrl.m_url);
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
int32_t saved = g_errno;
// ok, fix the memleak here
mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
delete ( m_oldDoc );
m_oldDocExistedButHadError = true;
//log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc);
m_oldDoc = NULL;
g_errno = saved;
// MDW: i removed this on 2/8/2016 again so the code below
// would execute.
//return NULL; //mdwmdwmdw
// if it is data corruption, just assume empty so
// we don't stop spidering a url because of this. so we'll
// think this is the first time indexing it. otherwise
// we get "Bad cached document" in the logs and the
// SpiderReply and it never gets re-spidered because it is
// not a 'temporary' error according to the url filters.
log("build: treating corrupted titlerec as not found");
g_errno = 0;
m_oldDoc = NULL;
m_oldDocValid = true;
return &m_oldDoc;
}
m_oldDocValid = true;
// share our masterloop and state!
m_oldDoc->m_masterLoop = m_masterLoop;
m_oldDoc->m_masterState = m_masterState;
return &m_oldDoc;
}
void XmlDoc::nukeDoc ( XmlDoc *nd ) {
// skip if empty
if ( ! nd ) return;
// debug the mem leak
// if ( nd == m_oldDoc )
// log("xmldoc: nuke xmldoc1=%"PTRFMT" u=%s this=%"PTRFMT""
// ,(PTRTYPE)m_oldDoc
// ,m_firstUrl.getUrl()
// ,(PTRTYPE)this
// );
// do not nuke yerself!
if ( nd == this ) return;
// or root doc!
//if ( nd == m_rootDoc ) return;
// nuke it
mdelete ( nd , sizeof(XmlDoc) , "xdnuke");
delete ( nd );
// invalidate
if ( nd == m_extraDoc ) {
m_extraDocValid = false;
m_extraDoc = NULL;
}
if ( nd == m_rootDoc ) {
m_rootDocValid = false;
m_rootDoc = NULL;
}
if ( nd == m_oldDoc ) {
m_oldDocValid = false;
m_oldDoc = NULL;
}
if ( nd == m_ahrefsDoc ) {
m_ahrefsDocValid = false;
m_ahrefsDoc = NULL;
}
}
static LinkInfo s_dummy;
XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
if ( m_extraDocValid ) return &m_extraDoc;
// note that
setStatus ( "getting new doc" );
// we need a valid first ip first!
//int32_t *pfip = getFirstIp();
//if ( ! pfip || pfip == (void *)-1 ) return (XmlDoc **)pfip;
// must be NULL
if ( m_extraDoc ) { char *xx=NULL;*xx=0; }
// sanity check
if ( ! u || ! u[0] ) { char *xx=NULL;*xx=0; }//return &m_extraDoc;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . if *otr is NULL that means not found
// . return a NULL old XmlDoc in that case as well?
// . make a new one
// . this will uncompress it and set ourselves!
try { m_extraDoc = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
return NULL;
}
mnew ( m_extraDoc , sizeof(XmlDoc),"xmldoc2");
// . if we did not have it in titledb then download it!
// . or if titleRec was too old!
// a spider rec for the extra doc to use
SpiderRequest sreq;
// clear it
sreq.reset();
// spider the url "u"
strcpy ( sreq.m_url , u );
// inherit page parser
sreq.m_isPageParser = getIsPageParser();
// set the data size right
sreq.setDataSize();
// . prepare to download it, set it up
// . returns false and sets g_errno on error
if ( ! m_extraDoc->set4 ( &sreq ,
NULL , // doledbkey ptr
cr->m_coll ,
NULL , // SafeBuf
m_niceness ))
return NULL;
// share our masterloop and state!
m_extraDoc->m_masterLoop = m_masterLoop;
m_extraDoc->m_masterState = m_masterState;
// carry this forward always!
m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;
// disable spam check because that is not necessary for this doc!
m_extraDoc->m_spamCheckDisabled = true;
// tell msg13 to get this from it robots.txt cache if it can. it also
// keeps a separate html page cache for the root pages, etc. in case
m_extraDoc->m_maxCacheAge = maxCacheAge;
// a dummy thing
s_dummy.m_numStoredInlinks = 0;
s_dummy.m_numGoodInlinks = 0;
// we indirectly call m_extraDoc->getHttpReply() which calls
// m_extraDoc->getRedirectUrl(), which checks the linkInfo and
// dmoz catids of the original url to see if we should set m_indexCode
// to something bad or not. to avoid these unnecessary lookups we
// set these to NULL and validate them
m_extraDoc->ptr_catIds = NULL;
m_extraDoc->size_catIds = 0;
m_extraDoc->m_catIdsValid = true;
m_extraDoc->ptr_linkInfo1 = &s_dummy;
m_extraDoc->size_linkInfo1 = 0;
m_extraDoc->m_linkInfo1Valid = true;
m_extraDoc->ptr_linkInfo2 = &s_dummy;
m_extraDoc->size_linkInfo2 = 0;
m_extraDoc->m_linkInfo2Valid = true;
m_extraDoc->m_urlFilterNumValid = true;
m_extraDoc->m_urlFilterNum = 0;
// for redirects
m_extraDoc->m_allowSimplifiedRedirs = true;
// always forward the http download request so that Msg13.cpp's
// handleRequest13() can avoid this same page
// from being downloaded at the same time. also, if we are robots.txt
// this allows us to use the same cache since we select the host we
// forward to based on ip address.
m_extraDoc->m_forwardDownloadRequest = true;
// set this flag so msg13.cpp doesn't print the "hammering ip" msg
m_extraDoc->m_isChildDoc = true;
m_extraDoc->m_parentDocPtr = this;
// debug it
//g_doc = this;
// and inherit test dir so getTestDir() doesn't core on us
bool isPageParser = getIsPageParser();
m_extraDoc->m_isPageParser = isPageParser;
m_extraDoc->m_isPageParserValid = true;
// without this we send all the msg13 requests to host #3! because
// Msg13 uses it to determine what host to handle it
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
m_extraDoc->m_firstIp = m_firstIp;
m_extraDoc->m_firstIpValid = true;
// i guess we are valid now
m_extraDocValid = true;
return &m_extraDoc;
}
bool XmlDoc::getIsPageParser ( ) {
if ( m_isPageParserValid ) return m_isPageParser;
// assume not
m_isPageParser = false;
// and set otherwise
if ( m_sreqValid && m_sreq.m_isPageParser ) m_isPageParser = true;
// and validate
m_isPageParserValid = true;
return m_isPageParser;
}
XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
if ( m_rootDocValid ) return &m_rootDoc;
// help avoid mem leaks
if ( m_rootDoc ) { char *xx=NULL;*xx=0; }
// note it
setStatus ( "getting root doc");
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (XmlDoc **)isRoot;
// if we are root use us!!!!!
if ( *isRoot ) {
m_rootDoc = this;
m_rootDocValid = true;
return &m_rootDoc;
}
// get our site root
char *mysite = getSite();
if ( ! mysite || mysite == (void *)-1 ) return (XmlDoc **)mysite;
// otherwise, we gotta get it!
char **rtr = getRootTitleRec ( );
if ( ! rtr || rtr == (char **)-1 ) return (XmlDoc **)rtr;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (XmlDoc **)cu;
// if no title rec, return ptr to a null
//m_rootDoc = NULL;
//if ( ! *rtr ) {
// // damn, not in titledb, i guess download it then
// m_rootDocValid = true; return &m_rootDoc; }
// note it
setStatus ( "getting root doc");
// to keep injections fast, do not download the root page!
if ( ! *rtr && m_contentInjected ) {
// assume none
m_rootDoc = NULL;
m_rootDocValid = true;
return &m_rootDoc;
}
// likewise, if doing a rebuild
if ( ! *rtr && m_useSecondaryRdbs ) {
// assume none
m_rootDoc = NULL;
m_rootDocValid = true;
return &m_rootDoc;
}
// or recycling content like for query reindex. keep it fast.
if ( ! *rtr && m_recycleContent ) {
m_rootDoc = NULL;
m_rootDocValid = true;
return &m_rootDoc;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . if *otr is NULL that means not found
// . return a NULL root XmlDoc in that case as well?
// . make a new one
// . this will uncompress it and set ourselves!
try { m_rootDoc = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
return NULL;
}
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
// if we had the title rec, set from that
if ( *rtr ) {
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
m_rootTitleRecSize , // maxSize ,
cr->m_coll ,
NULL , // pbuf
m_niceness ) ) {
// it was corrupted... delete this
// possibly printed
// " uncompress uncompressed size=..." bad uncompress
log("build: rootdoc set2 failed");
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
delete ( m_rootDoc );
// call it empty for now, we don't want to return
// NULL with g_errno set because it could stop
// the whole indexing pipeline
m_rootDoc = NULL;
m_rootDocValid = true;
return &m_rootDoc;
//return NULL;
}
}
// . otherwise, set the url and download it on demand
// . this junk copied from the contactDoc->* stuff below
else {
// a spider rec for the contact doc
SpiderRequest sreq;
// clear it
sreq.reset();
// spider the url "u"
char *p = sreq.m_url;
if ( cu->isHttps() ) p += sprintf ( p , "https://" );
else p += sprintf ( p , "http://" );
strcpy ( p , mysite );
// set this
if ( m_sreqValid ) {
// this will avoid it adding to tagdb!
sreq.m_isPageParser = m_sreq.m_isPageParser;
}
// reset the data size
sreq.setDataSize ();
// . prepare to download it, set it up
// . returns false and sets g_errno on error
if ( ! m_rootDoc->set4 ( &sreq ,
NULL , // doledbkey ptr
cr->m_coll ,
NULL , // SafeBuf
m_niceness )) {
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
delete ( m_rootDoc );
m_rootDoc = NULL;
return NULL;
}
// do not throttle it!
//m_rootDoc->m_throttleDownload = false;
// . do not do robots check for it
// . no we must to avoid triggering a bot trap & getting banned
//m_rootDoc->m_isAllowed = m_isAllowed;
//m_rootDoc->m_isAllowedValid = true;
}
// share our masterloop and state!
m_rootDoc->m_masterLoop = m_masterLoop;
m_rootDoc->m_masterState = m_masterState;
// msg13 caches the pages it downloads
m_rootDoc->m_maxCacheAge = maxCacheAge;
// like m_contactDoc we avoid unnecessary lookups in call to
// getRedirUrl() by validating these empty members
m_rootDoc->ptr_catIds = NULL;
m_rootDoc->size_catIds = 0;
m_rootDoc->m_catIdsValid = true;
m_rootDoc->ptr_linkInfo1 = &s_dummy;
m_rootDoc->size_linkInfo1 = 0;
m_rootDoc->m_linkInfo1Valid = true;
m_rootDoc->ptr_linkInfo2 = &s_dummy;
m_rootDoc->size_linkInfo2 = 0;
m_rootDoc->m_linkInfo2Valid = true;
m_rootDoc->m_urlFilterNumValid = true;
m_rootDoc->m_urlFilterNum = 0;
// for redirects
m_rootDoc->m_allowSimplifiedRedirs = true;
// always forward the http download request so that Msg13.cpp's
// handleRequest13() can avoid the same root page or contact page
// from being downloaded at the same time. also, if we are robots.txt
// this allows us to use the same cache since we select the host we
// forward to based on ip address.
m_rootDoc->m_forwardDownloadRequest = true;
// set this flag so msg13.cpp doesn't print the "hammering ip" msg
m_rootDoc->m_isChildDoc = true;
m_rootDoc->m_parentDocPtr = this;
// validate it
m_rootDocValid = true;
return &m_rootDoc;
}
/*
// no longer access Revdb to get the old metalist, now re-compute
RdbList *XmlDoc::getOldMetaList ( ) {
// if valid return that
if ( m_oldMetaListValid ) return &m_oldMetaList;
// update status msg
setStatus ( "getting old meta list");
// load the old title rec
XmlDoc **odp = getOldXmlDoc( );
if ( ! odp || odp == (XmlDoc **)-1 ) return (RdbList *)odp;
XmlDoc *od = *odp;
// empty old doc?
if ( ! od ) {
m_oldMetaList.reset();
m_oldMetaListValid = true;
return &m_oldMetaList;
}
// and use that. it has m_setFromTitleRec set to true.
char *old = od->getMetaList();
if ( ! old || old == (void *)-1 ) return (RdbList *)old;
// set it
m_oldMetaList.m_list = od->m_metaList; // old;
m_oldMetaList.m_listSize = od->m_metaListSize;
m_oldMetaList.m_ownData = false;
// assign it
m_oldMetaListValid = true;
return &m_oldMetaList;
}
*/
SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
if ( m_setFromDocId ) return &m_timeAxisUrl;
m_timeAxisUrlValid = true;
Url *fu = getFirstUrl();
m_timeAxisUrl.reset();
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
return &m_timeAxisUrl;
}
// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
// from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getOldTitleRec ( ) {
// clear if we blocked
//if ( g_errno == ENOTFOUND ) g_errno = 0;
// g_errno = EBADTITLEREC;
// return NULL;
// if valid return that
if ( m_oldTitleRecValid ) return &m_oldTitleRec;
// update status msg
setStatus ( "getting old title rec");
// if we are set from a title rec, we are the old doc
if ( m_setFromTitleRec ) {
m_oldTitleRecValid = true;
m_oldTitleRec = NULL;//m_titleRec;
return &m_oldTitleRec;
}
// sanity check
if ( m_oldTitleRecValid && m_msg22a.m_outstanding ) {
char *xx=NULL;*xx=0; }
// point to url
//char *u = getCurrentUrl()->getUrl();
//char *u = getFirstUrl()->getUrl();
// assume its valid
m_oldTitleRecValid = true;
// add it to the cache?
bool addToCache = false;
//if ( maxCacheAge > 0 ) addToCache = true;
// not if new! no we need to do this so XmlDoc::getDocId() works!
// this logic prevents us from setting g_errno to ENOTFOUND
// when m_msg22a below calls indexDocWrapper(). however, for
// doing a query delete on a not found docid will succumb to
// the g_errno because m_isIndexed is not valid i think...
if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
m_oldTitleRec = NULL;
m_oldTitleRecValid = true;
return &m_oldTitleRec;
}
// sanity check. if we have no url or docid ...
if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// use docid if first url not valid
int64_t docId = 0;
if ( ! m_firstUrlValid ) docId = m_docId;
// if url not valid, use NULL
char *u = NULL;
if ( docId == 0LL && ptr_firstUrl ) u = getFirstUrl()->getUrl();
// if both are not given that is a problem
if ( docId == 0LL && ! u ) {
log("doc: no url or docid provided to get old doc");
g_errno = EBADENGINEER;
return NULL;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if using time axis then append the timestamp to the end of
// the url. this way Msg22::getAvailDocId() will return a docid
// based on that so we don't collide with other instances of this
// same url.
if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
SafeBuf *tau = getTimeAxisUrl();
u = tau->getBufStart();
}
// the title must be local since we're spidering it
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
u ,
docId , // probable docid
cr->m_coll ,
// . msg22 will set this to point to it!
// . if NULL that means NOT FOUND
&m_oldTitleRec ,
&m_oldTitleRecSize ,
false , // just chk tfndb?
false , // getAvailDocIdOnly
m_masterState ,
m_masterLoop ,
m_niceness , // niceness
addToCache , // add to cache?
0 , // max cache age
999999 , // timeout seconds
false ))// load balancing?
// return -1 if we blocked
return (char **)-1;
// not really an error
if ( g_errno == ENOTFOUND ) g_errno = 0;
// error?
if ( g_errno ) return NULL;
// got it
return &m_oldTitleRec;
}
// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
// from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getRootTitleRec ( ) {
// if valid return that
if ( m_rootTitleRecValid ) return &m_rootTitleRec;
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
// if we are root use us!!!!! well, the old us...
if ( *isRoot ) {
char **otr = getOldTitleRec ( );
if ( ! otr || otr == (char **)-1 ) return (char **)otr;
m_rootTitleRec = m_oldTitleRec;
m_rootTitleRecSize = m_oldTitleRecSize;
return &m_rootTitleRec;
}
// get our site root
char *mysite = getSite();
if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// make it a url. keep it on stack since msg22 copies it into its
// url request buffer anyway! (m_msg22Request.m_url[])
Url site; site.set ( mysite );
// assume its valid
m_rootTitleRecValid = true;
// add it to the cache?
bool addToCache = false;
//if ( maxCacheAge > 0 ) addToCache = true;
// update status msg
setStatus ( "getting root title rec");
// the title must be local since we're spidering it
if ( ! m_msg22b.getTitleRec ( &m_msg22Request ,
site.getUrl() ,
0 , // probable docid
cr->m_coll ,
// . msg22 will set this to point to it!
// . if NULL that means NOT FOUND
&m_rootTitleRec ,
&m_rootTitleRecSize ,
false , // just chk tfndb?
false , // getAvailDocIdOnly
m_masterState ,
m_masterLoop ,
m_niceness , // niceness
addToCache , // add to cache?
0 , // max cache age
999999 , // timeout seconds
false ))// load balancing?
// return -1 if we blocked
return (char **)-1;
// not really an error
if ( g_errno == ENOTFOUND ) g_errno = 0;
// error?
if ( g_errno ) return NULL;
// got it
return &m_rootTitleRec;
}
/*
// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
// from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getContactTitleRec ( char *u ) {
// clear if we blocked
//if ( g_errno == ENOTFOUND ) g_errno = 0;
// if valid return that
if ( m_contactTitleRecValid ) return &m_contactTitleRec;
// fake
static char *s_fake = NULL;
// if no url, we got no contact title rec in titledb then!
if ( ! u || u[0] == '\0' ) return &s_fake;
// update status msg
setStatus ( "getting contact title rec");
// assume its valid
m_contactTitleRecValid = true;
// add it to the cache?
bool addToCache = false;
//if ( maxCacheAge > 0 ) addToCache = true;
// the title must be local since we're spidering it
if ( ! m_msg22c.getTitleRec ( &m_msg22Request ,
u ,
0 , // probable docid
m_coll ,
// . msg22 will set this to point to it!
// . if NULL that means NOT FOUND
&m_contactTitleRec ,
&m_contactTitleRecSize ,
false , // just chk tfndb?
m_masterState ,
m_masterLoop ,
m_niceness , // niceness
addToCache , // add to cache?
0 , // max cache age
999999 , // timeout seconds
false ))// load balancing?
// return -1 if we blocked
return (char **)-1;
// not really an error
if ( g_errno == ENOTFOUND ) g_errno = 0;
// error?
if ( g_errno ) return NULL;
// got it
return &m_contactTitleRec;
}
*/
// used for indexing spider replies. we need a unique docid because it
// is treated as a different document even though its url will be the same.
// and there is never an "older" version of it because each reply is treated
// as a brand new document.
int64_t *XmlDoc::getAvailDocIdOnly ( int64_t preferredDocId ) {
if ( m_availDocIdValid && g_errno ) {
log("xmldoc: error getting availdocid: %s",
mstrerror(g_errno));
return NULL;
}
if ( m_availDocIdValid )
// this is 0 or -1 if no avail docid was found
return &m_msg22c.m_availDocId;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// pre-validate it
m_availDocIdValid = true;
if ( ! m_msg22c.getAvailDocIdOnly ( &m_msg22Requestc ,
preferredDocId ,
cr->m_coll ,
m_masterState ,
m_masterLoop ,
m_niceness ) )
return (int64_t *)-1;
// error?
log("xmldoc: error getting availdocid2: %s",mstrerror(g_errno));
return NULL;
}
int64_t *XmlDoc::getDocId ( ) {
if ( m_docIdValid ) return &m_docId;
setStatus ("getting docid");
XmlDoc **od = getOldXmlDoc( );
if ( ! od || od == (XmlDoc **)-1 ) return (int64_t *)od;
setStatus ("getting docid");
// . set our docid
// . *od is NULL if no title rec found with that docid in titledb
if ( *od ) {
m_docId = *(*od)->getDocId();
m_docIdValid = true;
return &m_docId;
}
m_docId = m_msg22a.getAvailDocId();
// if titlerec was there but not od it had an error uncompressing
// because of the corruption bug in RdbMem.cpp when dumping to disk.
if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
log("build: salvaged docid %"INT64" from corrupt title rec "
"for %s",m_docId,m_firstUrl.m_url);
}
if ( m_docId == 0 ) {
log("build: docid is 0 for %s",m_firstUrl.m_url);
g_errno = ENODOCID;
return NULL;
}
// ensure it is within probable range
if ( ! getUseTimeAxis () ) {
char *u = getFirstUrl()->getUrl();
int64_t pd = g_titledb.getProbableDocId(u);
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
if ( m_docId < d1 || m_docId > d2 ) {
char *xx=NULL;*xx=0; }
}
// if docid is zero, none is a vailable!!!
//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
m_docIdValid = true;
return &m_docId;
}
// . is our docid on disk? i.e. do we exist in the index already?
// . TODO: just check tfndb?
char *XmlDoc::getIsIndexed ( ) {
if ( m_isIndexedValid ) return &m_isIndexed;
setStatus ( "getting is indexed" );
// we must be old if this is true
//if ( m_setFromTitleRec ) {
// m_isNew = false;
// m_isNewValid = true;
// return &m_isNew;
//}
// get the url
//char *u = getFirstUrl()->getUrl();
if ( m_oldDocValid ) {
m_isIndexedValid = true;
if ( m_oldDoc ) m_isIndexed = true;
else m_isIndexed = false;
return &m_isIndexed;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// sanity check. if we have no url or docid ...
if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// use docid if first url not valid
int64_t docId = 0;
char *url = NULL;
// use docid if its valid, otherwise use url
if ( m_docIdValid ) docId = m_docId;
else url = ptr_firstUrl;
// note it
if ( ! m_calledMsg22e )
setStatus ( "checking titledb for old title rec");
else
setStatus ( "back from msg22e call");
// . consult the title rec tree!
// . "justCheckTfndb" is set to true here!
if ( ! m_calledMsg22e &&
! m_msg22e.getTitleRec ( &m_msg22Request ,
url ,
docId , // probable docid
cr->m_coll ,
// . msg22 will set this to point to it!
// . if NULL that means NOT FOUND
NULL , // tr ptr
NULL , // tr size ptr
true , // just chk tfndb?
false, // getavaildocidonly
m_masterState ,
m_masterLoop ,
m_niceness , // niceness
false , // add to cache?
0 , // max cache age
999999 , // timeout seconds
false )){//load balancing?
// validate
m_calledMsg22e = true;
// return -1 if we blocked
return (char *)-1;
}
// got it
m_calledMsg22e = true;
// error?
if ( g_errno ) return NULL;
// get it
if ( m_msg22e.m_found ) m_isIndexed = true;
else m_isIndexed = false;
// validate
m_isIndexedValid = true;
return &m_isIndexed;
}
void gotTagRecWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// note it
THIS->setStatus ( "in got tag rec wrapper" );
// set these
if ( ! g_errno ) {
THIS->m_tagRec.serialize ( THIS->m_tagRecBuf );
THIS->ptr_tagRecData = THIS->m_tagRecBuf.getBufStart();
THIS->size_tagRecData = THIS->m_tagRecBuf.length();
// validate
THIS->m_tagRecValid = true;
}
// continue
THIS->m_masterLoop ( THIS->m_masterState );
}
// if tagrec changed enough so that it would affect what we would index
// since last time we indexed this doc, we need to know that!
/*
int32_t *XmlDoc::getTagHash32 ( ) {
// make it valid
if ( m_tagHash32Valid ) return &m_tagHash32;
// compute it
TagRec *gr = getTagRec ();
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
// init it
m_tagHash32 = 0;
// hash the values of all tags
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
// breathe
QUICKPOLL(m_niceness);
// get data
uint32_t h = hash32(tag->getTagData(),tag->getTagDataSize(),0);
// skip if 0
if ( ! h ) continue;
// xor it up
m_tagHash32 = hash32h ( h , m_tagHash32 );
}
// validate
m_tagHash32Valid = true;
return &m_tagHash32;
}
*/
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
TagRec *XmlDoc::getTagRec ( ) {
// if we got it give it
if ( m_tagRecValid ) return &m_tagRec;
// do we got a title rec?
if ( m_setFromTitleRec && m_version >= 118 &&
// lookup up fresh from tagdb when doing a rebuild so we get
// the latest sitenuminlinks! nah, we set m_tagRecValid and
// m_tagRecDataValid to false in Repair.cpp iff rebuilding
// titledb!! otherwise, we have to use what is in titlerec
// to avoid parsing inconsistencies that would result in
// undeletable posdb data.
//! m_useSecondaryRdbs &&
// lookup the tagdb rec fresh if setting for a summary. that way
// we can see if it is banned or not
m_tagRecDataValid ) {
// all done
m_tagRecValid = true;
// assume null if old version
//if ( m_version <= 115 ) return &m_tagRec;
// just return empty otherwise
m_tagRec.setFromBuf ( ptr_tagRecData , size_tagRecData );
return &m_tagRec;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// get our site, usually the hostname, but can be like
// "www.last.fm/user/breendaxx/"
// we can't call this because it CALLS getTagRec()!!!
//char *mysite = getSite();
//if ( ! mysite || mysite == (char *)-1 ) return (TagRec *)mysite;
// update status msg
setStatus ( "getting tagdb record" );
// get the final redirected url
//Url *u = getCurrentUrl();
// nah, try this
Url *u = getFirstUrl();
// if we are docid based url this might block!
//if ( ! u || u == (void *)-1 ) return (TagRec *)u;
// good to go
//m_oldTagRecValid = true;
// get it, user our collection for lookups, not m_tagdbColl[] yet!
if ( ! m_msg8a.getTagRec ( u ,
// we have to guess the site because
// we can't hit tagdb to get it at this
// point!!!
NULL, // guess it! // mysite ,
cr->m_collnum ,
false, // skip domain lookup? // true
m_niceness ,
this ,
gotTagRecWrapper ,
&m_tagRec ) )
// we blocked, return -1
return (TagRec *)-1;
// error? ENOCOLLREC?
if ( g_errno ) return NULL;
// assign it
m_tagRec.serialize ( m_tagRecBuf );
ptr_tagRecData = m_tagRecBuf.getBufStart();
size_tagRecData = m_tagRecBuf.length();
// validate
m_tagRecValid = true;
// our tag rec should be all valid now
return &m_tagRec;
}
// this is only for purposes of setting the site's TagRec
char *XmlDoc::getHasContactInfo ( ) {
if ( m_hasContactInfoValid ) return &m_hasContactInfo2;
setStatus ( "getting has contact info" );
// get it from the tag rec if we can
TagRec *gr = getTagRec ();
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
char *ic = getIsThisDocContacty ( );
if ( ! ic || ic == (void *)-1 ) return (char *)ic;
// the current top ip address
//int32_t *ip = getIp();
//if ( ! ip || ip == (int32_t *)-1) return (char *)ip;
//int32_t top = *ip & 0x00ffffff;
// and should have a contact page tag
Tag *tag = gr->getTag ("hascontactinfo");
if ( tag ) m_hasContactInfo = true;
else m_hasContactInfo = false;
m_hasContactInfo2 = m_hasContactInfo;
// are we a "contact" link? i.e. about us, etc. that would contain
// the physical address of the entity responsible for this website
//bool isContacty = getIsContacty( fu ,
// info1 ,
// hops ,
// *ct ,
// *isRoot ,
// m_niceness );
// bail early if not a candidate for contact info
if ( ! *ic ) { // check ) {
m_hasContactInfoValid = true;
return &m_hasContactInfo2;
}
//
// TODO: did IP change?? invalidate it???
//
// set status. we can time status changes with this routine!
setStatus ( "getting contact info on just this page" );
int32_t *nca = getNumContactAddresses();
if ( ! nca || nca == (void *)-1 ) return (char *)nca;
// did we have a contact address?
if ( *nca ) {
m_hasContactInfo = true;
m_hasContactInfo2 = true;
m_hasContactInfoValid = true;
return &m_hasContactInfo2;
}
// get the email addresses
int32_t *numOfficial = getNumOfficialEmails ( );
if ( ! numOfficial || numOfficial == (void *)-1)
return (char *)numOfficial;
// did we get some?
if ( *numOfficial > 0 ) {
m_hasContactInfo = true;
m_hasContactInfo2 = true;
m_hasContactInfoValid = true;
return &m_hasContactInfo2;
}
// this should set m_hasContactInfo as well as m_contact*[] arrays
//TagRec *pcitr = getContactInfoTagRec ();
//if ( ! pcitr || pcitr == (void *)-1 ) return (char *)pcitr;
// do not re-peat the above now
m_hasContactInfoValid = true;
return &m_hasContactInfo2;
}
// returns "type" of contact link, > 0
int32_t getIsContacty ( Url *url ,
LinkInfo *info1 ,
int32_t hops ,
uint8_t ct ,
bool isRoot ,
int32_t niceness ) {
static int64_t h_home ;
static int64_t h_site ;
static int64_t h_map ;
static int64_t h_sitemap ;
static int64_t h_contact ;
static int64_t h_about ;
static int64_t h_privacy ;
static int64_t h_policy ;
static int64_t h_statement ;
static int64_t h_terms ;
static int64_t h_of ;
static int64_t h_and ;
static int64_t h_service ;
static int64_t h_conditions ;
static int64_t h_use ;
static int64_t h_us ;
static int64_t h_help ;
static int64_t h_location ;
static int64_t h_faq ;
static int64_t h_faqs ;
static int64_t h_customer ;
static int64_t h_support ;
static int64_t h_advertise ;
static int64_t h_inquiry ;
static int64_t h_inquiries ;
static int64_t h_feedback ;
static int64_t h_company ;
static int64_t h_corporate ;
static bool s_inith = false;
if ( ! s_inith ) {
s_inith = true;
h_home = hash64n ("home");
h_site = hash64n ("site");
h_map = hash64n ("map");
h_sitemap = hash64n ("sitemap");
h_contact = hash64n ("contact");
h_about = hash64n ("about");
h_privacy = hash64n ("privacy");
h_policy = hash64n ("policy");
h_statement = hash64n ("statement");
h_terms = hash64n ("terms");
h_of = hash64n ("of");
h_and = hash64n ("and");
h_service = hash64n ("service");
h_conditions = hash64n ("conditions");
h_use = hash64n ("use");
h_us = hash64n ("us");
h_help = hash64n ("help");
h_location = hash64n ("location");
h_faq = hash64n ("faq");
h_faqs = hash64n ("faqs");
h_customer = hash64n ("customer");
h_support = hash64n ("support");
h_advertise = hash64n ("advertise");
h_inquiry = hash64n ("inquiry");
h_inquiries = hash64n ("inquiries");
h_feedback = hash64n ("feedback");
h_company = hash64n ("company");
h_corporate = hash64n ("corporate");
}
int32_t check = 0;
// loop over the link texts we got
for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
// never do anything if hop count >= 3
if ( hops >= 3 ) break;
// javascript must be hopcount 1 only
if ( ct == CT_JS && hops != 1 ) break;
// is this inlinker internal?
//bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// skip if not local to site
//if ( ! internal ) continue;
// get the text
char *txt = k->getLinkText();
// get length of link text
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// assume utf-8. so do a utf-8 sanity check so it doesn't
// break Words::countWords() by thinking a character is
// 2+ bytes and breaching the buffer
if ( ! verifyUtf8 ( txt , tlen ) ) {
log("xmldoc: bad link text 1 from url=%s for %s",
k->getUrl(),url->m_url);
continue;
}
// convert into words i guess
Words ww;
// . TODO: use alt text if only an image in the link!!!!!
// . return -1 if it fails with g_errno set
if ( ! ww.setx ( txt , tlen , niceness) ) return (char)-1;
// int16_tcut
int32_t nw = ww.getNumWords();
// skip if too big
if ( nw >= 30 ) continue;
// int16_tcut
int64_t *wids = ww.getWordIds();
// reset alnumcount
int32_t count = 0;
// loop over its words
for ( int32_t j = 0 ; j < nw && ! check ; j++ ) {
// skip if not alnum
if ( ! wids[j] ) continue;
// keep track of alnum word position
count++;
// "contact..." only good from root or root kid
if ( wids[j] == h_contact && hops >= 1 && count == 1 )
check = 1;
// "about..." only good from root or root kid
if ( wids[j] == h_about && hops >= 1 && count == 1 )
check = 2;
// "...privacy policy..."
if ( wids[j ] == h_privacy && j+2<nw &&
wids[j+2] == h_policy )
check = 3;
// "...privacy statement..."
if ( wids[j ] == h_privacy && j+2<nw &&
wids[j+2] == h_statement )
check = 4;
// "...terms of service..."
if ( wids[j ] == h_terms && j+4<nw &&
wids[j+2] == h_of &&
wids[j+4] == h_service )
check = 5;
// "...terms of use..."
if ( wids[j ] == h_terms && j+4<nw &&
wids[j+2] == h_of &&
wids[j+4] == h_use )
check = 6;
// "... terms & conditions ..."
if ( wids[j ] == h_terms && j+2<nw &&
wids[j+2] == h_conditions )
check = 7;
// "... terms and conditions ..."
if ( wids[j ] == h_terms && j+4<nw &&
wids[j+2] == h_and &&
wids[j+4] == h_conditions )
check = 8;
// "...site map ..."
if ( wids[j] == h_site && j+2<nw &&
wids[j+2] == h_map )
check = 9;
// "...about us..."
if ( wids[j] == h_about && j+2<nw &&
wids[j+2] == h_us )
check = 10;
// "...contact us..."
if ( wids[j] == h_contact && j+2<nw &&
wids[j+2] == h_us)
check = 11;
// "help..."
if ( wids[j] == h_help && count == 1 )
check = 12;
// "faq..."
if ( wids[j] == h_faq && count == 1 )
check = 13;
// "faqs..."
if ( wids[j] == h_faqs && count == 1 )
check = 14;
// "...customer support..."
if ( wids[j] == h_customer && j+2<nw &&
wids[j+2] == h_support )
check = 15;
// "advertise..."
if ( wids[j] == h_advertise && count == 1)
check = 16;
// "...inquiry..."
if ( wids[j] == h_inquiry )
check = 17;
// "...inquiries..."
if ( wids[j] == h_inquiries )
check = 18;
// one word only below here
if ( ww.getNumAlnumWords() != 1 ) continue;
if ( wids[j] == h_about ) check = 2;
if ( wids[j] == h_home ) check = 19;
if ( wids[j] == h_support ) check = 20;
if ( wids[j] == h_advertise ) check = 21;
if ( wids[j] == h_help ) check = 22;
if ( wids[j] == h_faq ) check = 23;
if ( wids[j] == h_faqs ) check = 24;
if ( wids[j] == h_contact ) check = 25;
if ( wids[j] == h_feedback ) check = 26;
if ( wids[j] == h_sitemap ) check = 27;
if ( wids[j] == h_company ) check = 28;
if ( wids[j] == h_corporate ) check = 29;
if ( wids[j] == h_privacy ) check = 30;
if ( wids[j] == h_terms ) check = 31;
// "location" fixes guildcinema.com
if ( wids[j] == h_location && isRoot ) check = 32;
}
}
// check for certain things in the url path that would indicate that
// this is a contact info page
//char *path = m_firstUrl.getPath();
char *path = url->getPath();
if ( gb_strcasestr(path,"contact" ) ) { check += 33; check *= 90; }
if ( gb_strcasestr(path,"/about" ) ) { check += 34; check *= 91; }
if ( gb_strcasestr(path,"/feedback") ) { check += 35; check *= 92; }
if ( gb_strcasestr(path,"/help" ) ) { check += 36; check *= 93; }
if ( gb_strcasestr(path,"/faq" ) ) { check += 37; check *= 94; }
if ( gb_strcasestr(path,"advertise") ) { check += 38; check *= 95; }
if ( gb_strcasestr(path,"inquir" ) ) { check += 39; check *= 96; }
return check;
}
char *XmlDoc::getIsThisDocContacty() {
if ( m_isContactyValid ) return &m_isContacty;
setStatus ( "getting is contacty" );
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
int8_t *hc = getHopCount();
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
LinkInfo *info1 = getLinkInfo1 ();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
// get the first url
Url *fu = getFirstUrl();
// int16_tcut
int32_t hops = *hc;
// check it
m_isContacty = getIsContacty ( fu ,
info1 ,
hops ,
*ct ,
*isRoot ,
m_niceness );
m_isContactyValid = true;
return &m_isContacty;
}
int32_t *XmlDoc::getNumContactAddresses ( ) {
// process
Address **ca = getContactAddresses();
if ( ! ca || ca == (void *)-1 ) return (int32_t *)ca;
// now we are valid
return &m_numContactAddresses;
}
Address **XmlDoc::getContactAddresses ( ) {
// assume none
if ( m_contactAddressesValid ) return m_contactAddresses;
// need this of course
Addresses *aa = getAddresses ();
if ( ! aa || aa == (void *)-1 ) return (Address **)aa;
// assume none
m_contactAddressesValid = true;
m_numContactAddresses = 0;
// not if not contacty. we gotta be a url like ".../contact.asp"
char *ic = getIsThisDocContacty ( );
if ( ! ic || ic == (void *)-1 ) return (Address **)ic;
// if not a of contact url form, return none
if ( ! *ic )
return m_contactAddresses;
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (Address **)isRoot;
// do not do this for root if multiple addresses. this
// fixes http://obits.abqjournal.com/
if ( *isRoot && aa->m_uniqueStreetHashes > 1 )
return m_contactAddresses;
// reset count
int32_t nca = 0;
// number of addresses in this doc
int32_t na = aa->m_am.getNumPtrs();
// add all addresses then???
for ( int32_t i = 0 ; i < na ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// get it
Address *ai = (Address *)aa->m_am.getPtr(i);
// do not add this to tagdb if not inlined!
if ( ! ( ai->m_flags & AF_INLINED ) ) continue;
// store it
m_contactAddresses[nca++] = ai;
// stop before breach
if ( nca >= MAX_CONTACT_ADDRESSES ) break;
}
// update count
m_numContactAddresses = nca;
return m_contactAddresses;
}
int32_t *XmlDoc::getNumOfficialEmails ( ) {
char *eb = getEmailBuf();
if ( ! eb || eb == (void *)-1 ) return (int32_t *)eb;
return &m_numOfficialEmails;
}
// . add email addresses to tag rec
// . add up to 3 of same domain and different domain addresses
// . return # of *official* contact infos added to tag rec
// . this now includes submission forms!
// . returns -1 and sets g_errno on error
char *XmlDoc::getEmailBuf ( ) {
if ( m_emailBufValid ) return m_emailBuf;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
// count # of official contacts we got
int32_t official = 0;
// int16_tcuts
int64_t *wids = ww->m_wordIds;
char **wptrs = ww->m_words;
int32_t *wlens = ww->m_wordLens;
nodeid_t *tids = ww->m_tagIds;
int32_t nw = ww->getNumWords();
// get our url
Url *f = getFirstUrl();
// get its domain len
char *myDom = f->getMidDomain();
int32_t myDomLen = f->getMidDomainLen();
// point here
char *eptr = m_emailBuf;
char *emax = m_emailBuf + EMAILBUFSIZE;
m_emailBufValid = true;
// reset
*eptr = '\0';
//
// ADD EMAIL ADDRESSES
//
// count how many we find
int32_t ne = 0;
// loop over all the words
for ( int32_t i = 1 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// . email address? look for the '@'
// . might also have <img src="at.gif"> (bot proof)
if ( wptrs[i][0] != '@' && tids[i] != TAG_IMG ) continue;
// . make sure any image has an "/at." in it!
// . "mail<img src="/common/images/at.gif">pipl.com"
if(tids[i]==TAG_IMG&&!gb_strncasestr(wptrs[i],wlens[i],"/at."))
continue;
// must be a single char
if ( ! tids[i] && wlens[i] != 1 ) continue;
// if i was the last word, give up!
if ( i + 1 >= nw ) break;
// back up i until we hit a non-email char
int32_t a ;
for ( a = i ; a - 1 > 0 ; a-- ) {
if (wids [a-1] ) continue;
if (wptrs[a-1][0]=='.'&&wlens[a-1]==1)continue;
if (wptrs[a-1][0]=='-'&&wlens[a-1]==1)continue;
break;
}
// must not start with '.'
if ( wptrs[a][0]=='.' ) a++;
// now get the end of it
int32_t b;
int32_t periodCount = 0;
for ( b = i ; b+1 < nw ; b++ ) {
if (wids[b+1]) continue;
// only punct we allow is a single period
if ( wptrs[b+1][0]!='.' ) break;
if ( wlens[b+1] != 1 ) break;
periodCount++;
}
// must have at least one!
if ( ! periodCount ) continue;
// must not end on '.'
if ( wptrs[b][0]=='.') b--;
// hostname must have a valid tld
char *host = wptrs[i+1];
char *hend = wptrs[b]+wlens[b];
// temp null term
char c = *hend;
*hend = '\0';
int32_t tldLen ; char *tld = getTLDFast ( host, &tldLen , false );
// ignore the rest of this line for addresses even
// if tld is bogus
//ignoreLine = true;
// must have a legit tld!
if ( ! tld ) { *hend = c; continue; }
// if not from our same domain, use "emailaddressoffsite"
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
// use mid domain. subtract '.'
//int32_t midlen = tld - dom - 1;
// undo the temp NULL thing
*hend = c;
if ( ! dom ) continue;
// include last word
b++;
// normal buffer
char buf[100];
char *p = buf;
char *pend = buf + 100;
// normalize it
for ( int32_t j = a ; j < b ; j++ ) {
// include the at sign
if ( j == i ) {*p++ = '@'; continue;}
// skip tags
if ( tids[j] ) continue;
// skip punct
if ( ! wids[j] ) {*p++ ='.'; continue;}
// ensure minimal space
if ( p + wlens[j] + 1 >= pend ) break;
// write out wids
gbmemcpy ( p , wptrs[j] , wlens[j] );
p += wlens[j];
}
// NULL term it
*p = '\0';
// do we match domains?
//char *tn = "emailaddressoffsite";
// use this if we match domains
//if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
// tn = "emailaddressonsite";
// // this is an official contact method
// //official++;
//}
// we now count even offsite email addresses as official
// for addresses like @gmail.com etc. because we are now
// only checking "contact us" and "about us" and root pages,
// so they should never be email addresses of commenters.
// and often bloggers have external email addresses.
// http://www.christinesaari.com/html/about.php?psi=44
official++;
// store it
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
// return -1;
int32_t blen = gbstrlen(buf);
// ignore if breach
if ( eptr + blen + 2 > emax ) continue;
// comma?
if ( eptr > m_emailBuf ) *eptr++ = ',';
// store it
gbmemcpy (eptr , buf , blen );
// advance
eptr += blen;
// limit it
if ( ++ne >= 3 ) break;
}
//
// ADD BOT-PROOF EMAIL ADDRESSES (bot proof)
//
// super dot john at xyz dot com
//
int64_t h_at = hash64Lower_utf8("at");
int64_t h_dot = hash64Lower_utf8("dot");
// loop over all the words
for ( int32_t i = 1 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// email address? look for the " at "
if ( wids[i] != h_at ) continue;
// front name word count
int32_t nameCount = 0;
// back up i until we hit a non-email word
int32_t a ;
// do a loop
for ( a = i - 1 ; a > 0 ; ) {
// need a space/punt word
if ( wids[a] ) break;
if ( tids[a] ) break;
// skip it
a--;
// then need the "john" part
if ( ! wids[a] ) break;
if ( tids[a] ) break;
if ( wids[a] == h_dot ) break; // "dot" is bad
// count account name part
nameCount++;
// go back if like "mike dot smith"
if ( a - 4 >= 0 &&
! tids[a-1] &&
wids [a-2] == h_dot &&
! tids[a-3] &&
wids [a-4] != h_dot &&
wids [a-4] != h_at )
a -= 4;
// that is good enough
break;
}
// need a name at least one
if ( nameCount <= 0 ) continue;
// skip over that space/punct word
//a--;
// now must be regular word before that
//if ( tids[a-1] ) continue;
//if ( ! wids[a-1] ) continue;
// we got it
//a--;
// now get the end of it
int32_t b ;
// count the dots
int32_t dotCount = 0;
// make sure last word is a legit tld
int32_t tldLen = 0; char *tld = NULL;
// do a loop
for ( b = i + 1 ; b + 3 < nw ; b++ ) {
// need a space/punt word
if ( wids[b] ) break;
if ( tids[b] ) break;
// skip it
b++;
// then need the "xyz" part
if ( ! wids[b] ) break;
if ( tids[b] ) break;
if ( wids[b] == h_dot ) break; // "dot" is bad
// remember it for tld detection
tld = wptrs[b];
tldLen = wlens[b];
// skip it
b++;
// need another space/punct word
if ( wids[b] ) break;
if ( tids[b] ) break;
// skip it
b++;
// now we need a "dot"
if ( wids[b] != h_dot ) break;
// count the dots
dotCount++;
}
// need at least one "dot"
if ( dotCount < 1 ) continue;
// not too many!
if ( dotCount > 5 ) continue;
// must have legit tld
if ( tld && ! isTLD ( tld , tldLen ) ) continue;
// normal buffer
char buf[100];
char *p = buf;
char *pend = buf + 100;
// normalize it
for ( int32_t j = a ; j < b ; j++ ) {
// skip tags
if ( tids[j] ) continue;
// skip punct
if ( ! wids[j] ) continue;
// ensure minimal space
if ( p + wlens[j] + 1 >= pend ) break;
// write out wids
if ( wids[j] == h_at ) {*p++ = '@'; continue;}
if ( wids[j] == h_dot ) {*p++ = '.'; continue;}
gbmemcpy ( p , wptrs[j] , wlens[j] );
p += wlens[j];
}
// NULL term it
*p = '\0';
// get the host
char *host = buf ; // wptrs[i+1]; ?? is this right?
// if not from our same domain, use "emailaddressoffsite"
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
if ( ! dom ) continue;
// use mid domain
int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
// limit domain by that. subtract '.'
int32_t midlen = tld3 - dom - 1;
// do we match domains?
char *tn = "emailaddressoffsite";
// use this if we match domains
if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
tn = "emailaddressonsite";
// this is an official contact method
//official++;
}
// we now count even offsite email addresses as official
// for addresses like @gmail.com etc. because we are now
// only checking "contact us" and "about us" and root pages,
// so they should never be email addresses of commenters
// and often bloggers have external email addresses.
// http://www.christinesaari.com/html/about.php?psi=44
official++;
// store that
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
// return -1;
int32_t blen = gbstrlen(buf);
// ignore if breach
if ( eptr + blen + 2 > emax ) continue;
// comma?
if ( eptr > m_emailBuf ) *eptr++ = ',';
// store it
gbmemcpy (eptr , buf , blen );
// advance
eptr += blen;
// limit it
if ( ++ne >= 3 ) break;
}
//
// ADD EMAIL ADDRESSES IN MAILTO TAGS
//
// <a href=mailto:steve@xyz.com>
// <a href=mailto:"steve at xyz dot com">
// now we check char by char since a website had it in the javascript:
// http://www.botanique.com/bincgi/stateprov.CFM?state=NM
//
char *m = xml->m_xml;
char *mend = m + xml->m_xmlLen - 4;
// empty?
if ( ! m ) mend = m;
// scan
for ( ; ; m++ ) {
// breach?
if ( m >= mend ) break;
// breathe
QUICKPOLL ( m_niceness );
// skip if not possible mailto:
if ( *m != 'm' && *m !='M' ) continue;
// skip
m++;
// skip?
if ( *m != 'a' && *m !='A' ) continue;
// skip
m++;
// skip?
if ( *m != 'i' && *m !='I' ) continue;
// skip
m++;
// skip?
if ( *m != 'l' && *m !='L' ) continue;
// skip
m++;
// skip?
if ( *m != 't' && *m !='T' ) continue;
// skip
m++;
// skip?
if ( *m != 'o' && *m !='O' ) continue;
// skip
m++;
// skip?
if ( *m != ':' ) continue;
// skip
m++;
// set end
char *mend = m + 100;
// skip over the mailto:
//m += 7;
// that is the start of the email address then
char *start = m;
// skip til '@'
for ( ; *m && m < mend && *m != '@' ; m++ ) {
// but give up if we hit a non-email name char
if ( is_alnum_a(*m) ) continue;
if ( *m == '.' ) continue;
if ( *m == '-' ) continue;
break;
}
// bad if no @
if ( *m != '@' ) continue;
// skip the @
m++;
// . skip until alnum
// . fix parsing of "dsquires@ unimelb.edu.au" for
// http://www.marcom1.unimelb.edu.au/public/contact.html
for (;*m && is_wspace_utf8(m); m+=getUtf8CharSize(m) );
// get the host
char *host = m;
// skip till end of hostname
for (;*m && m<mend && (is_alnum_a(*m)||*m=='.'||*m=='-');m++ );
// null term
char c = *m; *m = '\0';
// if not from our same domain, use "emailaddressoffsite"
int32_t dlen ; char *dom = getDomFast ( host , &dlen , false );
// skip if no valid domain
if ( ! dom ) { *m = c; continue; }
// use mid domain
int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
// limit domain by that. subtract '.'
int32_t midlen = tld3 - dom - 1;
// put it back
*m = c;
// point "end" to end of the email address
char *end = dom + dlen;
// do we match domains?
char *tn = "emailaddressoffsite";
// use this if we match domains
if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
tn = "emailaddressonsite";
// this is an official contact method
//official++;
}
// we now count even offsite email addresses as official
// for addresses like @gmail.com etc. because we are now
// only checking "contact us" and "about us" and root pages,
// so they should never be email addresses of commenters
// and often bloggers have external email addresses.
// http://www.christinesaari.com/html/about.php?psi=44
official++;
// store that
//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,start,end-start) )
// return -1;
// cast it
char *buf = start;
int32_t blen = end - start;
// ignore if breach
if ( eptr + blen + 2 > emax ) continue;
// comma?
if ( eptr > m_emailBuf ) *eptr++ = ',';
// store it
gbmemcpy (eptr , buf , blen );
// advance
eptr += blen;
// limit it
if ( ++ne >= 3 ) break;
}
//
// ADD CONTACT FORM
//
bool gotEmailBox = false;
bool storedForm = false;
int32_t emailPos = -1;
int32_t alnumCount = 0;
// quick compares
int64_t he1 = hash64Lower_utf8 ( "email");
int64_t he2 = hash64Lower_utf8 ( "mail");
// loop over all words again
for ( int32_t i = 1 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get tag id if any
int32_t tid = tids[i] & BACKBITCOMP;
// . do we have a submit form?
// . first, do we have a text box for the sender's email?
if ( tid == TAG_INPUT ) {
int32_t ttlen;
// bad i is not a node # it is a word #
int32_t nn = ww->m_nodes[i];
// must be valid
char *tt = xml->getString(nn,"type",&ttlen);
if ( ! tt || ttlen <= 0 ) continue;
// must be of type text
if ( strncasecmp(tt,"text",4) ) continue;
// might have "email" or "e-mail" in the value
int32_t vlen;
char *val = xml->getString(nn,"value",&vlen);
// check that
if ( val ) {
if ( gb_strncasestr(val,vlen,"email") ||
gb_strncasestr(val,vlen,"e-mail") )
// flag it good
gotEmailBox = true;
}
// must have the word "email" or "e-mail" within
// a few words right before it!
if ( emailPos == -1 ) continue;
//if ( i - emailPos >= 7 ) continue;
if ( alnumCount > 7 ) continue;
// flag it
gotEmailBox = true;
}
// text area? must happen AFTER the email adress box
if ( tid == TAG_TEXTAREA && gotEmailBox ) {
// must have had the form before us
// do not double store into tagdb rec
if ( storedForm ) continue;
// store this bad boy into the tagdb rec
//if ( ! gr->addTag("hascontactform",
// timestamp,
// "xmldoc",
// ip,
// "1" ,
// 1 ) )
// return -1;
// copy it
char *buf = "hascontactform";
int32_t blen = gbstrlen(buf);
// ignore if breach
if ( eptr + blen + 2 > emax ) continue;
// comma?
if ( eptr > m_emailBuf ) *eptr++ = ',';
// store it
gbmemcpy (eptr , buf , blen );
// advance
eptr += blen;
// do not double store
storedForm = true;
// this is an official contact method
official++;
// another contact method
ne++;
// that's enough!
break;
}
// alnum counter
if ( wids[i] ) alnumCount++;
// special counter
if ( wids[i] == he1 || wids[i] == he2 ) {
// mark it
emailPos = i;
// reset counter
alnumCount = 0;
}
}
// null term
*eptr = '\0';
m_numOfficialEmails = official;
// i guess that is it
return m_emailBuf;
}
// returns vector 1-1 with Words.m_words[] array
/*
Spam *XmlDoc::getSpam ( ) {
if ( m_spamValid ) return &m_spam;
// set it
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Spam *)ww;
Bits *bits = getBits ();
if ( ! bits || bits == (Bits *)-1 ) return (Spam *)bits;
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (Spam *)sni;
// if more than X% ("thresh") of words are spammed to some degree,
// index all words with a minimum score
int32_t thresh = 6;
if ( *sni > 10 ) thresh = 8;
if ( *sni > 30 ) thresh = 10;
if ( *sni > 100 ) thresh = 20;
if ( *sni > 500 ) thresh = 30;
//int64_t x[] = {30,40,50,70,90};
//int64_t y[] = {6,8,10,20,30};
//int32_t spamThresh = getY ( m_docQuality , x , y , 5 );
if ( ! m_spam.set ( ww ,
bits ,
m_version ,
thresh ,
20 ,
m_niceness ))
return NULL;
m_spamValid = true;
return &m_spam;
}
*/
// this means any tod now
bool *XmlDoc::getHasTOD ( ) {
if ( m_hasTODValid ) return &m_hasTOD2;
// scan the dates
Dates *dp = getDates() ;
if ( ! dp || dp == (Dates *)-1 ) return (bool *)dp;
// assume not
m_hasTOD2 = false;
m_hasTOD = false;
// scan the dates
for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get date
Date *di = dp->m_datePtrs[i];
// skip if got nuked
if ( ! di ) continue;
// tod?
if ( !(di->m_hasType & DT_TOD) ) continue;
// got one
m_hasTOD2 = true;
m_hasTOD = true;
}
// it is now valid
m_hasTODValid = true;
return &m_hasTOD2;
}
/*
bool *XmlDoc::getHasSiteVenue ( ) {
if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
// get the tag rec
TagRec *gr = getTagRec ();
if ( ! gr || gr == (TagRec *)-1 ) return (bool *)gr;
// get tag from it
Tag *sv = gr->getTag("venueaddress") ;
// from that
m_hasSiteVenue2 = (bool)sv;
m_hasSiteVenue = (bool)sv;
m_hasSiteVenueValid = true;
return &m_hasSiteVenue2;
}
*/
// do not include addresses that are always in the header/footer of every page!
bool *XmlDoc::getHasAddress ( ) {
if ( m_hasAddressValid ) return &m_hasAddress2;
// get the addresses
Addresses *aa = getAddresses();
if ( ! aa || aa == (void *)-1 ) return (bool *)aa;
// from that
m_hasAddress2 = (aa->getNumNonDupAddresses() > 0);
m_hasAddress = (aa->getNumNonDupAddresses() > 0);
m_hasAddressValid = true;
return &m_hasAddress2;
}
Addresses *XmlDoc::getAddresses ( ) {
if ( m_addressesValid ) {
// return error if buf was breached
//if ( m_addresses.m_breached ) {
// g_errno = EBUFOVERFLOW;
// return NULL;
//}
// otherwise, return it
return &m_addresses;
}
// skip for now
m_addressesValid = true;
return &m_addresses;
// note it
setStatus ( "getting addresses");
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Addresses *)ww;
// we make sure that D_IS_IN_DATE is set by doing this
//Dates *dp = getDates();
//if ( ! dp || dp == (Dates *)-1) return (Addresses *)dp;
// we set the D_IS_IN_DATE flag for these bits
Bits *bits = getBits(); if ( ! bits ) return NULL;
Sections *sections = getExplicitSections();
if ( !sections||sections==(Sections *)-1) return (Addresses *)sections;
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (Addresses *)gr;
// the site hash
//int32_t *sh32 = getSiteHash32();
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Addresses *)sh32;
int32_t dh = getDomHash32();
// hash of all adjacent tag pairs
//uint32_t *tph = getTagPairHash32 ( ) ;
//if ( ! tph || tph == (void *)-1 ) return (Addresses *)tph;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Addresses *)d;
// get our ip
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1) return (Addresses *)ip;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
//char **stb = getSiteTitleBuf();
//if ( ! stb || stb == (void *)-1 ) return (Addresses *)stb;
// sanity check
//if ( ! m_siteTitleBufValid ) { char *xx=NULL;*xx=0; }
char **fbuf = getFilteredRootTitleBuf();
if ( ! fbuf || fbuf == (void *)-1 ) return (Addresses *)fbuf;
// this will set D_IS_IN_DATE in the Bits::m_bits[] array which
// Addresses::set() uses to avoid having addresses that are really
// just dates!
Dates *dd = getSimpleDates();
// return NULL on error
if ( ! dd ) return (Addresses *)NULL;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if the serialized section is valid, use that
//char *sd = NULL;
//bool valid = false;
//if ( od && od->m_sectionsReplyValid ) valid = true;
//if ( valid ) sd = od->ptr_sectionsReply;
// assume valid, really only when it returns in case it blocked...
//m_addressesValid = true;
// this should not be outstanding!
if ( m_addressSetCalled ) { char *xx=NULL;*xx=0; }
// assume valid, really only when it returns in case it blocked...
m_addressesValid = true;
// set it
m_addressSetCalled = true;
// make a copy of the tag rec here in case it gets mangled later
// because the m_addresses class may reference its buffer
//m_savedTagRec1.copy ( gr );
// . this returns false if blocked
// . it uses the "venueaddress" from the tagrec, "gr", BUT if this
// page is the one that sets the venue address, it won't be able
// to use it as a default city/state thingy until next time it is
// spidered, since that info is in the tagrec
// . PROBLEM: if the venue address is on this page, we can't take
// advantage of it by usings its city/state as a default for the
// other addresses on this page
if ( ! m_addresses.set ( sections ,
ww ,
bits ,
&m_tagRec , // &m_savedTagRec1 , // gr
&m_firstUrl ,
*d ,
cr->m_collnum ,
dh , // *sh32
*ip ,
//(int32_t)*tph ,
m_niceness ,
m_pbuf ,
m_masterState ,
m_masterLoop ,
*ct ,
//ptr_addressReply ,
//size_addressReply ,
//m_addressReplyValid ,
m_filteredRootTitleBuf ,
m_filteredRootTitleBufSize ,
this ))
return (Addresses *)-1;
// sanity check
if ( m_addresses.m_msg2c &&
m_addresses.m_msg2c->m_requests !=
m_addresses.m_msg2c->m_replies) {
char *xx=NULL;*xx=0; }
// error?
if ( g_errno ) return NULL;
// return it if not breached
//if ( ! m_addresses.m_breached ) return &m_addresses;
// return that error otherwise
//g_errno = EBUFOVERFLOW;
//return NULL;
return &m_addresses;
}
/*
int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
if ( m_siteNumInlinksUniqueIpValid )
return &m_siteNumInlinksUniqueIp;
// get our companion number
int32_t *ni = getSiteNumInlinks();
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
// sanity check
if ( ! m_siteNumInlinksUniqueIp ) { char *xx=NULL;*xx=0; }
// ok we must be valid
return &m_siteNumInlinksUniqueIp;
}
int32_t *XmlDoc::getSiteNumInlinksUniqueCBlock ( ) {
if ( m_siteNumInlinksUniqueCBlockValid )
return &m_siteNumInlinksUniqueCBlock;
// get our companion number
int32_t *ni = getSiteNumInlinks();
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
// sanity check
if ( ! m_siteNumInlinksUniqueCBlock ) { char *xx=NULL;*xx=0; }
// ok we must be valid
return &m_siteNumInlinksUniqueCBlock;
}
int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
if ( m_siteNumInlinksTotalValid )
return &m_siteNumInlinksTotal;
// get our companion number
int32_t *ni = getSiteNumInlinks();
if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
// sanity check
if ( ! m_siteNumInlinksTotal ) { char *xx=NULL;*xx=0; }
// ok we must be valid
return &m_siteNumInlinksTotal;
}
*/
// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
int32_t *XmlDoc::getFirstIp ( ) {
// return it if we got it
if ( m_firstIpValid ) return &m_firstIp;
// note it
setStatus ( "getting first ip");
// get tag rec
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
// got it
Tag *tag = gr->getTag ( "firstip" );
// get from tag
m_firstIp = 0;
if ( tag ) m_firstIp = atoip(tag->getTagData());
// if no tag, or is bogus in tag... set from ip
if ( m_firstIp == 0 || m_firstIp == -1 ) {
// need ip then!
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
// set that
m_firstIp = *ip;
}
m_firstIpValid = true;
return &m_firstIp;
// must be 4 bytes - no now its a string
//if ( tag->getTagDataSize() != 4 ) { char *xx=NULL;*xx=0; }
}
uint8_t *XmlDoc::getSiteNumInlinks8 () {
if ( m_siteNumInlinks8Valid ) return &m_siteNumInlinks8;
// get the full count
int32_t *si = getSiteNumInlinks();
if ( ! si || si == (int32_t *)-1 ) return (uint8_t *)si;
// convert to 8
m_siteNumInlinks8 = score32to8 ( *si );
// validate
m_siteNumInlinks8Valid = true;
return &m_siteNumInlinks8;
}
// this is the # of GOOD INLINKS to the site. so it is no more than
// 1 per c block, and it has to pass link spam detection. this is the
// highest-level count of inlinks to the site. use it a lot.
int32_t *XmlDoc::getSiteNumInlinks ( ) {
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
// sanity check
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// hacks of speed. computeSiteNumInlinks is true by default
// but if the user turns it off the just use sitelinks.txt
if ( cr && ! cr->m_computeSiteNumInlinks ) {
int32_t hostHash32 = getHostHash32a();
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
// fix core by setting these
// m_siteNumInlinksUniqueIp = 0;
// m_siteNumInlinksUniqueCBlock = 0;
// m_siteNumInlinksTotal = 0;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
//a nd this
m_siteNumInlinksValid = true;
m_siteNumInlinks = 0;
// if still not in sitelinks.txt, just use 0
if ( min < 0 ) {
return &m_siteNumInlinks;
}
m_siteNumInlinks = min;
return &m_siteNumInlinks;
}
setStatus ( "getting site num inlinks");
// get it from the tag rec if we can
TagRec *gr = getTagRec ();
if ( ! gr || gr == (void *)-1 ) return (int32_t *)gr;
// the current top ip address
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
//int32_t top = *ip & 0x00ffffff;
// this happens when its NXDOMAIN reply from dns so assume
// no site inlinks
if ( *ip == 0 ) {
m_siteNumInlinks = 0;
// m_siteNumInlinksUniqueIp = 0;
// m_siteNumInlinksUniqueCBlock = 0;
// m_siteNumInlinksTotal = 0;
m_siteNumInlinksValid = true;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
return &m_siteNumInlinks;
}
if ( *ip == -1 ) {
log("xmldoc: ip is %"INT32", can not get site inlinks",*ip);
g_errno = EBADIP;
return NULL;
}
// wait for clock to sync before calling getTimeGlobal
int32_t wfts = waitForTimeSync();
// 0 means error, i guess g_errno should be set, -1 means blocked
if ( ! wfts ) return NULL;
if ( wfts == -1 ) return (int32_t *)-1;
setStatus ( "getting site num inlinks");
// check the tag first
Tag *tag = gr->getTag ("sitenuminlinks");
// is it valid?
bool valid = true;
// current time
int32_t now = getTimeGlobal();
// use the spidered time for the test collection for consistency
if ( !strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
now = getSpideredTime();//m_spideredTime;
}
// get tag age in days
int32_t age = 0; if ( tag ) age = (now - tag->m_timestamp) ;
// add in some flutter to avoid having all hsots in the network
// calling msg25 for this site at the same time.
// a 10,000 second jitter. 3 hours.
int32_t flutter = rand() % 10000;
// add it in
age += flutter;
// . if site changes ip then toss the contact info out the window,
// but give it a two week grace period
// . well now we use the "ownershipchanged" tag to indicate that
//if (tag && age>14*3600*24) valid=false;
// . we also expire it periodically to keep the info uptodate
// . the higher quality the site, the longer the expiration date
int32_t ns = 0;
int32_t maxAge = 0;
int32_t sni = -1;
if ( tag ) {
// how many site inlinks?
ns = atol(tag->getTagData());
// for less popular sites use smaller maxAges
maxAge = 90;
if ( ns < 10 ) maxAge = 10;
else if ( ns < 30 ) maxAge = 15;
else if ( ns < 50 ) maxAge = 30;
else if ( ns < 100 ) maxAge = 60;
// if index size is tiny then maybe we are just starting to
// build something massive, so reduce the cached max age
int64_t nt = g_titledb.m_rdb.getCollNumTotalRecs(m_collnum);
if ( nt < 100000000 ) //100M
maxAge = 3;
if ( nt < 10000000 ) //10M
maxAge = 1;
// for every 100 urls you already got, add a day!
sni = atol(tag->getTagData());
// double if repairing
//if ( m_useSecondaryRdbs ) maxAge = (maxAge+1) * 2;
// fix bug for rebuild. rebuild any tag before now because
// the MAX_LINKERS_IN_TERMLIST was too small in Linkdb.cpp
// and i raised from 1M to 3M. it was hurting mahalo.com.
if ( m_useSecondaryRdbs && tag->m_timestamp < 1345819704 )
valid = false;
// force another rebuild of siterank because i fixed
// the 'beds' query a little to use firstip, so recompute
// siterank for those spammers.
if ( m_useSecondaryRdbs && tag->m_timestamp < 1348257346 &&
// leave really big guys in tact
sni < 300 )
valid = false;
// convert into seconds
maxAge *= 3600*24;
// so youtube which has 2997 links will add an extra 29 days
maxAge += (sni / 100) * 86400;
// hack for global index. never affect siteinlinks i imported
if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0;
// invalidate for that as wel
if ( age > maxAge ) valid = false;
}
// our companion tags, sitePop and fresh inlinks
// Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
// Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
// Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
// if we are missing either of those, invalidate as well
// if ( ! tag2 ) valid = false;
// if ( ! tag3 ) valid = false;
// if ( ! tag4 ) valid = false;
// if we have already been through this
if ( m_updatingSiteLinkInfoTags ) valid = false;
// if rebuilding linkdb assume we have no links to sample from!
if ( tag && m_useSecondaryRdbs && g_repair.m_rebuildLinkdb )
valid = true;
// debug log
if ( g_conf.m_logDebugLinkInfo )
log("xmldoc: valid=%"INT32" "
"age=%"INT32" ns=%"INT32" sni=%"INT32" "
"maxage=%"INT32" "
"tag=%"PTRFMT" "
// "tag2=%"PTRFMT" "
// "tag3=%"PTRFMT" "
"url=%s",
(int32_t)valid,age,ns,sni,
maxAge,
(PTRTYPE)tag,
// (PTRTYPE)tag2,
// (PTRTYPE)tag3,
m_firstUrl.m_url);
LinkInfo *sinfo = NULL;
char *mysite = NULL;
// if we are good return it
if ( tag && valid ) {
// set it
m_siteNumInlinks = atol(tag->getTagData());
m_siteNumInlinksValid = true;
// companion tags
// if ( tag2 ) {
// m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
// m_siteNumInlinksUniqueIpValid = true;
// }
// if ( tag3 ) {
// m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
// m_siteNumInlinksUniqueCBlockValid = true;
// }
// if ( tag4 ) {
// m_siteNumInlinksTotal =atol(tag4->getTagData());
// m_siteNumInlinksTotalValid = true;
// }
// . consult our sitelinks.txt file
// . returns -1 if not found
goto updateToMin;
}
// set status. we can time status changes with this routine!
//setStatus ( "getting site link info");
// if ip is bad we can't do this. we need to have a legit ip
// so we know if a linker is internal or not
/*
if ( *ip == 0 || *ip == -1 ) {
log("gb: bad ip so we can't get site num inlinks right");
m_siteNumInlinks = 0;
m_sitePop = 0;
m_siteNumInlinksFresh = 0;
m_siteNumInlinksValid = true;
m_siteNumInlinksFreshValid = true;
m_sitePopValid = true;
return &m_siteNumInlinks;
}
*/
// set this flag so when we are re-called, "valid" will be set to false
// so we can come down here and continue this. "flutter" might
// otherwise cause us to not make it down here.
m_updatingSiteLinkInfoTags = true;
// we need to re-get both if either is NULL
sinfo = getSiteLinkInfo();
// block or error?
if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;
//
// now update tagdb!
//
// ok, get the sites of the external outlinks and they must
// also be NEW outlinks, added to the page since the last time
// we spidered it...
//Links *links = getLinks ();
//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
mysite = getSite();
if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;
setStatus ( "adding site info tags to tagdb 1");
// why are we adding tag again! should already be in tagdb!!!
if ( m_doingConsistencyCheck ) {char*xx=NULL;*xx=0;}
// do not re-call at this point
//m_siteNumInlinks = sinfo->m_numInlinksExtrapolated;
m_siteNumInlinks = (int32_t)sinfo->m_numGoodInlinks;
//m_siteNumInlinksFresh = sinfo->m_numInlinksFresh;
//m_sitePop = sinfo->m_pagePop;
// m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
// m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
// m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
m_siteNumInlinksValid = true;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
updateToMin:
// . consult our sitelinks.txt file
// . returns -1 if not found
int32_t hostHash32 = getHostHash32a();
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
if ( min >= 0 ) {
if ( m_siteNumInlinks < min ||
! m_siteNumInlinksValid ) {
m_siteNumInlinks = min;
m_siteNumInlinksValid = true;
}
// if ( ! m_siteNumInlinksUniqueIpValid ||
// m_siteNumInlinksUniqueIp < min ) {
// m_siteNumInlinksUniqueIp = min;
// m_siteNumInlinksUniqueIpValid = true;
// }
// if ( ! m_siteNumInlinksUniqueCBlockValid ||
// m_siteNumInlinksUniqueCBlock < min ) {
// m_siteNumInlinksUniqueCBlock = min;
// m_siteNumInlinksUniqueCBlockValid = true;
// }
// if ( ! m_siteNumInlinksTotalValid ||
// m_siteNumInlinksTotal < min ) {
// m_siteNumInlinksTotal = min;
// m_siteNumInlinksTotalValid = true;
// }
}
// deal with it
return &m_siteNumInlinks;
}
// . do a 'site:xyz.com | gbnuminlinks' query to get the top docs
// from a site and get the gigabits from that query!
// . then store the resulting gigabits into tagdb for efficiency
// . recompute once per month or so ... or if ip changes i guess
// . we need the root title as a source for city and adm1's for
// Addresses::set() function
//char **XmlDoc::getSiteGigabits ( ) {
//}
// TODO: can we have a NULL LinkInfo without having had an error?
LinkInfo *XmlDoc::getSiteLinkInfo() {
// lookup problem?
if ( g_errno ) {
log("build: error getting link info: %s",
mstrerror(g_errno));
return NULL;
}
setStatus ( "getting site link info" );
if ( m_siteLinkInfoValid )
//return msg25.m_linkInfo;
return (LinkInfo *)m_mySiteLinkInfoBuf.getBufStart();
char *mysite = getSite();
if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
int32_t *fip = getFirstIp();
if ( ! fip || fip == (int32_t *)-1) return (LinkInfo *)fip;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// can we be cancelled?
bool canBeCancelled = true;
// not if pageparser though
if ( m_pbuf ) canBeCancelled = false;
// not if injecting
if ( ! m_sreqValid ) canBeCancelled = false;
// assume valid when it returns
m_siteLinkInfoValid = true;
// use this buffer so XmlDoc::print() can display it where it wants
SafeBuf *sb = NULL;
if ( m_pbuf ) sb = &m_siteLinkBuf;
// only do this for showing them!!!
if ( m_useSiteLinkBuf ) sb = &m_siteLinkBuf;
//bool onlyGetGoodInlinks = true;
//if ( m_useSiteLinkBuf ) onlyGetGoodInlinks = false;
// get this
int32_t lastUpdateTime = getTimeGlobal();
// get from spider request if there
//bool injected = false;
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
// but be consistent if doing the "qatest123" collection
if ( ! strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
lastUpdateTime = getSpideredTime();//m_spideredTime;
}
bool onlyNeedGoodInlinks = true;
// so if steve wants to display all links then set this
// to false so we get titles of bad inlinks
// seems like pageparser.cpp just sets m_pbuf and not
// m_usePageLinkBuf any more
if ( sb ) onlyNeedGoodInlinks = false;
// int16_tcut
//Msg25 *m = &m_msg25;
if ( ! getLinkInfo ( &m_tmpBuf11,
&m_mcast11,
mysite , // site
mysite , // url
true , // isSiteLinkInfo?
*fip ,
0 , // docId
cr->m_collnum , //linkInfoColl
NULL , // qbuf
0 , // qbufSize
m_masterState ,
m_masterLoop ,
m_contentInjected ,// isInjecting?
sb ,
m_printInXml ,
0 , // sitenuminlinks -- dunno!
//0 , // sitePop
NULL , // oldLinkInfo1 ,
m_niceness ,
cr->m_doLinkSpamCheck ,
cr->m_oneVotePerIpDom ,
canBeCancelled ,
lastUpdateTime ,
onlyNeedGoodInlinks ,
false,
0,
0,
// it will store the linkinfo into this safebuf
&m_mySiteLinkInfoBuf) )
// return -1 if it blocked
return (LinkInfo *)-1;
// sanity check
//if ( ! m_msg25.m_linkInfo ) {
// log("build: error making link info: %s",mstrerror(g_errno));
// return NULL;
//}
// we got it
//return m_msg25.m_linkInfo;
// getLinkInfo() now calls multicast so it returns true on errors only
log("build: error making link info: %s",mstrerror(g_errno));
return NULL;
}
static void gotIpWrapper ( void *state , int32_t ip ) ;
static void delayWrapper ( int fd , void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_masterLoop ( THIS->m_masterState );
}
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
int32_t *XmlDoc::getIp ( ) {
// return if we got it
if ( m_ipValid ) return &m_ip;
// update status msg
setStatus ( "getting ip" );
m_ipStartTime = 0;
// assume the same in case we get it right away
m_ipEndTime = 0;
// if set from docid and recycling
if ( m_recycleContent ) {
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (int32_t *)pod;
// int16_tcut
XmlDoc *od = *pod;
// set it
if ( od ) {
m_ip = od->m_ip;
m_ipValid = true;
return &m_ip;
}
}
// fakeit for now
//log("FAKING IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
//m_ip = atoip("74.201.80.152",13);
//m_ipValid = true;
//return &m_ip;
// get the best url
Url *u = getCurrentUrl();
if ( ! u || u == (void *)-1 ) return (int32_t *)u;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
// when building the "qatest123" collection try to get the ip from
// "./test/ips.txt" so our injections are consistent every time
// Test.cpp runs its injection loop into the "qatest123" collection
if ( useTestCache ) { // && m_useIpsTxtFile ) {
// stolen from msgc.cpp:
// if url is already in a.b.c.d format return that
int32_t ip2 = 0;
char *host = u->getHost();
if ( host ) ip2 = atoip ( host,u->getHostLen() );
if ( ip2 != 0 ) {
m_ip = ip2;
m_ipValid = true;
return &m_ip;
}
// assume not found in our file
bool found = false;
// get test dir
char *testDir = getTestDir();
// get it from "./test/ips.txt"
getTestIp ( u->getUrl() , &m_ip , &found , m_niceness,testDir);
// if we found a match...
if ( found ) { // m_ip != 0 ) {
// we are valid now
return gotIp ( false );
//m_ipValid = true;
// return it
//return &m_ip;
}
}
// we need the ip before we download the page, but before we get
// the IP and download the page, wait for this many milliseconds.
// this basically slows the spider down.
int32_t delay = cr->m_spiderDelayInMilliseconds;
// ignore for testing
if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
// injected?
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0;
if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0;
// . don't do the delay when downloading extra doc, robots.txt etc.
// . this also reports a status msg of "getting new doc" when it
// really means "delaying spider"
if ( m_isChildDoc ) delay = 0;
if ( delay > 0 && ! m_didDelay ) {
// we did it
m_didDelay = true;
m_statusMsg = "delaying spider";
// random fuzz so we don't get everyone being unleashed at once
int32_t radius = (int32_t)(.20 * (double)delay);
int32_t fuzz = (rand() % (radius * 2)) - radius;
delay += fuzz;
// make a callback wrapper.
// this returns false and sets g_errno on error
if ( g_loop.registerSleepCallback ( delay ,
m_masterState ,
delayWrapper,//m_masterLoop
m_niceness ))
// wait for it, return -1 since we blocked
return (int32_t *)-1;
// if was not able to register, ignore delay
}
if ( m_didDelay && ! m_didDelayUnregister ) {
g_loop.unregisterSleepCallback(m_masterState,delayWrapper);
m_didDelayUnregister = true;
}
// update status msg
setStatus ( "getting ip" );
m_ipStartTime = gettimeofdayInMillisecondsGlobal();
// assume valid! if reply handler gets g_errno set then m_masterLoop
// should see that and call the final callback
//m_ipValid = true;
// get it
if ( ! m_msgc.getIp ( u->getHost () ,
u->getHostLen() ,
&m_ip ,
this ,
gotIpWrapper ))
// we blocked
return (int32_t *)-1;
// wrap it up
return gotIp ( true );
}
void gotIpWrapper ( void *state , int32_t ip ) {
// point to us
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();
// wrap it up
THIS->gotIp ( true );
// . call the master callback
// . m_masterState usually equals THIS, unless THIS is the
// Xml::m_contactDoc or something...
THIS->m_masterLoop ( THIS->m_masterState );
}
int32_t *XmlDoc::gotIp ( bool save ) {
// return NULL on error
if ( g_errno ) return NULL;
// this is bad too
//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());
setStatus ("got ip");
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// note it for crawlbot
if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
log("db: got ip %"INT32" for %s",
m_ip,getCurrentUrl()->getUrl());
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
// when building the "qatest123" collection try to get the ip from
// "./test/ips.txt" so our injections are consistent every time
// Test.cpp runs its injection loop into the "qatest123" collection
if ( save && useTestCache ) {
// ip of 0 means NXDOMAIN i think (-1 means error)
//if ( m_ip == 0 ) {
// log("waiting for debug break");
// sleep(3600);
//}
// get the best url
Url *u = getCurrentUrl();
if ( !u || u == (void *)-1 ) { char *xx=NULL;*xx=0; }
// . add it to "./test/ips.txt"
// . this function is in Msge1.cpp
addTestIp ( u->getHost() , u->getHostLen() , m_ip );
// get test dir
char *testDir = getTestDir();
// save it
saveTestBuf ( testDir );
}
// we got it
m_ipValid = true;
// give it to them
return &m_ip;
}
#include "Mime.h"
// taken from Robotdb.cpp
bool isAllowed2 ( Url *url ,
char *userAgent ,
char *file ,
int32_t fileLen ,
bool *userAgentFound ,
bool substringMatch ,
int32_t *crawlDelay ,
char **cacheStart ,
int32_t *cacheLen ,
bool *hadAllowOrDisallow ) {
// assume nothing to cache yet
*cacheLen = 0;
*cacheStart = file;
// assume user agent is not in the file
*userAgentFound = false;
*hadAllowOrDisallow = false;
// assume no crawl delay (-1)
// *crawlDelay = -1;
// if fileLen is 0 it is allowed
if ( fileLen <= 0 ) return true;
// get path from url, include cgi stuff
char *path = url->getPath();
int32_t pathLen = url->getPathLenWithCgi();
// set the Mime class to this Mime file
Mime mime;
mime.set ( file , fileLen );
// get a line of Mime
char *f , *v;
int32_t flen, vlen;
// user agent length
int32_t uaLen = gbstrlen (userAgent);
// ptr into "file"
char *p = file;
char flag;
bool allowed = true;
loop:
// if p is NULL now we're done
if ( ! p ) return allowed;
// get the next Mime line
p = mime.getLine ( p , &f , &flen , &v , &vlen );
// if this field is NOT "user-agent" skip it
if ( flen != 10 ) goto loop;
if ( strncasecmp ( f , "user-agent" , 10 ) != 0 ) goto loop;
gotAgent:
//some webmasters put comments at the end of their lines,
//because they think this is a shell script or something.
char* vv = v;
while(vv - v < vlen && *vv != '#') vv++;
vlen = vv - v;
// decrement vlen to hack off spaces after the user-agent so that vlen
// is really the length of the user agent
while ( vlen > 0 && is_wspace_a(v[vlen-1]) ) vlen--;
// now match the user agent
if ( ! substringMatch && vlen != uaLen ) goto loop;
// otherwise take the min of the lengths
if ( uaLen < vlen ) vlen = uaLen;
// is it the right user-agent?
if ( strncasecmp ( v , userAgent , vlen ) != 0 ) goto loop;
// we got it, if first instance start our cache here
if ( !*userAgentFound ) *cacheStart = f;
*userAgentFound = true;
flag = 0;
urlLoop:
// if p is NULL now there is no more lines
if ( ! p ) {
// set our cache stop to the end of the file
*cacheLen = (file + fileLen) - *cacheStart;
return allowed;
}
// now loop over lines until we hit another user-agent line
p = mime.getLine ( p , &f , &flen , &v , &vlen );
// if it's another user-agent line ... ignore it unless we already
// have seen a disallow line, in which case we got another set of
if ( flag && flen==10 && strncasecmp(f,"user-agent",10)==0) {
// set our cache stop here
*cacheLen = f - *cacheStart;
goto gotAgent;
}
// if a crawl delay, get the delay
if ( flen == 11 && strncasecmp ( f , "crawl-delay", 11 ) == 0 ) {
// set flag
flag = 1;
// skip if invalid. it could be ".5" seconds
if ( ! is_digit ( *v ) && *v != '.' ) goto urlLoop;
// get this. multiply crawl delay by x1000 to be in
// milliseconds/ms
int64_t vv = (int64_t)(atof(v) * 1000LL);
// truncate to 0x7fffffff
if ( vv > 0x7fffffff ) *crawlDelay = 0x7fffffff;
else if ( vv < 0 ) *crawlDelay = -1;
else *crawlDelay = (int32_t)vv;
// get the delay
//*crawlDelay = atol(v) * 1000;
goto urlLoop;
}
// if already disallowed, just goto the next line
if ( !allowed ) goto urlLoop;
// if we have an allow line or sitemap: line, then set flag to 1
// so we can go to another user-agent line.
// fixes romwebermarketplace.com/robots.txt
// (doc.156447320458030317.txt)
if ( flen==5 && strncasecmp(f,"allow" ,5)==0 ) {
*hadAllowOrDisallow = true;
flag = 1;
}
if ( flen==7 && strncasecmp(f,"sitemap",7)==0 ) {
flag = 1;
}
// if not disallow go to loop at top
if ( flen != 8 ) goto urlLoop;
if ( strncasecmp ( f , "disallow" , 8 ) != 0 ) {
goto urlLoop;
}
// we had a disallow
*hadAllowOrDisallow = true;
// set flag
flag = 1;
// . take off trailing chars from the banned path name
// . this is now done below
//while ( vlen > 0 && is_space(v[vlen-1]) ) vlen--;
// . skip leading spaces
// . this should be done in mime class
// while ( vlen > 0 && is_space(v[0]) ) { v++; vlen--; }
// now stop at first space after url or end of line
char *s = v;
char *send = v + vlen;
// skip all non-space chars
while ( s < send && ! is_wspace_a(*s) ) s++;
// stop there
vlen = s - v;
// check for match
char *tmpPath = path;
int32_t tmpPathLen = pathLen;
// assume path begins with /
if ( vlen > 0 && v[0] != '/'){tmpPath++;tmpPathLen--;}
if ( vlen > tmpPathLen ) goto urlLoop;
if ( strncasecmp(tmpPath,v,vlen) != 0 ) goto urlLoop;
// an exact match
if ( vlen == tmpPathLen ) {
//return false;
allowed = false;
goto urlLoop;
}
// must be something
if ( vlen <= 0 ) goto urlLoop;
// "v" may or may not end in a /, it really should end in a / though
if ( v[vlen-1] == '/' && tmpPath[vlen-1] == '/' ) {
//return false;
allowed = false;
goto urlLoop;
}
if ( v[vlen-1] != '/' && tmpPath[vlen ] == '/' ) {
//return false;
allowed = false;
goto urlLoop;
}
// let's be stronger. just do the substring match. if the webmaster
// does not want us splitting path or file names then they should end
// all of their robots.txt entries in a '/'. this also fixes the
// problem of the "Disallow: index.htm?" line.
//return false;
allowed = false;
// get another url path
goto urlLoop;
}
// when doing a custom crawl we have to decide between the provided crawl
// delay, and the one in the robots.txt...
int32_t *XmlDoc::getFinalCrawlDelay() {
if ( m_finalCrawlDelayValid )
return &m_finalCrawlDelay;
bool *isAllowed = getIsAllowed();
if ( ! isAllowed || isAllowed == (void *)-1 ) return (int32_t *)isAllowed;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
m_finalCrawlDelayValid = true;
// getIsAllowed already sets m_crawlDelayValid to true
if ( ! cr->m_isCustomCrawl ) {
m_finalCrawlDelay = m_crawlDelay;
// default to 250ms i guess if none specified in robots
// just to be somewhat nice by default
if ( m_crawlDelay < 0 ) m_finalCrawlDelay = 250;
return &m_finalCrawlDelay;
}
// get manually specified crawl delay in seconds. convert to ms.
int32_t manual = (int32_t)(cr->m_collectiveCrawlDelay * 1000.0);
// negative means -1 means unknown or not specified
if ( manual < 0 ) manual = -1;
// if both are unknown...
if ( m_crawlDelay == -1 && manual == -1 ) {
m_finalCrawlDelay = -1;
return &m_finalCrawlDelay;
}
// if not in robots.txt use manual
if ( m_crawlDelay == -1 ) {
m_finalCrawlDelay = manual;
return &m_finalCrawlDelay;
}
// if manually provided crawldelay is -1, use robots.txt then
if ( manual == -1 ) {
m_finalCrawlDelay = m_crawlDelay;
return &m_finalCrawlDelay;
}
// let robots.txt dictate if both are >= 0
if ( m_useRobotsTxt ) {
m_finalCrawlDelay = m_crawlDelay;
return &m_finalCrawlDelay;
}
// if not using robots.txt, pick the smallest
if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
else m_finalCrawlDelay = manual;
return &m_finalCrawlDelay;
}
bool XmlDoc::isFirstUrlRobotsTxt ( ) {
if ( m_isRobotsTxtUrlValid )
return m_isRobotsTxtUrl;
Url *fu = getFirstUrl();
m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
m_isRobotsTxtUrlValid = true;
return m_isRobotsTxtUrl;
}
// . get the Robots.txt and see if we are allowed
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
// . getting a robots.txt is not trivial since we need to follow redirects,
// so we make use of the powerful XmlDoc class for this
bool *XmlDoc::getIsAllowed ( ) {
// return if we got it
if ( m_isAllowedValid ) return &m_isAllowed;
// could be turned off for everyone
if ( ! m_useRobotsTxt ) {
m_isAllowed = true;
m_isAllowedValid = true;
m_crawlDelayValid = true;
m_crawlDelay = -1;
//log("xmldoc: skipping robots.txt lookup for %s",
// m_firstUrl.m_url);
return &m_isAllowed;
}
// . if setting from a title rec, assume allowed
// . this avoids doConsistencyCheck() from blocking and coring
if ( m_setFromTitleRec ) {
m_isAllowed = true;
m_isAllowedValid = true;
return &m_isAllowed;
}
if ( m_recycleContent ) {
m_isAllowed = true;
m_isAllowedValid = true;
return &m_isAllowed;
}
// HACK: so we can spider archive.org warcs and arcs internally
if ( m_firstUrlValid &&
m_firstUrl.getDomainLen() == 11 &&
strncmp ( m_firstUrl.getDomain() , "archive.org" , 11 ) == 0 ) {
m_isAllowed = true;
m_isAllowedValid = true;
return &m_isAllowed;
}
// double get?
if ( m_crawlDelayValid ) { char *xx=NULL;*xx=0; }
// bulk jobs don't need this
CollectionRec *cr = getCollRec();
if ( cr && cr->m_isCustomCrawl == 2 ) {
m_isAllowed = true;
m_isAllowedValid = true;
return &m_isAllowed;
}
// . if WE are robots.txt that is always allowed!!!
// . check the *first* url since these often redirect to wierd things
if ( isFirstUrlRobotsTxt() ) {
m_isAllowed = true;
m_isAllowedValid = true;
m_crawlDelayValid = true;
// make it super fast...
m_crawlDelay = 0;
return &m_isAllowed;
}
// or if using the "qatest123" collection, assume yes!
//if ( ! strcmp ( m_coll , "qatest123" ) ) {
// m_isAllowed = true;
// m_isAllowedValid = true;
// return &m_isAllowed;
//}
// update status msg
setStatus ( "getting robots.txt" );
// sanity
int32_t *ip = getIp ();
// error? or blocked?
if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
Url *fu = getFirstUrl();
// if ip does not exist on the dns, do not try to download robots.txt
// it is pointless... this can happen in the dir coll and we basically
// have "m_siteInCatdb" set to true
if ( *ip == 1 || *ip == 0 || *ip == -1 ) {
// note this
log("build: robots.txt ip is %s for url=%s. allowing for now.",
fu->getUrl(),iptoa(*ip));
// just core for now
//char *xx=NULL;*xx=0;
m_isAllowed = true;
m_isAllowedValid = true;
// since ENOMIME is no longer causing the indexCode
// to be set, we are getting a core because crawlDelay
// is invalid in getNewSpiderReply()
m_crawlDelayValid = true;
m_crawlDelay = -1;
return &m_isAllowed;
}
// we need this so getExtraDoc does not core
int32_t *pfip = getFirstIp();
if ( ! pfip || pfip == (void *)-1 ) return (bool *)pfip;
// get the current url after redirects
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (bool *)cu;
// set m_extraUrl to the robots.txt url
char buf[MAX_URL_LEN+2];
char *p = buf;
if ( cu->isHttps() ) p += sprintf ( p , "https://" );
else p += sprintf ( p , "http://" );
// sanity
if ( ! cu->getHost() ) { char *xx=NULL;*xx=0; }
gbmemcpy ( p , cu->getHost() , cu->getHostLen() );
p += cu->getHostLen();
int32_t port = cu->getPort();
// 80 is the default port
int32_t defPort = 80;
// is it https://?
if ( cu->m_url[4] == 's' ) defPort = 443;
if ( port != defPort ) p += sprintf ( p , ":%"INT32"",port );
p += sprintf ( p , "/robots.txt" );
m_extraUrl.set ( buf );
// . maxCacheAge = 3600 seconds = 1 hour for robots.txt
// . if this is non-zero then msg13 should store it as well!
// . for robots.txt it should only cache the portion of the doc
// relevant to our user agent!
// . getHttpReply() should use msg13 to get cached reply!
XmlDoc **ped = getExtraDoc ( m_extraUrl.getUrl() , 3600 );
if ( ! ped || ped == (void *)-1 ) return (bool *)ped;
// assign it
XmlDoc *ed = *ped;
// return NULL on error with g_errno set
if ( ! ed ) {
// sanity check, g_errno must be set
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// log it -- should be rare?
log("doc: had error getting robots.txt: %s",
mstrerror(g_errno));
return NULL;
}
// inherit this
//if ( ! m_useIpsTxtFile ) ed->m_useIpsTxtFile = false;
// . steal m_firstIp from us to avoid tag rec lookup
// . why was this commented out?
// . maybe because if we redirect, this is not the same!!!
//ed->m_firstIp = m_firstIp;
//ed->m_firstIpValid = m_firstIpValid;//true;
// also, steal our ip! neither is this!
//ed->m_ip = m_ip;
//ed->m_ipValid = m_ipValid;
// . now try the content
// . should call getHttpReply
char **pcontent = ed->getContent();
if ( ! pcontent || pcontent == (void *)-1 ) return (bool *)pcontent;
// get the mime
HttpMime *mime = ed->getMime();
if ( ! mime || mime == (HttpMime *)-1 ) return (bool *)mime;
// get this
int32_t contentLen = ed->m_contentLen;
// save this
m_robotsTxtLen = contentLen;
m_robotsTxtLenValid = true;
// get content
char *content = *pcontent;
// sanity check
if ( content && contentLen>0 && content[contentLen] != '\0'){
char*xx=NULL;*xx=0;}
// reset this. -1 means unknown or none found.
m_crawlDelay = -1;
m_crawlDelayValid = true;
// assume valid and ok to spider
m_isAllowed = true;
m_isAllowedValid = true;
// put in a crawldelay test for diffbot
/*
SafeBuf tmp;
if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) {
tmp.safePrintf("User-Agent: *\n"
"Crawl-Delay: 10.1\n"
);
content = tmp.getBufStart();
contentLen = tmp.getLength();
}
// if not success, assume no robots.txt
else*/
if ( mime->getHttpStatus() != 200 ) {
// nuke it to save mem
nukeDoc ( ed );
return &m_isAllowed;
}
// get the url we lookup
//Url *cu = getCurrentUrl();
// this is set to true if our userAgent was found explicitly
bool uaFound;
bool allowed;
char *cacheStart;
int32_t cacheLen;
bool hadAllowOrDisallow;
int32_t savedCrawlDelay = -1;
// now use left-anchored substring match so we can match Gigabot/1.0
allowed = isAllowed2 ( cu ,
g_conf.m_spiderUserAgent ,
content ,
contentLen ,
&uaFound ,
true , // substrmatch?
&m_crawlDelay ,
&cacheStart ,
&cacheLen ,
&hadAllowOrDisallow );
// save it
savedCrawlDelay = m_crawlDelay;
// . if didn't find our user agent so check for * as a user-agent
// . www.wikihow.com/robots.txt just has "Gigabot: crawl-delay:10\n"
// and then a "User-Agent: *" after that with the disallows, so
// i added the hadAllowDisallow parm
if ( ! uaFound || ! hadAllowOrDisallow )
allowed = isAllowed2 ( cu ,
"*" ,
content ,
contentLen ,
&uaFound ,
false , // substrmatch?
&m_crawlDelay ,
&cacheStart ,
&cacheLen ,
&hadAllowOrDisallow );
// bring back?
if ( savedCrawlDelay != -1 ) m_crawlDelay = savedCrawlDelay;
// nuke it to save mem
nukeDoc ( ed );
// we are legit
m_isAllowed = allowed;
m_isAllowedValid = true;
return &m_isAllowed;
}
// . lookup the title rec with the "www." if we do not have that in the url
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
char *XmlDoc::getIsWWWDup ( ) {
// this is not a real error really
//if ( g_errno == ENOTFOUND ) g_errno = 0;
// return if we got it
if ( m_isWWWDupValid ) return &m_isWWWDup;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// could be turned off for everyone
if ( ! cr->m_dupCheckWWW ) {
m_isWWWDup = false;
m_isWWWDupValid = true;
return &m_isWWWDup;
}
// get the FIRST URL... (no longer current url after redirects)
Url *u = getFirstUrl(); // CurrentUrl();
// if we are NOT a DOMAIN-ONLY url, then no need to do this dup check
if ( u->getDomainLen() != u->getHostLen() ) {
m_isWWWDup = false;
m_isWWWDupValid = true;
return &m_isWWWDup;
}
// must NOT have a www
if ( ! u->isHostWWW() ) {
m_isWWWDup = false;
m_isWWWDupValid = true;
return &m_isWWWDup;
}
// watch out for idiot urls like www.gov.uk and www.gov.za
// treat them as though the TLD is uk/za and the domain
// is gov.uk and gov.za
if ( u->getDomain() &&
strncmp ( u->getDomain() , "www." , 4 ) == 0 ) {
m_isWWWDup = false;
m_isWWWDupValid = true;
return &m_isWWWDup;
}
// make it without the www
char withoutWWW[MAX_URL_LEN+1];
char *proto = "http";
if ( u->isHttps() ) proto = "https";
sprintf(withoutWWW,"%s://%s",proto,u->getDomain());
// assume yes
m_isWWWDup = true;
if ( ! m_calledMsg22f )
setStatus ( "getting possible www dup title rec" );
// . does this title rec exist in titledb?
// . "justCheckTfndb" is set to true here!
if ( ! m_calledMsg22f &&
! m_msg22f.getTitleRec ( &m_msg22Request ,
withoutWWW ,
0 , // probable docid
cr->m_coll ,
// . msg22 will set this to point to it!
// . if NULL that means NOT FOUND
NULL , // tr ptr
NULL , // tr size ptr
true , // just chk tfndb?
false, // getavaildocidonly
m_masterState ,
m_masterLoop ,
m_niceness , // niceness
false , // add to cache?
0 , // max cache age
999999 , // timeout seconds
false )){//load balancing?
// validate
m_calledMsg22f = true;
// return -1 if we blocked
return (char *)-1;
}
// got it
m_calledMsg22f = true;
// valid now
m_isWWWDupValid = true;
// found?
if ( ! g_errno && m_msg22f.m_found ) {
// crap we are a dup
m_isWWWDup = true;
// set the index code
//m_indexCode = EDOCDUPWWW;
}
// return us
return &m_isWWWDup;
}
LinkInfo s_dummy2;
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
LinkInfo *XmlDoc::getLinkInfo1 ( ) {
if ( m_linkInfo1Valid && ptr_linkInfo1 )
return ptr_linkInfo1;
// do not generate in real-time from a msg20 request for a summary,
// because if this falls through then getFirstIp() below can return -1
// and we return -1, causing all kinds of bad things to happen for
// handling the msg20 request
if ( m_setFromTitleRec && m_req && ! ptr_linkInfo1 ) {
returnDummy:
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
s_dummy2.m_lisize = sizeof(LinkInfo);
ptr_linkInfo1 = &s_dummy2;
size_linkInfo1 = sizeof(LinkInfo);
return ptr_linkInfo1;
}
// at least get our firstip so if cr->m_getLinkInfo is false
// then getRevisedSpiderReq() will not core because it is invalid
int32_t *ip = getFirstIp();
if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;
// just return nothing if not doing link voting
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// to keep things fast we avoid getting link info for some collections
if ( ! m_linkInfo1Valid && ! cr->m_getLinkInfo ) {
ptr_linkInfo1 = NULL;
m_linkInfo1Valid = true;
}
// sometimes it is NULL in title rec when setting from title rec
if ( m_linkInfo1Valid && ! ptr_linkInfo1 ) {
goto returnDummy;
}
// return if we got it
if ( m_linkInfo1Valid )
return ptr_linkInfo1;
// change status
setStatus ( "getting local inlinkers" );
XmlDoc **od = getOldXmlDoc ( );
if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo *)od;
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni;
//int32_t *fip = getFirstIp();
//if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d;
// sanity check. error?
if ( *d == 0LL ) {
log("xmldoc: crap no g_errno");
g_errno = EBADENGINEER;
return NULL;
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return NULL;
}
char *mysite = getSite();
if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
// no linkinfo for diffbot custom crawls to speed up
if ( cr->m_isCustomCrawl ) {
m_linkInfo1Valid = true;
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
s_dummy2.m_lisize = sizeof(LinkInfo);
ptr_linkInfo1 = &s_dummy2;
size_linkInfo1 = sizeof(LinkInfo);
return ptr_linkInfo1;
}
// grab a ptr to the LinkInfo contained in our Doc class
LinkInfo *oldLinkInfo1 = NULL;
if ( *od ) oldLinkInfo1 = (*od)->getLinkInfo1();
// if ip does not exist, make it 0
if ( *ip == 0 || *ip == -1 ) {
m_linkInfo1Valid = true;
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
s_dummy2.m_lisize = sizeof(LinkInfo);
ptr_linkInfo1 = &s_dummy2;
size_linkInfo1 = sizeof(LinkInfo);
return ptr_linkInfo1;
}
//link info generation requires an IP for internal/external computation
// UNLESS we are from getSpiderStatusDocMetaList2() ... so handle
// -1 above!
//if ( *ip == -1 || *ip == 0 ) { char *xx=NULL;*xx=0; }
// . error getting linkers?
// . on udp timeout we were coring below because msg25.m_linkInfo
// was NULL
if ( g_errno && m_calledMsg25 ) return NULL;
// prevent core as well
//if ( m_calledMsg25 && ! size_linkInfo1 ) { // m_msg25.m_linkInfo ) {
// log("xmldoc: msg25 had null link info");
// g_errno = EBADENGINEER;
// return NULL;
//}
// . now search for some link info for this url/doc
// . this queries the search engine to get linking docIds along
// with their termIds/scores from anchor text and then compiles
// it all into one IndexList
// . if we have no linkers to this url then we set siteHash, etc.
// for this linkInfo class
// . this is my google algorithm
// . let's use the first url (before redirects) for this
// . m_newDocId is used for classifying doc under predefined news topic
// . catSiteRec is used for classifying pages under a predefined
// newstopic. this is currently for news search only.
// . use the rootTitleRecPtr if there and we are doing our link info
// stuff in this collection, but if doing it in another collection
// the msg25 will look up the root in that collection...
if ( ! m_calledMsg25 ) {
// get this
int32_t lastUpdateTime = getTimeGlobal();
// but be consistent if doing the "qatest123" collection
if ( ! strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
lastUpdateTime = getSpideredTime();//m_spideredTime;
}
// do not redo it
m_calledMsg25 = true;
// int16_tcut
//Msg25 *m = &m_msg25;
// can we be cancelled?
bool canBeCancelled = true;
// not if pageparser though
if ( m_pbuf ) canBeCancelled = false;
// not if injecting
if ( ! m_sreqValid ) canBeCancelled = false;
// use this buffer so XmlDoc::print() can display wherever
SafeBuf *sb = NULL;
if ( m_pbuf ) sb = &m_pageLinkBuf;
// only do this for showing them!!!
if ( m_usePageLinkBuf ) sb = &m_pageLinkBuf;
// get from spider request if there
//bool injected = false;
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
// we do not want to waste time computing the page title
// of bad inlinks if we only want the good inlinks, because
// as of oct 25, 2012 we only store the "good" inlinks
// in the titlerec
bool onlyNeedGoodInlinks = true;
// so if steve wants to display all links then set this
// to false so we get titles of bad inlinks
if ( m_usePageLinkBuf ) onlyNeedGoodInlinks = false;
// seems like pageparser.cpp just sets m_pbuf and not
// m_usePageLinkBuf any more
if ( m_pbuf ) onlyNeedGoodInlinks = false;
// status update
setStatus ( "calling msg25 for url" );
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// we want to get all inlinks if doing a custom crawlbot crawl
// because we need the anchor text to pass in to diffbot
bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
// this seems to overdo it when we have a ton of linktext
// perhaps, so take this out...
//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
// doLinkSpamCheck = false;
// oneVotePerIpDom = false;
// onlyNeedGoodInlinks = false;
//}
// call it. this is defined in Linkdb.cpp
char *url = getFirstUrl()->getUrl();
if ( ! getLinkInfo ( &m_tmpBuf12,
&m_mcast12,
mysite ,
url ,
false , // isSiteLinkInfo?
*ip ,
*d ,
cr->m_collnum , //linkInfoColl
NULL , // qbuf
0 , // qbufSize
m_masterState ,
m_masterLoop ,
m_contentInjected ,//m_injectedReply ,
sb ,
m_printInXml ,
*sni ,
//m_sitePop ,
oldLinkInfo1 ,
m_niceness ,
doLinkSpamCheck ,
oneVotePerIpDom ,
canBeCancelled ,
lastUpdateTime ,
onlyNeedGoodInlinks ,
false, // getlinkertitles
0, // ourhosthash32 (special)
0, // ourdomhash32 (special)
&m_myPageLinkInfoBuf
) )
// blocked
return (LinkInfo *)-1;
// error?
if ( g_errno ) return NULL;
// panic! what the fuck? why did it return true and then
// call our callback???
//if ( g_conf.m_logDebugBuild ) {
log("build: xmldoc call to msg25 did not block");
// must now block since it uses multicast now to
// send the request onto the network
char *xx=NULL;*xx=0;
//}
}
// at this point assume its valid
m_linkInfo1Valid = true;
// . get the link info we got set
// . this ptr references into m_myPageLinkInfoBuf safebuf
//ptr_linkInfo1 = m_msg25.m_linkInfo;
//size_linkInfo1 = m_msg25.m_linkInfo->getSize();
ptr_linkInfo1 = (LinkInfo *)m_myPageLinkInfoBuf.getBufStart();
size_linkInfo1 = m_myPageLinkInfoBuf.length();
// we should free it
m_freeLinkInfo1 = true;
// this can not be NULL!
if ( ! ptr_linkInfo1 || size_linkInfo1 <= 0 ) {
log("build: error getting linkinfo1: %s",mstrerror(g_errno));
char *xx=NULL;*xx=0;
return NULL;
}
// take it from msg25 permanently
//m_msg25.m_linkInfo = NULL;
// set flag
m_linkInfo1Valid = true;
// . validate the hop count thing too
// . i took hopcount out of linkdb to put in lower ip byte for steve
//m_minInlinkerHopCount = -1;//m_msg25.getMinInlinkerHopCount();
// return it
return ptr_linkInfo1;
}
static void *s_null = NULL;
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
LinkInfo **XmlDoc::getLinkInfo2 ( ) {
// this can now be title hashes for XmlDoc::m_diffbotTitleHashes
// but otherwise, we don't use it for link info from another cluster
// any more.
m_linkInfo2Valid = true;
return (LinkInfo **)&s_null;
// return if we got it
if ( m_linkInfo2Valid ) return &ptr_linkInfo2;
m_linkInfo2Valid = true;
ptr_linkInfo2 = NULL;
return &ptr_linkInfo2;
/*
if ( ! cr->m_importFromHosts2Conf ) {
m_linkInfo2Valid = true;
ptr_linkInfo2 = NULL;
return &ptr_linkInfo2;
}
// change status
setStatus ( "getting remote hosts2.conf inlinkers" );
XmlDoc **od = getOldXmlDoc ( );
if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo **)od;
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo **)sni;
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo **)ip;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (LinkInfo **)d;
// grab a ptr to the LinkInfo contained in our Doc class
LinkInfo *oldLinkInfo2 = NULL;
if ( *od ) oldLinkInfo2 = *(*od)->getLinkInfo2();
// . now search for some link info for this url/doc
// . this queries the search engine to get linking docIds along
// with their termIds/scores from anchor text and then compiles
// it all into one IndexList
// . if we have no linkers to this url then we set siteHash, etc.
// for this linkInfo class
// . this is my google algorithm
// . let's use the first url (before redirects) for this
// . m_newDocId is used for classifying doc under predefined news topic
// . catSiteRec is used for classifying pages under a predefined
// newstopic. this is currently for news search only.
// . use the rootTitleRecPtr if there and we are doing our link info
// stuff in this collection, but if doing it in another collection
// the msg25 will look up the root in that collection...
if ( ! m_calledMsg25b ) {
// do not redo it
m_calledMsg25b = true;
// int16_tcut
Msg25 *m = &m_msg25;
// can we be cancelled?
bool canBeCancelled = true;
// not if pageparser though
if ( m_pbuf ) canBeCancelled = false;
// not if injecting
if ( ! m_sreqValid ) canBeCancelled = false;
// use this buffer so XmlDoc::print() can display wherever
//SafeBuf *sb = NULL;
//if ( m_pbuf ) sb = &m_pageLinkBuf2;
// call it
if ( ! m->getPageLinkInfo2 ( getFirstUrl() ,
m_coll ,
cr->m_externalColl ,
m_masterState ,
m_masterLoop ,
cr->m_doLinkSpamCheck ,
cr->m_oneVotePerIpDom ,
canBeCancelled ) )
// blocked
return (LinkInfo **)-1;
// error?
if ( g_errno ) return NULL;
}
// at this point assume its valid
m_linkInfo2Valid = true;
// get the link info we got set
ptr_linkInfo2 = m_msg25.m_linkInfo;
// we should free it
m_freeLinkInfo2 = true;
// take it from msg25 permanently
m_msg25.m_linkInfo = NULL;
// set flag
m_linkInfo2Valid = true;
// validate the hop count thing too
//m_minInlinkerHopCount = m_msg25.getMinInlinkerHopCount();
// return it
return &ptr_linkInfo2;
*/
}
static void gotSiteWrapper ( void *state ) ;
// . we should store the site in the title rec because site getter might
// change what it thinks the site is!
char *XmlDoc::getSite ( ) {
// was there a problem getting site?
if ( m_siteValid && m_siteGetter.m_errno ) {
g_errno = m_siteGetter.m_errno;
return NULL;
}
// ok, return it
if ( m_siteValid ) return ptr_site;//m_siteGetter.m_site;
// note it
setStatus ( "getting site");
// need this
TagRec *gr = getTagRec();
// sanity check
if ( ! gr && ! g_errno ) { char *xx=NULL;*xx=0; }
// blocked or error?
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// get url
Url *f = getFirstUrl();
// bogus first url? prevent core in getIsSiteRoot().
if ( f->getUrlLen() <= 1 ) {
log("xmldoc: getSite: got bogus first url.");
g_errno = EBADURL;
return NULL;
}
// this must be valid
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
int32_t timestamp = getSpideredTime();//m_spideredTime;
// add tags to tagdb?
//bool addTags = true;
//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
//if ( getIsPageParser() ) addTags = false;
// do it
if ( ! m_siteGetter.getSite ( f->getUrl() ,
gr ,
timestamp ,
cr->m_collnum ,
m_niceness ,
//addTags ,
this , // state
gotSiteWrapper ))
// return -1 if we blocked
return (char *)-1;
// error?
if ( g_errno ) return NULL;
// set these then
gotSite();
return ptr_site;//m_siteGetter.m_site;
}
// set it
void gotSiteWrapper ( void *state ) {
// point to us
XmlDoc *THIS = (XmlDoc *)state;
THIS->gotSite ();
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
}
void XmlDoc::gotSite ( ) {
// sanity check
if ( ! m_siteGetter.m_allDone && ! g_errno ) { char *xx=NULL;*xx=0; }
// this sets g_errno on error
ptr_site = m_siteGetter.m_site;
size_site = m_siteGetter.m_siteLen+1; // include \0
// sanity check -- must have a site
if ( ! g_errno && size_site <= 1 ) { char *xx=NULL;*xx=0; }
// sitegetter.m_errno might be set!
m_siteValid = true;
// must be valid
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
// add the sitepathdepth tag to our tagrec
//Tag *a = m_siteGetter.m_addedTag.getFirstTag();
//if ( a ) m_newTagRec.addTag ( a );
}
int64_t *XmlDoc::getSiteHash64 ( ) {
if ( m_siteHash64Valid ) return &m_siteHash64;
char *site = getSite();
// sanity check
if ( ! site && ! g_errno ) { char *xx=NULL;*xx=0; }
if ( ! site || site == (void *)-1) return (int64_t *)site;
m_siteHash64 = hash64 ( site , gbstrlen(site) );
m_siteHash64Valid = true;
return &m_siteHash64;
}
int32_t *XmlDoc::getSiteHash32 ( ) {
if ( m_siteHash32Valid ) return &m_siteHash32;
char *site = getSite();
if ( ! site || site == (void *)-1) return (int32_t *)site;
m_siteHash32 = hash32 ( site , gbstrlen(site) );
m_siteHash32Valid = true;
return &m_siteHash32;
}
void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
XmlDoc *THIS = (XmlDoc *)state;
bool hadError = false;
THIS->setStatus("got diffbot reply");
// wha?
if ( g_errno ) {
log("diffbot: http error2 %s",mstrerror(g_errno));
THIS->m_diffbotReplyError = g_errno;
hadError = true;
}
// just retry if connection got reset by peer!
if ( g_errno == ECONNRESET ||
g_errno == ETIMEDOUT ) {
retry:
// reset error in case was set below before our retry.
// getDiffbotReply() will retry because we never set
// m_diffbotReplyValid to true, below.
THIS->m_diffbotReplyError = 0;
log("buld: retrying diffbot reply");
THIS->m_diffbotReplyRetries++;
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
return;
}
THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();
//char *buf = s->m_readBuf;
// do not allow TcpServer.cpp to free it since m_diffbotReply
// is now responsible for that
//s->m_readBuf = NULL;
// set the mime
HttpMime mime;
if ( ! hadError && s && s->m_readOffset>0 &&
// set location url to "null"
! mime.set ( s->m_readBuf , s->m_readOffset , NULL ) ) {
// g_errno should be set
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("build: error setting diffbot mime");
THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR;
hadError = true;
}
bool retryUrl = false;
// check the status
if ( ! hadError && mime.getHttpStatus() != 200 ) {
THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS;
log("xmldoc: diffbot reply mime was %"INT32"",
mime.getHttpStatus());
hadError = true;
// gateway timed out? then retry.
if ( mime.getHttpStatus() == 504 )
retryUrl = true;
}
if ( hadError )
log("build: diffbot error for url %s",
THIS->m_diffbotUrl.getBufStart());
CollectionRec *cr = THIS->getCollRec();
if ( cr && strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
log("build: diffbot reply for url %s = %s",
THIS->m_diffbotUrl.getBufStart(),
s->m_readBuf);
}
if ( retryUrl )
goto retry;
// get page content
char *page = NULL;
int32_t pageLen = 0;
if ( ! hadError && mime.getMimeLen() >= 0 ) {
page = s->m_readBuf + mime.getMimeLen();
char *end = s->m_readBuf + s->m_readOffset;
pageLen = end - page;
}
// "-1" means diffbot had an error
if ( page &&
page[0] == '-' &&
page[1] == '1' ) {
log("xmldoc: diffbot reply was -1");
THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
}
// . verify that it contains legit json and has the last field
// b/c we saw a case where the diffbot reply was truncated
// somehow
// . check to make sure it has the "url": field as all diffbot
// json replies must
if ( ! THIS->m_diffbotReplyError ) {
char *ttt = strstr ( page , "\"url\":\"");
if ( ! ttt ) ttt = strstr ( page , "\"pageUrl\":\"");
if ( ! ttt ) {
log("xmldoc: diffbot reply for %s using %s is missing "
"the url: field in the json reply. reply=%s",
THIS->m_firstUrl.m_url,
THIS->m_diffbotUrl.getBufStart(),
page
);
// try to get the right error code
char *err = strstr(page,"\"error\":\"");
if ( err ) err += 9;
int32_t code = EDIFFBOTUNKNOWNERROR;
if ( ! err &&
page[0]=='{' &&
page[1]=='}' )
code = EDIFFBOTCURLYREPLY;
if ( err && !strncmp(err,"Unable to apply rules",21))
code = EDIFFBOTUNABLETOAPPLYRULES;
// like .pdf pages get this error
if ( err && !strncmp(err,"Could not parse page",20))
code = EDIFFBOTCOULDNOTPARSE;
// if it is 404... 502, etc. any http status code
if ( err && !strncmp(err,"Could not download page",23))
code = EDIFFBOTCOULDNOTDOWNLOAD;
// custom api does not apply to the url
if ( err && !strncmp(err,"Invalid API",11))
code = EDIFFBOTINVALIDAPI;
if ( err && !strncmp(err,"Version required",16))
code = EDIFFBOTVERSIONREQ;
if ( err && !strncmp(err,"Empty content",13))
code = EDIFFBOTEMPTYCONTENT;
if ( err && !strncmp(err,"The selected pages contains too many TextNodes",46))
code = EDIFFBOTTOOMANYTEXTNODES;
if ( err && !strncmp(err,"No content received",19))
code = EDIFFBOTEMPTYCONTENT;
if ( err && !strncmp(err,"Request timed",13))
code = EDIFFBOTREQUESTTIMEDOUT;
if ( err &&!strncmp(err,"Request of third-party c",24))
code = EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY;
// error processing url
if ( err && !strncmp(err,"Error processing",16))
code = EDIFFBOTURLPROCESSERROR;
if ( err && !strncmp(err,"Your token has exp",18))
code = EDIFFBOTTOKENEXPIRED;
if ( err && !strncmp(err,"Not authorized API tok",22))
code = EDIFFBOTTOKENUNAUTHORIZED;
if ( err && !strncmp(err,"Error.",6) )
code = EDIFFBOTPLAINERROR;
THIS->m_diffbotReplyError = code;
}
// a hack for detecting if token is expired
if ( THIS->m_diffbotReplyError == EDIFFBOTTOKENEXPIRED ) {
// note it
log("xmldoc: pausing crawl %s (%"INT32") because "
"token is expired",cr->m_coll,
(int32_t)cr->m_collnum);
// pause the crawl
SafeBuf parmList;
// spidering enabled is the "cse" cgi parm in Parms.cpp
g_parms.addNewParmToList1 ( &parmList ,
cr->m_collnum,
"0", // val
-1 ,
"cse");
// this uses msg4 so parm ordering is guaranteed
g_parms.broadcastParmList ( &parmList , NULL , NULL );
}
}
// reply is now valid but might be empty
THIS->m_diffbotReplyValid = true;
// if json reply was truncated, that is an error as well.
// likewise we have to check if such bad json is in the serps
// when doing an icc=1 and print 'bad json' in json instead.
if ( ! THIS->m_diffbotReplyError && s->m_readOffset > 1 &&
// json must end with '}' (ignores trailing whitespace)
! endsInCurly ( s->m_readBuf , s->m_readOffset ) ) {
// hopefully this can be re-tried later.
THIS->m_diffbotReplyError = EJSONMISSINGLASTCURLY;
// make a note of it
log("build: got diffbot reply missing curly for %s",
THIS->m_firstUrl.m_url);
}
//if ( ! cr ) return;
bool countIt = true;
if ( ! cr ) countIt = false;
if ( THIS->m_diffbotReplyError ) countIt = false;
/*
// solution for bug #2092 but probably not really needed so
// commented out.
// if doing /vxxx/analzye?mode=xxxx then ensure matches
bool isAnalyze = false;
if ( countIt &&
THIS->m_diffbotApiUrlValid &&
strstr ( THIS->m_diffbotApiUrl.getBufStart(), "/analyze?") )
isAnalyze = true;
char *mode = NULL;
if ( isAnalyze ) {
mode = strstr (THIS->m_diffbotApiUrl.getBufStart(), "mode=");
if ( mode ) mode += 5;
// find end of it
}
char *pageType = NULL;
int32_t pageTypeLen;
if ( mode &&
THIS->m_diffbotReplyValid &&
THIS->m_diffbotReply.length() > 5 ) {
char *reply = THIS->m_diffbotReply.getBufStart();
pageType = strstr ( reply , "\"type\":\"" );
if ( pageType ) pageType += 8;
char *e = pageType;
for ( ; *e && *e != '\"' ; e++ );
pageTypeLen = e - pageType;
}
// if it does not match, do not count it
if ( mode && pageType && strncmp ( mode , pageType , pageTypeLen ) )
countIt = false;
*/
// increment this counter on a successful reply from diffbot
if ( countIt ) { // ! THIS->m_diffbotReplyError && cr ) {
// mark this flag
THIS->m_gotDiffbotSuccessfulReply = 1;
// count it for stats
cr->m_localCrawlInfo.m_pageProcessSuccesses++;
cr->m_globalCrawlInfo.m_pageProcessSuccesses++;
// per round as well
cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound++;
cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound++;
// log it
log(LOG_INFO,
"build: processed page %s (pageLen=%"INT32")",
THIS->m_firstUrl.m_url,
pageLen);
// changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
// sanity!
// crap, this can happen if we try to get the metalist
// of an old page for purposes of incremental indexing or
// deletion. we do not re-download it, but it seems we try
// to re-process it...
//if ( cr->m_localCrawlInfo.m_pageProcessAttempts >
// cr->m_localCrawlInfo.m_pageDownloadAttempts ) {
// char *xx=NULL;*xx=0; }
// need to save collection rec now during auto save
cr->m_needsSave = true;
// the diffbot api url we used
//SafeBuf *au = THIS->getDiffbotApiUrl();
//if ( ! au || au == (void *)-1 ) {char *xx=NULL;*xx=0;}
// set the reply properly
int32_t need = pageLen + 1;// + au->length() + 1;
if ( ! THIS->m_diffbotReply.reserve ( need ) )
goto skip;
// first store the url we used on first line
//THIS->m_diffbotReply.safeMemcpy ( au->getBufStart(),
// au->length() );
//THIS->m_diffbotReply.pushChar('\n');
// convert the \u1f23 to utf8 (\n and \r as well)
// crap, this decodes \\\\\" to \\" which is causing
// the json parser to believe it is an encoded \ then
// a REAL quote... but quote is contained...
//THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen ,
// THIS->m_niceness );
// do not do that any more then, jsonparse can call it
// on a per string basis
THIS->m_diffbotReply.safeMemcpy ( page , pageLen );
// convert embedded \0 to space
//char *p = THIS->m_diffbotReply.getBufStart();
//char *pend = p + THIS->m_diffbotReply.getLength();
// tack on a \0 but don't increment m_length
THIS->m_diffbotReply.nullTerm();
// any embedded \0's in the utf8?
int32_t testLen1 = THIS->m_diffbotReply.length();
int32_t testLen2 = gbstrlen(THIS->m_diffbotReply.getBufStart());
if ( testLen1 != testLen2 ) { char *xx=NULL;*xx=0; }
// convert the \u1f23 to utf8 (\n and \r as well)
//THIS->m_diffbotReply.decodeJSONToUtf8 ( THIS->m_niceness );
//THIS->m_diffbotReply.nullTerm();
}
skip:
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
}
SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
if ( m_diffbotApiUrlValid )
return &m_diffbotApiUrl;
// if we are a diffbot json object, do not re-send to diffbot!
if ( m_isDiffbotJSONObject ) {
//m_diffbotApiNum = DBA_NONE;
m_diffbotApiUrlValid = true;
return &m_diffbotApiUrl;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
m_diffbotApiUrl.safeMemcpy ( &cr->m_diffbotApiUrl );
m_diffbotApiUrl.nullTerm();
m_diffbotApiUrlValid = true;
// this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
// in case the url filters table changes while spidering this!!!
// gotta be careful of that.
//int32_t *ufn = getUrlFilterNum();
//if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
// ensure it does set it!
//if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }
//m_diffbotApiNum = cr->m_spiderDiffbotApiNum[*ufn];
// sanity check
//if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
//m_diffbotApiNumValid = true;
return &m_diffbotApiUrl;
}
// if only processing NEW URLs is enabled, then do not get diffbot reply
// if we already got one before
bool *XmlDoc::getRecycleDiffbotReply ( ) {
if ( m_recycleDiffbotReplyValid )
return &m_recycleDiffbotReply;
// if from pageparser.cpp re-call diffbot for debugging
if ( getIsPageParser() ) {
m_recycleDiffbotReply = false;
m_recycleDiffbotReplyValid = true;
return &m_recycleDiffbotReply;
}
XmlDoc **odp = getOldXmlDoc( );
if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp;
XmlDoc *od = *odp;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if doc has been successfully processed in the past then
// ***RECYCLE*** the diffbot reply!
m_recycleDiffbotReply = false;
if ( cr->m_diffbotOnlyProcessIfNewUrl &&
od && od->m_gotDiffbotSuccessfulReply )
m_recycleDiffbotReply = true;
// to fight off corrupted title recs just assume that even though
// we could not uncompress the title rec that it had a successful reply
// if ( cr->m_diffbotOnlyProcessIfNewUrl &&
// m_oldDocExistedButHadError )
// m_recycleDiffbotReply = true;
// don't recycle if specfically asked to reindex though
if ( m_sreqValid && m_sreq.m_isPageReindex )
m_recycleDiffbotReply = false;
// unless the 'recycle content' checkbox was checked when doing
// the query (page) reindex...
if ( m_sreqValid && m_sreq.m_recycleContent )
m_recycleDiffbotReply = true;
m_recycleDiffbotReplyValid = true;
return &m_recycleDiffbotReply;
}
// get hashes of the json objects in the diffbotreply
int32_t *XmlDoc::getDiffbotTitleHashes ( int32_t *numHashes ) {
*numHashes = size_linkInfo2 / 4;
if ( ! ptr_linkInfo2 ) *numHashes = 0;
// hack: use linkdbdata2 field
if ( m_diffbotTitleHashBufValid ) {
// do not return NULL without g_errno set
if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
return (int32_t *)ptr_linkInfo2;
}
SafeBuf *tdbr = getTokenizedDiffbotReply();
if ( ! tdbr || tdbr == (void *)-1 ) return (int32_t *)tdbr;
HashTableX dedup;
if ( ! dedup.set ( 4,0,1024,NULL,0,false,m_niceness,"ddthbuf") )
return NULL;
// parse out the json items in the reply
char *p = tdbr->getBufStart();
char *pend = p + tdbr->length();
int32_t plen;
for ( ; p < pend ; p += plen + 1 ) {
// breathe some in case diffbot reply is 250MB
QUICKPOLL(m_niceness);
// set this
plen = gbstrlen(p);
// get title from it
int32_t valLen;
char *val = getJSONFieldValue ( p , "title", &valLen );
int32_t th32 = 0;
// hash the title
if ( val && valLen ) {
th32 = hash32 ( val , valLen );
// avoid 0
if ( th32 == 0 ) th32 = 1;
}
// if no title, use hash of body
if ( th32 == 0 ) {
th32 = hash32 ( p , plen );
// avoid 0
if ( th32 == 0 ) th32 = 2;
}
// if our hash is duplicated then increment until unique
while ( dedup.isInTable ( &th32 ) ) th32++;
// store it for deduping
dedup.addKey ( &th32 );
// store it
m_diffbotTitleHashBuf.pushLong(th32);
}
ptr_linkInfo2 = (LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
size_linkInfo2 = m_diffbotTitleHashBuf.length();
*numHashes = size_linkInfo2 / 4;
m_diffbotTitleHashBufValid = true;
// if no hashes return 0x01 because NULL means g_errno
if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
return (int32_t *)ptr_linkInfo2;
}
// . we now get the TOKENIZED diffbot reply.
// . that converts a single diffbot reply into multiple \0 separated
// json objects.
// . for instance, the diffbot product api returns an array like
// "products":[{...},{...}],"url":... that consists of multiple
// json product items, but the json elements that are not in
// this array are description of the page itself, like url and title.
// so we need to carry over these outter json objects to each
// inner json object we tokenize.
// . in this fashion we'll have separate objects that can each be indexed
// as a single page, which is what we want for searching.
SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
if ( m_tokenizedDiffbotReplyValid )
return m_tokenizedDiffbotReplyPtr;
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return dbr;
// empty? that's easy. might be just "{}\n" i guess
if ( dbr->length() <= 3 ) return dbr;
char *text = dbr->getBufStart();
Json jp;
if ( ! jp.parseJsonStringIntoJsonItems ( text , m_niceness ) ) {
g_errno = EBADJSONPARSER;
return NULL;
}
JsonItem *jsonItem = jp.getItem("objects");
char *array = NULL;
int32_t arrayLen = 0;
if ( jsonItem ) {
array = jsonItem->getArrayStart();
arrayLen = jsonItem->getArrayLen();
}
if ( array && arrayLen > 0 ) {
m_v3buf.safeMemcpy( array , arrayLen );
m_v3buf.nullTerm();
// trim off the enclosing []'s
char *p = m_v3buf.getBufStart();
for ( ; *p && is_wspace_a(*p) ; p++ );
if ( *p == '[') *p = ' ';
char *e = m_v3buf.getBuf()-1;
for ( ; e>p && is_wspace_a(*e) ;e--);
if ( *e ==']') *e=' ';
// replace top level commas with \0's
int32_t curlies = 0;
char *x = p;
bool inQuotes = false;
// scan now
for ( ; *x ; x++ ) {
// escaping a backslash?
if ( *x == '\\' && x[1] == '\\' ) {
// skip two bytes then..
x++;
continue;
}
// escaping a quote? ignore quote then.
if ( *x == '\\' && x[1] == '\"' ) {
// skip two bytes then..
x++;
continue;
}
if ( *x == '\"' ) {
inQuotes = ! inQuotes;
continue;
}
// if in a quote, ignore {} in there
if ( inQuotes ) continue;
if ( *x== '{' ) {
curlies++;
continue;
}
if ( *x == '}' ) {
curlies--;
continue;
}
if ( curlies != 0 ) continue;
if ( *x == ',' ) *x = '\0';
}
m_tokenizedDiffbotReplyPtr = &m_v3buf;
m_tokenizedDiffbotReplyValid = true;
return m_tokenizedDiffbotReplyPtr;
}
// it must have \"type\":\"product or \"type\":\"image
// in order for us to do the array separation logic below.
// we don't want to do this logic for articles because they
// contain an image array!!!
// this must be on the FIRST level of the json object, otherwise
// we get errors because we got type:article and it
// contains an images array!
int32_t valLen;
char *val = getJSONFieldValue ( text , "type", &valLen );
bool isProduct = false;
bool isImage = false;
if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 )
isProduct = true;
if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 )
isImage = true;
if ( ! isProduct && ! isImage ) {
m_tokenizedDiffbotReplyValid = true;
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
return m_tokenizedDiffbotReplyPtr;
}
char *needle;
char *newTerm;
if ( isProduct ) {
needle = ",\"products\":[";
newTerm = "product";
}
else {
needle = ",\"images\":[";
newTerm = "image";
}
char *parray = strstr ( text , needle );
// if not found, no need to do anything...
if ( ! parray ) {
m_tokenizedDiffbotReplyValid = true;
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
return m_tokenizedDiffbotReplyPtr;
}
// point to [
char *pstart = parray + gbstrlen(needle) - 1;
//
// ok, now we have to do so json ju jitsu to fix it
//
// point to array. starting at the '['
char *p = pstart;
int32_t brackets = 0;
bool inQuotes = false;
for ( ; *p ; p++ ) {
// escaping a quote? ignore quote then.
if ( *p == '\\' && p[1] == '\"' ) {
// skip two bytes then..
p++;
continue;
}
if ( *p == '\"' ) {
inQuotes = ! inQuotes;
continue;
}
// if in a quote, ignore {} in there
if ( inQuotes ) continue;
if ( *p == '[' ) brackets++;
if ( *p != ']' ) continue;
brackets--;
// stop if array is done. p points to ']'
if ( brackets == 0 ) break;
}
// now point to outter items to the left of the ",\"products\":[...
char *left1 = dbr->getBufStart();
char *left2 = parray;
// then to the right. skip over the ending ']'
char *right1 = p + 1;
char *right2 = dbr->getBuf(); // end of the buffer
SafeBuf *tbuf = &m_tokenizedDiffbotReply;
// now scan the json products or images in the array
char *x = pstart;
// skip over [
x++;
// each product item in array is enclosed in {}'s
if ( *x != '{' ) {
log("build: something is wrong with diffbot reply");
g_errno = EBADENGINEER;
return NULL;
}
// reset CURLY bracket count
int32_t curlies = 0;
char *xstart = NULL;
inQuotes = false;
// scan now
for ( ; x < right1 ; x++ ) {
// escaping a quote? ignore quote then.
if ( *x == '\\' && x[1] == '\"' ) {
// skip two bytes then..
x++;
continue;
}
if ( *x == '\"' ) {
inQuotes = ! inQuotes;
continue;
}
// if in a quote, ignore {} in there
if ( inQuotes ) continue;
if ( *x== '{' ) {
if ( curlies == 0 ) xstart = x;
curlies++;
continue;
}
if ( *x == '}' ) {
curlies--;
if ( curlies != 0 ) continue;
// unreciprocated '{'? wtf???
if ( ! xstart ) continue;
// skip empty curlies
if ( x[-1] == '{' ) continue;
//
// ok, we got an item!
//
// left top items
if ( ! tbuf->safeMemcpy ( left1 , left2-left1 ) )
return NULL;
// use "product":
if ( ! tbuf->safePrintf(",\"%s\":" , newTerm ) )
return NULL;
// the item itself, include it's curlies.
if ( ! tbuf->safeMemcpy ( xstart , x - xstart+1 ) )
return NULL;
// right top items
if ( ! tbuf->safeMemcpy ( right1 , right2-right1 ) )
return NULL;
// then a \0
if ( ! tbuf->pushChar('\0') )
return NULL;
// reset this!
xstart = NULL;
}
}
// now show the items. debug!
//p = tbuf->getBufStart();
//for ( ; p < tbuf->getBuf() ; p += gbstrlen(p) + 1 )
// fprintf(stderr,"ITEM\n%s\n\n",p);
m_tokenizedDiffbotReplyPtr = tbuf;
m_tokenizedDiffbotReplyValid = true;
return m_tokenizedDiffbotReplyPtr;
}
void gotDiffbotProxyReplyWrapper ( void *state , UdpSlot *slot ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_diffbotProxyReply = NULL;
// if a valid reply, then point to it
if ( slot->m_readBufSize == sizeof(ProxyReply) ) {
THIS->m_diffbotProxyReply = (ProxyReply *)slot->m_readBuf;
// steal it, we will free it in XmlDoc::reset()
slot->m_readBuf = NULL;
}
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
}
// . convert document into json representing multiple documents
// if it makes sense. sometimes a single url contains multiple
// subdocuments that each should have their own url, but do not,
// so we fix that here.
// . the diffbot reply will be a list of json objects we want to index
SafeBuf *XmlDoc::getDiffbotReply ( ) {
// got reply of malformed json missing final '}'
if ( m_diffbotReplyValid &&
m_diffbotReplyError == EJSONMISSINGLASTCURLY ) {
// hopefully spider will retry later
g_errno = m_diffbotReplyError;
return NULL;
}
if ( m_diffbotReplyValid )
return &m_diffbotReply;
// . check the url filters table to see if diffbot api is specified
// . just return "\0" if none, but NULL means error i guess
SafeBuf *au = getDiffbotApiUrl();
if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
// if no url, assume do not access diffbot
if ( au->length() <= 0 ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// if we are json do not send that to diffbot, like an injected
// json diffbot object. should fix json injections into gobal index
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
if ( *ct == CT_JSON ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// we make a "fake" url for the diffbot reply when indexing it
// by appending -diffbotxyz%"UINT32". see "fakeUrl" below.
if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
if ( m_firstUrlValid )
log("build: diffbot url would be too long for "
"%s", m_firstUrl.getUrl() );
else
log("build: diffbot url would be too long for "
"%"INT64"", m_docId );
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// getIndexCode() calls getDiffbotReply(), so avoid a loop!
//if ( *getIndexCode() )
// return &m_diffbotReply;
if ( m_indexCodeValid && m_indexCode )
return &m_diffbotReply;
if ( m_isDiffbotJSONObject ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// if this is a robots.txt or a root page we are downloading
// separately to get the title for to compare to this page's title,
// or whatever, do not pass to diffbot
if ( m_isChildDoc ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// get list of substring patterns
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( ucp && ! ucp[0] ) ucp = NULL;
// do we match the url process pattern or regex?
// get the compiled regular expressions
//regex_t *ucr = &cr->m_ucr;
regex_t *upr = &cr->m_upr;
//if ( ! cr->m_hasucr ) ucr = NULL;
if ( ! cr->m_hasupr ) upr = NULL;
// get the url
Url *f = getFirstUrl();
char *url = f->getUrl();
// . "upp" is a ||-separated list of substrings
// . "upr" is a regex
// . regexec returns 0 for a match
if ( upr && regexec(upr,url,0,NULL,0) ) {
// return empty reply
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
if ( upp && !upr &&!doesStringContainPattern(url,upp)) {
// return empty reply
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// if already processed and onlyprocessifnewurl is enabled then
// we recycle and do not bother with this, we also do not nuke
// the diffbot json objects we have already indexed by calling
// nukeJSONObjects()
bool *recycle = getRecycleDiffbotReply();
if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle;
if ( *recycle ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// if set from title rec, do not do it. we are possibly an "old doc"
// and we should only call diffbot.com with new docs
if ( m_setFromTitleRec ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// "none" means none too! Parms.cpp doesn't like &dapi1=& because
// it does not call setParm() on such things even though it probably
// should, it doesn't like no values, so i put "none" in there.
if ( strncasecmp(au->getBufStart(),"none",4) == 0 ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
if ( strncasecmp(au->getBufStart(),"donotprocess",12) == 0 ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// invalid url?
Url apiUrl; apiUrl.set ( au->getBufStart() );
if (apiUrl.getUrlLen() <= 0 ||
apiUrl.getHostLen() <= 0 ||
apiUrl.getDomainLen() <= 0 ) {
log("build: invalid diffbot api url of \"%s\".",
au->getBufStart() );
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// when respidering an "old" doc, never call this. we already
// have the diffbot replies xyz.com/-diffbot-0 and xyz.com/-diffbot-1
// etc.
//if ( m_setFromTitleRec ) { char *xx = NULL; *xx = 0; }
// sanity check. no! barfs on legit url with -diffbot- in it
//if ( strstr(m_firstUrl.m_url,"-diffbot-") ) {
// char *xx=NULL; *xx = 0; }
// we should not "process" (i.e. send to diffbot) urls that do
// not match the supplied CollectionRec::m_diffbotUrlProcessPattern
// let's just put a checkbox in the url filters box for this!
// i.e. Send to Diffbot? [X]
//if ( m_useDiffbot && ! doesUrlMatchDiffbotProcessPattern() ) {
// m_diffbotReplyValid = true;
// return &m_diffbotReply;
//}
// empty content, do not send to diffbot then
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
if ( ! *u8 ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// do not send to diffbot if its binary!
char *ib = getIsBinary();
if ( ! ib || ib == (void *)-1 ) return (SafeBuf *)ib;
if ( *ib ) {
m_diffbotReplyValid = true;
log("diffbot: skipping binary page %s",m_firstUrl.m_url);
return &m_diffbotReply;
}
// or if original page content matches the page regex dont hit diffbot
if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
// now include referring link anchor text, etc.
LinkInfo *info1 = getLinkInfo1 ();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
setStatus("getting diffbot reply");
// set up dedup table for deduping on link text
HashTableX dedup;
char tmp[512];
if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") )
return NULL;
SafeBuf headers;
bool first = true;
// . make additional headers
// . add two headers for every "good" (non-dup) link
// . do NOT end headers in \r\n since HttpServer adds that!
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL(m_niceness);
// sanity
if ( k->size_urlBuf <= 1 ) continue;
// skip if too long
if ( k->size_linkText > 1024 ) continue;
// or not enough! (size includes \0)
if ( k->size_linkText <= 1 ) continue;
// sanity check
char *txt = k->getLinkText();
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// this seems to happen sometimes..
if ( ! verifyUtf8 ( txt , tlen ) ) continue;
// if anchor text has \0 skip it
if ( gbstrlen(txt) != tlen ) continue;
// or if surrounding text has \0 skip as well
char *surStr = k->getSurroundingText();
int32_t surLen = k->size_surroundingText;
if ( surLen > 0 ) surLen--;
if ( surStr && gbstrlen(surStr) != surLen ) continue;
// dedup on that
int32_t h32 = hash32 ( txt , tlen );
if ( dedup.isInTable ( &h32 ) ) continue;
if ( ! dedup.addKey ( &h32 ) ) return NULL;
// separate with \r\n
if ( ! first && ! headers.safePrintf("\r\n" ) )
return NULL;
first = false;
// add to http header
if ( ! headers.safePrintf("X-referring-url: ") )
return NULL;
// do not include the terminating \0, so -1
if ( ! headers.safeMemcpy(k->getUrl() , k->size_urlBuf-1 ))
return NULL;
// and link text
if ( ! headers.safePrintf("\r\nX-anchor-text: ") )
return NULL;
// store the anchor text without any \r or \n chars
if ( ! headers.reserve ( tlen ) ) return NULL;
char *p = txt;
char *pend = txt + tlen;
for ( ; p < pend ; p++ ) {
if ( *p == '\r' ) continue;
if ( *p == '\n' ) continue;
headers.pushChar(*p);
}
// do not include it if more than 2000 chars big
if ( surLen > 0 && surLen < 2000 ) {
if ( ! headers.safePrintf("\r\nX-surrounding-text: ") )
return NULL;
// make room for copying the surrounding text
if ( ! headers.reserve ( surLen ) ) return NULL;
// copy minus any \r or \n so its mime header safe
p = surStr;
pend = surStr + surLen;
for ( ; p < pend ; p++ ) {
if ( *p == '\r' ) continue;
if ( *p == '\n' ) continue;
headers.pushChar(*p);
}
}
}
// make sure to null term the headers
if ( headers.length() && ! headers.nullTerm() ) return NULL;
//char *path = "api";
//if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 )
// path = "v2";
//
// DIFFBOT injection interface TODO
//
// if we are intercepting a direct injection diffbot request
// then we will probably take the exact same parms provided and
// just relay them to diffbot here. maybe Diffbot.cpp can set
// the original diffbot.com request url in this xmldoc class that
// is being inject using the url encoded in that request.
//
// url can be on the stack since httpserver.cpp makes an http mime
// from this url
//SafeBuf diffbotUrl;
// TODO: make sure "api" works as hostname for not just product...
//diffbotUrl.safePrintf("http://www.diffbot.com/");
// skip extra '/'?
//char *api = au->getBufStart();
//int32_t apiLen = au->length();
//if ( api && api[0] == '/' ) { api++; apiLen--; }
// append the custom url. i.e. /api/analyze?mode=auto&u=
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
// reset it in case we are a re-call from gotDiffbotReplyWrapper()
// if g_errno == ECONNRESET
m_diffbotUrl.reset();
// store the api url into here
m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() );
// . m_diffbotApi Is like "article" or "product" etc.
// . if classify is true we always return the classification
// of the page in the JSON. like "type":"frontpage" regardless
// of the "api" specified.
// . otherwise, if classify is false empty json will be returned
// if there is no json objects of the specified page type, "api"
// . BUT if api is "all" return all types of json objects
// . SHOULD we return "type" in the json output?
/*
if ( *an == DBA_ALL )
diffbotUrl.safePrintf("analyze?mode=auto&" );
else if ( *an == DBA_ARTICLE_FORCE )
diffbotUrl.safePrintf("article?");
else if ( *an == DBA_ARTICLE_AUTO )
diffbotUrl.safePrintf("analyze?mode=article&");
else if ( *an == DBA_PRODUCT_FORCE )
diffbotUrl.safePrintf("product?");
else if ( *an == DBA_PRODUCT_AUTO )
diffbotUrl.safePrintf("analyze?mode=product&");
else if ( *an == DBA_IMAGE_FORCE )
diffbotUrl.safePrintf("image?");
else if ( *an == DBA_IMAGE_AUTO )
diffbotUrl.safePrintf("analyze?mode=image&");
else if ( *an == DBA_FRONTPAGE_FORCE )
diffbotUrl.safePrintf("frontpage?");
else if ( *an == DBA_FRONTPAGE_AUTO )
diffbotUrl.safePrintf("analyze?mode=frontpage&");
else {
log("build: unknown diffbot api num = %"INT32". assuming all",*an );
diffbotUrl.safePrintf("analyze?mode=auto&" );
}
*/
//CollectionRec *cr = getCollRec();
//if ( ! cr ) return NULL;
// add a '?' if none
if ( ! strchr ( apiUrl.getUrl() , '?' ) )
m_diffbotUrl.pushChar('?');
else
m_diffbotUrl.pushChar('&');
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
// only print token if we have one, because if user provides their
// own diffbot url (apiUrl in Parms.cpp) then they might include
// the token in that for their non-custom crawl. m_customCrawl=0.
if ( cr->m_diffbotToken.length())
m_diffbotUrl.safePrintf("token=%s",
cr->m_diffbotToken.getBufStart());
bool useProxies = true;
// user can turn off proxy use with this switch
if ( ! g_conf.m_useProxyIps ) useProxies = false;
// did collection override?
if ( cr->m_forceUseFloaters ) useProxies = true;
// we gotta have some proxy ips that we can use
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
// until we fix https CONNECT support for https urls diffbot can't
// go through gb. we should fix that by downloading the whole page
// ourselves and sending it back, and tell diffbot's phantomjs not
// to do the certificate check.
//
// for now, allow http and NOT https urls through though.
// TODO: if the url redirects to an https url will this mess us up?
// if ( ! m_firstUrlValid )
// useProxies = false;
// if ( m_firstUrlValid && m_firstUrl.isHttps() )
// useProxies = false;
// turn off for now always
//useProxies = false;
if ( useProxies && ! m_diffbotProxyReplyValid && m_ipValid ) {
// a special opcode used in SpiderProxy.cpp
Msg13Request *r = &m_diffbotProxyRequest;
r->m_opCode = OP_GETPROXYFORDIFFBOT;
r->m_banProxyIp = 0;
r->m_urlIp = m_ip;
m_diffbotProxyReplyValid = true;
// get first alive host, usually host #0 but if he is dead then
// host #1 must take over! if all are dead, it returns host #0.
// so we are guaranteed "h will be non-null
Host *h = g_hostdb.getFirstAliveHost();
// now ask that host for the best spider proxy to send to
if ( ! g_udpServer.sendRequest ( (char *)r,
// just the top part of the
// Msg13Request is sent to
// handleRequest54() now
r->getProxyRequestSize() ,
0x54 , // msgType 0x54
h->m_ip ,
h->m_port ,
-1 , // h->m_hostId ,
NULL ,
this , // state data
gotDiffbotProxyReplyWrapper,
9999999 )){// 99999sectimeout
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// report it
log("spider: msg54 request3: %s %s",
mstrerror(g_errno),r->ptr_url);
return NULL;
}
// wait for reply
return (SafeBuf *)-1;
}
// if we used a proxy to download the doc, then diffbot should too
// BUT tell diffbot to go through host #0 so we can send it to the
// correct proxy using our load balancing & backoff algos.
if ( useProxies ) {
//Host *h0 = g_hostdb.getHost(0);
// use a random host now to avoid host #0 running
// out of sockets from diffbot trying to connect
// for downloading hundreds of urls from the same
// high crawl delay site.
// round robin over the hosts just to be more evenly
// distributed. it will likely get several http requests
// from diffbot.
// static int32_t s_lastHostId = -1;
// if ( s_lastHostId == -1 )
// s_lastHostId = g_hostdb.m_myHost->m_hostId;
// int32_t r = s_lastHostId;//rand() % g_hostdb.m_numHosts;
// if ( ++s_lastHostId >= g_hostdb.m_numHosts )
// s_lastHostId = 0;
// Host *h0 = g_hostdb.getHost(r);
// m_diffbotUrl.safePrintf("&proxy=%s:%"INT32"",
// iptoa(h0->m_ip),
// (int32_t)h0->m_httpPort);
ProxyReply *prep = m_diffbotProxyReply;
m_diffbotUrl.safePrintf("&proxy=%s:%"UINT32"",
iptoa(prep->m_proxyIp),
(uint32_t)prep->m_proxyPort);
m_diffbotUrl.safePrintf("&proxyAuth=");
m_diffbotUrl.urlEncode(prep->m_usernamePwd);
}
// char *p = g_conf.m_proxyAuth.getBufStart();
// if ( useProxies && p ) {
// char *p1 = p;
// for ( ; *p1 && is_wspace_a(*p1) ; p1++ );
// char *p2 = p1;
// for ( ; *p2 && ! is_wspace_a(*p2) ; p2++ );
// char c = *p2;
// *p2 = '\0';
// m_diffbotUrl.safePrintf("&proxyAuth=");
// m_diffbotUrl.urlEncode(p1);
// *p2 = c;
// }
// now so it works just give it a proxy directly, so it doesn't
// have to go through gb.
// if ( useProxies ) {
// // msg13 typically uses this to get an unbanned proxy
// getProxiesToUse();
// }
// if we use proxies then increase the timeout since proxies
// increase the crawl delay in hopes of backing off to discover
// the website's policy so we don't hit it too hard and get banned.
// so to avoid diffbot timing out tell it to wait up to a minute
// because the crawl delay can be as high as that, even higher
if ( useProxies )
m_diffbotUrl.safePrintf("&timeout=%"INT32"",
(int32_t)MAX_PROXYCRAWLDELAYMS+10000);
m_diffbotUrl.safePrintf("&url=");
// give diffbot the url to process
m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
// append this just in case the next thing doesn't have it.
//if ( cr->m_diffbotApiQueryString.length() &&
// cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
// diffbotUrl.pushChar('&');
// then user provided parms that are dependent on if it is an
// article, product, etc. like "&dontstripads=1" or whatever
//diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart());
// for analyze requests without mode=, make sure that diffbot expands all objects
// "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze
// null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above)
if (m_diffbotUrl.nullTerm()) {
char *u = m_diffbotUrl.getBufStart();
if (strstr(u, "/analyze") && !strstr(u, "mode=")) {
m_diffbotUrl.safePrintf("&expand");
}
}
// null term it
m_diffbotUrl.nullTerm();
// mark as tried
if ( m_srepValid ) { char *xx=NULL;*xx=0; }
m_sentToDiffbotThisTime = true;
// might have been a recall if gotDiffbotReplyWrapper() sensed
// g_errno == ECONNRESET and it will retry
if ( ! m_sentToDiffbot ) {
m_sentToDiffbot = 1;
// count it for stats
cr->m_localCrawlInfo.m_pageProcessAttempts++;
cr->m_globalCrawlInfo.m_pageProcessAttempts++;
// changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
cr->m_needsSave = true;
}
char *additionalHeaders = NULL;
if ( headers.length() > 0 )
additionalHeaders = headers.getBufStart();
// if did not get the web page first and we are crawling, not
// doing a bulk, then core. we need the webpage to harvest links
// and sometimes to check the pageprocesspattern to see if we should
// process.
if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
char *xx=NULL;*xx=0; }
log(LOG_INFO,
"diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
additionalHeaders);
m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();
if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
0 , // ip
0 , // offset
-1 , // size
0 , // ifmodifiedsince
this , // state
gotDiffbotReplyWrapper ,
// MDW: boost timeout from 180 to 18000
// seconds so we can figure out why
// diffbot times out, etc. what is
// going on.
// this is slowing things too much
// so make it 240 seconds
240*1000, // 240 sec timeout
0,//proxyip
0,//proxyport
// unlimited replies i guess
-1,//maxtextdoclen unlimited
-1,//maxotherdoclen unlimited
g_conf.m_spiderUserAgent ,
"HTTP/1.0",
false, // do post?
NULL, // cookie
additionalHeaders ) )
// return -1 if blocked
return (SafeBuf *)-1;
// error?
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// wha?
log("diffbot: http error %s",mstrerror(g_errno));
// had an error!
return NULL;
}
char **XmlDoc::getHttpReply ( ) {
// both must be valid now
if ( m_redirUrlValid && m_httpReplyValid ) {
// might have been a download error of ECORRUPTDATA
if ( m_downloadStatus == ECORRUPTDATA ) {
// set g_errno so caller knows
g_errno = m_downloadStatus;
// null means error
return NULL;
}
// otherwise, assume reply is valid
return &m_httpReply;
}
setStatus("getting http reply");
// come back up here if a redirect invalidates it
loop:
// sanity test -- only if not the test collection (NO, might be EBADIP)
//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
// get the http reply
char **replyPtr = getHttpReply2();
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
// . now if the reply was a redirect we should set m_redirUrl to it
// and re-do all this code
// . this often sets m_indexCode to stuff like ESIMPLIFIEDREDIR, etc.
Url **redirp = getRedirUrl();
// we often lookup the assocaited linkInfo on the original url to
// see if it is worth keeping and indexing just to take advantage of
// the incoming link text it has, so we may block on that!
// but in the case of a contactDoc, getContactDoc() sets these things
// to NULL to avoid unnecessary lookups.
if ( ! redirp || redirp == (void *)-1 ) return (char **)redirp;
// sanity check
if ( *redirp && ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
// if NULL, we are done
if ( ! *redirp ) return &m_httpReply;
// . also, hang it up if we got a simplified redir url now
// . we set m_redirUrl so that getLinks() can add a spiderRequest
// for it, but we do not want to actually redirect to it to get
// the content for THIS document
if ( m_redirError ) return &m_httpReply;
// and invalidate the redir url because we do not know if the
// current url will redirect or not (mdwmdw)
m_redirUrlValid = false;
m_metaRedirUrlValid = false;
// free it
mfree ( m_httpReply , m_httpReplyAllocSize, "freehr" );
// always nullify if we free so we do not re-use freed mem
m_httpReply = NULL;
// otherwise, we had a redirect, so invalidate what we had set
m_httpReplyValid = false;
// do not invalidate this any more, now it is when we STARTED spidering
// the document
//m_spideredTimeValid = false;
m_isContentTruncatedValid = false;
// do not redo robots.txt lookup if the redir url just changed from
// http to https or vice versa
Url *ru = *redirp;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1) return (char **)cu;
if ( strcmp ( ru->getUrl() + ru->getSchemeLen() ,
cu->getUrl() + cu->getSchemeLen() ) ) {
// redo robots.txt lookup. might be cached.
m_isAllowedValid = false;
m_crawlDelayValid = false;
}
// keep the same ip if hostname is unchanged
if ( ru->getHostLen() != cu->getHostLen() ||
strncmp ( ru->getHost() , cu->getHost(), cu->getHostLen() ) )
// ip is supposed to be that of the current url, which changed
m_ipValid = false;
// we set our m_xml to the http reply to check for meta redirects
// in the html sometimes in getRedirUrl() so since we are redirecting,
// invalidate that xml
m_xmlValid = false;
m_wordsValid = false;
m_rawUtf8ContentValid = false;
m_expandedUtf8ContentValid= false;
m_utf8ContentValid = false;
m_filteredContentValid = false;
m_contentValid = false;
m_mimeValid = false;
// update our current url now to be the redirected url
m_currentUrl.set ( *redirp , false );
m_currentUrlValid = true;
// loop it
goto loop;
}
void gotHttpReplyWrapper ( void *state ) {
// point to us
XmlDoc *THIS = (XmlDoc *)state;
// this sets g_errno on error
THIS->gotHttpReply ( );
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
}
// "NULL" can be a valid http reply (empty page) so we need to use "char **"
char **XmlDoc::getHttpReply2 ( ) {
if ( m_httpReplyValid ) return &m_httpReply;
setStatus("getting http reply2");
// if recycle is set then NEVER download if doing query reindex
// but if doing an injection then i guess we can download.
// do not even do ip lookup if no old titlerec, which is how we
// ended up here...
if ( m_recycleContent && m_sreqValid && m_sreq.m_isPageReindex ) {
g_errno = ENOTITLEREC;
return NULL;
}
// doing a query reindex on diffbot objects does not have a
// valid spider request, only sets m_recycleContent to true
// in reindexJSONObjects()/redoJSONObjects()
if ( m_recycleContent && m_isDiffbotJSONObject ) {
g_errno = ENOTITLEREC;
return NULL;
}
// get ip
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (char **)ip;
// reset
m_httpReplySize = 0;
m_httpReply = NULL;
// if ip is bogus, we are done
if ( *ip == 0 || *ip == -1 ) {
log("xmldoc: ip is bogus 0 or -1 for %s. skipping download",
m_firstUrl.getUrl());
m_httpReplyValid = true;
m_isContentTruncated = false;
m_isContentTruncatedValid = true;
// need this now too. but don't hurt a nonzero val if we have
if ( ! m_downloadEndTimeValid ) {
m_downloadEndTime = 0;
m_downloadEndTimeValid = true;
}
return &m_httpReply;
//return gotHttpReply ( );
}
// get this. should operate on current url (i.e. redir url if there)
bool *isAllowed = getIsAllowed();
// error or blocked
if ( ! isAllowed || isAllowed == (void *)-1) return (char **)isAllowed;
// this must be valid, since we share m_msg13 with it
if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }
int32_t *cd = getFinalCrawlDelay();
if ( ! cd || cd == (void *)-1 ) return (char **)cd;
// we might bail
if ( ! *isAllowed ) {
m_httpReplyValid = true;
m_isContentTruncated = false;
m_isContentTruncatedValid = true;
// need this now too. but don't hurt a nonzero val if we have
if ( ! m_downloadEndTimeValid ) {
m_downloadEndTime = 0;
m_downloadEndTimeValid = true;
}
m_downloadStatusValid = true;
// forbidden? assume we downloaded it and it was empty
m_downloadStatus = 0; // EDOCDISALLOWED;//403;
return &m_httpReply;
//return gotHttpReply ( );
}
// are we site root page?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
//int8_t *hc = getHopCount();
//if ( ! hc || hc == (void *)-1 ) return (char **)hc;
XmlDoc *od = NULL;
if ( ! m_isSpiderProxy &&
// don't lookup xyz.com/robots.txt in titledb
! isFirstUrlRobotsTxt() ) {
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
// get ptr to old xml doc, could be NULL if non exists
od = *pod;
}
// sanity check
if ( od && m_recycleContent ) {char *xx=NULL;*xx=0; }
// validate m_firstIpValid
int32_t *pfip = getFirstIp();
if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// robots.txt and css files etc should have m_isChildDoc as true
//if ( ! m_downloadAttempted && ! m_isChildDoc )
// // keep track of spider stats
// cr->m_localCrawlInfo.m_pageDownloadAttempts++;
// we made an attempt to download, so mark it
//m_downloadAttempted = true;
// if we didn't block getting the lock, keep going
setStatus ( "getting web page" );
// sanity check
if ( ! m_masterLoop ) { char *xx=NULL;*xx=0; }
// int16_tcut. this will return the redirUrl if it is non-empty.
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (char **)cu;
/*
// if on google, make it empty so we do not hit them
if ( strstr(cu->getUrl(),".google.com/") ) {
log("spider: encountered google.com url. emptying.");
m_httpReplyValid = true;
m_isContentTruncated = false;
m_isContentTruncatedValid = true;
// need this now too. but don't hurt a nonzero val if we have
if ( ! m_downloadEndTimeValid ) {
m_downloadEndTime = 0;
m_downloadEndTimeValid = true;
}
return &m_httpReply;
}
*/
// no ip found means empty page i guess
//if ( *ip == 0 || *ip == -1 )
// return gotHttpReply ( );
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
// sanity check
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// set parms
Msg13Request *r = &m_msg13Request;
// clear it first
r->reset();
// and set the url
//strcpy ( r->m_url , cu->getUrl() );
r->ptr_url = cu->getUrl();
r->size_url = cu->getUrlLen()+1;
// caution: m_sreq.m_hopCountValid is false sometimes for page parser
// this is used for Msg13.cpp's ipWasBanned()
// we use hopcount now instead of isInSeedBuf(cr,r->ptr_url)
bool isInjecting = getIsInjecting();
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// sanity check
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// r->m_maxTextDocLen = maxDownload;
// r->m_maxOtherDocLen = maxDownload;
r->m_maxTextDocLen = cr->m_maxTextDocLen;
r->m_maxOtherDocLen = cr->m_maxOtherDocLen;
// max to download in bytes. currently 1MB.
//int32_t maxDownload = (int32_t)MAXDOCLEN;
// but if url is http://127.0.0.1.... or local then
if ( m_ipValid ) {
// make into a string
char *ipStr = iptoa(m_ip);
// is it local?
bool isLocal = false;
if ( strncmp(ipStr,"192.168.",8) == 0) isLocal = true;
if ( strncmp(ipStr,"10." ,3) == 0) isLocal = true;
if ( m_ip == 16777343 ) isLocal = true; // 127.0.0.1 ?
// . if local then make web page download max size unlimited
// . this is for adding the gbdmoz.urls.txt.* files to
// populate dmoz. those files are about 25MB each.
if ( isLocal ) {
//maxDownload = -1;
r->m_maxTextDocLen = -1;
r->m_maxOtherDocLen = -1;
}
}
// m_maxCacheAge is set for getting contact or root docs in
// getContactDoc() and getRootDoc() and it only applies to
// titleRecs in titledb i guess... but still... for Msg13 it applies
// to its cache ... for robots.txt files too
r->m_maxCacheAge = m_maxCacheAge;
r->m_urlIp = *ip;
r->m_firstIp = m_firstIp;
r->m_urlHash48 = getFirstUrlHash48();
if ( r->m_maxTextDocLen < 100000 ) r->m_maxTextDocLen = 100000;
if ( r->m_maxOtherDocLen < 200000 ) r->m_maxOtherDocLen = 200000;
r->m_forwardDownloadRequest = (bool)m_forwardDownloadRequest;
r->m_useTestCache = (bool)useTestCache;
r->m_spideredTime = getSpideredTime();//m_spideredTime;
r->m_ifModifiedSince = 0;
r->m_skipHammerCheck = 0;
//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
//else r->m_addToTestCache = false;
r->m_addToTestCache = (bool)useTestCache;
if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
r->ptr_cookie = m_redirCookieBuf.getBufStart();
r->size_cookie = m_redirCookieBuf.length() + 1;
// . only do once per redirect
// . do not invalidate because we might have to carry it
// through to the next redir... unless we change domain
// . this fixes the nyt.com/nytimes.com bug some more
//m_redirCookieBufValid = false;
}
// . this is -1 if unknown. none found in robots.txt or provided
// in the custom crawl parms.
// . it should also be 0 for the robots.txt file itself
r->m_crawlDelayMS = *cd;
// let's time our crawl delay from the initiation of the download
// not from the end of the download. this will make things a little
// faster but could slam servers more.
r->m_crawlDelayFromEnd = false;
// need this in order to get all languages, etc. and avoid having
// to set words class at the spider compression proxy level
r->m_forEvents = 0;
// new stuff
r->m_contentHash32 = 0;
// if valid in SpiderRequest, use it. if spider compression proxy
// sees the content is unchanged it will not send it back! it will
// send back g_errno = EDOCUNCHANGED or something
if ( m_sreqValid )
r->m_contentHash32 = m_sreq.m_contentHash32;
// if we have the old doc already set use that
if ( od )
r->m_contentHash32 = od->m_contentHash32;
// force floater usage on even if "use spider proxies" parms is off
// if we're a diffbot crawl and use robots is off.
//if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
// r->m_forceUseFloaters = true;
// for beta testing, make it a collection specific parm for diffbot
// so we can turn on manually
if ( cr->m_forceUseFloaters )
r->m_forceUseFloaters = true;
// eventgurubot is the max
//char *userAgent = g_conf.m_spiderUserAgent;
// hardcode it
//char *userAgent = "EventGuruBot";
//int32_t uaLen = gbstrlen(userAgent);
//if ( uaLen > 12 ) {
// log("spider: user agent string too long");
// uaLen = 12;
//}
//strncpy(r->m_userAgent,userAgent,uaLen);
//r->m_userAgent[uaLen] = '\0';
// turn this off too
r->m_attemptedIframeExpansion = false;
r->m_collnum = (collnum_t)-1;
if ( m_collnumValid )r->m_collnum = m_collnum;
// turn off
r->m_useCompressionProxy = false;
r->m_compressReply = false;
r->m_isCustomCrawl = cr->m_isCustomCrawl;
// set it for this too
if ( g_conf.m_useCompressionProxy &&
// do not use for the test collection ever, that is qa'ing
strcmp(cr->m_coll,"qatest123") ) {
r->m_useCompressionProxy = true;
r->m_compressReply = true;
}
// are we a robots.txt file?
//bool isRobotsTxt = isRobotsTxtFile ( cu->getUrl() , cu->getUrlLen());
char *td = getTestDir();
if ( td ) strncpy ( r->m_testDir, td, 31);
//r->m_isPageParser = getIsPageParser();
//r->m_isPageInject = ( m_sreqValid && m_sreq.m_isInjecting );
// if current url IS NOT EQUAL to first url then set redir flag
if ( strcmp(cu->m_url,m_firstUrl.m_url) )
r->m_skipHammerCheck = 1;
// or if this an m_extraDoc or m_rootDoc for another url then
// do not bother printing the hammer ip msg in msg13.cpp either
if ( m_isChildDoc )
r->m_skipHammerCheck = 1;
if ( m_contentInjected ) // oldsrValid && m_sreq.m_isInjecting )
r->m_skipHammerCheck = 1;
// or if ahrefs
if ( strncmp(cu->m_url,"http://api.ahrefs.com/",22) == 0 )
r->m_skipHammerCheck = 1;
if ( r->m_skipHammerCheck )
log(LOG_DEBUG,"build: skipping hammer check");
// if we had already spidered it... try to save bandwidth and time
if ( od ) {
// sanity check
if ( ! od->m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// only get it if modified since last spider time
r->m_ifModifiedSince = od->m_spideredTime;
}
// tell msg13 he is scraping...
if ( m_sreqValid && m_sreq.m_isScraping )
r->m_isScraping = 1;
// if doing frame expansion on a doc we just downloaded as the
// spider proxy, we are asking ourselves now to download the url
// from an <iframe src=...> tag. so definitely use msg13 again
// so it can use the robots.txt cache, and regular html page cache.
if ( m_isSpiderProxy ) {
r->m_useCompressionProxy = false;
r->m_compressReply = false;
r->m_skipHammerCheck = 1;
//r->m_requireGoodDate = false;
// no frames within frames
r->m_attemptedIframeExpansion = 1;
log(LOG_DEBUG,"build: skipping hammer check 2");
}
// . use msg13 to download the file, robots.txt
// . msg13 will ensure only one download of that url w/ locks
// . msg13 can use the compress the http reply before
// sending it back to you via udp (compression proxy)
// . msg13 uses XmlDoc::getHttpReply() function to handle
// redirects, etc.? no...
bool isTestColl = false;
if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;
//if ( isTestColl && m_contentType == CT_IMAGE )
// isTestColl = false;
// sanity check. keep injections fast. no downloading!
if ( m_wasContentInjected ) {
log("xmldoc: url injection failed! error!");
char *xx=NULL;*xx=0;
}
// sanity check
if ( m_deleteFromIndex ) {
log("xmldoc: trying to download page to delete");
char *xx=NULL;*xx=0;
}
m_downloadStartTimeValid = true;
m_downloadStartTime = gettimeofdayInMillisecondsGlobal();
if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
// return -1 if blocked
return (char **)-1;
return gotHttpReply ( );
}
// . this returns false if blocked, true otherwise
// . sets g_errno on error
char **XmlDoc::gotHttpReply ( ) {
// save it
int32_t saved = g_errno;
// note it
setStatus ( "got web page" );
// sanity check. are we already valid?
if ( m_httpReply && m_httpReplyValid ) { char *xx=NULL;*xx=0; }
// do not re-call
m_httpReplyValid = true;
// assume none
m_httpReply = NULL;
// . get the HTTP reply
// . TODO: free it on reset/destruction, we own it now
// . this is now NULL terminated thanks to changes in
// Msg13.cpp, but watch the buf size, need to subtract 1
// . therefore, we can set the Xml class with it
m_httpReply = m_msg13.m_replyBuf;
m_httpReplySize = m_msg13.m_replyBufSize;
// how much to free?
m_httpReplyAllocSize = m_msg13.m_replyBufAllocSize;
// sanity check
if ( m_httpReplySize > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
// what is this for? that makes it into a length not a size!
//if ( m_httpReplySize > 0 ) m_httpReplySize--;
// . save entire reply length we read from the net so
// SpiderCache
// can use it for its m_avgReplyLen for throttling
// . m_bufLen may change due to filtering
//m_replyLen = m_bufLen;
// . don't let UdpServer free m_buf when socket is
// recycled/closed
// . we own it now and are responsible for freeing it
//slot->m_readBuf = NULL;
m_msg13.m_replyBuf = NULL;
// relabel mem so we know where it came from
relabel( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
// breathe
QUICKPOLL ( m_niceness );
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . sanity test -- only if not the test collection
// . i.e. what are you doing downloading the page if there was
// a problem with the page we already know about
if ( m_indexCode && m_indexCodeValid &&
strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
// fix this
if ( saved == EDOCUNCHANGED ) {
// assign content from it since unchanged
m_recycleContent = true;
// clear the error
saved = 0;
g_errno = 0;
}
// . save the error in download status
// . could now be EDOCUNCHANGED or EDOCNOGOODDATE (w/ tod)
m_downloadStatus = saved; // g_errno;
// validate
m_downloadStatusValid = true;
// update m_downloadEndTime if we should, used for sameIpWait
m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
m_downloadEndTimeValid = true;
// make it so
g_errno = saved;
bool doIncrement = true;
if ( m_isChildDoc ) doIncrement = false;
if ( m_incrementedDownloadCount ) doIncrement = false;
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
// if it doesn't match the crawl pattern, just the process pattern
// then do not increment download successes
if ( doIncrement &&
cr->m_isCustomCrawl == 1 &&
// allow seeds to be counted
! isSeed &&
//! sreq->m_isPageReindex &&
//! sreq->m_isInjecting &&
! doesUrlMatchDiffbotCrawlPattern() )
doIncrement = false;
// . do not count bad http status in mime as failure i guess
// . do not inc this count for robots.txt and root page downloads, etc.
if ( doIncrement ) {
cr->m_localCrawlInfo.m_pageDownloadSuccesses++;
cr->m_globalCrawlInfo.m_pageDownloadSuccesses++;
cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound++;
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound++;
m_incrementedDownloadCount = true;
cr->m_needsSave = true;
// changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
}
// this means the spider compression proxy's reply got corrupted
// over roadrunner's crappy wireless internet connection
if ( saved == ECORRUPTDATA ) return NULL;
// this one happens too! for the same reason...
if ( saved == EBADREPLYSIZE ) return NULL;
// might as well check this too while we're at it
if ( saved == ENOMEM ) return NULL;
// sanity check -- check after bailing on corruption because
// corrupted replies do not end in NULLs
if ( m_httpReplySize > 0 && m_httpReply[m_httpReplySize-1] ) {
log("http: httpReplySize=%"INT32" http reply does not end in \\0 "
"for %s in collnum=%"INT32". blanking out reply."
,m_httpReplySize
,m_firstUrl.m_url
,(int32_t)m_collnum
);
// free it i guess
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
// and reset it
m_httpReplySize = 0;
m_httpReply = NULL;
m_httpReplyAllocSize = 0;
// call it data corruption i guess for now
g_errno = ECORRUPTDATA;
//char *xx=NULL;*xx=0;
}
// if its a bad gzip reply, a compressed http reply, then
// make the whole thing empty? some websites return compressed replies
// even though we do not ask for them. and then the compression
// is corrupt.
if ( saved == ECORRUPTHTTPGZIP ||
// if somehow we got a page too big for MAX_DGRAMS... treat
// it like an empty page...
saved == EMSGTOOBIG ) {
// free it i guess
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
// and reset it
m_httpReplySize = 0;
m_httpReply = NULL;
m_httpReplyAllocSize = 0;
}
// if errors were not local, reset g_errno and set m_indexCode
//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
//if ( g_errno == EBADMIME ) m_indexCode = EBADMIME;
// clear g_errno
//if ( m_indexCode ) g_errno = 0;
// return if cancelled, etc.
//if ( g_errno ) return NULL;
// clear this i guess
g_errno = 0;
/*
MDW: 2/8/16 this logic now below in getIsContentTruncated() function
// int16_tcut - convert size to length
int32_t LEN = m_httpReplySize - 1;
m_isContentTruncated = false;
// was the content truncated? these might label a doc is truncated
// when it really is not... but we only use this for link spam stuff,
// so it should not matter too much. it should only happen rarely.
//if ( LEN >= cr->m_maxTextDocLen-1 ) m_isContentTruncated = true;
//if ( LEN >= cr->m_maxOtherDocLen-1 ) m_isContentTruncated = true;
if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
// set this
m_isContentTruncated2 = (bool)m_isContentTruncated;
// validate it
m_isContentTruncatedValid = true;
*/
return &m_httpReply;
}
char *XmlDoc::getIsContentTruncated ( ) {
if ( m_isContentTruncatedValid ) return &m_isContentTruncated2;
setStatus ( "getting is content truncated" );
// if recycling content use its download end time
if ( m_recycleContent ) {
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (char *)pod;
// int16_tcut
XmlDoc *od = *pod;
// this is non-NULL if it existed
if ( od ) {
m_isContentTruncated = od->m_isContentTruncated;
m_isContentTruncated2 = (bool)m_isContentTruncated;
m_isContentTruncatedValid = true;
return &m_isContentTruncated2;
}
}
// need a valid reply
char **replyPtr = getHttpReply ();
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char *)replyPtr;
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (char *)ct;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// shortcut - convert size to length
int32_t LEN = m_httpReplySize - 1;
m_isContentTruncated = false;
// was the content truncated? these might label a doc is truncated
// when it really is not... but we only use this for link spam stuff,
// so it should not matter too much. it should only happen rarely.
if ( cr->m_maxTextDocLen >= 0 &&
LEN >= cr->m_maxTextDocLen-1 &&
*ct == CT_HTML )
m_isContentTruncated = true;
if ( cr->m_maxOtherDocLen >= 0 &&
LEN >= cr->m_maxOtherDocLen-1 &&
*ct != CT_HTML )
m_isContentTruncated = true;
//if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
// set this
m_isContentTruncated2 = (bool)m_isContentTruncated;
// validate it
m_isContentTruncatedValid = true;
return &m_isContentTruncated2;
}
int32_t *XmlDoc::getDownloadStatus ( ) {
if ( m_downloadStatusValid ) return &m_downloadStatus;
// log it
setStatus ( "getting download status");
// if recycling content, we're 200!
if ( m_recycleContent ) {
m_downloadStatus = 0;
m_downloadStatusValid = true;
return &m_downloadStatus;
}
// get ip
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
// . first try ip
// . this means the dns lookup timed out
if ( *ip == -1 ) {
m_downloadStatus = EDNSTIMEDOUT;
m_downloadStatusValid = true;
return &m_downloadStatus;
}
// this means ip does not exist
if ( *ip == 0 ) {
m_downloadStatus = EBADIP;
m_downloadStatusValid = true;
return &m_downloadStatus;
}
// need a valid reply
char **reply = getHttpReply ();
if ( ! reply || reply == (void *)-1 ) return (int32_t *)reply;
// must be valid now
if ( ! m_downloadStatusValid ) { char *xx=NULL;*xx=0; }
// return it
return &m_downloadStatus;
}
int64_t *XmlDoc::getDownloadEndTime ( ) {
if ( m_downloadEndTimeValid ) return &m_downloadEndTime;
// log it
setStatus ( "getting download end time");
// do not cause us to core in getHttpReply2() because m_deleteFromIndex
// is set to true...
if ( m_deleteFromIndex ) {
m_downloadEndTime = 0;
m_downloadEndTimeValid = true;
return &m_downloadEndTime;
}
// if recycling content use its download end time
if ( m_recycleContent ) {
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (int64_t *)pod;
// int16_tcut
XmlDoc *od = *pod;
// this is non-NULL if it existed
if ( od ) {
m_downloadEndTime = od->m_downloadEndTime;
m_downloadEndTimeValid = true;
return &m_downloadEndTime;
}
}
// need a valid reply
char **reply = getHttpReply ();
if ( ! reply || reply == (void *)-1 ) return (int64_t *)reply;
// must be valid now
if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0;}
// return it
return &m_downloadEndTime;
}
int16_t *XmlDoc::getHttpStatus ( ) {
// if we got a title rec then return that
if ( m_httpStatusValid ) return &m_httpStatus;
// get mime otherwise
HttpMime *mime = getMime();
if ( ! mime || mime == (HttpMime *)-1 ) return (int16_t *)mime;
// get from that
m_httpStatus = mime->getHttpStatus();
m_httpStatusValid = true;
return &m_httpStatus;
}
HttpMime *XmlDoc::getMime () {
if ( m_mimeValid ) return &m_mime;
// log debug
setStatus("getting http mime");
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1) return (HttpMime *)cu;
// injection from SpiderLoop.cpp sets this to true
if ( m_useFakeMime ) {
usefake:
m_mime.set ( NULL , 0 , cu );
m_mime.setHttpStatus ( 200 );
m_mime.setContentType ( CT_HTML );
m_mimeValid = true;
return &m_mime;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if recycling content, fake this mime
if ( cr->m_recycleContent || m_recycleContent ) {
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (HttpMime *)pod;
// int16_tcut
XmlDoc *od = *pod;
// . this is non-NULL if it existed
// . fake it for now
if ( od ) goto usefake;
}
// need a valid reply
char **reply = getHttpReply ();
if ( ! reply || reply == (void *)-1 ) return (HttpMime *)reply;
// fake it for now
m_mime.set ( NULL , 0 , cu );
m_mime.setHttpStatus ( 200 );
m_mime.setContentType ( CT_HTML );
// int16_tcut
int32_t LEN = m_httpReplySize - 1;
// validate it
m_mimeValid = true;
// TODO: try again on failures because server may have been overloaded
// and closed the connection w/o sending anything
if ( LEN>0 && ! m_mime.set ( m_httpReply , LEN , cu ) ) {
// set this on mime error
//m_indexCode = EBADMIME;
// return a fake thing. content length is 0.
return &m_mime;
}
// . check the mime status, should be in the 200's for success
// . spider should redirect on 3xx codes
// . 404 means not found, etc.
// . 304 is not modified since
// . >= 300 should only happen if redirect chain was too long to follow
//int32_t httpStatus = m_mime.getHttpStatus();
// sanity check, these must be reserved! no longer, we have
// a separate m_httpStatus in the SpiderReply class now
//if ( mstrerror(httpStatus) ) { char *xx=NULL;*xx=0; }
// sanity check
//if ( m_indexCode ) { char *xx=NULL;*xx=0; }
// set it
//m_indexCode = httpStatus;
// clear if it was ok though
//if ( m_indexCode == 200 ) m_indexCode = 0;
// bail out now
return &m_mime;
}
// need to use "char **" since content might be NULL itself, if none
char **XmlDoc::getContent ( ) {
if ( m_contentValid ) return &m_content;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// recycle?
if ( cr->m_recycleContent || m_recycleContent ) {
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (char **)pod;
// int16_tcut
XmlDoc *od = *pod;
// this is non-NULL if it existed
if ( od ) {
m_content = od-> ptr_utf8Content;
m_contentLen = od->size_utf8Content - 1;
m_contentValid = true;
return &m_content;
}
if ( m_recycleContent )
log("xmldoc: failed to load old title rec "
"when recycle content was true and url = "
"%s",ptr_firstUrl);
// if could not find title rec and we are docid-based then
// we can't go any further!!
if ( m_setFromDocId ) {
log("xmldoc: null content for docid-based titlerec "
"lookup which was not found");
m_content = NULL;
m_contentLen = 0;
m_contentValid = true;
return &m_content;
}
}
if ( m_recycleContent ) {
if ( m_firstUrlValid )
log("xmldoc: failed to recycle content for %s. could "
"not load title rec",m_firstUrl.m_url);
else if ( m_docIdValid )
log("xmldoc: failed to recycle content for %"UINT64". "
"could "
"not load title rec",m_docId );
else
log("xmldoc: failed to recycle content. "
"could not load title rec" );
// let's let it pass and just download i guess, then
// we can get page stats for urls not in the index
//g_errno = EBADENGINEER;
//return NULL;
}
// if we were set from a title rec use that we do not have the original
// content, and caller should be calling getUtf8Content() anyway!!
if ( m_setFromTitleRec ) { char *xx=NULL; *xx=0; }
// query reindex has m_setFromDocId to true and we WANT to re-download
// the content... so why did i have this here? MDW 9/25/2014
//if ( m_setFromDocId ) { char *xx=NULL; *xx=0; }
// recycle?
//if ( m_recycleContent ) { char *xx=NULL; *xx=0; }
// get the mime first
HttpMime *mime = getMime();
if ( ! mime || mime == (HttpMime *)-1 ) return (char **)mime;
// http reply must be valid
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
// make it valid
m_contentValid = true;
// assume none
m_content = NULL;
m_contentLen = 0;
// all done if no reply
if ( ! m_httpReply ) return &m_content;
// set the content, account for mime header
m_content = m_httpReply + mime->getMimeLen() ;
m_contentLen = m_httpReplySize - mime->getMimeLen() ;
// watch out for this!
if ( m_useFakeMime ) {
m_content = m_httpReply;
m_contentLen = m_httpReplySize;
}
// why is this not really the size???
m_contentLen--;
// sanity check
if ( m_contentLen < 0 ) { char *xx = NULL; *xx = 0; }
return &m_content;
}
char getContentTypeFromContent ( char *p , int32_t niceness ) {
char ctype = 0;
// max
char *pmax = p + 100;
// check that out
for ( ; p && *p && p < pmax ; p++ ) {
QUICKPOLL(niceness);
if ( p[0] != '<' ) continue;
if ( p[1] != '!' ) continue;
if ( to_lower_a(p[2]) != 'd' ) continue;
if ( strncasecmp(p,"<!doctype ",10) ) continue;
char *dt = p + 10;
// skip spaces
for ( ; *dt ; dt++ ) {
QUICKPOLL(niceness);
if ( ! is_wspace_a ( *dt ) ) break;
}
// point to that
if ( ! strncasecmp(dt,"html" ,4) ) ctype = CT_HTML;
if ( ! strncasecmp(dt,"xml" ,3) ) ctype = CT_XML;
if ( ! strncasecmp(dt,"text/html",9) ) ctype = CT_HTML;
if ( ! strncasecmp(dt,"text/xml" ,8) ) ctype = CT_XML;
break;
}
return ctype;
}
uint8_t *XmlDoc::getContentType ( ) {
if ( m_contentTypeValid ) return &m_contentType;
// log debug
setStatus("getting content type");
// get the mime first
HttpMime *mime = getMime();
if ( ! mime || mime == (HttpMime *)-1 ) return (uint8_t *)mime;
// then get mime
m_contentType = mime->getContentType();
// but if they specify <!DOCTYPE html> in the document that overrides
// the content type in the mime! fixes planet.mozilla.org
char **pp = getContent();
if ( ! pp || pp == (void *)-1 ) return (uint8_t *)pp;
char *p = *pp;
// scan content for content type. returns 0 if none found.
char ctype2 = getContentTypeFromContent ( p , m_niceness );
// valid?
if ( ctype2 != 0 ) m_contentType = ctype2;
// it is valid now
m_contentTypeValid = true;
// give to to them
return &m_contentType;
}
// . similar to getMetaRedirUrl but look for different strings
// . rel="canonical" or rel=canonical in a link tag.
Url **XmlDoc::getCanonicalRedirUrl ( ) {
// return if we got it
if ( m_canonicalRedirUrlValid ) return &m_canonicalRedirUrlPtr;
//if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
// assume none in doc
m_canonicalRedirUrlPtr = NULL;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// disable for crawlbot, not good really for deduping
if ( cr->m_isCustomCrawl ) {
m_canonicalRedirUrlValid = true;
return &m_canonicalRedirUrlPtr;
}
if ( ! cr->m_useCanonicalRedirects ) {
m_canonicalRedirUrlValid = true;
return &m_canonicalRedirUrlPtr;
}
// are we site root page? don't follow canonical url then.
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (Url **)isRoot;
if ( *isRoot ) {
m_canonicalRedirUrlValid = true;
return &m_canonicalRedirUrlPtr;
}
// if this page has an inlink, then let it stand
LinkInfo *info1 = getLinkInfo1 ();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
if ( info1->getNumGoodInlinks() > 0 ) {
m_canonicalRedirUrlValid = true;
return &m_canonicalRedirUrlPtr;
}
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
// these canonical links only supported in xml/html i think
if ( *ct != CT_HTML && *ct != CT_XML ) {
m_canonicalRedirUrlValid = true;
return &m_canonicalRedirUrlPtr;
}
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Url **)xml;
// scan nodes looking for a <link> node. like getBaseUrl()
for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
// breathe some
QUICKPOLL(m_niceness);
// 12 is the <base href> tag id
if ( xml->getNodeId ( i ) != TAG_LINK ) continue;
// get the href field of this base tag
int32_t linkLen;
char *link = (char *) xml->getString ( i, "href", &linkLen );
// skip if not valid
if ( ! link || linkLen == 0 ) continue;
// must also have rel=canoncial
int32_t relLen;
char *rel = xml->getString(i,"rel",&relLen);
if ( ! rel ) continue;
// skip if does not match "canonical"
if ( strncasecmp(rel,"canonical",relLen) ) continue;
// allow for relative urls
Url *cu = getCurrentUrl();
// set base to it. addWWW=false
m_canonicalRedirUrl.set(cu,link,linkLen,false);//true
// assume it is not our url
bool isMe = false;
// if it is us, then skip!
if(strcmp(m_canonicalRedirUrl.getUrl(),m_firstUrl.getUrl())==0)
isMe = true;
// might also be our redir url i guess
if(strcmp(m_canonicalRedirUrl.getUrl(),m_redirUrl.getUrl())==0)
isMe = true;
// if it is us, keep it NULL, it's not a redirect. we are
// the canonical url.
if ( isMe ) break;
// ignore if in an expanded iframe (<gbrame>) tag
char *pstart = xml->m_xml;
char *p = link;
// scan backwards
if ( ! m_didExpansion ) p = pstart;
bool skip = false;
for ( ; p > pstart ; p-- ) {
QUICKPOLL(m_niceness);
if ( p[0] != '<' )
continue;
if ( p[1] == '/' &&
p[2] == 'g' &&
p[3] == 'b' &&
p[4] == 'f' &&
p[5] == 'r' &&
p[6] == 'a' &&
p[7] == 'm' &&
p[8] == 'e' &&
p[9] == '>' )
break;
if ( p[1] == 'g' &&
p[2] == 'b' &&
p[3] == 'f' &&
p[4] == 'r' &&
p[5] == 'a' &&
p[6] == 'm' &&
p[7] == 'e' &&
p[8] == '>' ) {
skip = true;
break;
}
}
if ( skip ) continue;
// otherwise, it is not us, we are NOT the canonical url
// and we should not be indexed, but just ass the canonical
// url as a spiderrequest into spiderdb, just like
// simplified meta redirect does.
m_canonicalRedirUrlPtr = &m_canonicalRedirUrl;
break;
}
m_canonicalRedirUrlValid = true;
return &m_canonicalRedirUrlPtr;
}
// returns false if none found
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
Url *cu ) {
// limit scan
char *limit = p + 30;
// skip whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must be a num
if ( ! is_digit(*p) ) return false;
// init delay
int32_t delay = atol ( p );
// ignore long delays
if ( delay >= 10 ) return false;
// now find the semicolon, if any
for ( ; *p && p < limit && *p != ';' ; p++ );
// must have semicolon
if ( *p != ';' ) return false;
// skip it
p++;
// skip whitespace some more
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must have URL
if ( strncasecmp(p,"URL",3) ) return false;
// skip that
p += 3;
// skip white space
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// then an equal sign
if ( *p != '=' ) return false;
// skip equal sign
p++;
// them maybe more whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// an optional quote
if ( *p == '\"' ) p++;
// can also be a single quote!
if ( *p == '\'' ) p++;
// set the url start
char *url = p;
// now advance to next quote or space or >
for ( ; *p && !is_wspace_a(*p) &&
*p !='\'' &&
*p !='\"' &&
*p !='>' ;
p++);
// that is the end
char *urlEnd = p;
// get size
int32_t usize = urlEnd - url;
// skip if too big
if ( usize > 1024 ) {
log("build: meta redirurl of %"INT32" bytes too big",usize);
return false;
}
// get our current utl
//Url *cu = getCurrentUrl();
// decode what we got
char decoded[MAX_URL_LEN];
// convert &amp; to "&"
int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
decoded[decBytes]='\0';
// . then the url
// . set the url to the one in the redirect tag
// . but if the http-equiv meta redirect url starts with a '?'
// then just replace our cgi with that one
if ( *url == '?' ) {
char foob[MAX_URL_LEN*2];
char *pf = foob;
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
gbmemcpy(foob,cu->getUrl(),cuBytes);
pf += cuBytes;
gbmemcpy ( pf , decoded , decBytes );
pf += decBytes;
*pf = '\0';
metaRedirUrl->set(foob);
}
// . otherwise, append it right on
// . use "url" as the base Url
// . it may be the original url or the one we redirected to
// . redirUrl is set to the original at the top
else
// addWWW = false, stripSessId=true
metaRedirUrl->set(cu,decoded,decBytes,false,true);
return true;
}
// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
Url **XmlDoc::getMetaRedirUrl ( ) {
if ( m_metaRedirUrlValid ) return &m_metaRedirUrlPtr;
// get ptr to utf8 content
//char **u8 = getHttpReply();
//if ( ! u8 || u8 == (void *)-1 ) return (Url **)u8;
if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }
char *p = m_httpReply;
// subtract one since this is a size not a length
char *pend = p + m_httpReplySize - 1;//size_utf8Content;
// assume no meta refresh url
m_metaRedirUrlPtr = NULL;
// make it valid regardless i guess
m_metaRedirUrlValid = true;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if we are recycling or injecting, do not consider meta redirects
if ( cr->m_recycleContent || m_recycleContent )
return &m_metaRedirUrlPtr;
// will this work in here?
//uint8_t *ct = getContentType();
//if ( ! ct ) return NULL;
Url *cu = getCurrentUrl();
bool gotOne = false;
// advance a bit, we are initially looking for the 'v' char
p += 10;
// begin the string matching loop
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL ( m_niceness );
// fix <!--[if lte IE 6]>
// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
if ( *p == '!' &&
p[-1]=='<' &&
p[1] == '-' &&
p[2] == '-' ) {
// find end of comment
for ( ; p < pend ; p++ ) {
QUICKPOLL(m_niceness);
if ( p[0] == '-' &&
p[1] == '-' &&
p[2] == '>' )
break;
}
// if found no end of comment, then stop
if ( p >= pend )
break;
// resume looking for meta redirect tags
continue;
}
// base everything off the equal sign
if ( *p != '=' ) continue;
// did we match "http-equiv="?
if ( to_lower_a(p[-1]) != 'v' ) continue;
if ( to_lower_a(p[-2]) != 'i' ) continue;
if ( to_lower_a(p[-3]) != 'u' ) continue;
if ( to_lower_a(p[-4]) != 'q' ) continue;
if ( to_lower_a(p[-5]) != 'e' ) continue;
if ( p[-6] != '-' ) continue;
if ( to_lower_a(p[-7]) != 'p' ) continue;
if ( to_lower_a(p[-8]) != 't' ) continue;
if ( to_lower_a(p[-9]) != 't' ) continue;
if ( to_lower_a(p[-10])!= 'h' ) continue;
// skip the equal sign
p++;
// skip quote if there
if ( *p == '\"' ) p++;
// must be "refresh", continue if not
if ( strncasecmp(p,"refresh",7) ) continue;
// skip that
p += 7;
// skip another quote if there
if ( *p == '\"' ) p++;
// limit the # of white spaces
char *limit = p + 20;
// skip white spaces
while ( *p && p < limit && is_wspace_a(*p) ) p++;
// must be content now
if ( strncasecmp(p,"content=",8) ) continue;
// skip that
p += 8;
// skip possible quote
if ( *p == '\"' ) p++;
// PARSE OUT THE URL
Url dummy;
if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
continue;
gotOne = true;
break;
}
if ( ! gotOne )
return &m_metaRedirUrlPtr;
// to fix issue with scripts containing
// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
// we have to get the Xml. we can't call getXml() because of
// recursion bugs so just do it directly here
Xml xml;
if ( ! xml.set ( m_httpReply ,
m_httpReplySize - 1, // make it a length
false , // ownData?
0 , // allocSize
false , // pure xml?
m_version ,
false , // setParentsArg?
m_niceness ,
// assume html since getContentType() is recursive
// on us.
CT_HTML ) ) // *ct ) )
// return NULL on error with g_errno set
return NULL;
XmlNode *nodes = xml.getNodes();
int32_t n = xml.getNumNodes();
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != 68 ) continue;
// only get content for <meta http-equiv=..>
int32_t tagLen;
char *tag ;
tag = xml.getString ( i , "http-equiv" , &tagLen );
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// if not a refresh, skip it
if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
// get the content
tag = xml.getString ( i ,"content", &tagLen );
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// PARSE OUT THE URL
if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
continue;
// set it
m_metaRedirUrlPtr = &m_metaRedirUrl;
// return it
return &m_metaRedirUrlPtr;
}
// nothing found
return &m_metaRedirUrlPtr;
}
uint16_t getCharsetFast ( HttpMime *mime,
char *url,
char *s ,
int32_t slen ,
int32_t niceness ){
int16_t charset = csUnknown;
if ( slen < 0 ) slen = 0;
char *pstart = s;
char *pend = s + slen;
char *cs = mime->getCharset();
int32_t cslen = mime->getCharsetLen();
if ( cslen > 31 ) cslen = 31;
if ( cs && cslen > 0 ) {
char *p2 = cs + cslen ; char c = *p2; *p2 = '\0';
// get it
charset = get_iana_charset ( cs , gbstrlen(cs) );
// restore
*p2 = c;
}
// look for Unicode BOM first though
cs = ucDetectBOM ( pstart , pend - pstart );
if ( cs && charset == csUnknown ) {
log(LOG_DEBUG, "build: Unicode BOM signature detected: %s",cs);
int32_t len = gbstrlen(cs); if ( len > 31 ) len = 31;
charset = get_iana_charset ( cs , len );
}
// prepare to scan doc
char *p = pstart;
// if the doc claims it is utf-8 let's double check because
// newmexicomusic.org says its utf-8 in the mime header and it says
// it is another charset in a meta content tag, and it is NOT in
// utf-8, so don't trust that!
if ( charset == csUTF8 ) {
// loop over every char
for ( char *s = pstart ; s < pend ; s += getUtf8CharSize(s) ) {
// breathe
QUICKPOLL(niceness);
// sanity check
if ( ! isFirstUtf8Char ( s ) ) {
// note it
log(LOG_DEBUG,
"build: mime says UTF8 but does not "
"seem to be for url %s",url);
// reset it back to unknown then
charset = csUnknown;
break;
}
}
}
// do not scan the doc if we already got it set
if ( charset != csUnknown ) p = pend;
//
// it is inefficient to set xml just to get the charset.
// so let's put in some quick string matching for this!
//
// . how big is one char? usually this is 1 unless we are in utf16...
// . if we are in utf16 natively then this code needs to know that and
// set oneChar to 2! TODO!!
//char oneChar = 1;
// advance a bit, we are initially looking for the = sign
if ( p ) p += 10;
// begin the string matching loop
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL ( niceness );
// base everything off the equal sign
if ( *p != '=' ) continue;
// must have a 't' or 'g' before the equal sign
char c = to_lower_a(p[-1]);
// did we match "charset="?
if ( c == 't' ) {
if ( to_lower_a(p[-2]) != 'e' ) continue;
if ( to_lower_a(p[-3]) != 's' ) continue;
if ( to_lower_a(p[-4]) != 'r' ) continue;
if ( to_lower_a(p[-5]) != 'a' ) continue;
if ( to_lower_a(p[-6]) != 'h' ) continue;
if ( to_lower_a(p[-7]) != 'c' ) continue;
}
// did we match "encoding="?
else if ( c == 'g' ) {
if ( to_lower_a(p[-2]) != 'n' ) continue;
if ( to_lower_a(p[-3]) != 'i' ) continue;
if ( to_lower_a(p[-4]) != 'd' ) continue;
if ( to_lower_a(p[-5]) != 'o' ) continue;
if ( to_lower_a(p[-6]) != 'c' ) continue;
if ( to_lower_a(p[-7]) != 'n' ) continue;
if ( to_lower_a(p[-8]) != 'e' ) continue;
}
// if not either, go to next char
else
continue;
// . make sure a <xml or a <meta preceeds us
// . do not look back more than 500 chars
char *limit = p - 500;
// assume charset= or encoding= did NOT occur in a tag
bool inTag = false;
// check crazy wrap if m_content was close to a NULL ptr...
if ( limit >= pend ) limit = pstart;
if ( limit < pstart ) limit = pstart;
for ( char *s = p ; s >= limit ; s -= 1 ) { // oneChar ) {
// break at > or <
if ( *s == '>' ) break;
if ( *s != '<' ) continue;
// . TODO: this could be in a quoted string too! fix!!
// . is it in a <meta> tag?
if ( to_lower_a(s[1]) == 'm' &&
to_lower_a(s[2]) == 'e' &&
to_lower_a(s[3]) == 't' &&
to_lower_a(s[4]) == 'a' ) {
inTag = true;
break;
}
// is it in an <xml> tag?
if ( to_lower_a(s[1]) == 'x' &&
to_lower_a(s[2]) == 'm' &&
to_lower_a(s[3]) == 'l' ) {
inTag = true;
break;
}
// is it in an <?xml> tag?
if ( to_lower_a(s[1]) == '?' &&
to_lower_a(s[2]) == 'x' &&
to_lower_a(s[3]) == 'm' &&
to_lower_a(s[4]) == 'l' ) {
inTag = true;
break;
}
}
// if not in a tag proper, it is useless
if ( ! inTag ) continue;
// skip over equal sign
p += 1;//oneChar;
// skip over ' or "
if ( *p == '\'' ) p += 1;//oneChar;
if ( *p == '\"' ) p += 1;//oneChar;
// keep start ptr
char *csString = p;
// set a limit
limit = p + 50;
if ( limit > pend ) limit = pend;
if ( limit < p ) limit = pend;
// stop at first special character
while ( p < limit &&
*p &&
*p !='\"' &&
*p !='\'' &&
! is_wspace_a(*p) &&
*p !='>' &&
*p != '<' &&
*p !='?' &&
*p !='/' &&
// fix yaya.pro-street.us which has
// charset=windows-1251;charset=windows-1"
*p !=';' &&
*p !='\\' )
p += 1;//oneChar;
// save it
char d = *p;
// do the actual NULL termination
*p = 0;
// get the character set
int16_t metaCs = get_iana_charset(csString, gbstrlen(csString));
// put it back
*p = d;
// update "charset" to "metaCs" if known, it overrides all
if (metaCs != csUnknown ) charset = metaCs;
// all done, only if we got a known char set though!
if ( charset != csUnknown ) break;
}
// alias these charsets so iconv understands
if ( charset == csISO58GB231280 ||
charset == csHZGB2312 ||
charset == csGB2312 )
charset = csGB18030;
if ( charset == csEUCKR )
charset = csKSC56011987; //x-windows-949
// use utf8 if still unknown
if ( charset == csUnknown ) {
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"doc: forcing utf8 charset");
charset = csUTF8;
}
// once again, if the doc is claiming utf8 let's double check it!
if ( charset == csUTF8 ) {
// use this for iterating
char size;
// loop over every char
for ( char *s = pstart ; s < pend ; s += size ) {
// breathe
QUICKPOLL(niceness);
// set
size = getUtf8CharSize(s);
// sanity check
if ( ! isFirstUtf8Char ( s ) ) {
// but let 0x80 slide? it is for the
// 0x80 0x99 apostrophe i've seen for
// eventvibe.com. it did have a first byte,
// 0xe2 that led that sequece but it was
// converted into &acirc; by something that
// thought it was a latin1 byte.
if ( s[0] == (char)0x80 &&
s[1] == (char)0x99 ) {
s += 2;
size = 0;
continue;
}
// note it
log(LOG_DEBUG,
"build: says UTF8 (2) but does not "
"seem to be for url %s"
" Resetting to ISOLatin1.",url);
// reset it to ISO then! that's pretty common
// no! was causing problems for
// eventvibe.com/...Yacht because it had
// some messed up utf8 in it but it really
// was utf8. CRAP, but really messes up
// sunsetpromotions.com and washingtonia
// if we do not have this here
charset = csISOLatin1;
break;
}
}
}
// breathe
QUICKPOLL ( niceness );
//char *csName = get_charset_str(charset);
// breathe
//QUICKPOLL ( m_niceness );
// if we are not supported, set m_indexCode
//if ( csName && ! supportedCharset(charset) ) {
// log("build: xml: Unsupported charset: %s", csName);
// g_errno = EBADCHARSET;
// return NULL;
// //charset = csUnknown;
// // i guess do not risk it
// //m_indexCode = EBADCHARSET;
//}
// all done
return charset;
}
uint16_t *XmlDoc::getCharset ( ) {
if ( m_charsetValid ) return &m_charset;
// . get ptr to filtered content
// . we can't get utf8 content yet until we know what charset this
// junk is so we can convert it!
char **fc = getFilteredContent();
if ( ! fc || fc == (void *)-1 ) return (uint16_t *)fc;
// scan document for two things:
// 1. charset= (in a <meta> tag)
// 2. encoding= (in an <?xml> tag)
char *pstart = *fc;
//char *pend = *fc + m_filteredContentLen;
// assume known charset
m_charset = csUnknown;
// make it valid regardless i guess
m_charsetValid = true;
// check in http mime for charset
HttpMime *mime = getMime();
m_charset = getCharsetFast ( mime ,
m_firstUrl.getUrl(),
pstart ,
m_filteredContentLen,
m_niceness );
m_charsetValid = true;
return &m_charset;
}
char *XmlDoc::getIsBinary ( ) {
if ( m_isBinaryValid ) return &m_isBinary;
// get the content
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1 ) return (char *)u8;
//char *ctype = getContentType();
//if ( ! ctype || ctype == (void *)-1 ) return (char *)ctype;
//bool doBinaryCheck = false;
// the "abq-g" query gives a lot of binary content, use that
// as a testbed to make sure we filter it out!
//if ( *ctype == CT_TEXT ) doBinaryCheck = true;
//if ( *ctype == CT_UNKNOWN ) doBinaryCheck = true;
//if ( *ctype == CT_XML ) doBinaryCheck = true;
//if ( *ctype == CT_HTML ) doBinaryCheck = true;
//if ( csEnum == csUnknown ) doBinaryCheck = true;
//if ( csEnum == csASCII ) doBinaryCheck = true;
//if ( csEnum == csISOLatin1 ) doBinaryCheck = true;
//if ( slen <= 0 ) doBinaryCheck = false;
// why shouldn't we binary check everything? now that we are utf8...
//doBinaryCheck = true;
// assume not
m_isBinary = false;
m_isBinaryValid = true;
// if content is not identifed as a type known to us, then check it
// for binary characters. yes, this can be utf8 or utf16 and then
// detected as binary i think, but it should really be identified as
// being html or txt or something...
//if ( ! doBinaryCheck ) return &m_isBinary;
// use a table
char table[256];
memset ( table , 0 , 256 );
// see if we had deceitful binary content
char *s = ptr_utf8Content;
char *send = s + size_utf8Content - 1;
// for now just count the binary chars
int32_t count = 0;
// no content?
if ( ! s ) return &m_isBinary;
for ( ; s < send ; s += getUtf8CharSize(s) ) {
// yield
QUICKPOLL(m_niceness);
// skip valid utf8 characters
if ( getUtf8CharSize(s) > 1 ) continue;
// . do not count \0's
// . the fctypes.cpp isBinary array takes into account
// that people mix windows 1254 characters into
// latin-1. windows 1254 is a superset of latin-1.
// so the more common quotes and dashes are no longer
// counted as binary characters, but some of the
// rarer ones are! however, the "diff" count
// contraint helps us make up for that.
// . the first char of a utf8 character sequence always has
// the high bit off, so just test that...
if ( ! is_binary_a(*s) || ! *s ) continue;
// count it up
count++;
table[(unsigned char)*s]++;
}
// how many DIFFERENT bin chars do we have?
int32_t diff = 0;
for ( int32_t i = 0 ; i < 256 ; i++ )
if ( table[i] ) diff++;
// . is binary if 10 or more bin chars and at least 10
// DIFFERENT binary chars
// . is binary if > 5% of chars are binary
if ( (count > 10 && diff>=5) || ( 100 * count ) / size_utf8Content>6) {
// note it for now
logf(LOG_DEBUG,"build: Got binary content for %s. "
"Zeroing out content. (diff=%"INT32" count=%"INT32" "
"len=%"INT32")",
m_firstUrl.getUrl(),diff,count,size_utf8Content-1);
// do not try to index binary content, but keep it
// around for site: queries or in case we have
// inlink text for it!
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_isBinary = true;
}
return &m_isBinary;
}
// declare these two routines for using threads
static void filterDoneWrapper ( void *state , ThreadEntry *te ) ;
static void *filterStartWrapper_r ( void *state , ThreadEntry *te ) ;
// filters m_content if its pdf, word doc, etc.
char **XmlDoc::getFilteredContent ( ) {
// return it if we got it already
if ( m_filteredContentValid ) return &m_filteredContent;
// this must be valid
char **content = getContent();
if ( ! content || content == (void *)-1 ) return content;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
// it needs this
HttpMime *mime = getMime();
if ( ! mime || mime == (void *)-1 ) return (char **)mime;
// make sure NULL terminated always
// Why? pdfs can have nulls embedded
// if ( m_content &&
// m_contentValid &&
// m_content[m_contentLen] ) {
// char *xx=NULL;*xx=0; }
int32_t max , max2;
CollectionRec *cr;
bool filterable = false;
if ( m_calledThread ) goto skip;
// assume we do not need filtering by default
m_filteredContent = m_content;
m_filteredContentLen = m_contentLen;
m_filteredContentValid = true;
m_filteredContentAllocSize = 0;
// empty content?
if ( ! m_content ) return &m_filteredContent;
if ( *ct == CT_HTML ) return &m_filteredContent;
if ( *ct == CT_TEXT ) return &m_filteredContent;
if ( *ct == CT_XML ) return &m_filteredContent;
// javascript - sometimes has address information in it, so keep it!
if ( *ct == CT_JS ) return &m_filteredContent;
if ( m_contentLen == 0 ) return &m_filteredContent;
// we now support JSON for diffbot
if ( *ct == CT_JSON ) return &m_filteredContent;
if ( *ct == CT_ARC ) return &m_filteredContent;
if ( *ct == CT_WARC ) return &m_filteredContent;
// unknown content types are 0 since it is probably binary... and
// we do not want to parse it!!
if ( *ct == CT_PDF ) filterable = true;
if ( *ct == CT_DOC ) filterable = true;
if ( *ct == CT_XLS ) filterable = true;
if ( *ct == CT_PPT ) filterable = true;
if ( *ct == CT_PS ) filterable = true;
// if its a jpeg, gif, text/css etc. bail now
if ( ! filterable ) {
m_filteredContent = NULL;
m_filteredContentLen = 0;
m_filteredContentValid = true;
return &m_filteredContent;
}
// invalidate
m_filteredContentValid = false;
cr = getCollRec();
if ( ! cr ) return NULL;
// . if we have no filter specified...
// . usually "gbfilter" and it is a script in the working directory
//if ( ! cr->m_filter[0] ) {
// m_indexCode = EDOCBADCONTENTTYPE;
// return &m_filteredContent;
//}
// if not text/html or text/plain, use the other max
//max = MAXDOCLEN; // cr->m_maxOtherDocLen;
max = cr->m_maxOtherDocLen;
// now we base this on the pre-filtered length to save memory because
// our maxOtherDocLen can be 30M and when we have a lot of injections
// at the same time we lose all our memory quickly
max2 = 5 * m_contentLen + 10*1024;
if ( max > max2 ) max = max2;
// user uses -1 to specify no maxTextDocLen or maxOtherDocLen
if ( max < 0 ) max = max2;
// make a buf to hold filtered reply
m_filteredContentAllocSize = max;
m_filteredContent = (char *)mmalloc(m_filteredContentAllocSize,"xdfc");
if ( ! m_filteredContent ) {
log("build: Could not allocate %"INT32" bytes for call to "
"content filter.",m_filteredContentMaxSize);
return NULL;
}
// breathe
QUICKPOLL ( m_niceness );
// reset this here in case thread gets killed by the kill() call below
m_filteredContentLen = 0;
// update status msg so its visible in the spider gui
setStatus ( "filtering content" );
// reset this... why?
g_errno = 0;
// . call thread to call popen
// . callThread returns true on success, in which case we block
// . do not repeat
m_calledThread = true;
// reset this since filterStart_r() will set it on error
m_errno = 0;
// how can this be? don't core like this in thread, because it
// does not save our files!!
if ( ! m_mimeValid ) { char *xx=NULL;*xx=0; }
// do it
if ( g_threads.call ( FILTER_THREAD ,
MAX_NICENESS ,
this ,
filterDoneWrapper ,
filterStartWrapper_r ) )
// return -1 if blocked
return (char **)-1;
// clear error!
g_errno = 0;
// note it
log("build: Could not spawn thread for call to "
"content filter.");
// get the data
filterStart_r ( false ); // am thread?
// skip down here if thread has returned and we got re-called
skip:
// if size is 0, free the buf
if ( m_filteredContentLen <= 0 ) {
mfree ( m_filteredContent ,
m_filteredContentAllocSize,"fcas");
m_filteredContent = NULL;
m_filteredContentLen = 0;
m_filteredContentAllocSize = 0;
}
// did we have an error from the thread?
if ( m_errno ) g_errno = m_errno;
// but bail out if it set g_errno
if ( g_errno ) return NULL;
// must be valid now - sanity check
if ( ! m_filteredContentValid ) { char *xx=NULL;*xx=0; }
// return it
return &m_filteredContent;
}
// come back here
void filterDoneWrapper ( void *state , ThreadEntry *te ) {
// jump back into the brawl
XmlDoc *THIS = (XmlDoc *)state;
// if size is 0, free the buf. have to do this outside the thread
// since malloc/free cannot be called in thread
if ( THIS->m_filteredContentLen <= 0 ) {
mfree ( THIS->m_filteredContent ,
THIS->m_filteredContentAllocSize,"fcas");
THIS->m_filteredContent = NULL;
THIS->m_filteredContentLen = 0;
THIS->m_filteredContentAllocSize = 0;
}
// . call the master callback
// . it will ultimately re-call getFilteredContent()
THIS->m_masterLoop ( THIS->m_masterState );
}
// thread starts here
void *filterStartWrapper_r ( void *state , ThreadEntry *te ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->filterStart_r ( true ); // am thread?
return NULL;
}
//int my_system_r ( char *cmd , int32_t timeout ) ;
// sets m_errno on error
void XmlDoc::filterStart_r ( bool amThread ) {
// get thread id
pthread_t id = getpidtid();
// sanity check
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
// int16_tcut
int32_t ctype = m_contentType;
// assume none
m_filteredContentLen = 0;
//if ( amThread ) id = pthread_self();
//else id = getpid();
// pass the input to the program through this file
// rather than a pipe, since popen() seems broken
char in[1024];
snprintf(in,1023,"%sin.%"INT64"", g_hostdb.m_dir , (int64_t)id );
unlink ( in );
// collect the output from the filter from this file
char out[1024];
snprintf ( out , 1023,"%sout.%"INT64"", g_hostdb.m_dir, (int64_t)id );
unlink ( out );
// ignore errno from those unlinks
errno = 0;
// open the input file
retry11:
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry11;
m_errno = errno;
log("build: Could not open file %s for writing: %s.",
in,mstrerror(m_errno));
return;
}
// we are in a thread, this must be valid!
if ( ! m_mimeValid ) { char *xx=NULL;*xx=0;}
retry12:
// write the content into the input file
int32_t w = write ( fd , m_content , m_contentLen );
// valgrind
if ( w < 0 && errno == EINTR ) goto retry12;
// did we get an error
if ( w != m_contentLen ) {
//int32_t w = fwrite ( m_buf , 1 , m_bufLen , pd );
//if ( w != m_bufLen ) {
m_errno = errno;
log("build: Error writing to %s: %s.",in,
mstrerror(m_errno));
close(fd);
return;
}
// close the file
close ( fd );
// int16_tcut
char *wdir = g_hostdb.m_dir;
// . open a pipe to pdf2html program
// . the output will go to stdout
char cmd[2048];
// different commands to filter differt ctypes
// -i : ignore images
// -stdout: send output to stdout
// -c : generate complex document
// Google generates complex docs, but the large ones are horribly slow
// in the browser, but docs with 2 cols don't display right w/o -c.
// damn, -stdout doesn't work when -c is specified.
// These ulimit sizes are max virtual memory in kilobytes. let's
// keep them to 25 Megabytes
if ( ctype == CT_PDF )
snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; timeout 30s nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
else if ( ctype == CT_DOC )
// "wdir" include trailing '/'? not sure
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; timeout 30s nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
else if ( ctype == CT_XLS )
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
// this is too buggy for now... causes hanging threads because it
// hangs, so i added 'timeout 10s' but that only works on newer
// linux version, so it'll just error out otherwise.
else if ( ctype == CT_PPT )
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/ppthtml %s > %s" , wdir , in , out );
else if ( ctype == CT_PS )
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30; timeout 10s nice -n 19 %s/pstotext %s > %s" , wdir , in , out );
else { char *xx=NULL;*xx=0; }
// breach sanity check
//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }
// exectue it
int retVal = gbsystem ( cmd );
if ( retVal == -1 )
log("gb: system(%s) : %s",
cmd,mstrerror(g_errno));
// all done with input file
// clean up the binary input file from disk
if ( unlink ( in ) != 0 ) {
// log error
log("gbfilter: unlink (%s): %s\n",in, strerror(errno));
// ignore it, since it was not a processing error per se
errno = 0;
}
// don't use too much memory, i think xhtml uses so much that it
// swaps out all the gb processes?
//struct rlimit lim;
//lim.rlim_cur = lim.rlim_max = 24 * 1024 * 1024 ;
//if ( setrlimit ( RLIMIT_AS , &lim ) )
// fprintf (stderr,"gbfilter:setrlimit: %s", strerror(errno) );
retry13:
fd = open ( out , O_RDONLY );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry13;
m_errno = errno;
log("gbfilter: Could not open file %s for reading: %s.",
out,mstrerror(m_errno));
return;
}
// sanity -- need room to store a \0
if ( m_filteredContentAllocSize < 2 ) { char *xx=NULL;*xx=0; }
// to read - leave room for \0
int32_t toRead = m_filteredContentAllocSize - 1;
retry14:
// read right from pipe descriptor
int32_t r = read (fd, m_filteredContent,toRead);
// note errors
if ( r < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry14;
log("gbfilter: reading output: %s",mstrerror(errno));
// this is often bad fd from an oom error, so ignore it
//m_errno = errno;
errno = 0;
r = 0;
}
// clean up shop
close ( fd );
// delete output file
unlink ( out );
// validate now
m_filteredContentValid = 1;
// save the new buf len
m_filteredContentLen = r;
// ensure enough room for null term
if ( r >= m_filteredContentAllocSize ) { char *xx=NULL;*xx=0; }
// ensure filtered stuff is NULL terminated so we can set the Xml class
m_filteredContent [ m_filteredContentLen ] = '\0';
// it is good
m_filteredContentValid = true;
// . at this point we got the filtered content
// . bitch if we didn't allocate enough space
if ( r > 0 && r == toRead )
log(LOG_LOGIC,"build: Had to truncate document to %"INT32" bytes "
"because did not allocate enough space for filter. "
"This should never happen. It is a hack that should be "
"fixed right.", toRead );
// if we got something, then we're done
//if ( r > 0 ) return;
// otherwise, free it up
// . NO! not in a thread!!
//mfree ( m_filteredContent , m_filteredContentAllocSize, "fcas" );
//m_filteredContent = NULL;
//m_filteredContentLen = 0;
//m_filteredContentAllocSize = 0;
}
pid_t g_pid = -1;
int32_t g_ticker = 0;
int32_t g_filterTimeout = -1;
/*
static int startUp ( void *cmd ) ;
#include <sys/types.h> // waitpid()
#include <sys/wait.h> // waitpid()
#include <sched.h> // clone()
static char cloneStack[250000];
int my_system_r ( char *cmd , int32_t timeout ) {
// bail if cmd empty
if ( ! cmd ) {
log(LOG_LOGIC,"build: empty command.");
return -1;
}
errno = 0;
// this gives out of memory on newer kernels, was that causing our
// older kernerls to crash, too, in addition to the e1000 driver?
//pid_t pid = fork();
// let's use clone() instead now
// error forking?
pid_t pid = clone ( startUp ,
cloneStack + 250000 ,
CLONE_FS | CLONE_FILES | CLONE_VM | SIGCHLD ,
cmd );
if (pid == -1) {
log("build: fork: %s.",mstrerror(errno));
return -1;
}
// sanity check
if ( g_pid != -1 ) { char *xx = NULL; *xx = 0; }
// set the process group id of this guy to itself, so he becomes
// the process leader, so any processes he spawns should all receive
// the same HUP or kill signals he receives. uhhhh probably not...
//setpgid ( pid , pid );
// save the pid globally so Threads.cpp can kill(9,g_pid) it if it
// stalls too long. but to measure how long it is out for, keep a
// ticker count. this ticker count is incremented in the sleep wrapper
// in Threads.cpp.
g_ticker = 0;
g_pid = pid;
g_filterTimeout = timeout;
loop:
int status;
if ( waitpid ( pid , &status , 0 ) == -1 ) {
// reset g_pid so Threads.cpp's kill wrapper chills out
if ( errno != EINTR ) {
log("build: waitpid pid=%"INT32": %s.",
(int32_t)g_pid,mstrerror(errno));
g_pid = -1;
return -1;
}
// if we got interrupted by a different signal keep waiting
goto loop;
}
// reset g_pid so Threads.cpp's kill wrapper chills out
g_pid = -1;
if ( status < 0 ) log("build: Got bad status from child.");
// we got the signal
return status;
}
int startUp ( void *cmd ) {
char *argv[4];
argv[0] = "sh";
argv[1] = "-c";
argv[2] = (char *)cmd;
argv[3] = 0;
char *envp[2];
char buf[1024];
// antiword needs this environment var so it can find
// the .antiword/ dir , we should put it in gb's working dir
snprintf(buf,1023,"HOME=%s", g_hostdb.m_dir );
envp[0] = buf;
envp[1] = 0;
execve("/bin/sh", argv, envp );
//exit(127);
return 1;
}
*/
// return downloaded content as utf8
char **XmlDoc::getRawUtf8Content ( ) {
// if we already computed it, return that
if ( m_rawUtf8ContentValid ) return &m_rawUtf8Content;
// . get our characterset
// . crap! this can be recursive. it calls getXml() which calls
// getUtf8Content() which is us!
uint16_t *charset = getCharset ( );
if ( ! charset || charset == (uint16_t *)-1 ) return (char **)charset;
char *csName = get_charset_str(*charset);
// . if not supported fix that!
// . m_indexCode should be set to EBADCHARSET ultimately, but not here
if ( ! supportedCharset(*charset) && csName ) {
m_rawUtf8Content = NULL;
m_rawUtf8ContentSize = 0;
m_rawUtf8ContentAllocSize = 0;
m_rawUtf8ContentValid = true;
return &m_rawUtf8Content;
}
// get ptr to filtered content
char **fc = getFilteredContent();
if ( ! fc || fc == (void *)-1 ) return (char **)fc;
// make sure NULL terminated always
if ( m_filteredContent &&
m_filteredContentValid &&
m_filteredContent[m_filteredContentLen] ) {
char *xx=NULL;*xx=0; }
// NULL out if no content
if ( ! m_filteredContent ) {
m_rawUtf8Content = NULL;
m_rawUtf8ContentSize = 0;
m_rawUtf8ContentAllocSize = 0;
m_rawUtf8ContentValid = true;
return &m_rawUtf8Content;
}
// assume already utf8
m_rawUtf8Content = m_filteredContent;
m_rawUtf8ContentSize = m_filteredContentLen + 1;
m_rawUtf8ContentAllocSize = 0;
// if we are not ascii or utf8 already, encode it into utf8
if ( m_rawUtf8ContentSize > 1 &&
csName &&
*charset != csASCII &&
*charset != csUTF8 ) {
// ok, no-go
//ptr_utf8Content = NULL;
m_rawUtf8Content = NULL;
// assume utf8 will be twice the size ... then add a little
int32_t need = (m_filteredContentLen * 2) + 4096;
char *buf = (char *) mmalloc(need, "Xml3");
// log oom error
if ( ! buf ) {
log("build: xml: not enough memory for utf8 buffer");
return NULL;
}
// sanity check
if ( ! csName ) { char *xx=NULL;*xx=0; }
// note it
setStatus ( "converting doc to utf8" );
// returns # of bytes i guess
int32_t used = ucToUtf8 ( buf ,
// fix core dump by subtracting 10!
need - 10,
m_filteredContent ,
m_filteredContentLen ,
csName ,
-1 ,//allowBadChars
m_niceness );
// clear this if successful, otherwise, it sets errno
if ( used > 0 ) g_errno = 0;
// unrecoverable error? bad charset is g_errno == 7
// which is like argument list too long or something
// error from Unicode.cpp's call to iconv()
if ( g_errno )
log(LOG_INFO, "build: xml: failed parsing buffer: %s "
"(cs=%d)", mstrerror(g_errno), *charset);
if ( g_errno && g_errno != 7 ) {
mfree ( buf, need, "Xml3");
// do not index this doc, delete from spiderdb/tfndb
//if ( g_errno != ENOMEM ) m_indexCode = g_errno;
// if conversion failed NOT because of bad charset
// then return NULL now and bail out. probably ENOMEM
return NULL;
}
// if bad charset... just make doc empty as a utf8 doc
if ( g_errno == 7 ) {
used = 0;
buf[0] = '\0';
buf[1] = '\0';
// clear g_errno
g_errno = 0;
// and make a note for getIndexCode() so it will not
// bother indexing the doc! nah, just index it
// but with no content...
}
// crazy? this is pretty important...
if ( used + 10 >= need )
log("build: utf8 using too much buf space!!! u=%s",
getFirstUrl()->getUrl());
// re-assign
//ptr_utf8Content = buf;
//size_utf8Content = used + 1;
//m_utf8ContentAllocSize = need;
m_rawUtf8Content = buf;
m_rawUtf8ContentSize = used + 1;
m_rawUtf8ContentAllocSize = need;
}
// convert \0's to spaces. why do we see these in some pages?
// http://www.golflink.com/golf-courses/ has one in the middle after
// about 32k of content.
char *p = m_rawUtf8Content;
char *pend = p + m_rawUtf8ContentSize - 1;
for ( ; p < pend ; p++ ) {
QUICKPOLL(m_niceness);
if ( ! *p ) *p = ' ';
}
//
// VALIDATE the UTF-8
//
// . make a buffer to hold the decoded content now
// . we were just using the m_expandedUtf8Content buf itself, but "n"
// ended up equalling m_expadedUtf8ContentSize one time for a
// doc, http://ediso.net/, which probably had corrupt utf8 in it,
// and that breached our buffer! so verify that this is good
// utf8, and that we can parse it without breaching our buffer!
p = m_rawUtf8Content;
// make sure NULL terminated always
if ( p[m_rawUtf8ContentSize-1]) { char *xx=NULL;*xx=0;}
// make sure we don't breach the buffer when parsing it
char size;
char *lastp = NULL;
for ( ; ; p += size ) {
QUICKPOLL(m_niceness);
if ( p >= pend ) break;
lastp = p;
size = getUtf8CharSize(p);
}
// overflow?
if ( p > pend && lastp ) {
// back up to the bad utf8 char that made us overshoot
p = lastp;
// space it out
for ( ; p < pend ; p++ ) *p = ' ';
// log it maybe due to us not being keep alive http server?
log("doc: fix bad utf8 overflow (because we are not "
"keepalive?) in doc %s",m_firstUrl.m_url);
}
// overflow?
if ( p != pend ) { char *xx=NULL;*xx=0; }
// sanity check for breach. or underrun in case we encountered a
// premature \0
if (p-m_rawUtf8Content!=m_rawUtf8ContentSize-1) {char*xx=NULL;*xx=0;}
// sanity -- must be \0 terminated
if ( m_rawUtf8Content[m_rawUtf8ContentSize-1] ) {char *xx=NULL;*xx=0; }
// it might have shrunk us
//m_rawUtf8ContentSize = n + 1;
// we are good to go
m_rawUtf8ContentValid = true;
//return &ptr_utf8Content;
return &m_rawUtf8Content;
}
// this is so Msg13.cpp can call getExpandedUtf8Content() to do its
// iframe expansion logic
void getExpandedUtf8ContentWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
char **retVal = THIS->getExpandedUtf8Content();
// return if blocked again
if ( retVal == (void *)-1 ) return;
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
// now if there are any <iframe> tags let's substitute them for
// the html source they represent here. that way we will get all the
// information you see on the page. this is somewhat critical since
// a lot of pages have their content in the frame.
char **XmlDoc::getExpandedUtf8Content ( ) {
// if we already computed it, return that
if ( m_expandedUtf8ContentValid ) return &m_expandedUtf8Content;
// if called from spider compression proxy we need to set
// masterLoop here now
if ( ! m_masterLoop ) {
m_masterLoop = getExpandedUtf8ContentWrapper;
m_masterState = this;
}
// get the unexpanded cpontent first
char **up = getRawUtf8Content ();
if ( ! up || up == (void *)-1 ) return up;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (char **)cu;
// NULL out if no content
if ( ! *up ) {
m_expandedUtf8Content = NULL;
m_expandedUtf8ContentSize = 0;
m_expandedUtf8ContentValid = true;
return &m_expandedUtf8Content;
}
// do not do iframe expansion in order to keep injections fast
if ( m_wasContentInjected ) {
m_expandedUtf8Content = m_rawUtf8Content;
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
m_expandedUtf8ContentValid = true;
return &m_expandedUtf8Content;
}
bool skip = m_skipIframeExpansion;
// if we are a warc, arc or doc that consists of a sequence of
// sub-docs that we are indexing/injecting then skip iframe expansion
if ( isContainerDoc() )
skip = true;
// or if this is set to true
if ( skip ) {
m_expandedUtf8Content = m_rawUtf8Content;
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
m_expandedUtf8ContentValid = true;
return &m_expandedUtf8Content;
}
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (char **)ct;
// if we have a json reply, leave it alone... do not expand iframes
// in json, it will mess up the json
if ( *ct == CT_JSON ) {
m_expandedUtf8Content = m_rawUtf8Content;
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
m_expandedUtf8ContentValid = true;
return &m_expandedUtf8Content;
}
// we need this so getExtraDoc does not core
int32_t *pfip = getFirstIp();
if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;
// point to it
char *p = *up;
char *pend = *up + m_rawUtf8ContentSize; // includes \0
// declare crap up here so we can jump into the for loop
int32_t urlLen;
char *url;
char *fend;
Url furl;
XmlDoc **ped;
XmlDoc *ed;
bool inScript = false;
bool match;
// assign saved value if we got that
if ( m_savedp ) {
// restore "p"
p = m_savedp;
// update this
ed = m_extraDoc;
// and see if we got the mime now
goto gotMime;
}
// now loop for frame and iframe tags
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
// breathe
QUICKPOLL(m_niceness);
// if never found a frame tag, just keep on chugging
if ( *p != '<' ) continue;
// <script>?
if ( to_lower_a(p[1]) == 's' &&
to_lower_a(p[2]) == 'c' &&
to_lower_a(p[3]) == 'r' &&
to_lower_a(p[4]) == 'i' &&
to_lower_a(p[5]) == 'p' &&
to_lower_a(p[6]) == 't' )
inScript = 1;
// </script>?
if ( p[1]=='/' &&
to_lower_a(p[2]) == 's' &&
to_lower_a(p[3]) == 'c' &&
to_lower_a(p[4]) == 'r' &&
to_lower_a(p[5]) == 'i' &&
to_lower_a(p[6]) == 'p' &&
to_lower_a(p[7]) == 't' )
inScript = 0;
// . skip if in script
// . fixes guysndollsllc.com which has an iframe tag in
// a script section, "document.write ('<iframe..."
if ( inScript ) continue;
// iframe or frame?
match = false;
if ( to_lower_a(p[1]) == 'f' &&
to_lower_a(p[2]) == 'r' &&
to_lower_a(p[3]) == 'a' &&
to_lower_a(p[4]) == 'm' &&
to_lower_a(p[5]) == 'e' )
match = true;
if ( to_lower_a(p[1]) == 'i' &&
to_lower_a(p[2]) == 'f' &&
to_lower_a(p[3]) == 'r' &&
to_lower_a(p[4]) == 'a' &&
to_lower_a(p[5]) == 'm' &&
to_lower_a(p[6]) == 'e' )
match = true;
// skip tag if not iframe or frame
if ( ! match ) continue;
// check for frame or iframe
//if ( strncasecmp(p+1,"frame " , 6) &&
// strncasecmp(p+1,"iframe ", 7) )
// continue;
// get src tag (function in Words.h)
url = getFieldValue ( p , pend - p ,"src" , &urlLen );
// needs a src field
if ( ! url ) continue;
// "" is not acceptable either. techcrunch.com has
// <iframe src=""> which ends up embedding the root url.
if ( urlLen == 0 )
continue;
// skip if "about:blank"
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
continue;
// get our current url
//cu = getCurrentUrl();
// set our frame url
furl.set ( cu , url , urlLen );
// no recursion
if ( strcmp(furl.getUrl(),m_firstUrl.getUrl()) == 0 )
continue;
// must be http or https, not ftp! ftp was causing us to
// core in Msg22.cpp where it checks the url's protocol
// when trying to lookup the old title rec.
// http://sweetaub.ipower.com/ had an iframe with a ftp url.
if ( ! furl.isHttp() && ! furl.isHttps() ) continue;
// ignore google.com/ assholes for now
if ( strstr(furl.getUrl(),"google.com/" ) ) continue;
// and bing just to be safe
if ( strstr(furl.getUrl(),"bing.com/" ) ) continue;
// save it in case we have to return and come back later
m_savedp = p;
// break here
//log("mdw: breakpoing here");
// . download that. get as a doc. use 0 for max cache time
// . no, use 5 seconds since we often have the same iframe
// in the root doc that we have in the main doc, like a
// facebook iframe or something.
// . use a m_maxCacheAge of 5 seconds now!
ped = getExtraDoc ( furl.m_url , 5 );
// should never block
if ( ! ped ) {
log("xmldoc: getExpandedutf8content = %s",
mstrerror(g_errno));
return NULL;
}
// . return -1 if it blocked???
// . no, this is not supported right now
// . it will mess up our for loop
if ( ped == (void *)-1 ) {char *xx=NULL;*xx=0;}
// cast it
ed = *ped;
// sanity
if ( ! ed ) { char *xx=NULL;*xx=0; }
// jump in here from above
gotMime:
// make it not use the ips.txt cache
//ed->m_useIpsTxtFile = false;
//ed->m_readFromTestCache = false;
// get the mime
HttpMime *mime = ed->getMime();
if ( ! mime || mime == (void *)-1 ) return (char **)mime;
// if not success, do not expand it i guess...
if ( mime->getHttpStatus() != 200 ) {
// free it
nukeDoc ( ed );
// and continue
continue;
}
// update m_downloadEndTime if we should
if ( ed->m_downloadEndTimeValid ) {
// we must already be valid
if ( ! m_downloadEndTimeValid ) {char *xx=NULL;*xx=0;}
// only replace it if it had ip and robots.txt allowed
if ( ed->m_downloadEndTime )
m_downloadEndTime = ed->m_downloadEndTime;
}
// re-write that extra doc into the content
char **puc = ed->getRawUtf8Content();
// this should not block
//if ( puc == (void *)-1 ) { char *xx=NULL;*xx=0; }
// it blocked before! because the charset was not known!
if ( puc == (void *)-1 ) return (char **)puc;
// error?
if ( ! puc ) return (char **)puc;
// cast it
char *uc = *puc;
// or if no content, and no mime (like if robots.txt disallows)
if ( ! uc || ed->m_rawUtf8ContentSize == 1 ) {
// free it
nukeDoc ( ed );
// and continue
continue;
}
// size includes terminating \0
if ( uc[ed->m_rawUtf8ContentSize-1] ) { char *xx=NULL;*xx=0;}
// if first time we are expanding, set this
if ( ! m_oldp ) m_oldp = *up;
// find end of frame tag
fend = p;
for ( ; fend < pend ; fend += getUtf8CharSize(fend) ) {
// breathe
QUICKPOLL(m_niceness);
// if never found a frame tag, just keep on chugging
if ( *fend == '>' ) break;
}
// if no end to the iframe tag was found, bail then...
if ( fend >= pend ) continue;
// skip the >
fend++;
// insert the non-frame crap first AND the frame/iframe tag
m_esbuf.safeMemcpy ( m_oldp , fend - m_oldp );
// end the frame
//m_esbuf.safeMemcpy ( "</iframe>", 9 );
// use our own special tag so Sections.cpp can set
// Section::m_gbFrameNum which it uses internally
m_esbuf.safePrintf("<gbframe>"); // gbiframe
// identify javascript
bool javascript = false;
if ( *ed->getContentType() == CT_JS ) javascript = true;
// so we do not mine javascript for cities and states etc.
// in Address.cpp
if ( javascript ) m_esbuf.safePrintf("<script>");
// store that
m_esbuf.safeMemcpy ( uc , ed->m_rawUtf8ContentSize - 1 );
// our special tag has an end tag as well
if ( javascript ) m_esbuf.safePrintf("</script>");
m_esbuf.safePrintf("</gbframe>");
// free up ed
nukeDoc ( ed );
// end of frame tag, skip over whole thing
m_oldp = fend ;
// sanity check
if ( m_oldp > pend ) { char *xx=NULL;*xx=0; }
// another flag
m_didExpansion = true;
// count how many we did
if ( ++m_numExpansions >= 5 ) break;
}
// default
m_expandedUtf8Content = m_rawUtf8Content;
m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
// point to expansion buffer if we did any expanding
if ( m_didExpansion ) {
// copy over the rest
m_esbuf.safeMemcpy ( m_oldp , pend - m_oldp );
// null term it
m_esbuf.pushChar('\0');
// and point to that buffer
m_expandedUtf8Content = m_esbuf.getBufStart();//m_buf;
// include the \0 as part of the size
m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
}
// sanity -- must be \0 terminated
if ( m_expandedUtf8Content[m_expandedUtf8ContentSize-1] ) {
char *xx=NULL;*xx=0; }
m_expandedUtf8ContentValid = true;
return &m_expandedUtf8Content;
}
static SafeBuf s_cookieBuf;
void *systemStartWrapper_r ( void *state , ThreadEntry *t ) {
XmlDoc *THIS = (XmlDoc *)state;
char filename[2048];
snprintf(filename,2048,"%sgbarchivefile%"UINT32".gz",
g_hostdb.m_dir,
(int32_t)(int64_t)THIS);
char cmd[MAX_URL_LEN+256];
snprintf( cmd,
MAX_URL_LEN+256,
"wget -q --header=\"Cookie: %s\" \"%s\" -O %s" ,
s_cookieBuf.getBufStart() ,
THIS->m_firstUrl.getUrl() ,
filename );
log("build: wget: %s",cmd );
int ret;
ret = system(cmd);
if ( ret == -1 )
log("build: wget system failed: %s",mstrerror(errno));
else
log("build: wget system returned %"INT32"",ret);
// unzip it now
snprintf ( cmd , MAX_URL_LEN+256, "gunzip -f %s" , filename );
log("build: wget begin: %s",cmd );
ret = system(cmd);
if ( ret == -1 )
log("build: gunzip system failed: %s",mstrerror(errno));
else
log("build: gunzip system returned %"INT32"",ret);
log("build: done with gunzip");
return NULL;
}
// come back here
void systemDoneWrapper ( void *state , ThreadEntry *t ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_masterLoop ( THIS->m_masterState );
}
// we download large files to a file on disk, like warcs and arcs
FILE *XmlDoc::getUtf8ContentInFile () {
setStatus ("wgetting archive file");
// if ( m_calledWgetThread ) {
// char filename[2048];
// snprintf ( filename,
// 2048,
// "gbarchivefile%"UINT32"",
// (int32_t)(int64_t)this);
// m_file.set ( g_hostdb.m_dir , filename );
// m_fileSize = m_file.getFileSize();
// m_fileValid = true;
// *fileSizeArg = m_fileSize;
// m_file.open(O_RDONLY);
// // explicitly set it to false now to make it harder for
// // it not to be true because that messes things up
// m_file.m_usePartFiles = false;
// return &m_file;
// }
// before calling the system wget thread we gotta set the cookiebuf
// HACK: for archive.org
// if getting a page from archive.org then append the cookie
// so we have the proper permissions
static bool s_triedToLoadCookie = false;
char *x = m_firstUrl.getUrl();
// only go out 20 chars looking for start of .archive.org/
char *xend = x + 25;
bool isArchiveOrg = false;
for ( ; x < xend && *x ; x++ ) {
if ( x[ 0] != '.' && x[0] != '/' ) continue; // /archive.org?
if ( x[ 1] != 'a' ) continue;
if ( x[ 2] != 'r' ) continue;
if ( x[ 3] != 'c' ) continue;
if ( x[ 4] != 'h' ) continue;
if ( x[ 5] != 'i' ) continue;
if ( x[ 6] != 'v' ) continue;
if ( x[ 7] != 'e' ) continue;
if ( x[ 8] != '.' ) continue;
if ( x[ 9] != 'o' ) continue;
if ( x[10] != 'r' ) continue;
if ( x[11] != 'g' ) continue;
if ( x[12] != '/' ) continue;
isArchiveOrg = true;
break;
}
if ( isArchiveOrg && ! s_triedToLoadCookie ) {
// try to load it up if haven't tried yet
s_triedToLoadCookie = true;
SafeBuf tmp;
//int32_t loaded = tmp.load ( "/home/mwells/.config/internetarchive.yml");
int32_t loaded = tmp.load ( "auth/internetarchive.yml");
if(loaded <= 0) {
if ( ! g_errno ) g_errno = EDOCTOOBIG;
log("gb: failed to load auth/internetarchive.yml: "
"%s",mstrerror(g_errno));
// do not restart gb in a loop, so return 0 to shell
exit(0);
//return NULL;
// FIXME
char *xx=NULL;*xx=0;
}
char *s = tmp.getBufStart();
char *line;
char *lineEnd;
line = strstr ( s , "logged-in-user: " );
if ( line ) lineEnd = strstr(line,"\n");
if ( lineEnd ) {
s_cookieBuf.safePrintf("logged-in-user=");
line += 16;
s_cookieBuf.safeMemcpy(line,lineEnd-line);
s_cookieBuf.pushChar(';');
s_cookieBuf.pushChar(' ');
s_cookieBuf.nullTerm();
}
line = strstr ( s , "logged-in-sig: " );
if ( line ) lineEnd = strstr(line,"\n");
if ( lineEnd ) {
s_cookieBuf.safePrintf("logged-in-sig=");
line += 15;
s_cookieBuf.safeMemcpy(line,lineEnd-line);
//s_cookieBuf.pushChar(';');
//s_cookieBuf.pushChar(' ');
s_cookieBuf.nullTerm();
}
}
// if we loaded something use it
if ( isArchiveOrg && s_cookieBuf.length() ) {
//cookie = s_cookieBuf.getBufStart();
log("http: using archive cookie %s",s_cookieBuf.getBufStart());
// and set user-agent too
// userAgent = "python-requests/2.3.0 "
// "CPython/2.7.3 Linux/3.5.0-32-generic";
}
char cmd[MAX_URL_LEN+256];
snprintf( cmd,
MAX_URL_LEN+256,
"set -o pipefail|"
"wget --limit-rate=10M -O- --header=\"Cookie: %s\" \"%s\"|" //
"zcat|"
"mbuffer -t -m 10M -o-", //this is useful but we need a new version of mbuffer -W 30
s_cookieBuf.getBufStart() ,
m_firstUrl.getUrl());
log("build: wget: %s",cmd );
FILE* fh = gbpopen(cmd);
int fd = fileno(fh);
int flags = fcntl(fd, F_GETFL, 0);
if(fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
log("build: could not set wget stream to nonblocking %s",
m_firstUrl.getUrl());
//error
}
if(!g_loop.registerReadCallback ( fd,
this ,
doneReadingArchiveFileWrapper,
m_niceness )) {
log("build: failed to register warc read callback." );
return NULL;
}
m_registeredWgetReadCallback = true;
log("build: called popen");
m_calledWgetThread = true;
m_hasMoreToRead = true;
return fh;
// return getUtf8ContentInFile ( fileSizeArg );
// . callThread returns true on success, in which case we block
// if ( g_threads.call ( FILTER_THREAD ,
// MAX_NICENESS ,
// (void *)this , // this
// systemDoneWrapper ,
// systemStartWrapper_r ) )
// // would block, wait for thread
// return (BigFile *)-1;
// // failed?
// log("build: failed to launch wget thread");
// If we run it in this thread then if we are fetching
// a local url it will block forever.
// systemStartWrapper_r(this,NULL);
// return getUtf8ContentInFile ( fileSizeArg );
//g_errno = ETHREADSDISABLED;
//return NULL;
}
// . get the final utf8 content of the document
// . all html entities are replaced with utf8 chars
// . all iframes are expanded
// . if we are using diffbot then getting the utf8 content should return
// the json which is the output from the diffbot api. UNLESS we are getting
// the webpage itself for harvesting outlinks to spider later.
char **XmlDoc::getUtf8Content ( ) {
// if we already computed it, return that
if ( m_utf8ContentValid ) return &ptr_utf8Content;
if ( m_setFromTitleRec ) {
m_utf8ContentValid = true;
return &ptr_utf8Content;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
setStatus("getting utf8 content");
// recycle?
if ( cr->m_recycleContent || m_recycleContent ||
// if trying to delete from index, load from old titlerec
m_deleteFromIndex ) {
// get the old xml doc from the old title rec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (void *)-1 ) return (char **)pod;
// int16_tcut
XmlDoc *od = *pod;
// this is non-NULL if it existed
if ( od ) {
ptr_utf8Content = od-> ptr_utf8Content;
size_utf8Content = od->size_utf8Content;
m_utf8ContentValid = true;
m_contentType = od->m_contentType;
m_contentTypeValid = true;
// sanity check
if ( ptr_utf8Content &&
ptr_utf8Content[size_utf8Content-1] ) {
char *xx=NULL;*xx=0; }
return &ptr_utf8Content;
}
// if could not find title rec and we are docid-based then
// we can't go any further!!
if ( m_setFromDocId ||
// it should be there if trying to delete as well!
m_deleteFromIndex ) {
log("xmldoc: null utf8 content for docid-based "
"titlerec (d=%"INT64") lookup which was not found",
m_docId);
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_utf8ContentValid = true;
m_contentType = CT_HTML;
m_contentTypeValid = true;
return &ptr_utf8Content;
}
}
char **ep = getExpandedUtf8Content();
if ( ! ep || ep == (void *)-1 ) return ep;
// NULL out if no content
if ( ! *ep ) {
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_utf8ContentValid = true;
return &ptr_utf8Content;
}
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (char **)ct;
// if we have a json reply, leave it alone... expanding a &quot;
// into a double quote will mess up the JSON!
if ( *ct == CT_JSON ) {
ptr_utf8Content = (char *)m_expandedUtf8Content;
size_utf8Content = m_expandedUtf8ContentSize;
m_utf8ContentValid = true;
return &ptr_utf8Content;
}
// why would the spider proxy, who use msg13.cpp to call
// XmlDoc::getExpandedUtf8Content() want to call this??? it seems
// to destroy expandedutf8content with a call to htmldecode
if ( m_isSpiderProxy ) { char *xx=NULL;*xx=0; }
// not if rss file extension
//bool isRSSExt = false;
//char *ext = m_firstUrl.getExtension();
//if ( ext && strcasecmp(ext,"rss") == 0 ) isRSSExt = true;
//if ( ext && strcasecmp(ext,"xml") == 0 ) isRSSExt = true;
//if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
//if ( m_contentTypeValid && m_contentType == CT_XML ) isRSSExt = true;
// convert &lt; to <gb and &gt; to gb/> ???? and &amp; to utf32 char
// for a double wide ampersand?
//bool doSpecial = true;
// convert to what it should be if we are an .rss file extension
//if ( isRSSExt ) doSpecial = false;
// sabnity check
if ( m_xmlValid ) { char *xx=NULL;*xx=0; }
if ( m_wordsValid ) { char *xx=NULL;*xx=0; }
QUICKPOLL(m_niceness);
//
// convert illegal utf8 characters into spaces
//
// fixes santaclarachorale.vbotickets.com/tickets/g.f._handels_israel_in_egypt/1062
// which has a 228,0x80,& sequence (3 chars, last is ascii)
uint8_t *x = (uint8_t *)m_expandedUtf8Content;
char size;
for ( ; *x ; x += size ) {
QUICKPOLL(m_niceness);
size = getUtf8CharSize(x);
// ok, make it a space i guess if it is a bad utf8 char
if ( ! isSaneUtf8Char(x) ) {
*x = ' ';
size = 1;
continue;
}
// skip if only one byte
if ( size == 1 ) continue;
// now each byte in the sequence must have 0x80 set...
if ( ! (x[1] & 0x80) ) {
x[0] = ' ';
size = 1;
continue;
}
if ( size == 2 ) continue;
if ( ! (x[2] & 0x80) ) {
x[0] = ' ';
size = 1;
continue;
}
if ( size == 3 ) continue;
if ( ! (x[3] & 0x80) ) {
x[0] = ' ';
size = 1;
continue;
}
}
// sanity
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
// if we are an xml doc, then before we call htmlDecode translate
// all tags like <title> or <link> to <gbtitle> or <gblink> so we
// know they are xml tags. because stuff like &lt;br&gt; will
// become <br> and will be within its xml tag like <gbdescription>
// or <gbtitle>.
// MDW: 9/28/2014. no longer do this since i added hashXmlFields().
/*
if ( m_contentType == CT_XML ) {
// count the xml tags
char *p = m_expandedUtf8Content;
char *pend = p + m_expandedUtf8ContentSize - 1;
int32_t need = m_expandedUtf8ContentSize;
for ( ; p < pend ; p++ ) {
QUICKPOLL(m_niceness);
if ( *p == '<' ) need += 5; // for adding "gbxml"
}
if ( ! m_xbuf.reserve(need) ) return NULL;
// reset ptr
p = m_expandedUtf8Content;
// ponit to dst
char *dst = m_xbuf.getBufStart();
// do the copy
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL(m_niceness);
// copy it over
*dst++ = *p;
if ( *p != '<' ) continue;
// if <?xml> copy over as is, do not insert 'gb'
if ( p[1] == '?' ) continue;
// same for comments <!--...-->
if ( p[1] == '!' ) continue;
// point to tagname
char *tagName = p+1;
if ( p[1] == '/' ) tagName++;
// also get the full node now
NodeType *nt; getTagId ( tagName , &nt );
// if it is not an html tag, do not fuss with it!
if ( ! nt ) continue;
// if its in the list but is xml, let it go too
if ( nt->m_isXmlTag ) continue;
// . otherwise, its an html tag being used as an xml
// tag and we need to encode (append gbxml to it)
// . insert / first if there
if ( p[1] == '/' ) {p++;*dst++ = *p;}
// then "gb"
*dst++ = 'g';
*dst++ = 'b';
*dst++ = 'x';
*dst++ = 'm';
*dst++ = 'l';
}
// update
m_xbuf.m_length = dst - m_xbuf.getBufStart();
// final \0
*dst = '\0';
// re-assign these
m_expandedUtf8Content = m_xbuf.getBufStart();//m_buf;
m_expandedUtf8ContentSize = m_xbuf.m_length + 1;
// free esbuf if we were referencing that to save mem
m_esbuf.purge();
}
*/
// richmondspca.org has &quot; in some tags and we do not like
// expanding that to " because it messes up XmlNode::getTagLen()
// and creates big problems. same for www.first-avenue.com. so
// by setting doSpecial to try we change &lt; &gt and &quot; to
// [ ] and ' which have no meaning in html per se.
bool doSpecial = true;
if ( m_contentType == CT_XML ) doSpecial = false;
// . now decode those html entites into utf8 so that we never have to
// check for html entities anywhere else in the code. a big win!!
// . doSpecial = true, so that &lt, &gt, &amp; and &quot; are
// encoded into high value
// utf8 chars so that Xml::set(), etc. still work properly and don't
// add any more html tags than it should
// . this will decode in place
// . MDW: 9/28/2014. no longer do for xml docs since i added
// hashXmlFields()
int32_t n = m_expandedUtf8ContentSize - 1;
if ( m_contentType != CT_XML )
n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
m_expandedUtf8Content,//ptr_utf8Content,
m_expandedUtf8ContentSize-1,//size_utf8Con
doSpecial,
m_niceness);
// can't exceed this! n does not include the final \0 even though
// we do right it out.
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
// sanity
if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }
// now rss has crap in it like "&amp;nbsp;" so we have to do another
// decoding pass
// . MDW: 9/28/2014. no longer do for xml docs since i added
// hashXmlFields()
// if ( m_contentType == CT_XML ) // isRSSExt )
// n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
// m_expandedUtf8Content,//ptr_utf8Content,
// n,
// false,//doSpecial,
// m_niceness);
// sanity
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
// sanity
if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }
// finally transform utf8 apostrophe's into regular apostrophes
// to make parsing easier
uint8_t *p = (uint8_t *)m_expandedUtf8Content;
uint8_t *dst = (uint8_t *)m_expandedUtf8Content;
uint8_t *pend = p + n;
for ( ; *p ; p += size ) {
QUICKPOLL(m_niceness);
size = getUtf8CharSize(p);
// quick copy
if ( size == 1 && p[0] != '<' ) { *dst++ = *p; continue; }
// make "1<super>st</super>" into "1st" so Dates.cpp can
// have an easier time
if ( p[0] == '<' &&
to_lower_a(p[1]) == 's' &&
to_lower_a(p[2]) == 'u' &&
to_lower_a(p[3]) == 'p' ) {
// assume no go!
*dst++ = '<';
// use this
char *s = (char *)p;
// did number preceed?
char *pn = s - 1;
for (;pn>=m_expandedUtf8Content&&is_wspace_a(*pn);pn--)
QUICKPOLL(m_niceness);
// must be like "1st" or "32nd"
if ( ! is_digit(*pn) ) continue;
// skip the "<sup"
s += 4;
// skip until >
for (; *s && *s != '>' ; s++ )
QUICKPOLL(m_niceness);
// crazy?
if ( ! *s ) continue;
// skip the '>'
s++;
// skip spaces after the "<sup>" tag
for (; *s && is_wspace_a(*s) ; s++ )
QUICKPOLL(m_niceness);
// crazy?
if ( ! *s ) continue;
// check for "st" etc
bool gotIt = false;
char *suffix = s;
if ( (to_lower_a(s[0])=='s'&&to_lower_a(s[1]) == 't')||
(to_lower_a(s[0])=='n'&&to_lower_a(s[1]) == 'd')||
(to_lower_a(s[0])=='r'&&to_lower_a(s[1]) == 'd')||
(to_lower_a(s[0])=='t'&&to_lower_a(s[1]) == 'h'))
gotIt = true;
if ( ! gotIt ) continue;
// skip that
s += 2;
// skip more spaces
for (; *s && is_wspace_a(*s) ; s++ )
QUICKPOLL(m_niceness);
// crazy?
if ( ! *s ) continue;
// find </super> tag
if ( s[0] != '<' ) continue;
if ( s[1] != '/' ) continue;
if ( to_lower_a(s[2]) != 's' ) continue;
if ( to_lower_a(s[3]) != 'u' ) continue;
if ( to_lower_a(s[4]) != 'p' ) continue;
if ( s[5] != '>' ) continue;
// skip it, point to >
s += 5;
// assign p to that
p = (unsigned char *)s;
// back up ove rthe no-go
dst--;
// rewrite it
*dst++ = to_lower_a(suffix[0]);
*dst++ = to_lower_a(suffix[1]);
// do next round
continue;
}
// check for crazy apostrophes
if ( p[0]==0xe2 &&
p[1]==0x80 &&
(p[2]==0x99 ||
p[2]==0x98 ||
p[2]==0x9b ) ) {
*dst++ = '\'';
continue;
}
// utf8 control character?
if ( p[0] == 0xc2 &&
p[1] >= 0x80 &&
p[1] <= 0x9f ) {
*dst++ = ' ';
continue;
}
// double quotes in utf8
// DO NOT do this if type JSON!! json uses quotes as
// control characters
if ( p[0] == 0xe2 &&
p[1] == 0x80 &&
m_contentType != CT_JSON ) {
if (p[2] == 0x9c ) {
*dst++ = '\"';
continue;
}
if (p[2] == 0x9d ) {
*dst++ = '\"';
continue;
}
}
// and crazy hyphens (8 - 10pm)
if ( p[0]==0xc2 &&
p[1]==0xad ) {
*dst++ = '-';
continue;
}
if ( p[0]==0xe2 &&
p[1]==0x80 &&
p[2]==0x93 ) {
*dst++ = '-';
continue;
}
if ( p[0]==0xe2 &&
p[1]==0x80 &&
p[2]==0x94 ) {
*dst++ = '-';
continue;
}
// . convert all utf8 white space to ascii white space
// . should benefit the string matching algo in
// XmlDoc::getEventSummary() which needs to skip spaces
if ( ! g_map_is_ascii[(unsigned char)*p] &&
is_wspace_utf8(p) ) {
*dst++ = ' ';
continue;
}
// otherwise, just copy it
gbmemcpy(dst,p,size);
dst += size;
}
// null term
*dst++ = '\0';
// now set it up
ptr_utf8Content = (char *)m_expandedUtf8Content;
//size_utf8Content = n+1;//m_expandedUtf8ContentSize;
size_utf8Content = (char *)dst - m_expandedUtf8Content;
// sanity -- skipped over the \0???
if ( p > pend ) { char *xx=NULL;*xx=0; }
// sanity check
if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) {
char *xx=NULL;*xx=0; }
m_utf8ContentValid = true;
return &ptr_utf8Content;
}
// *pend should be \0
int32_t getContentHash32Fast ( unsigned char *p ,
int32_t plen ,
int32_t niceness ) {
// sanity
if ( ! p ) return 0;
if ( plen <= 0 ) return 0;
if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
unsigned char *pend = p + plen;
static bool s_init = false;
static char s_qtab0[256];
static char s_qtab1[256];
static char s_qtab2[256];
static char *s_skips[] = {
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
"sun",
"mon",
"tue",
"wed",
"thu",
"fri",
"sat" };
if ( ! s_init ) {
// only call this crap once
s_init = true;
// clear up
memset(s_qtab0,0,256);
memset(s_qtab1,0,256);
memset(s_qtab2,0,256);
for ( int32_t i = 0 ; i < 19 ; i++ ) {
unsigned char *s = (unsigned char *)s_skips[i];
s_qtab0[(unsigned char)to_lower_a(s[0])] = 1;
s_qtab0[(unsigned char)to_upper_a(s[0])] = 1;
// do the quick hash
unsigned char qh = to_lower_a(s[0]);
qh ^= to_lower_a(s[1]);
qh <<= 1;
qh ^= to_lower_a(s[2]);
s_qtab1[qh] = 1;
// try another hash, the swift hash
unsigned char sh = to_lower_a(s[0]);
sh <<= 1;
sh ^= to_lower_a(s[1]);
sh <<= 1;
sh ^= to_lower_a(s[2]);
s_qtab2[sh] = 1;
}
}
bool lastWasDigit = false;
bool lastWasPunct = true;
uint32_t h = 0LL;
//char size = 0;
unsigned char pos = 0;
for ( ; p < pend ; p++ ) { // += size ) {
// breathe
QUICKPOLL ( niceness );
// get size
// this might not be utf8!!!
//size = getUtf8CharSize(p);
// skip if not alnum
// this might not be utf8!!!
//if ( ! is_alnum_utf8 ( (char *)p ) ) {
if ( ! is_alnum_a ( *p ) ) {
lastWasDigit = false;
lastWasPunct = true;
continue;
}
// if its a digit, call it 1
if ( is_digit(*p) ) {
// skip consecutive digits
if ( lastWasDigit ) continue;
// xor in a '1'
h ^= g_hashtab[pos][(unsigned char)'1'];
pos++;
lastWasDigit = true;
continue;
}
// reset
lastWasDigit = false;
// exclude days of the month or week so clocks do
// not affect this hash
if ( s_qtab0[p[0]] && lastWasPunct && p[1] && p[2] ) {
// quick hash
unsigned char qh = to_lower_a(p[0]);
qh ^= to_lower_a(p[1]);
qh <<= 1;
qh ^= to_lower_a(p[2]);
// look that up
if ( ! s_qtab1[qh] ) goto skip;
// try another hash, the swift hash
unsigned char sh = to_lower_a(p[0]);
sh <<= 1;
sh ^= to_lower_a(p[1]);
sh <<= 1;
sh ^= to_lower_a(p[2]);
if ( ! s_qtab2[sh] ) goto skip;
// ok, probably a match..
unsigned char *s = p + 3;
// skip to end of word
//char size2;
//for ( ; s < pend ; s += size2 ) {
for ( ; s < pend ; s++ ) {
//size2 = getUtf8CharSize(s);
//if ( ! is_alnum_utf8 ((char *)s) )
if ( ! is_alnum_a ( *s ) )
break;
}
// it is already point to the next char, so clr this
//size = 0;
// advance p now
p = s;
// hash as one type of thing...
h ^= g_hashtab[pos][(unsigned char)'X'];
pos++;
continue;
}
skip:
// reset this
lastWasPunct = false;
// xor this in right
h ^= g_hashtab[pos][p[0]];
pos++;
// assume ascii or latin1
continue;
/*
// one more?
if ( size == 1 ) continue;
// do that
h ^= g_hashtab[pos][p[1]];
pos++;
// one more?
if ( size == 2 ) continue;
// do that
h ^= g_hashtab[pos][p[2]];
pos++;
// one more?
if ( size == 3 ) continue;
// do that
h ^= g_hashtab[pos][p[3]];
pos++;
// that should do it!
continue;
*/
}
return h;
}
int32_t *XmlDoc::getContentHash32 ( ) {
// return it if we got it
if ( m_contentHash32Valid ) return &m_contentHash32;
setStatus ( "getting contenthash32" );
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (int32_t *)ct;
// we do not hash the url/resolved_url/html fields in diffbot json
// because the url field is a mirror of the url and the html field
// is redundant and would slow us down
if ( *ct == CT_JSON )
return getContentHashJson32();
// if we are a diffbot json object, fake this for now, it will
// be set for real in hashJSON()
// no, because we call this before hashJSON() for to set
// EDOCUNCHANGED above... so just hash the json normally for now
//if ( m_isDiffbotJSONObject ) {
// m_contentHash32 = 0;
// return &m_contentHash32;
//}
// . get the content. get the pure untouched content!!!
// . gotta be pure since that is what Msg13.cpp computes right
// after it downloads the doc...
// . if iframes are present, msg13 gives up
char **pure = getContent();
if ( ! pure || pure == (char **)-1 ) return (int32_t *)pure;
// size
//int32_t n = size_utf8Content - 1;
// hash up to first 10,000 chars
//if ( n > 10000 ) n = 10000;
// do it
//m_contentHash32 = hash32 ( ptr_utf8Content , n );
unsigned char *p = (unsigned char *)(*pure);
int32_t plen = m_contentLen;//size_utf8Content - 1;
// no content means no hash32
if ( plen <= 0 ) {//ptr_utf8Content ) {
m_contentHash32 = 0;
m_contentHash32Valid = true;
return &m_contentHash32;
}
// we set m_contentHash32 in ::hashJSON() below because it is special
// for diffbot since it ignores certain json fields like url: and the
// fields are independent, and numbers matter, like prices
//if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; }
// *pend should be \0
m_contentHash32 = getContentHash32Fast ( p , plen , m_niceness );
// validate
m_contentHash32Valid = true;
return &m_contentHash32;
}
// we do not hash the url/resolved_url/html fields in diffbot json
// because the url field is a mirror of the url and the html field
// is redundant and would slow us down
int32_t *XmlDoc::getContentHashJson32 ( ) {
if ( m_contentHash32Valid ) return &m_contentHash32;
// use new json parser
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1 ) return (int32_t *)jp;
JsonItem *ji = jp->getFirstItem();
int32_t totalHash32 = 0;
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
for ( ; ji ; ji = ji->m_next ) {
QUICKPOLL(m_niceness);
// skip if not number or string
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
continue;
char *topName = NULL;
// what name level are we?
int32_t numNames = 1;
JsonItem *pi = ji->m_parent;
for ( ; pi ; pi = pi->m_parent ) {
// empty name?
if ( ! pi->m_name ) continue;
if ( ! pi->m_name[0] ) continue;
topName = pi->m_name;
numNames++;
}
// if we are the diffbot reply "html" field do not hash this
// because it is redundant and it hashes html tags etc.!
// plus it slows us down a lot and bloats the index.
if ( ji->m_name && numNames==1 &&
strcmp(ji->m_name,"html") == 0 )
continue;
if ( ji->m_name && numNames==1 &&
strcmp(ji->m_name,"url") == 0 )
continue;
if ( ji->m_name && numNames==1 &&
strcmp(ji->m_name,"pageUrl") == 0 )
continue;
// mike will track down how the hash works in article|3|123456
//if ( ji->m_name && numNames==1 &&
// strcmp(ji->m_name,"diffbotUri") == 0 )
// continue;
if ( ji->m_name && numNames==1 &&
strcmp(ji->m_name,"resolved_url") == 0 )
continue;
if ( topName && strcmp(topName,"stats") == 0 )
continue;
if ( topName && strcmp(topName,"queryString") == 0 )
continue;
if ( topName && strcmp(topName,"nextPages") == 0 )
continue;
if ( topName && strcmp(topName,"textAnalysis") == 0 )
continue;
if ( topName && strcmp(topName,"links") == 0 )
continue;
// hash the fully compound name
int32_t nameHash32 = 0;
JsonItem *p = ji;
char *lastName = NULL;
for ( ; p ; p = p->m_parent ) {
// empty name?
if ( ! p->m_name ) continue;
if ( ! p->m_name[0] ) continue;
// dup? can happen with arrays. parent of string
// in object, has same name as his parent, the
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
if ( p->m_name == lastName ) continue;
// update
lastName = p->m_name;
// hash it up
nameHash32 = hash32(p->m_name,p->m_nameLen,nameHash32);
}
//
// now Json.cpp decodes and stores the value into
// a buffer, so ji->getValue() should be decoded completely
//
// . get the value of the json field
// . if it's a number or bool it converts into a string
int32_t vlen;
char *val = ji->getValueAsString( &vlen );
//
// for deduping search results we set m_contentHash32 here for
// diffbot json objects.
//
// we use this hash for setting EDOCUNCHANGED when reindexing
// a diffbot reply. we also use to see if the diffbot reply
// is a dup with another page in the index. thirdly, we use
// to dedup search results, which could be redundant because
// of our spider-time deduping.
//
// make the content hash so we can set m_contentHash32
// for deduping. do an exact hash for now...
int32_t vh32 = hash32 ( val , vlen , m_niceness );
// combine
int32_t combined32 = hash32h ( nameHash32 , vh32 );
// accumulate field/val pairs order independently
totalHash32 ^= combined32;
// debug note
//logf(LOG_DEBUG,"ch32: field=%s nh32=%"UINT32" vallen=%"INT32"",
// ji->m_name,
// nameHash32,
// vlen);
}
m_contentHash32 = totalHash32;
m_contentHash32Valid = true;
return &m_contentHash32;
}
// do not consider tags except frame and iframe... make all months
// and days of weeks and digits basically the same
int64_t *XmlDoc::getLooseContentHash64 ( ) {
if ( m_looseContentHash64Valid )
return &m_looseContentHash64;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (int64_t *)xml;
int64_t h64 = 0LL;
int32_t n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes ();
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if not the right kinda tag
if ( nodes[i].isTag() &&
nodes[i].getNodeId() != TAG_FRAME &&
nodes[i].getNodeId() != TAG_IFRAME &&
nodes[i].getNodeId() != TAG_IMG )
continue;
// hash that node up
int64_t ch64;
// this is really a 32-bit hash
ch64=getContentHash32Fast((unsigned char *)nodes[i].getNode() ,
nodes[i].getNodeLen() ,
m_niceness );
// incorporate hash from that node
h64 = hash64h ( ch64 , h64 );
}
m_looseContentHash64Valid = true;
m_looseContentHash64 = h64;
return &m_looseContentHash64;
}
int32_t XmlDoc::getHostHash32a ( ) {
if ( m_hostHash32aValid ) return m_hostHash32a;
m_hostHash32aValid = true;
Url *f = getFirstUrl();
m_hostHash32a = f->getHostHash32();
return m_hostHash32a;
}
int32_t XmlDoc::getHostHash32b ( ) {
if ( m_hostHash32bValid ) return m_hostHash32b;
m_hostHash32bValid = true;
Url *c = getCurrentUrl();
m_hostHash32b = c->getHostHash32();
return m_hostHash32b;
}
int32_t XmlDoc::getDomHash32( ) {
if ( m_domHash32Valid ) return m_domHash32;
m_domHash32Valid = true;
Url *f = getFirstUrl();
m_domHash32 = hash32 ( f->getDomain(), f->getDomainLen() );
return m_domHash32;
}
// . this will be the actual pnm data of the image thumbnail
// . you can inline it in an image tag like
// <img src="...."/>
// background-image:url(...);
// . FORMAT of ptr_imageData:
// <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg>
char **XmlDoc::getThumbnailData ( ) {
if ( m_imageDataValid ) return &ptr_imageData;
Images *images = getImages();
if ( ! images || images == (Images *)-1 ) return (char **)images;
ptr_imageData = NULL;
size_imageData = 0;
m_imageDataValid = true;
if ( ! images || ! images->m_imageBufValid ) return &ptr_imageData;
if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData;
// this buffer is a ThumbnailArray
ptr_imageData = images->m_imageBuf.getBufStart();
size_imageData = images->m_imageBuf.length();
return &ptr_imageData;
}
Images *XmlDoc::getImages ( ) {
if ( m_imagesValid ) return &m_images;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( ! cr->m_makeImageThumbnails ) {
m_images.reset();
m_imagesValid = true;
return &m_images;
}
if ( cr->m_isCustomCrawl ) {
m_images.reset();
m_imagesValid = true;
return &m_images;
}
setStatus ( "getting thumbnail" );
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (Images *)words;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Images *)xml;
Sections *sections = getSections();
if ( ! sections || sections==(Sections *)-1) return (Images *)sections;
char *site = getSite ();
if ( ! site || site == (char *)-1 ) return (Images *)site;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Images *)d;
int8_t *hc = getHopCount();
if ( ! hc || hc == (void *)-1 ) return (Images *)hc;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (Images *)cu;
// . this does not block or anything
// . if we are a diffbot json reply it should just use the primary
// image, if any, as the only candidate
m_images.setCandidates ( cu , words , xml , sections , this );
setStatus ("getting thumbnail");
// assume valid
m_imagesValid = true;
// now get the thumbnail
if ( ! m_images.getThumbnail ( site ,
gbstrlen(site) ,
*d ,
this ,
cr->m_collnum ,
//NULL , // statusPtr ptr
*hc ,
m_masterState,
m_masterLoop ) )
return (Images *)-1;
return &m_images;
}
// . get different attributes of the Links as vectors
// . these are 1-1 with the Links::m_linkPtrs[] array
TagRec ***XmlDoc::getOutlinkTagRecVector () {
// if page has a <meta name=usefakeips content=1> tag
// then use the hash of the links host as the firstip.
// this will speed things up when adding a gbdmoz.urls.txt.*
// file to index every url in dmoz.
char *useFakeIps = hasFakeIpsMetaTag();
if ( ! useFakeIps || useFakeIps == (void *)-1 )
return (TagRec ***)useFakeIps;
// no error and valid, return quick
if ( m_outlinkTagRecVectorValid && *useFakeIps )
return &m_outlinkTagRecVector;
// error?
if ( m_outlinkTagRecVectorValid && m_msge0.m_errno ) {
g_errno = m_msge0.m_errno;
return NULL;
}
// if not using fake ips, give them the real tag rec vector
if ( m_outlinkTagRecVectorValid )
return &m_msge0.m_tagRecPtrs;
Links *links = getLinks();
if ( ! links || links == (void *) -1 ) return (TagRec ***)links;
if ( *useFakeIps ) {
// set to those
m_fakeTagRec.reset();
// just make a bunch ptr to empty tag rec
int32_t need = links->m_numLinks * sizeof(TagRec *);
if ( ! m_fakeTagRecPtrBuf.reserve ( need ) ) return NULL;
// make them all point to the fake empty tag rec
TagRec **grv = (TagRec **)m_fakeTagRecPtrBuf.getBufStart();
for ( int32_t i = 0 ; i < links->m_numLinks ; i++ )
grv[i] = &m_fakeTagRec;
// set it
m_outlinkTagRecVector = grv;
m_outlinkTagRecVectorValid = true;
return &m_outlinkTagRecVector;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// update status msg
setStatus ( "getting outlink tag rec vector" );
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (TagRec ***)gr;
// assume valid
m_outlinkTagRecVectorValid = true;
// go get it
if ( ! m_msge0.getTagRecs ( links->m_linkPtrs ,
links->m_linkFlags ,
links->m_numLinks ,
false , // skip old?
// make it point to this basetagrec if
// the LF_SAMEHOST flag is set for the link
gr ,
cr->m_collnum ,
m_niceness ,
m_masterState ,
m_masterLoop )) {
// sanity check
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
// we blocked
return (TagRec ***)-1;
}
// error?
if ( g_errno ) return NULL;
// or this?
if ( m_msge0.m_errno ) {
g_errno = m_msge0.m_errno;
return NULL;
}
// set it
//m_outlinkTagRecVector = m_msge0.m_tagRecPtrs;
// ptr to a list of ptrs to tag recs
return &m_msge0.m_tagRecPtrs;
}
char *XmlDoc::hasNoIndexMetaTag() {
if ( m_hasNoIndexMetaTagValid )
return &m_hasNoIndexMetaTag;
// assume none
m_hasNoIndexMetaTag = false;
// store value/content of meta tag in here
char mbuf[16];
mbuf[0] = '\0';
char *tag = "noindex";
int32_t tlen = gbstrlen(tag);
// check the xml for a meta tag
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
xml->getMetaContent ( mbuf, 16 , tag , tlen );
if ( mbuf[0] == '1' ) m_hasNoIndexMetaTag = true;
m_hasNoIndexMetaTagValid = true;
return &m_hasNoIndexMetaTag;
}
char *XmlDoc::hasFakeIpsMetaTag ( ) {
if ( m_hasUseFakeIpsMetaTagValid ) return &m_hasUseFakeIpsMetaTag;
char mbuf[16];
mbuf[0] = '\0';
char *tag = "usefakeips";
int32_t tlen = gbstrlen(tag);
// check the xml for a meta tag
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
xml->getMetaContent ( mbuf, 16 , tag , tlen );
m_hasUseFakeIpsMetaTag = false;
if ( mbuf[0] == '1' ) m_hasUseFakeIpsMetaTag = true;
m_hasUseFakeIpsMetaTagValid = true;
return &m_hasUseFakeIpsMetaTag;
}
int32_t **XmlDoc::getOutlinkFirstIpVector () {
Links *links = getLinks();
if ( ! links ) return NULL;
// if page has a <meta name=usefakeips content=1> tag
// then use the hash of the links host as the firstip.
// this will speed things up when adding a gbdmoz.urls.txt.*
// file to index every url in dmoz.
char *useFakeIps = hasFakeIpsMetaTag();
if ( ! useFakeIps || useFakeIps == (void *)-1 )
return (int32_t **)useFakeIps;
if ( *useFakeIps && m_outlinkIpVectorValid )
return &m_outlinkIpVector;
if ( *useFakeIps ) {
int32_t need = links->m_numLinks * 4;
m_fakeIpBuf.reserve ( need );
for ( int32_t i = 0 ; i < links->m_numLinks ; i++ ) {
uint64_t h64 = links->getHostHash64(i);
int32_t ip = h64 & 0xffffffff;
m_fakeIpBuf.pushLong(ip);
}
int32_t *ipBuf = (int32_t *)m_fakeIpBuf.getBufStart();
m_outlinkIpVector = ipBuf;
m_outlinkIpVectorValid = true;
return &m_outlinkIpVector;
}
// return msge1's buf otherwise
if ( m_outlinkIpVectorValid )
return &m_msge1.m_ipBuf;
// should we have some kinda error for msge1?
//if ( m_outlinkIpVectorValid && m_msge1.m_errno ) {
// g_errno = m_msge1.m_errno;
// return NULL;
//}
// . we now scrounge them from TagRec's "firstip" tag if there!
// . that way even if a domain changes its ip we still use the
// original ip, because the only reason we need this ip is for
// deciding which group of hosts will store this SpiderRequest and
// we use that for throttling, so we have to be consistent!!!
// . we never add -1 or 0 ips to tagdb though.... (NXDOMAIN,error...)
// . uses m_msgeForTagRecs for this one
TagRec ***grv = getOutlinkTagRecVector();
if ( ! grv || grv == (void *)-1 ) return (int32_t **)grv;
// note it
setStatus ( "getting outlink first ip vector" );
// assume valid
m_outlinkIpVectorValid = true;
// sanity check
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// use this
int32_t nowGlobal = getSpideredTime();//m_spideredTime;
// add tags to tagdb?
bool addTags = true;
//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
if ( getIsPageParser() ) addTags = false;
// get this
char *testDir = getTestDir();
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . go get it
// . if coll is "qatest123" then try to use the file ./test/ips.txt to
// see if the ip is in there for the given url hostname
// . this will now update Tagdb with the "firstip" tags if it should!!
// . this just dns looks up the DOMAINS of each outlink because these
// are *first* ips and ONLY used by Spider.cpp for throttling!!!
if ( ! m_msge1.getFirstIps ( *grv ,
links->m_linkPtrs ,
links->m_linkFlags ,
links->m_numLinks ,
false , // skip old?
cr->m_coll ,
m_niceness ,
m_masterState ,
m_masterLoop ,
nowGlobal ,
addTags ,
testDir )) {
// sanity check
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
// we blocked
return (int32_t **)-1;
}
// error?
if ( g_errno ) return NULL;
// . ptr to a list of ptrs to tag recs
// . ip will be -1 on error
return &m_msge1.m_ipBuf;
}
/*
// really this could just check titledb in memory tree and tfndb and should
// be really fast!!
char **XmlDoc::getOutlinkIsIndexedVector () {
if ( m_outlinkIsIndexedVectorValid ) return &m_msge2.m_isIndexedBuf;
setStatus ( "getting outlink is indexed vector" );
Links *links = getLinks();
if ( ! links ) return NULL;
// assume valid
m_outlinkIsIndexedVectorValid = true;
// go get it
bool status = m_msge2.getIsIndexed ( links->m_linkPtrs ,
links->m_linkFlags ,
links->m_numLinks ,
false , // skip old?
m_coll ,
m_niceness ,
m_masterState ,
m_masterLoop );
// set it
//m_outlinkIsIndexedVector = m_msge2.m_isIndexedBuf;
// we blocked
if ( ! status ) return (char **)-1;
// error?
if ( g_errno ) return NULL;
// ptr to a list of ptrs to tag recs
return &m_msge2.m_isIndexedBuf;
}
*/
/*
char *XmlDoc::getIsVisible ( ) {
if ( m_isVisibleValid ) return &m_isVisible;
setStatus ( "getting is visible" );
// to get a live reading, invalidate tag rec from title rec
m_oldTagRecValid = false;
// . loop over all regular expression in the url filters table
// . stop at first regular expression it matches
int32_t *rn = getRegExpNum2 ( -1 );
// need to wait for a callback at this point (or we had critical error)
if ( ! rn || rn == (int32_t *)-1 ) return (char *)rn;
// assume yes
m_isVisible = true;
// and valid
m_isVisibleValid = true;
// no match
if ( *rn == -1 ) return &m_isVisible;
// get spider priority
int32_t pr = m_cr->m_spiderPriorities[*rn];
// test it
if ( pr == -2 ) m_isVisible = false;
if ( pr == -3 ) m_isVisible = false;
return &m_isVisible;
}
*/
int32_t *XmlDoc::getUrlFilterNum ( ) {
// return it if already set
if ( m_urlFilterNumValid ) return &m_urlFilterNum;
// note that
setStatus ( "getting url filter row num");
// . make the partial new spider rec
// . we need this for matching filters like lang==zh_cn
// . crap, but then it matches "hasReply" when it should not
// . PROBLEM! this is the new reply not the OLD reply, so it may
// end up matching a DIFFERENT url filter num then what it did
// before we started spidering it...
//SpiderReply *newsr = getNewSpiderReply ( );
// note it
//if ( ! newsr )
// log("doc: getNewSpiderReply: %s",mstrerror(g_errno));
//if ( ! newsr || newsr == (void *)-1 ) return (int32_t *)newsr;
// need language i guess
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (int32_t *)langId;
// make a fake one for now
// SpiderReply fakeReply;
// // fix errors
// fakeReply.reset();
// fakeReply.m_isIndexedINValid = true;
// // just language for now, so we can FILTER by language
// if ( m_langIdValid ) fakeReply.m_langId = m_langId;
int32_t langIdArg = -1;
if ( m_langIdValid ) langIdArg = m_langId;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// this must be valid
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
int32_t spideredTime = getSpideredTime();
// get the spider request
SpiderRequest *oldsr = &m_sreq;
// null it out if invalid...
if ( ! m_sreqValid ) oldsr = NULL;
// do not set the spideredTime in the spiderReply to 0
// so we do not trigger the lastSpiderTime
//int32_t saved = newsr->m_spideredTime;
//newsr->m_spideredTime = 0;
//
// PROBLEM: we end up matching "isIndexed" in the url filters
// even if this is a NEW document because we pass it in the spider
// reply that we generate now even though another spider reply
// may not exist.
//
// SOLUTION: just do not supply a spider reply, we only seem to
// use the urlfilternum to get a diffbot api url OR to see if the
// document is banned/filtered so we should delete it. otherwise
// we were supplying "newsr" above...
// . look it up
// . use the old spidered date for "nowGlobal" so we can be consistent
// for injecting into the "qatest123" coll
int32_t ufn = ::getUrlFilterNum ( oldsr,
NULL,//&fakeReply,
spideredTime,false,
m_niceness,cr,
false, // isOutlink?
NULL,
langIdArg);
// put it back
//newsr->m_spideredTime = saved;
// bad news?
if ( ufn < 0 ) {
log("build: failed to get url filter for xmldoc %s",
m_firstUrl.m_url);
//g_errno = EBADENGINEER;
//return NULL;
}
// store it
m_urlFilterNum = ufn;
m_urlFilterNumValid = true;
// set this too in case the url filters table changes while
// we are spidering this and a row is inserted or deleted or something
//SafeBuf *yy = &cr->m_spiderDiffbotApiUrl[ufn];
// copy to ours
//m_diffbotApiUrl.safeMemcpy ( yy );
// ensure null term
//m_diffbotApiUrl.nullTerm();
//m_diffbotApiUrlValid = true;
return &m_urlFilterNum;
}
// . both "u" and "site" must not start with http:// or https:// or protocol
bool isSiteRootFunc ( char *u , char *site ) {
// get length of each
int32_t slen = gbstrlen(site);//m_siteLen;
int32_t ulen = gbstrlen(u);
// "site" may or may not end in /, so remove that
if ( site[slen-1] == '/' ) slen--;
// same for url
if ( u[ulen-1] == '/' ) ulen--;
// skip http:// or https://
if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
if ( strncmp(site,"http://" ,7)==0 ) { site += 7; slen -= 7; }
if ( strncmp(site,"https://",8)==0 ) { site += 8; slen -= 8; }
// subtract default.asp etc. from "u"
//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.asp",11)==0 )
// ulen -= 11;
//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.html",12)==0 )
// ulen -= 12;
//if ( ulen > 15 && strncasecmp(u+ulen-11,"index.html",10)==0 )
// ulen -= 10;
// now they must match exactly
if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
// all done
return false;
}
bool isSiteRootFunc3 ( char *u , int32_t siteRootHash32 ) {
// get length of each
int32_t ulen = gbstrlen(u);
// remove trailing /
if ( u[ulen-1] == '/' ) ulen--;
// skip http:// or https://
if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
// now they must match exactly
int32_t sh32 = hash32(u,ulen);
return ( sh32 == siteRootHash32 );
}
char *XmlDoc::getIsSiteRoot ( ) {
if ( m_isSiteRootValid ) return &m_isSiteRoot2;
// get our site
char *site = getSite ();
if ( ! site || site == (char *)-1 ) return (char *)site;
// get our url without the http:// or https://
char *u = getFirstUrl()->getHost();
if ( ! u ) {
g_errno = EBADURL;
return NULL;
}
// assume valid now
m_isSiteRootValid = true;
// get it
bool isRoot = isSiteRootFunc ( u , site );
// seems like https:://twitter.com/ is not getting set to root
if ( m_firstUrl.getPathDepth(true) == 0 && ! m_firstUrl.isCgi() )
isRoot = true;
m_isSiteRoot2 = m_isSiteRoot = isRoot;
return &m_isSiteRoot2;
}
/*
bool XmlDoc::getIsOutlinkSiteRoot ( char *u , TagRec *gr ) {
// get our site
Tag *tag = gr->getTag("site");
// make "host" point to u's hostname
int32_t hostLen; char *host = getHostFast ( u , &hostLen );
// use hostname?
char *site;
int32_t slen;
if ( tag ) {
site = tag->getTagData();
slen = tag->getTagDataSize() - 1;
}
// otherwise, use hostname as site
else {
// must be end, or could be '/'
if ( ! host[hostLen] || ! host[hostLen+1] ) return true;
// i guess we were more than just a hostname, so not site root
return false;
}
// get length of each
int32_t ulen = gbstrlen(u);
// "site" may or may not end in /, so remove that
if ( site[slen-1] == '/' ) slen--;
// same for url
if ( u[ulen-1] == '/' ) ulen--;
// now they must match exactly
if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
// all done
return false;
}
*/
int8_t *XmlDoc::getHopCount ( ) {
// return now if valid
if ( m_hopCountValid ) return &m_hopCount;
setStatus ( "getting hop count" );
CollectionRec *cr = this->getCollRec();
if(cr && cr->m_isCustomCrawl ) {
// for diffbot collections, compute hopcount without casting
// site/rss to 0 hopcount -- copied from below
LinkInfo *info1 = getLinkInfo1();
if (!info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
int32_t origHopCount = -1;
if ( m_sreqValid ) {
origHopCount = m_sreq.m_hopCount;
}
int32_t hc = -1;
// if(m_minInlinkerHopCount+1 < hc && m_minInlinkerHopCount>=0)
// hc = m_minInlinkerHopCount + 1;
// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
// hc = m_minInlinkerHopCount + 1;
if ( origHopCount < hc && origHopCount >= 0 )
hc = origHopCount;
if ( hc == -1 && origHopCount >= 0 )
hc = origHopCount;
if ( hc == -1 )
hc = 1;
if ( hc > 0x7f ) hc = 0x7f;
m_hopCountValid = true;
m_hopCount = hc;
//printf("Custom hopcount: %d for url: %s",
//m_hopCount, this->ptr_firstUrl);
return &m_hopCount;
}
// the unredirected url
Url *f = getFirstUrl();
// get url as string, skip "http://" or "https://"
//char *u = f->getHost();
// if we match site, we are a site root, so hop count is 0
//char *isr = getIsSiteRoot();
//if ( ! isr || isr == (char *)-1 ) return (int8_t *)isr;
//if ( *isr ) {
// m_hopCount = 0;
// m_hopCountValid = true;
// return &m_hopCount;
//}
// ping servers have 0 hop counts
if ( f->isPingServer() ) {
// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
m_hopCount = 0;
m_hopCountValid = true;
return &m_hopCount;
}
char *isRSS = getIsRSS();
if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
// check for site root
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (int8_t *)gr;
// and site roots
char *isSiteRoot = getIsSiteRoot();
if (!isSiteRoot ||isSiteRoot==(char *)-1) return (int8_t *)isSiteRoot;
if ( *isSiteRoot ) {
// log("xmldoc: hc1 is 0 (siteroot) %s",m_firstUrl.m_url);
m_hopCount = 0;
m_hopCountValid = true;
return &m_hopCount;
}
// make sure m_minInlinkerHopCount is valid
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
// . fix bad original hop counts
// . assign this hop count from the spider rec
int32_t origHopCount = -1;
if ( m_sreqValid ) origHopCount = m_sreq.m_hopCount;
// derive our hop count from our parent hop count
int32_t hc = -1;
// . BUT use inlinker if better
// . if m_linkInfo1Valid is true, then m_minInlinkerHopCount is valid
// if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 )
// hc = m_minInlinkerHopCount + 1;
// or if parent is unknown, but we have a known inlinker with a
// valid hop count, use the inlinker hop count then
// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
// hc = m_minInlinkerHopCount + 1;
// if ( origHopCount == 0 )
// log("xmldoc: hc3 is 0 (spiderreq) %s",m_firstUrl.m_url);
// or use our hop count from the spider rec if better
if ( origHopCount < hc && origHopCount >= 0 )
hc = origHopCount;
// or if neither parent or inlinker was valid hop count
if ( hc == -1 && origHopCount >= 0 )
hc = origHopCount;
// if we have no hop count at this point, i guess just pick 1!
if ( hc == -1 )
hc = 1;
// truncate, hop count is only one byte in the TitleRec.h::m_hopCount
if ( hc > 0x7f ) hc = 0x7f;
// and now so do rss urls.
if ( *isRSS && hc > 1 ) {
// force it to one, not zero, otherwise it gets pounded
// too hard on the aggregator sites. spider priority
// is too high
m_hopCount = 1;
m_hopCountValid = true;
return &m_hopCount;
}
// unknown hop counts (-1) are propogated, except for root urls
m_hopCountValid = true;
m_hopCount = hc;
return &m_hopCount;
}
/*
int8_t *XmlDoc::getOutlinkHopCountVector ( ) {
if ( m_outlinkHopCountVectorValid ) return m_outlinkHopCountVector;
// need these of course
Links *links = getLinks();
if ( ! links || links == (Links *)-1 ) return (int8_t *)links;
// and these for seeing if outlink is a site root
TagRec ***grv = getOutlinkTagRecVector();
if ( ! grv || grv == (void *)-1 ) return (int8_t *)grv;
// hop count of parent
int8_t *ph = getHopCount();
if ( ! ph || ph == (void *)-1 ) return (int8_t *)ph;
// int16_tcut
int32_t n = links->getNumLinks();
// sanity check
if ( m_outlinkHopCountVector ) { char *xx=NULL;*xx=0; }
// make some space
m_outlinkHopCountVector = (int8_t *)mmalloc ( n * 4 ,"xdhc");
// return NULL on error with g_errno set
if ( ! m_outlinkHopCountVector ) return NULL;
// save size
m_outlinkHopCountVectorSize = n * 4;
// stock it
for ( int32_t i = 0 ; i < n ; i++ ) {
// get it
char *u = links->getLinkPtr(i);
// and this
TagRec *gr = (*grv)[i];
// flags
linkflags_t flags = links->m_linkFlags[i];
// hop count. default to 1.
int32_t hc = 1;
if ( getIsOutlinkSiteRoot ( u , gr ) ) hc = 0;
else if ( isPingServer ( u ) ) hc = 0;
else if ( flags & LF_RSS ) hc = 0;
else hc = *ph + 1;
// assign it
m_outlinkHopCountVector[i] = hc;
}
m_outlinkHopCountVectorValid = true;
return m_outlinkHopCountVector;
}
*/
//set to false fo rinjecting and validate it... if &spiderlinks=0
// should we spider links?
char *XmlDoc::getSpiderLinks ( ) {
// set it to false on issues
//if ( m_indexCode ) {
// m_spiderLinks = false;
// m_spiderLinks2 = false;
// m_spiderLinksValid = true ; }
// this slows importing down because we end up doing ip lookups
// for every outlink if "firstip" not in tagdb.
// shoot. set2() already sets m_spiderLinksValid to true so we
// have to override if importing.
if ( m_isImporting && m_isImportingValid ) {
m_spiderLinks = false;
m_spiderLinks2 = false;
m_spiderLinksValid = true;
return &m_spiderLinks2;
}
// return the valid value
if ( m_spiderLinksValid ) return &m_spiderLinks2;
setStatus ( "getting spider links flag");
// do not add links now if doing the parser test
if ( g_conf.m_testParserEnabled ||
m_isDiffbotJSONObject ) {
m_spiderLinks = false;
m_spiderLinks2 = false;
m_spiderLinksValid = true;
return &m_spiderLinks2;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return (char *)cr;
int32_t *ufn = getUrlFilterNum();
if ( ! ufn || ufn == (void *)-1 ) return (char *)ufn;
// if url filters forbids it
if ( ! cr->m_harvestLinks[*ufn] ) {
m_spiderLinksValid = true;
m_spiderLinks2 = false;
m_spiderLinks = false;
return &m_spiderLinks2;
}
// hack for bulk job detection. never spider links
//if ( cr->m_isCustomCrawl == 2 ) {
// m_spiderLinks = false;
// m_spiderLinks2 = false;
// m_spiderLinksValid = true;
// return &m_spiderLinks2;
//}
// check the xml for a meta robots tag
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
// assume true
m_spiderLinks = true;
// or if meta tag says not to
char buf1 [256];
char buf2 [256];
buf1[0] = '\0';
buf2[0] = '\0';
xml->getMetaContent ( buf1, 255 , "robots" , 6 );
xml->getMetaContent ( buf2, 255 , "gigabot", 7 );
if ( strstr ( buf1 , "nofollow" ) ||
strstr ( buf2 , "nofollow" ) ||
strstr ( buf1 , "none" ) ||
strstr ( buf2 , "none" ) )
m_spiderLinks = false;
// spider links if doing custom crawl or not using robots.txt
if ( ! m_useRobotsTxt || cr->m_isCustomCrawl )
m_spiderLinks = true;
// spider request forbade it? diffbot.cpp crawlbot api when
// specifying urldata (list of urls to add to spiderdb) usually
// they do not want the links crawled i'd imagine.
if ( m_sreqValid && m_sreq.m_avoidSpiderLinks )
m_spiderLinks = false;
// also check in url filters now too
// set shadow member
m_spiderLinks2 = m_spiderLinks;
// validate
m_spiderLinksValid = true;
return &m_spiderLinks2;
}
//
// . DELETE ALL SPAM FROM THE INDEX!!!
//
// . for a page to be spam these must ALL be true, with the current ip:
// . site is not in google
// . site has no "stars" in google's dir
// . site has no authorityinlink tag
// . site has less than 10 fresh inlinks
// . site has less than 500 total inlinks
// . ip is not from ultra dns
// . TODO: site is not linked to by wikipedia.com
// . TODO: site is not linked to by about.com
// . TODO: site is not linked to by a .gov site
// . the page IP address changed significantly since the same since last
// time we indexed it when it was not spam (if applicable)
//
// . if the page was indexed at one time and then we decided it was spam,
// and its ip changed significantly since last time, we just
// reschedule the spider rec for 15 days later and do not touch anything
// else. that way we keep the index somewhat stable.
//
/*
char *XmlDoc::getIsSpam() {
// return it if valid
if ( m_isSpamValid ) return &m_isSpam;
setStatus ("getting is spam");
// assume it is not spam
m_isSpam = false;
// debug
//logf(LOG_DEBUG,"doc: NOT SPAM!!");
//m_isSpamValid = true; return &m_isSpam;
// we disable this check for the contact doc
if ( m_spamCheckDisabled ) { m_isSpamValid = true; return &m_isSpam; }
// . i put this here for debugging purposes
// . some big sites have no easy to find contact info
// . get our domain
Url *fu = getFirstUrl();
char *dom = fu->getDomain ();
int32_t dlen = fu->getDomainLen();
if ( dlen == 12 && !strncmp(dom,"facebook.com",dlen) ) {
m_isSpamValid = true; return &m_isSpam; }
if ( dlen == 9 && !strncmp(dom,"yahoo.com",dlen) ) {
m_isSpamValid = true; return &m_isSpam; }
// get our site's tag rec
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
// are we already in the index?
//char *isIndexed = getIsIndexed();
//if (!isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
// this will update m_oldTagRec with the latest info if its stale
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
char *hci = getHasContactInfo();
if ( ! hci || hci == (char *)-1 ) return (char *)hci;
//int32_t *ip = getIp();
//if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
//XmlDoc **od = getOldXmlDoc ( );
//if ( ! od || od == (void *)-1 ) return (char *)od;
//int32_t oldIp = 0 ;
//if ( *od ) {
// int32_t *ip2 = (*od)->getIp();
// if ( ! ip2 || ip2 == (int32_t *)-1 ) return (char *)ip2;
// oldIp = *ip2;
//}
// i am guessing that most sites that use ultra dns will have a lot
// of site inlinks! so comment this our for now
//char *ultra = getIpIsUltraDns();
//if ( ultra || ultra==(char *)-1 ) return (char *)ultra;
// spammers do not use ultradns
//if ( *ultra ) return false;
Url *f = getFirstUrl();
char *u = f->getUrl();
int32_t now = getTimeGlobal();
// this will be valid
m_isSpamValid = true;
// use this routine
m_isSpam = isSpam ( u,
gr,
now,
// *isIndexed,
//oldIp ,
// *ip ,
*hci );
// we are doomed! delete in its entirety
if ( m_isSpam ) m_indexCode = EDOCSPAM;
return &m_isSpam;
}
// . "u" must be NORMALIZED. i.e. start with http:// or https:// etc.
// . we call this on outlinks as well
// . we no longer look at the old and newip to determine ownership change,
// because that is not reliable enough
// . we now maybe rely on a major change to the site root page...
bool XmlDoc::isSpam ( char *u ,
TagRec *gr ,
int32_t now ,
char isIndexed ,
int32_t oldIp ,
int32_t newIp ,
bool hasContactInfo ) {
// we need to mine that same database that firefox does...
Tag *tag = gr->getTag ( "malware" );
if ( tag && tag->getTagData()[0] != '0' ) return true;
// if they have contact info, that is a really good sign
if ( hasContactInfo ) return false;
// .edu and .gov sites are always fine
int32_t tlen; char *tld = getTLDFast(u,&tlen);
if ( tlen == 3 && ! strncmp(tld,"edu",3) ) return false;
if ( tlen == 3 && ! strncmp(tld,"gov",3) ) return false;
// the current top ip address
//int32_t top = newIp & 0x00ffffff;
// TODO: in the case of multiple ips on one domain, ensure we select
// the same IP every time we do a lookup in MsgC.
// ok if in google
if ( gr->getTag ( "ingoogle" ) ) return false;
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
// can also be in google's dmoz dir. must have a decent page rank.
if ( gr->getTag ( "pagerank" ) ) return false;
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
// . if was linked to by a high quality root as a new external outlink
// . TODO: include about.com and wikipedia.com i guess (TODO)
if ( gr->getTag ( "authorityinlink" ) ) return false;
//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;
tag = gr->getTag("sitenuminlinks");
// i guess if it has no entry for this, assume the best
if ( ! tag ) return false;
// or just a massive amount of any-age inlinks
if ( atol(tag->getTagData()) >= 500 ) return false;
tag = gr->getTag("sitenuminlinksfresh");
// i guess if it has no entry for this, assume the best
if ( ! tag ) return false;
// if site has enough good FRESH inlinks from the last 3 mos, no spam
if( atol(tag->getTagData()) >= 10 ) return false;
// if we are old and the top 3 bytes of the ip is the same as the last
// time we were indexed and thereby not identified as spam...
// then assume we are still not spam! because it was unlikely that
// the domain ownership changed...
//if ( isIndexed (oldIp & 0x00ffffff) == top ) return false;
// if they have contact info, that is a really good sign
//if ( hasContactInfo && (oldIp & 0x00ffffff) == top ) return false;
// if first time... accept them if they got contact info
//if ( ! oldIp && hasContactInfo ) return false;
// . if it has had the same ip for the last 365 days, let it in
// . getTagRec() updates this tag immediately if the ip changes
// . so we can't really use this tag for outlinks, because they might
// never get thrown into spiderdb to where we can add this tag to
// their tag rec... UNLESS msgc/msge were to update their tag rec...
// . i've seen quite a few old spam sites/pages. they just kinda stay
// there. so let's not do this...
//tag = gr->get("iptimestamp");
//int32_t now;
//if ( tag ) now = getTimeGlobal();
//if(tag&&now-atol(tag->getTagData())>365*24*3600&&
// ((tag->m_ip&0x00ffffff)==top))
// return false;
return true;
}
*/
// should we index the doc? if already indexed, and is filtered, we delete it
char *XmlDoc::getIsFiltered ( ) {
if ( m_isFilteredValid ) return &m_isFiltered;
if ( m_isDiffbotJSONObject ) {
m_isFiltered = false;
m_isFilteredValid = true;
return &m_isFiltered;
}
int32_t *priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
m_isFiltered = false;
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
if ( *priority == -3 ) m_isFiltered = true;
m_isFilteredValid = true;
return &m_isFiltered;
}
int32_t *XmlDoc::getSpiderPriority ( ) {
if ( m_priorityValid ) return &m_priority;
setStatus ("getting spider priority");
// need tagrec to see if banned
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
// this is an automatic ban!
if ( gr->getLong("manualban",0) ) {
m_priority = -3;//SPIDER_PRIORITY_BANNED;
m_priorityValid = true;
return &m_priority;
}
int32_t *ufn = getUrlFilterNum();
if ( ! ufn || ufn == (void *)-1 ) return (int32_t *)ufn;
// sanity check
if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
m_priority = cr->m_spiderPriorities[*ufn];
// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
if ( cr->m_forceDelete[*ufn] ) m_priority = -3;
m_priorityValid = true;
return &m_priority;
}
bool XmlDoc::logIt ( SafeBuf *bb ) {
// set errCode
int32_t errCode = m_indexCode;
if ( ! errCode && g_errno ) errCode = g_errno;
// were we new?
//char isIndexed = -1;
//if ( m_isIndexedValid ) isIndexed = m_isIndexed;
bool isNew = true;
if ( m_sreqValid && m_sreq.m_hadReply ) isNew = false;
// keep track of stats
g_stats.addSpiderPoint ( errCode, isNew ); // !isIndexed );
// do not log if we should not, saves some time
//if ( ! g_conf.m_logSpideredUrls && ! m_forceDelete ) return true;
if ( ! g_conf.m_logSpideredUrls ) return true;
// patch the ip
int32_t ip = m_ip;
// invalid?
if ( ! m_ipValid ) ip = 0;
char *coll = "nuked";
CollectionRec *cr = getCollRec();
if ( cr ) coll = cr->m_coll;
SafeBuf tmpsb;
// print into this now
SafeBuf *sb = &tmpsb;
// log into provided safebuf if not null
if ( bb ) sb = bb;
//
// coll
//
sb->safePrintf("coll=%s ",coll);
sb->safePrintf("collnum=%"INT32" ",(int32_t)m_collnum);
//
// print ip
//
if ( m_ipValid )
sb->safePrintf("ip=%s ",iptoa(m_ip) );
if ( m_firstIpValid )
sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );
// . first ip from spider req if it is fake
// . we end up spidering the same url twice because it will have
// different "firstips" in the SpiderRequest key. maybe just
// use domain hash instead of firstip, and then let msg13
// make queues in the case of hammering an ip, which i think
// it already does...
if ( m_sreqValid && m_sreq.m_firstIp != m_firstIp )
sb->safePrintf("fakesreqfirstip=%s ",iptoa(m_sreq.m_firstIp) );
//
// print when this spider request was added
//
//if ( m_sreqValid && m_sreq.m_addedTime ) {
// struct tm *timeStruct = gmtime ( &m_sreq.m_addedTime );
// char tmp[64];
// strftime(tmp,64,"requestadded=%b-%d-%Y(%H:%M:%S)", timeStruct);
// sb->safePrintf("%s(%"UINT32") ",tmp,m_sreq.m_addedTime);
//}
//
// print spidered time
//
//if ( m_spideredTimeValid ) {
time_t spideredTime = (time_t)getSpideredTime();
struct tm *timeStruct = gmtime ( &spideredTime );
char tmp[64];
strftime(tmp,64,"spidered=%b-%d-%Y(%H:%M:%S)", timeStruct );
sb->safePrintf("%s(%"UINT32") ",tmp,(uint32_t)spideredTime);
// when it was scheduled to be spidered
if ( m_sreqValid && m_sreq.m_addedTime ) {
time_t ts = m_sreq.m_addedTime;
struct tm *timeStruct = gmtime ( &ts );
char tmp[64];
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
sb->safePrintf("scheduledtime=%s(%"UINT32") ",
tmp,(uint32_t)m_sreq.m_addedTime);
}
// discovery date, first time spiderrequest was added to spiderdb
if ( m_sreqValid && m_sreq.m_discoveryTime ) {
time_t ts = m_sreq.m_discoveryTime;
struct tm *timeStruct = gmtime ( &ts );
char tmp[64];
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
sb->safePrintf("discoverydate=%s(%"UINT32") ",
tmp,(uint32_t)m_sreq.m_discoveryTime);
}
// print first indexed time
if ( m_firstIndexedDateValid ) {
time_t ts = m_firstIndexedDate;
timeStruct = gmtime ( &ts );//m_firstIndexedDate );
strftime(tmp,64,"firstindexed=%b-%d-%Y(%H:%M:%S)", timeStruct);
sb->safePrintf("%s(%"UINT32") ",tmp,
(uint32_t)m_firstIndexedDate);
}
//if ( ! m_isIndexedValid ) { char *xx=NULL;*xx=0; }
// just use the oldurlfilternum for grepping i guess
//if ( m_oldDocValid && m_oldDoc )
// when injecting a request we have no idea if it had a reply or not
if ( m_sreqValid && m_sreq.m_isInjecting )
sb->safePrintf("firsttime=? ");
else if ( m_sreqValid && m_sreq.m_hadReply )
sb->safePrintf("firsttime=0 ");
else if ( m_sreqValid )
sb->safePrintf("firsttime=1 ");
else
sb->safePrintf("firsttime=? ");
//
// print # of link texts
//
if ( m_linkInfo1Valid && ptr_linkInfo1 ) {
LinkInfo *info = ptr_linkInfo1;
int32_t nt = info->getNumLinkTexts();
sb->safePrintf("goodinlinks=%"INT32" ",nt );
// new stuff. includes ourselves i think.
//sb->safePrintf("ipinlinks=%"INT32" ",info->m_numUniqueIps);
//sb->safePrintf("cblockinlinks=%"INT32" ",
//info->m_numUniqueCBlocks);
}
//
// print # of link texts from 2nd coll
//
// this is not used for what it was used for.
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
// LinkInfo *info = ptr_linkInfo2;
// int32_t nt = 0;
// if ( info ) nt = info->getNumLinkTexts();
// if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
// }
if ( m_docIdValid )
sb->safePrintf("docid=%"UINT64" ",m_docId);
char *u = getFirstUrl()->getUrl();
int64_t pd = g_titledb.getProbableDocId(u);
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
sb->safePrintf("probdocid=%"UINT64" ",pd);
sb->safePrintf("probdocidmin=%"UINT64" ",d1);
sb->safePrintf("probdocidmax=%"UINT64" ",d2);
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
if ( m_siteNumInlinksValid ) {
sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
// sb->safePrintf("siteipinlinks=%"INT32" ",
// m_siteNumInlinksUniqueIp);
// sb->safePrintf("sitecblockinlinks=%"INT32" ",
// m_siteNumInlinksUniqueCBlock);
int32_t sr = ::getSiteRank ( m_siteNumInlinks );
sb->safePrintf("siterank=%"INT32" ", sr );
}
if ( m_sreqValid )
sb->safePrintf("pageinlinks=%04"INT32" ",
m_sreq.m_pageNumInlinks);
// int16_tcut
int64_t uh48 = hash64b ( m_firstUrl.m_url );
// mask it
uh48 &= 0x0000ffffffffffffLL;
sb->safePrintf ("uh48=%"UINT64" ",uh48 );
if ( m_charsetValid )
sb->safePrintf("charset=%s ",get_charset_str(m_charset));
if ( m_contentTypeValid )
sb->safePrintf("ctype=%s ",
g_contentTypeStrings [m_contentType]);
if ( m_sreqValid )
sb->safePrintf("parentlang=%02"INT32"(%s) ",
(int32_t)m_sreq.m_parentLangId,
getLanguageAbbr(m_sreq.m_parentLangId));
if ( m_langIdValid )
sb->safePrintf("lang=%02"INT32"(%s) ",(int32_t)m_langId,
getLanguageAbbr(m_langId));
if ( m_countryIdValid )
sb->safePrintf("country=%02"INT32"(%s) ",(int32_t)m_countryId,
g_countryCode.getAbbr(m_countryId));
if ( m_hopCountValid )
sb->safePrintf("hopcount=%02"INT32" ",(int32_t)m_hopCount);
if ( m_contentValid )
sb->safePrintf("contentlen=%06"INT32" ",m_contentLen);
if ( m_contentValid && cr->m_isCustomCrawl )
sb->safePrintf("zeroedout=%i ",(int)m_zeroedOut);
if ( m_isContentTruncatedValid )
sb->safePrintf("contenttruncated=%"INT32" ",
(int32_t)m_isContentTruncated);
if ( m_robotsTxtLenValid )
sb->safePrintf("robotstxtlen=%04"INT32" ",m_robotsTxtLen );
if ( m_isAllowedValid )
sb->safePrintf("robotsallowed=%i ", (int)m_isAllowed);
else
sb->safePrintf("robotsallowed=? " );
if ( m_contentHash32Valid )
sb->safePrintf("ch32=%010"UINT32" ",m_contentHash32);
if ( m_domHash32Valid )
sb->safePrintf("dh32=%010"UINT32" ",m_domHash32);
if ( m_siteHash32Valid )
sb->safePrintf("sh32=%010"UINT32" ",m_siteHash32);
if ( m_isPermalinkValid )
sb->safePrintf("ispermalink=%"INT32" ",(int32_t)m_isPermalink);
if ( m_isRSSValid )
sb->safePrintf("isrss=%"INT32" ",(int32_t)m_isRSS);
if ( m_linksValid )
sb->safePrintf("hasrssoutlink=%"INT32" ",
(int32_t)m_links.hasRSSOutlink() );
if ( m_numOutlinksAddedValid ) {
sb->safePrintf("outlinksadded=%04"INT32" ",
(int32_t)m_numOutlinksAdded);
sb->safePrintf("outlinksaddedfromsamedomain=%04"INT32" ",
(int32_t)m_numOutlinksAddedFromSameDomain);
}
if ( m_metaListValid )
sb->safePrintf("addlistsize=%05"INT32" ",
(int32_t)m_metaListSize);
else
sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)0);
if ( m_addedSpiderRequestSizeValid )
sb->safePrintf("addspiderreqsize=%05"INT32" ",
m_addedSpiderRequestSize);
else
sb->safePrintf("addspiderreqsize=%05"INT32" ",0);
if ( m_addedSpiderReplySizeValid )
sb->safePrintf("addspiderrepsize=%05"INT32" ",
m_addedSpiderReplySize);
else
sb->safePrintf("addspiderrepsize=%05"INT32" ",0);
if ( m_addedStatusDocSizeValid ) {
sb->safePrintf("addstatusdocsize=%05"INT32" ",
m_addedStatusDocSize);
sb->safePrintf("addstatusdocid=%"UINT64" ",
m_addedStatusDocId);
}
else {
sb->safePrintf("addstatusdocsize=%05"INT32" ",0);
sb->safePrintf("addstatusdocid=0 ");
}
if ( m_useSecondaryRdbs ) {
sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
if ( cr )
sb->safePrintf("indexspiderreplies=%i ",(int)
cr->m_indexSpiderReplies);
}
if ( size_imageData && m_imageDataValid ) {
// url is in data now
ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
int32_t nt = ta->getNumThumbnails();
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
sb->safePrintf("thumbnail=%s,%"INT32"bytes,%"INT32"x%"INT32",(%"INT32") ",
ti->getUrl(),
ti->m_dataSize,
ti->m_dx,
ti->m_dy,
nt);
}
else
sb->safePrintf("thumbnail=none ");
/*
if ( m_hasAddressValid && m_addressesValid )
sb->safePrintf("numaddr=%"INT32" ",(int32_t)m_addresses.m_numValid);
//if ( m_skipIndexingValid )
// sb->safePrintf("skipindexing=%"INT32" ",(int32_t)m_skipIndexing);
if ( m_hasTODValid )
sb->safePrintf("hastod=%"INT32" ",(int32_t)m_hasTOD);
*/
// get the content type
uint8_t ct = CT_UNKNOWN;
if ( m_contentTypeValid ) ct = m_contentType;
bool isRoot = false;
if ( m_isSiteRootValid ) isRoot = m_isSiteRoot;
// make sure m_minInlinkerHopCount is valid
LinkInfo *info1 = NULL;
if ( m_linkInfo1Valid ) info1 = ptr_linkInfo1;
//bool isContacty = getIsContacty(&m_firstUrl,
// info1,
// m_hopCount ,
// ct , // contentType
// isRoot ,
// m_niceness );
/*
// just use this now
if ( m_hasContactInfoValid )
sb->safePrintf("iscontacty=%"INT32" ",(int32_t)m_hasContactInfo);
if ( m_hasSiteVenueValid )
sb->safePrintf("hassitevenue=%"INT32" ",(int32_t)m_hasSiteVenue);
*/
// hack this kinda
// . in PageInject.cpp we do not have a valid priority without
// blocking because we did a direct injection!
// so ignore this!!
// . a diffbot json object, an xmldoc we set from a json object
// in a diffbot reply, is a childDoc (m_isChildDoc) is true
// and does not have a spider priority. only the parent doc
// that we used to get the diffbot reply (array of json objects)
// will have the spider priority
if ( ! getIsInjecting() && ! m_isDiffbotJSONObject ) {
//int32_t *priority = getSpiderPriority();
//if ( ! priority ||priority==(void *)-1){char *xx=NULL;*xx=0;}
if ( m_priorityValid )
sb->safePrintf("priority=%"INT32" ",
(int32_t)m_priority);
}
// should be valid since we call getSpiderPriority()
if ( m_urlFilterNumValid )
sb->safePrintf("urlfilternum=%"INT32" ",(int32_t)m_urlFilterNum);
if ( m_diffbotApiUrlValid &&
m_diffbotApiUrl.getBufStart() &&
m_diffbotApiUrl.getBufStart()[0] )
sb->safePrintf("diffbotjsonobjects=%"INT32" ",
(int32_t)m_diffbotJSONCount);
if ( m_diffbotReplyValid )
sb->safePrintf("diffboterror=%"INT32" ",m_diffbotReplyError);
if ( m_siteValid )
sb->safePrintf("site=%s ",ptr_site);
if ( m_isSiteRootValid )
sb->safePrintf("siteroot=%"INT32" ",m_isSiteRoot );
else
sb->safePrintf("siteroot=? ");
// like how we index it, do not include the filename. so we can
// have a bunch of pathdepth 0 urls with filenames like xyz.com/abc.htm
if ( m_firstUrlValid ) {
int32_t pd = -1;
// fix core
if ( m_firstUrl.m_url &&
m_firstUrl.m_ulen > 0 &&
m_firstUrl.m_path )
pd = m_firstUrl.getPathDepth(false);
sb->safePrintf("pathdepth=%"INT32" ",pd);
}
else {
sb->safePrintf("pathdepth=? ");
}
//
// . sometimes we print these sometimes we do not
// . put this at the end so we can awk out the above fields reliably
//
// print when it was last spidered
if ( m_oldDocValid && m_oldDoc ) {
time_t spideredTime = m_oldDoc->getSpideredTime();
struct tm *timeStruct = gmtime ( &spideredTime );
char tmp[64];
strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime);
}
// print new pubdate
if ( m_pubDateValid && m_pubDate!=(uint32_t)-1 && m_pubDate!=0 ) {
char tmp[64];
time_t ts = (time_t)m_pubDate;
struct tm *timeStruct = gmtime ( &ts );
strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
sb->safePrintf("pubdate=%s ", tmp );
}
if ( m_linkInfo1Valid && ptr_linkInfo1 && ptr_linkInfo1->hasRSSItem())
sb->safePrintf("hasrssitem=1 ");
// was the content itself injected?
if ( m_wasContentInjected )
sb->safePrintf("contentinjected=1 ");
else
sb->safePrintf("contentinjected=0 ");
// might have just injected the url and downloaded the content?
if ( (m_sreqValid && m_sreq.m_isInjecting) ||
(m_isInjecting && m_isInjectingValid) )
sb->safePrintf("urlinjected=1 ");
else
sb->safePrintf("urlinjected=0 ");
if ( m_sreqValid && m_sreq.m_isAddUrl )
sb->safePrintf("isaddurl=1 ");
else
sb->safePrintf("isaddurl=0 ");
if ( m_sreqValid && m_sreq.m_isPageReindex )
sb->safePrintf("pagereindex=1 ");
if ( m_spiderLinksValid && m_spiderLinks )
sb->safePrintf("spiderlinks=1 ");
if ( m_spiderLinksValid && ! m_spiderLinks )
sb->safePrintf("spiderlinks=0 ");
if ( m_crawlDelayValid && m_crawlDelay != -1 )
sb->safePrintf("crawldelayms=%"INT32" ",(int32_t)m_crawlDelay);
if ( m_recycleContent )
sb->safePrintf("recycleContent=1 ");
if ( m_exactContentHash64Valid )
sb->safePrintf("exactcontenthash=%"UINT64" ",
m_exactContentHash64 );
// . print percent changed
// . only print if non-zero!
if ( m_percentChangedValid && m_oldDocValid && m_oldDoc &&
m_percentChanged )
sb->safePrintf("changed=%.00f%% ",m_percentChanged);
// only print if different now! good for grepping changes
if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_docId != m_docId )
sb->safePrintf("olddocid=%"UINT64" ",m_oldDoc->m_docId);
// only print if different now! good for grepping changes
if ( m_sreqValid && m_sreq.m_ufn >= 0 &&
m_sreq.m_ufn != m_urlFilterNum )
sb->safePrintf("oldurlfilternum=%"INT32" ",
(int32_t)m_sreq.m_ufn);
if ( m_sreqValid && m_sreq.m_priority >= 0 &&
m_sreq.m_priority != m_priority )
sb->safePrintf("oldpriority=%"INT32" ",
(int32_t)m_sreq.m_priority);
if ( m_oldDoc && m_oldDoc->m_langIdValid &&
m_oldDoc->m_langId != m_langId )
sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_oldDoc->m_langId,
getLanguageAbbr(m_oldDoc->m_langId));
if ( m_useSecondaryRdbs &&
m_useTitledb &&
m_logLangId != m_langId )
sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_logLangId,
getLanguageAbbr(m_logLangId));
if ( m_useSecondaryRdbs &&
m_useTitledb &&
m_logSiteNumInlinks != m_siteNumInlinks )
sb->safePrintf("oldsiteinlinks=%04"INT32" ",m_logSiteNumInlinks);
if ( m_useSecondaryRdbs &&
m_useTitledb &&
m_oldDocValid &&
m_oldDoc &&
strcmp(ptr_site,m_oldDoc->ptr_site) )
sb->safePrintf("oldsite=%s ",m_oldDoc->ptr_site);
// . print old pubdate
// . -1 means unsupported, 0 means could not find one
// . only print if different now! good for grepping changes
if ( m_oldDocValid && m_oldDoc &&
m_oldDoc->m_pubDate!= (uint32_t)-1 &&
m_oldDoc->m_pubDate !=0 &&
m_oldDoc->m_pubDate != m_pubDate ) {
char tmp[64];
time_t ts = m_oldDoc->m_pubDate;
struct tm *timeStruct = gmtime ( &ts );
strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
sb->safePrintf("oldpubdate=%s ",tmp );
}
if ( m_isAdultValid )
sb->safePrintf("isadult=%"INT32" ",(int32_t)m_isAdult);
// only print if different now! good for grepping changes
if ( m_oldDocValid && m_oldDoc &&
m_oldDoc->m_siteNumInlinks >= 0 &&
m_oldDoc->m_siteNumInlinks != m_siteNumInlinks ) {
int32_t sni = -1;
if ( m_oldDoc ) sni = m_oldDoc->m_siteNumInlinks;
sb->safePrintf("oldsiteinlinks=%04"INT32" ",sni);
}
// Spider.cpp sets m_sreq.m_errCount before adding it to doledb
if ( m_sreqValid ) // && m_sreq.m_errCount )
sb->safePrintf("errcnt=%"INT32" ",(int32_t)m_sreq.m_errCount );
else
sb->safePrintf("errcnt=? ");
if ( ptr_redirUrl ) { // m_redirUrlValid && m_redirUrlPtr ) {
sb->safePrintf("redir=%s ",ptr_redirUrl);//m_redirUrl.getUrl());
if ( m_numRedirects > 2 )
sb->safePrintf("numredirs=%"INT32" ",m_numRedirects);
}
if ( m_canonicalRedirUrlValid && m_canonicalRedirUrlPtr )
sb->safePrintf("canonredir=%s ",
m_canonicalRedirUrlPtr->getUrl());
if ( m_httpStatusValid && m_httpStatus != 200 )
sb->safePrintf("httpstatus=%"INT32" ",(int32_t)m_httpStatus);
if ( m_updatedMetaData )
sb->safePrintf("updatedmetadata=1 ");
if ( m_isDupValid && m_isDup )
sb->safePrintf("dupofdocid=%"INT64" ",m_docIdWeAreADupOf);
if ( m_firstUrlValid )
sb->safePrintf("url=%s ",m_firstUrl.m_url);
else
sb->safePrintf("urldocid=%"INT64" ",m_docId);
//
// print error/status
//
sb->safePrintf(": %s",mstrerror(m_indexCode));
// breathe
QUICKPOLL ( m_niceness );
// if safebuf provided, do not log to log
if ( bb ) return true;
// log it out
logf ( LOG_INFO ,
"build: %s",
//getFirstUrl()->getUrl(),
sb->getBufStart() );
return true;
}
// . returns false and sets g_errno on error
// . make sure that the title rec we generated creates the exact same
// meta list as what we got
bool XmlDoc::doConsistencyTest ( bool forceTest ) {
// skip for now it was coring on a json doc test
return true;
CollectionRec *cr = getCollRec();
if ( ! cr )
return true;
if ( ! m_doConsistencyTesting && strcmp(cr->m_coll,"qatest123") != 0 )
return true;
// if we had an old doc then our meta list will have removed
// stuff already in the database from indexing the old doc.
// so it will fail the parsing consistency check... because of
// the 'incremental indexing' algo above
// disable for now... just a secondfor testing cheatcc.com
if ( m_oldDoc && m_oldDocValid && g_conf.m_doIncrementalUpdating )
return true;
// if not test coll skip this
//if ( strcmp(cr->m_coll,"qatest123") ) return true;
// title rec is null if we are reindexing an old doc
// and "unchanged" was true.
if ( m_unchangedValid && m_unchanged ) {
if ( ! m_titleRecBufValid ) return true;
if ( m_titleRecBuf.length()==0 ) return true;
}
// leave this uncommented so we can see if we are doing it
setStatus ( "doing consistency check" );
// log debug
log("spider: doing consistency check for %s",ptr_firstUrl);
// . set another doc from that title rec
// . do not keep on stack since so huge!
XmlDoc *doc ;
try { doc = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
return false;
}
mnew ( doc , sizeof(XmlDoc),"xmldcs");
if ( ! doc->set2 ( m_titleRecBuf.getBufStart() ,
-1 , cr->m_coll , NULL , m_niceness ,
// no we provide the same SpiderRequest so that
// it can add the same SpiderReply to the metaList
&m_sreq ) ) {
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
delete ( doc );
return false;
}
// . some hacks
// . do not look up title rec in titledb, assume it is new
doc->m_isIndexed = false;
doc->m_isIndexedValid = true;
// so we don't core in getRevisedSpiderRequest()
doc->m_firstIp = m_firstIp;
doc->m_firstIpValid = true;
// inherit this doc's tag rec since it has not called updateTagdb() yet
//doc->ptr_tagRecData = ptr_tagRecData;
//doc->size_tagRecData = size_tagRecData;
// getNewSpiderReply() calls getDownloadEndTime() which is not valid
// and causes the page to be re-downloaded, so stop that..!
doc->m_downloadEndTime = m_downloadEndTime;
doc->m_downloadEndTimeValid = true;
// inherit doledb key as well to avoid a core there
doc->m_doledbKey = m_doledbKey;
// skip the robots.txt lookup! that was causing this too block!
//doc->m_isAllowed = true;
//doc->m_isAllowedValid = true;
// do not get outlink info for this, that stuff is for adding outlinks
// to spiderdb, and tagdb may have changed. so we can't really compare
// spider recs! if this is false then the call to doc->getMetaList()
// blocks to lookup the tagdb and titledb recs for each outlink!
// therefore, set it to true!
//doc->m_isInjecting = true;
// mdw: shouldn't this have the same effect?
//doc->m_spiderLinks2 = false;
//doc->m_spiderLinksValid = true;
// flag it
doc->m_doingConsistencyCheck = true;
// get get its metalist. rv = return value
char *rv = doc->getMetaList ( );
// sanity check - compare urls
if ( doc->m_firstUrl.m_ulen != m_firstUrl.m_ulen){char *xx=NULL;*xx=0;}
// error setting it?
if ( ! rv ) {
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// free it
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
delete ( doc );
// error
return false;
}
// blocked? that is not allowed
if ( rv == (void *)-1 ) { char *xx=NULL; *xx=0; }
// compare with the old list
char *list1 = m_metaList;
int32_t listSize1 = m_metaListSize;
char *list2 = doc->m_metaList;
int32_t listSize2 = doc->m_metaListSize;
// show it for now
//log("build: printing meta list 1");
//printMetaList(list1,list1+listSize1,NULL);
//log("build: printing meta list 2");
//printMetaList(list2,list2+listSize2,NULL);
// do a compare
HashTableX ht1;
HashTableX ht2;
ht1.set ( sizeof(key224_t),sizeof(char *),
262144,NULL,0,false,m_niceness,"xmlht1");
ht2.set ( sizeof(key224_t),sizeof(char *),
262144,NULL,0,false,m_niceness,"xmlht2");
// format of a metalist... see XmlDoc::addTable() where it adds keys
// from a table into the metalist
// <nosplitflag|rdbId><key><dataSize><data>
// where nosplitflag is 0x80
char *p1 = list1;
char *p2 = list2;
char *pend1 = list1 + listSize1;
char *pend2 = list2 + listSize2;
// see if each key in list1 is in list2
if ( ! hashMetaList ( &ht1 , p1 , pend1 , false ) ) {
char *xx=NULL;*xx=0;
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
delete ( doc );
return log("doc: failed consistency test for %s",ptr_firstUrl);
}
if ( ! hashMetaList ( &ht2 , p2 , pend2 , false ) ) {
char *xx=NULL;*xx=0;
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
delete ( doc );
return log("doc: failed consistency test for %s",ptr_firstUrl);
}
// . now make sure each list matches the other
// . first scan the guys in "p1" and make sure in "ht2"
hashMetaList ( &ht2 , p1 , pend1 , true );
// . second scan the guys in "p2" and make sure in "ht1"
hashMetaList ( &ht1 , p2 , pend2 , true );
mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
delete ( doc );
log ("spider: passed consistency test for %s",ptr_firstUrl );
// no serious error, although there might be an inconsistency
return true;
}
int32_t XmlDoc::printMetaList ( ) {
SafeBuf sb;
printMetaList ( m_metaList ,
m_metaList + m_metaListSize ,
&sb );
fprintf(stderr,"%s\n",sb.getBufStart());
return 0;
}
#define TABLE_ROWS 25
// print this also for page parser output!
void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
verifyMetaList ( p , pend , false );
SafeBuf tmp;
if ( ! sb ) sb = &tmp;
char *hdr =
"<table border=1>\n"
"<tr>"
"<td><b>rdb</b></td>"
"<td><b>del?</b></td>"
"<td><b>shardByTermId?</b></td>"
// illustrates key size
"<td><b>key</b></td>"
// break it down. based on rdb, of course.
"<td><b>desc</b></td>"
"</tr>\n" ;
sb->safePrintf("%s",hdr);
int32_t recSize = 0;
int32_t rcount = 0;
for ( ; p < pend ; p += recSize ) {
// get rdbid
uint8_t rdbId = *p & 0x7f;
// skip
p++;
// get key size
int32_t ks = getKeySizeFromRdbId ( rdbId );
// point to it
char *rec = p;
// init this
int32_t recSize = ks;
// convert into a key128_t, the biggest possible key
//key224_t k ;
char k[MAX_KEY_BYTES];
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
//k.setMin();
gbmemcpy ( &k , p , ks );
// is it a negative key?
char neg = false;
if ( ! ( p[0] & 0x01 ) ) neg = true;
// this is now a bit in the posdb key so we can rebalance
char shardByTermId = false;
if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
shardByTermId = true;
// skip it
p += ks;
// get datasize
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
// . always zero if key is negative
// . this is not the case unfortunately...
if ( neg ) dataSize = 0;
// if -1, read it in
if ( dataSize == -1 ) {
dataSize = *(int32_t *)p;
// inc this
recSize += 4;
// sanity check
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
p += 4;
}
// point to it
char *data = p;
// skip the data
p += dataSize;
// inc it
recSize += dataSize;
// NULL it for negative keys
if ( dataSize == 0 ) data = NULL;
// see if one big table causes a browser slowdown
if ( (++rcount % TABLE_ROWS) == 0 )
sb->safePrintf("<!--ignore--></table>%s",hdr);
//if ( rdbId != RDB_LINKDB ) continue;
// print dbname
sb->safePrintf("<tr>");
char *dn = getDbnameFromId ( rdbId );
sb->safePrintf("<td>%s</td>",dn);
if ( neg ) sb->safePrintf("<td>D</td>");
else sb->safePrintf("<td>&nbsp;</td>");
if ( shardByTermId ) sb->safePrintf("<td>shardByTermId</td>");
else sb->safePrintf("<td>&nbsp;</td>");
sb->safePrintf("<td><nobr>%s</nobr></td>", KEYSTR(k,ks));
if ( rdbId == RDB_POSDB ) {
// get termid et al
key144_t *k2 = (key144_t *)k;
int64_t tid = g_posdb.getTermId(k2);
//uint8_t score8 = g_posdb.getScore ( *k2 );
//uint32_t score32 = score8to32 ( score8 );
// sanity check
if(dataSize!=0){char*xx=NULL;*xx=0;}
sb->safePrintf("<td>"
"termId=%020"UINT64" "
//"score8=%03"UINT32" "
//"score32=%010"UINT32""
"</td>"
,(uint64_t)tid
//(int32_t)score8,
//(int32_t)score32
);
}
else if ( rdbId == RDB_DATEDB ) {
// get termid et al
key128_t *k2 = (key128_t *)k;
int64_t tid = g_datedb.getTermId(k2);
// use indexdb's function for this. should be the same
uint8_t score8 = g_indexdb.getScore ( (char *)k );
int32_t date = g_datedb.getDate ( k2 );
uint32_t score32 = score8to32 ( score8 );
// sanity check
if(dataSize!=0){char*xx=NULL;*xx=0;}
sb->safePrintf("<td>"
"termId=%020"UINT64" "
"date=%010"UINT32" "
"score8=%03"UINT32" "
"score32=%010"UINT32""
"</td>",
tid,
date,
(int32_t)score8,
(int32_t)score32);
}
// key parsing logic from Sections.cpp::gotSectiondbList()
else if ( rdbId == RDB_SECTIONDB ) {
key128_t *k2 = (key128_t *)k;
int32_t secType = g_indexdb.getScore ( (char *)k2);
int32_t tagHash = g_datedb.getDate ( k2 );
int64_t tid = g_datedb.getTermId(k2);
int64_t siteHash = tid; // not quite 64 bits
SectionVote *sv = (SectionVote *)data;
char *dd = "tagHash32";
if ( secType == SV_TAGCONTENTHASH )
dd ="tagcontentHash32";
if ( secType == SV_TAGPAIRHASH )
dd = "tagPairHash32";
// sanity check
int32_t ds = sizeof(SectionVote);
if (!neg&&dataSize!=ds){char*xx=NULL;*xx=0;}
if ( neg&&dataSize!=0 ){char*xx=NULL;*xx=0;}
float score = 0.0;
float numSampled = 0.0;
if ( data ) {
score = sv->m_score;
numSampled = sv->m_numSampled;
}
sb->safePrintf("<td>"
"<nobr>"
"siteHash48=0x%016"XINT64" "
"%s=0x%08"XINT32" "
"secType=%s "
"score=%.02f "
"numSampled=%.02f"
"</nobr>"
"</td>",
siteHash,
dd,tagHash,
getSectionTypeAsStr(secType),
score,
numSampled);
}
else if ( rdbId == RDB_LINKDB ) {
key224_t *k2 = (key224_t *)k;
int64_t linkHash=g_linkdb.getLinkeeUrlHash64_uk(k2);
int32_t linkeeSiteHash = g_linkdb.getLinkeeSiteHash32_uk(k2);
int32_t linkerSiteHash = g_linkdb.getLinkerSiteHash32_uk(k2);
char linkSpam = g_linkdb.isLinkSpam_uk (k2);
int32_t siteRank = g_linkdb.getLinkerSiteRank_uk (k2);
//int32_t hopCount = g_linkdb.getLinkerHopCount_uk (k2);
//int32_t ip24 = g_linkdb.getLinkerIp24_uk (k2);
int32_t ip32 = g_linkdb.getLinkerIp_uk (k2);
int64_t docId = g_linkdb.getLinkerDocId_uk (k2);
// sanity check
if(dataSize!=0){char*xx=NULL;*xx=0;}
sb->safePrintf("<td>"
"<nobr>"
"linkeeSiteHash32=0x%08"XINT32" "
"linkeeUrlHash=0x%016"XINT64" "
"linkSpam=%"INT32" "
"siteRank=%"INT32" "
//"hopCount=%03"INT32" "
"sitehash32=0x%"XINT32" "
"IP32=%s "
"docId=%"UINT64""
"</nobr>"
"</td>",
linkeeSiteHash,
linkHash,
(int32_t)linkSpam,
siteRank,
//hopCount,
linkerSiteHash,
iptoa(ip32),
docId);
}
else if ( rdbId == RDB_CLUSTERDB ) {
key128_t *k2 = (key128_t *)k;
char *r = (char *)k2;
int32_t siteHash26 = g_clusterdb.getSiteHash26 ( r );
char lang = g_clusterdb.getLanguage ( r );
int64_t docId = g_clusterdb.getDocId ( r );
char ff = g_clusterdb.getFamilyFilter ( r );
// sanity check
if(dataSize!=0){char*xx=NULL;*xx=0;}
sb->safePrintf("<td>"
// 26 bit site hash
"siteHash26=0x%08"XINT32" "
"family=%"INT32" "
"lang=%03"INT32" "
"docId=%"UINT64""
"</td>",
siteHash26 ,
(int32_t)ff,
(int32_t)lang,
docId );
}
// key parsing logic taken from Address::makePlacedbKey
else if ( rdbId == RDB_PLACEDB ) {
key128_t *k2 = (key128_t *)k;
int64_t bigHash = g_placedb.getBigHash ( k2 );
int64_t docId = g_placedb.getDocId ( k2 );
int32_t snh = g_placedb.getStreetNumHash ( k2 );
//int32_t smallHash = g_placedb.getSmallHash ( k2 );
// sanity check
if(!neg &&dataSize<=0){char*xx=NULL;*xx=0;}
if( neg &&dataSize!=0){char*xx=NULL;*xx=0;}
sb->safePrintf("<td><nobr>"
"bigHash64=0x%016"XINT64" "
"docId=%"UINT64" "
"streetNumHash25=0x%08"XINT32" "
"dataSize=%010"INT32" "
"address=%s"
"</nobr>"
"</td>",
bigHash,
docId,
snh,
dataSize ,
data );
}
// key parsing logic taken from Address::makePlacedbKey
else if ( rdbId == RDB_SPIDERDB ) {
sb->safePrintf("<td><nobr>");
key128_t *k2 = (key128_t *)k;
if ( g_spiderdb.isSpiderRequest(k2) ) {
SpiderRequest *sreq = (SpiderRequest *)rec;
sreq->print ( sb );
}
else {
SpiderReply *srep = (SpiderReply *)rec;
srep->print ( sb );
}
sb->safePrintf("</nobr></td>");
}
else if ( rdbId == RDB_DOLEDB ) {
key_t *k2 = (key_t *)k;
sb->safePrintf("<td><nobr>");
sb->safePrintf("priority=%"INT32" "
"spidertime=%"UINT32" "
//"uh48=%"XINT64" "
"isdel=%"INT32"",
g_doledb.getPriority(k2),
(uint32_t)g_doledb.getSpiderTime(k2),
//g_doledb.getUrlHash48(k2),
g_doledb.getIsDel(k2));
sb->safePrintf("</nobr></td>");
}
else if ( rdbId == RDB_TITLEDB ) {
//XmlDoc tr;
//SafeBuf tmp;
//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
// print each offset and size for the variable crap
sb->safePrintf("<td><nobr>titlerec datasize=%"INT32" "
//"sizeofxmldoc=%"INT32" "
//"hdrSize=%"INT32" "
//"version=%"INT32" "
//"%s"
"</nobr></td>",
dataSize
//(int32_t)sizeof(XmlDoc),
//(int32_t)tr.m_headerSize,
//(int32_t)tr.m_version,
//tmp.getBufStart());
);
}
//else if ( rdbId == RDB_REVDB ) {
// sb->safePrintf("<td><nobr>revdb datasize=%"INT32" ",
// dataSize);
//}
else if ( rdbId == RDB_TAGDB ) {
Tag *tag = (Tag *)rec;
sb->safePrintf("<td><nobr>");
if ( rec[0] & 0x01 ) tag->printToBuf(sb);
else sb->safePrintf("negativeTagKey");
sb->safePrintf("</nobr></td>");
}
else {
char *xx=NULL;*xx=0;
}
// close it up
sb->safePrintf("</tr>\n");
/*
// hash the data into a int32_t for hash table
char *ns = "no";
if ( noSplit ) ns = "yes";
char *del = "";
if ( neg ) del = " (delete)";
if ( ks==12 ) {
key_t *k2 = (key_t *)k;
int64_t tid = g_indexdb.getTermId(k2);
uint8_t score8 = g_indexdb.getScore ( *k2 );
uint32_t score32 = score8to32 ( score8 );
log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
"tid=%"UINT64" score8=%"UINT32" score32=%"UINT32" nosplit=%s%s",
count,getDbnameFromId(rdbId),(int32_t)ks,
(int32_t)dataSize,tid ,(int32_t)score8,(int32_t)score32,
ns,del);
}
else {
log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
"nosplit=%s%s",
count,getDbnameFromId(rdbId),(int32_t)ks,
(int32_t)dataSize,ns,del);
}
*/
}
sb->safePrintf("</table>\n");
if ( sb == &tmp )
sb->print();
}
bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// do not do this if not test collection for now
if ( strcmp(cr->m_coll,"qatest123") ) return true;
log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");
// store each record in the list into the send buffers
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// first is rdbId
//char rdbId = -1; // m_rdbId;
//if ( rdbId < 0 ) rdbId = *p++;
uint8_t rdbId = *p++;
// mask off rdbId
rdbId &= 0x7f;
// get the key of the current record
//char *key = p;
// negative key?
bool del ;
if ( *p & 0x01 ) del = false;
else del = true;
// must always be negative if deleteing
// spiderdb is exempt because we add a spiderreply that is
// positive and a spiderdoc
// no, this is no longer the case because we add spider
// replies to the index when deleting or rejecting a doc.
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
// char *xx=NULL;*xx=0; }
// get the key size. a table lookup in Rdb.cpp.
int32_t ks ;
if ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 ) {
ks = 18;
// no compress bits set!
if ( p[0] & 0x06 ) { char*xx=NULL;*xx=0; }
// alignment bit set or cleared
if ( ! ( p[1] & 0x02 ) ) { char *xx=NULL;*xx=0; }
if ( ( p[7] & 0x02 ) ) { char *xx=NULL;*xx=0; }
int64_t docId = g_posdb.getDocId(p);
if ( docId != m_docId && !cr->m_indexSpiderReplies) {
log("xmldoc: %"INT64" != %"INT64""
, docId
, m_docId );
char *xx=NULL;*xx=0;
}
// else
// log("xmldoc: %"INT64" == %"INT64""
// , docId
// , m_docId );
// uint64_t termId = g_posdb.getTermId(p);
// if ( termId == 59194288760543LL ) {
// log("xmldoc: debug");
// //char *xx=NULL;*xx=0;
// }
}
else if ( rdbId == RDB_DATEDB ) ks = 16;
else ks = getKeySizeFromRdbId ( rdbId );
// sanity
if ( ks < 12 ) { char *xx=NULL;*xx=0; }
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
// another check
Rdb *rdb = getRdbFromId(rdbId);
if ( ! rdb ) { char *xx=NULL;*xx=0; }
if ( rdb->m_ks < 12 || rdb->m_ks > MAX_KEY_BYTES ) {
char *xx=NULL;*xx=0;}
// special linkdb check
//if ( rdbId == RDB_LINKDB ) {
// // parse it up
// key192_t *k = (key192_t *)p;
// unsigned char hc = g_linkdb.getLinkerHopCount_uk(k);
// if ( hc != 0 ){ char *xx=NULL;*xx=0; }
//}
char *rec = p;
// set this
//bool split = true;
//if(rdbId == RDB_POSDB && g_posdb.isShardedByTermId(p) )
// split =false;
// skip key
p += ks;
// . if key belongs to same group as firstKey then continue
// . titledb now uses last bits of docId to determine groupId
// . but uses the top 32 bits of key still
// . spiderdb uses last 64 bits to determine groupId
// . tfndb now is like titledb(top 32 bits are top 32 of docId)
//uint32_t gid = getGroupId ( rdbId , key , split );
// get the record, is -1 if variable. a table lookup.
int32_t dataSize;
if ( rdbId == RDB_POSDB || rdbId==RDB2_POSDB2)dataSize=0;
else if ( rdbId == RDB_DATEDB ) dataSize = 0;
//else if ( rdbId == RDB_REVDB ) dataSize = -1;
else if ( rdbId == RDB2_POSDB2 ) dataSize = 0;
else if ( rdbId == RDB2_DATEDB2 ) dataSize = 0;
//else if ( rdbId == RDB2_REVDB2 ) dataSize = -1;
else dataSize = getDataSizeFromRdbId ( rdbId );
// . for delete never stores the data
// . you can have positive keys without any dataSize member
// when they normally should have one, like titledb
if ( forDelete ) dataSize = 0;
// . negative keys have no data
// . this is not the case unfortunately
if ( del ) dataSize = 0;
// ensure spiderdb request recs have data/url in them
if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)rec ) &&
! forDelete &&
! del &&
dataSize == 0 ) {
char *xx=NULL;*xx=0; }
// if variable read that in
if ( dataSize == -1 ) {
// -1 means to read it in
dataSize = *(int32_t *)p;
// sanity check
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
// skip dataSize
p += 4;
}
// skip over the data, if any
p += dataSize;
// breach us?
if ( p > pend ) { char *xx=NULL;*xx=0; }
}
// must be exactly equal to end
if ( p != pend ) return false;
return true;
/*
int32_t recSize = 0;
int32_t count = 0;
for ( ; p < pend ; p += recSize , count++ ) {
// get rdbid
char rdbId = *p & 0x7f;
// get nosplit flag
char noSplit = *p & 0x80;
// skip
p++;
// get key size
int32_t ks = getKeySizeFromRdbId ( rdbId );
// sanity
if ( ks > 16 ) { char *xx=NULL;*xx=0;}
// negative key?
bool del;
if ( *p & 0x01 ) del = false;
else del = true;
// convert into a key128_t, the biggest possible key
char k[16];
gbmemcpy ( &k , p , ks );
// skip it
p += ks;
// flip this
char split = ! noSplit;
// test it
g_hostdb.getGroupId(rdbId,k,split);
// if negative, no data size allowed
if ( ( k[0] & 0x01 ) == 0x00 ) continue;
// get datasize
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
// no negative key has data
if ( del ) dataSize = 0;
// if -1, read it in
if ( dataSize == -1 ) {
dataSize = *(int32_t *)p;
// sanity check
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
p += 4;
}
// skip the data
p += dataSize;
}
*/
}
bool XmlDoc::hashMetaList ( HashTableX *ht ,
char *p ,
char *pend ,
bool checkList ) {
int32_t recSize = 0;
int32_t count = 0;
for ( ; p < pend ; p += recSize , count++ ) {
// breathe
QUICKPOLL(m_niceness);
// get rdbid
char rdbId = *p & 0x7f;
// skip rdb id
p++;
// save that
char *rec = p;
// get key size
int32_t ks = getKeySizeFromRdbId ( rdbId );
// sanity check
if ( ks > 28 ) { char *xx=NULL;*xx=0; }
// is it a delete key?
char del ;
if ( ( p[0] & 0x01 ) == 0x00 ) del = true;
else del = false;
// convert into a key128_t, the biggest possible key
char k[MAX_KEY_BYTES];//key128_t k ;
// zero out
KEYMIN(k,MAX_KEY_BYTES);
//k.setMin();
gbmemcpy ( k , p , ks );
// skip it
p += ks;
// if negative, no data size allowed -- no
if ( del ) continue;
// get datasize
int32_t dataSize = getDataSizeFromRdbId ( rdbId );
// if -1, read it in
if ( dataSize == -1 ) {
dataSize = *(int32_t *)p;
// sanity check
if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
p += 4;
}
// hash the data into a int32_t for hash table
//int32_t h32 = 0;
//h32 = hash32 ( p , dataSize );
// do not allow 0
//if ( h32 == 0 ) h32 = 1;
// skip the data
p += dataSize;
// ignore spiderdb recs for parsing consistency check
if ( rdbId == RDB_SPIDERDB ) continue;
if ( rdbId == RDB2_SPIDERDB2 ) continue;
// ignore tagdb as well!
if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;
// skip revdb for now too
//if ( rdbId == RDB_REVDB ) continue;
// set our rec size, includes key/dataSize/data
int32_t recSize = p - rec;
// debug point
//if ( *(uint64_t *)k == 4828936067112479745LL )
// log("hey");
// if just adding, do it
if ( ! checkList ) {
// we now store ptr to the rec, not hash!
if ( ! ht->addKey ( k , &rec ) ) return false;
continue;
}
// check to see if this rec is in the provided hash table
int32_t slot = ht->getSlot ( k );
// bitch if not found
if ( slot < 0 && ks==12 ) {
key144_t *k2 = (key144_t *)k;
int64_t tid = g_posdb.getTermId(k2);
char shardByTermId = g_posdb.isShardedByTermId(k2);
//uint8_t score8 = g_indexdb.getScore ( *k2 );
//uint32_t score32 = score8to32 ( score8 );
log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
"tid=%"UINT64" "
"key=%s "
//"score8=%"UINT32" score32=%"UINT32" "
"shardByTermId=%"INT32"",
count,getDbnameFromId(rdbId),(int32_t)ks,
(int32_t)dataSize,tid ,
//(int32_t)score8,(int32_t)score32,
KEYSTR(k2,ks),
(int32_t)shardByTermId);
// look it up
// int16_tcut
HashTableX *wt = m_wts;
// point to keys, termids?
//TermInfo **tp = (TermInfo **)wt->m_keys;
// now print the table we stored all we hashed into
for ( int32_t i = 0 ; i < wt->m_numSlots ; i++ ) {
// skip if empty
if ( wt->m_flags[i] == 0 ) continue;
// get the TermInfo
TermDebugInfo *ti;
ti = (TermDebugInfo *)wt->getValueFromSlot(i);
// skip if not us
if((ti->m_termId & TERMID_MASK)!=tid)continue;
// got us
char *start = m_wbuf.getBufStart();
char *term = start + ti->m_termOff;
char *prefix = "";
if ( ti->m_prefixOff >= 0 ) {
prefix = start + ti->m_prefixOff;
//prefix[ti->m_prefixLen] = '\0';
}
// NULL term it
term[ti->m_termLen] = '\0';
// print it
log("parser: term=%s prefix=%s",//score32=%"INT32"",
term,prefix);//,(int32_t)ti->m_score32);
}
char *xx=NULL; *xx=0;
// count it for PageStats.cpp
g_stats.m_parsingInconsistencies++;
continue;
}
if ( slot < 0 && ks != 12 ) {
// if it is sectiondb and the orig doc did not
// add sectiondb recs because m_totalSiteVoters >=
// MAX_SITE_VOTERS, then that is ok!
if ( (rdbId == RDB_SECTIONDB ||
rdbId == RDB2_SECTIONDB2 ) &&
m_sectionsValid &&
m_sections.m_totalSiteVoters >= MAX_SITE_VOTERS )
continue;
log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
"ks=%s "
,count,getDbnameFromId(rdbId),(int32_t)ks,
(int32_t)dataSize,KEYSTR(k,ks));
char *xx=NULL; *xx=0;
// count it for PageStats.cpp
g_stats.m_parsingInconsistencies++;
continue;
}
// if in there, check the hashes
//int32_t h2 = *(int32_t *)ht->getValueFromSlot ( slot );
char *rec2 = *(char **)ht->getValueFromSlot ( slot );
// get his dataSize
int32_t dataSize2 = getDataSizeFromRdbId(rdbId);
// his keysize
int32_t ks2 = getKeySizeFromRdbId(rdbId);
// get his recsize
int32_t recSize2 = ks2 ;
// if -1 that is variable
if ( dataSize2 == -1 ) {
dataSize2 = *(int32_t *)(rec2+ks2);
recSize2 += 4;
}
// add it up
recSize2 += dataSize2;
// keep on chugging if they match
if ( recSize2==recSize && !memcmp(rec,rec2,recSize) ) continue;
// otherwise, bitch
char shardByTermId = false;
if ( rdbId == RDB_POSDB )
shardByTermId = g_posdb.isShardedByTermId(rec2);
log("build: data not equal for key=%s "
"rdb=%s splitbytermid=%"INT32" dataSize=%"INT32"",
KEYSTR(k,ks2),
getDbnameFromId(rdbId),(int32_t)shardByTermId,dataSize);
// print into here
SafeBuf sb1;
SafeBuf sb2;
// print it out
if ( rdbId == RDB_SPIDERDB ) {
// get rec
if ( g_spiderdb.isSpiderRequest((key128_t *)rec) ) {
SpiderRequest *sreq1 = (SpiderRequest *)rec;
SpiderRequest *sreq2 = (SpiderRequest *)rec2;
sreq1->print(&sb1);
sreq2->print(&sb2);
}
else {
SpiderReply *srep1 = (SpiderReply *)rec;
SpiderReply *srep2 = (SpiderReply *)rec2;
srep1->print(&sb1);
srep2->print(&sb2);
}
log("build: rec1=%s",sb1.getBufStart());
log("build: rec2=%s",sb2.getBufStart());
}
char *xx=NULL; *xx=0;
// count it for PageStats.cpp
g_stats.m_parsingInconsistencies++;
}
return true;
}
/*
bool checkRegex ( SafeBuf *regex ,
char *target ,
bool *boolVal ,
bool *boolValValid ,
int32_t *compileError ,
CollectionRec *cr ) {
if ( compileError ) *compileError = false;
if ( *boolValValid )
return *boolVal;
// if not using diffbot or there is no regex, it is ok
if ( regex->length() <= 0 ) {
*boolVal = true;
*boolValValid = true;
return boolVal;
}
// whip out the regex shit i guess...
regex_t buf;
// this will store the compiled regular expression into "buf"
int32_t ret = regcomp ( &buf ,
// the actual regular expression to compile
regex->getBufStart() ,
// some flags
REG_EXTENDED|REG_ICASE|REG_NEWLINE|REG_NOSUB);
if ( ret ) {
//g_errno = ret;
if ( compileError ) *compileError = errno;
log("xmldoc: diffbot regcomp failed: %s. This should have "
"been tested before crawl was started. Ignoring.",
mstrerror(errno));
return true;
}
// now see if it is a match
if ( regexec(&buf,target,0,NULL,0) ) *boolVal = true;
else *boolVal = false;
*boolValValid = true;
return boolVal;
}
*/
// . should we send this url off to diffbot or processing?
// . if the url's downloaded content does not match the provided regex
// in m_diffbotPageProcessPattern, then we do not send the url to diffbot
// for processing
// . make sure this regex is pre-tested before starting the crawl
// so we know it compiles
bool XmlDoc::doesUrlMatchDiffbotCrawlPattern() {
if ( m_matchesCrawlPatternValid )
return m_matchesCrawlPattern;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// get the compiled regular expressions
regex_t *ucr = &cr->m_ucr;
if ( ! cr->m_hasucr ) ucr = NULL;
if ( ! m_firstUrlValid ) return false;
m_matchesCrawlPatternValid = true;
m_matchesCrawlPattern = false;
Url *furl = getFirstUrl();
char *url = furl->getUrl();
// if we had a url crawl regex then regexec will return non-zero
// if our url does NOT match i guess
if ( ucr && regexec(ucr,url,0,NULL,0) )
return false;
// int16_tcut
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// do not require a match on ucp if ucr is given
if ( ucp && ! ucr && ! doesStringContainPattern(url,ucp) )
return false;
m_matchesCrawlPattern = true;
return true;
}
/*
bool XmlDoc::doesUrlMatchDiffbotProcessPattern() {
return checkRegex ( &cr->m_diffbotUrlProcessPattern ,
m_firstUrl.m_url ,
&m_diffbotUrlProcessPatternMatch,
&m_diffbotUrlProcessPatternMatchValid,
NULL,
cr);
}
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
return checkRegex ( &cr->m_diffbotPageProcessPattern ,
ptr_utf8Content,
&m_diffbotPageProcessPatternMatch,
&m_diffbotPageProcessPatternMatchValid,
NULL,
cr);
}
*/
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
char *p = cr->m_diffbotPageProcessPattern.getBufStart();
// empty? no pattern matches everything.
if ( ! p ) return true;
if ( ! m_content ) return false;
// how many did we have?
return doesStringContainPattern ( m_content , p );
}
int32_t *XmlDoc::reindexJSONObjects ( int32_t *newTitleHashes,
int32_t numNewHashes ) {
return redoJSONObjects (newTitleHashes,numNewHashes,false );
}
int32_t *XmlDoc::nukeJSONObjects ( int32_t *newTitleHashes ,
int32_t numNewHashes ) {
return redoJSONObjects (newTitleHashes,numNewHashes,true );
}
// . returns ptr to status
// . diffbot uses this to remove the indexed json pages associated with
// a url. each json object is basically its own url. a json object
// url is the parent page's url with a -diffbotxyz-%"UINT32" appended to it
// where %"INT32" is the object # starting at 0 and incrementing from there.
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
int32_t *XmlDoc::redoJSONObjects ( int32_t *newTitleHashes ,
int32_t numNewHashes ,
bool deleteFromIndex ) {
// use this
static int32_t s_return = 1;
// if none, we are done
if ( m_diffbotJSONCount <= 0 ) return &s_return;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// i was trying to re-index some diffbot json docs in the global
// index but it wasn't set as custom crawl
//if ( ! cr->m_isCustomCrawl ) return &s_return;
// already did it?
if ( m_joc >= m_diffbotJSONCount ) return &s_return;
// new guy here
if ( ! m_dx ) {
try { m_dx = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
log("xmldoc: failed to alloc m_dx");
return NULL;
}
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
}
//
// index the hashes of the latest diffbot json items for this parent
//
HashTableX dedup;
if ( ! dedup.set(4,0,numNewHashes*4,NULL,0,false,m_niceness,"njodt") )
return NULL;
for ( int32_t i = 0 ; i < numNewHashes ; i++ )
dedup.addKey ( &newTitleHashes[i] );
// get this old doc's current title hashes
int32_t numOldHashes;
int32_t *oldTitleHashes = getDiffbotTitleHashes ( &numOldHashes );
// sanity. should return right away without having to block
if ( oldTitleHashes == (void *)-1 ) { char *xx=NULL;*xx=0; }
//int32_t count = m_diffbotJSONCount;
// sanity again
if ( numOldHashes != m_diffbotJSONCount ) {
log("build: can't remove json objects. "
"jsoncount mismatch %"INT32" != %"INT32
,numOldHashes
,m_diffbotJSONCount
);
g_errno = EBADENGINEER;
return NULL;
//count = 0;
//char *xx=NULL;*xx=0;
}
// scan down each
for ( ; m_joc < m_diffbotJSONCount ; ) {
// only NUKE the json items for which title hashes we lost
int32_t th32 = oldTitleHashes[m_joc];
// . if still in the new diffbot reply, do not DELETE!!!
// . if there was no title, it uses hash of entire object
if ( deleteFromIndex && dedup.isInTable(&th32) ) {
m_joc++;
continue;
}
// if m_dx has no url set, call set4 i guess
if ( ! m_dx->m_firstUrlValid ) {
// make the fake url for this json object for indexing
SafeBuf fakeUrl;
fakeUrl.set ( m_firstUrl.getUrl() );
// get his title hash32
//int32_t jsonTitleHash32 = titleHashes[m_joc];
// append -diffbotxyz%"UINT32" for fake url
fakeUrl.safePrintf("-diffbotxyz%"UINT32"",
(uint32_t)th32);
// set url of new xmldoc
if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
cr->m_coll ,
NULL , // pbuf
m_niceness ) )
// g_errno should be set!
return NULL;
// we are indexing json objects, don't use all these
m_dx->m_useClusterdb = false;
m_dx->m_useSpiderdb = false;
m_dx->m_useTagdb = false;
m_dx->m_usePlacedb = false;
m_dx->m_useLinkdb = false;
m_dx->m_isChildDoc = true;
m_dx->m_parentDocPtr = this;
// are we doing a query reindex or a nuke?
m_dx->m_deleteFromIndex = deleteFromIndex;//true;
// do not try to download this url
if ( ! deleteFromIndex )
m_dx->m_recycleContent = true;
// we need this because only m_dx->m_oldDoc will
// load from titledb and have it set
m_dx->m_isDiffbotJSONObject = true;
// for debug
char *str = "reindexing";
if ( deleteFromIndex ) str = "nuking";
log("xmldoc: %s %s",str,fakeUrl.getBufStart());
}
// when the indexdoc completes, or if it blocks, call us!
// we should just pass through here
m_dx->setCallback ( m_masterState , m_masterLoop );
// . this should ultimately load from titledb and not
// try to download the page since m_deleteFromIndex is
// set to true
// . if m_dx got its msg4 reply it ends up here, in which
// case do NOT re-call indexDoc() so check for
// m_listAdded.
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
return (int32_t *)-1;
// critical error on our part trying to index it?
// does not include timeouts or 404s, etc. mostly just
// OOM errors.
if ( g_errno ) return NULL;
// count as deleted
cr->m_localCrawlInfo.m_objectsDeleted++;
cr->m_globalCrawlInfo.m_objectsDeleted++;
cr->m_needsSave = true;
// but gotta set this crap back
//log("diffbot: resetting %s",m_dx->m_firstUrl.m_url);
// clear for next guy if there is one. clears
// m_dx->m_contentValid so the set4() can be called again above
m_dx->reset();
// try to do more json objects indexed from this parent doc
m_joc++;
}
// nuke it
mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
delete ( m_dx );
m_dx = NULL;
return &s_return;
}
void getMetaListWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// note it
THIS->setStatus ( "in get meta list wrapper" );
// get it
char *ml = THIS->getMetaList ( );
// sanity check
if ( ! ml && ! g_errno ) {
log("doc: getMetaList() returned NULL without g_errno");
sleep(5);
char *xx=NULL;*xx=0;
}
// return if it blocked
if ( ml == (void *)-1 ) return;
// sanityh check
if ( THIS->m_callback1 == getMetaListWrapper ) { char *xx=NULL;*xx=0;}
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
XmlDoc *g_od = NULL;
// . returns NULL and sets g_errno on error
// . make a meta list to call Msg4::addMetaList() with
// . called by Msg14.cpp
// . a meta list is just a buffer of Rdb records of the following format:
// rdbid | rdbRecord
// . meta list does not include title rec since Msg14 adds that using Msg1
// . returns false and sets g_errno on error
// . sets m_metaList ptr and m_metaListSize
// . if "deleteIt" is true, we are a delete op on "old"
// . returns (char *)-1 if it blocks and will call your callback when done
// . generally only Repair.cpp changes these use* args to false
char *XmlDoc::getMetaList ( bool forDelete ) {
if ( m_metaListValid ) return m_metaList;
setStatus ( "getting meta list" );
// force it true?
// "forDelete" means we want the metalist to consist of "negative"
// keys that will annihilate with the positive keys in the index,
// posdb and the other rdbs, in order to delete them. "deleteFromIndex"
// means to just call getMetaList(tre) on the m_oldDoc (old XmlDoc)
// which is built from the titlerec in Titledb. so don't confuse
// these two things. otherwise when i add this we were not adding
// the spiderreply of "Doc Force Deleted" from doing a query reindex
// and it kept repeating everytime we started gb up.
//if ( m_deleteFromIndex ) forDelete = true;
// assume valid
m_metaList = "";
m_metaListSize = 0;
// . internal callback
// . so if any of the functions we end up calling directly or
// indirectly block, this callback will be called
if ( ! m_masterLoop ) {
m_masterLoop = getMetaListWrapper;
m_masterState = this;
}
// returning from a handler that had an error?
if ( g_errno ) return NULL;
// if we are a spider status doc/titlerec and we are doing a rebuild
// operation, then keep it simple
if ( m_setFromTitleRec &&
m_useSecondaryRdbs &&
m_contentTypeValid &&
m_contentType == CT_STATUS ) {
// if not rebuilding posdb then done, list is empty since
// spider status docs do not contribute to linkdb, clusterdb,..
if ( ! m_usePosdb && ! m_useTitledb ) {
m_metaListValid = true;
return m_metaList;
}
/////////////
//
// if user disabled spider status docs then delete the titlerec
// AND the posdb index list from our dbs for this ss doc
//
/////////////
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( ! cr->m_indexSpiderReplies ) {
int64_t uh48 = m_firstUrl.getUrlHash48();
// delete title rec. true = delete?
key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
// shortcut
SafeBuf *ssb = &m_spiderStatusDocMetaList;
// add to list. and we do not add the spider status
// doc to posdb since we deleted its titlerec.
ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
ssb->safeMemcpy ( &tkey , sizeof(key_t) );
m_metaList = ssb->getBufStart();
m_metaListSize = ssb->getLength ();
m_metaListValid = true;
return m_metaList;
}
// set safebuf to the json of the spider status doc
SafeBuf jd;
if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
return NULL;
// set m_spiderStatusDocMetaList from the json
if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
return NULL;
// TODO: support titledb rebuild as well
m_metaList = m_spiderStatusDocMetaList.getBufStart();
m_metaListSize = m_spiderStatusDocMetaList.getLength();
m_metaListValid = true;
return m_metaList;
}
// any other indexing issue? hey! g_errno might not be set here
//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }
// a hacky thing
//XmlDoc *od = (XmlDoc *)1;
//bool diffbotEmptyReply = false;
/*
// fake this for diffbot?
if ( m_useDiffbot &&
! m_isDiffbotJSONObject &&
! doesUrlMatchDiffbotCrawlPattern() ) {
// flag it so we only add the SpiderReply to spiderdb and bail
//diffbotEmptyReply = true;
// we should not delete the json objects for this url
// from the index just because the user decided to remove
// it from her crawl
m_isIndexedValid = true;
m_isIndexed = false;
m_oldDocValid = true;
m_oldDoc = NULL;
}
*/
// if "rejecting" from index fake all this stuff
if ( m_deleteFromIndex ) {
// if we are using diffbot api and diffbot found no json objects
// or we never even processed the url, we really just want to
// add the SpiderReply for this url to spiderdb and nothing more.
// NO! we still want to store the page content in titledb
// so we can see if it has changed i guess
//diffbotEmptyReply ) {
// set these things to bogus values since we don't need them
m_contentHash32Valid = true;
m_contentHash32 = 0;
m_httpStatusValid = true;
m_httpStatus = 200;
m_siteValid = true;
ptr_site = "www.poopoo.com";
size_site = gbstrlen(ptr_site)+1;
m_isSiteRootValid = true;
m_isSiteRoot2 = 1;
//m_tagHash32Valid = true;
//m_tagHash32 = 0;
m_tagPairHash32Valid = true;
m_tagPairHash32 = 0;
m_siteHash64Valid = true;
m_siteHash64 = 0LL;
m_spiderLinksValid = true;
m_spiderLinks2 = 1;
m_langIdValid = true;
m_langId = 1;
m_siteNumInlinksValid = true;
m_siteNumInlinks = 0;
m_isIndexed = true;
m_isIndexedValid = true;
m_ipValid = true;
m_ip = 123456;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
//
// BEGIN MULTI DOC QUERY REINDEX HACK
//
// this fixes it so we can do a query reindex on fake child urls
// of their original parent multidoc url. the child urls are
// subsections of the original parent url that were indexed as
// separate documents with their own docid. if we try to do a
// query reindex on such things, detect it, and add the request
// for the original parent multidoc url.
//
if ( m_sreqValid && m_sreq.m_isPageReindex &&
// if it is a force delete, then allow the user to delete
// such diffbot reply json children documents, however.
! m_sreq.m_forceDelete ) {
// see if its diffbot json object
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
XmlDoc *od = *pod;
// if no old doc then we might have just been a diffbot
// json url that was directly injected into GLOBAL-INDEX
// like xyz.com/-diffbotxyz12345 (my format) or
if ( ! od ) goto skip9;
// if we are indexing a subdoc piece of a multidoc url
// then parentUrl should return non-NULL
char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
if ( ! parentUrl && od->m_contentType != CT_STATUS )
goto skip9;
// in that case we need to reindex the parent url not the
// subdoc url, so make the spider reply gen quick
//SpiderReply *newsr = od->getFakeSpiderReply();
//if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
// use our ip though
//newsr->m_firstIp = od->m_firstIp;
// however we have to use our docid-based spider request
SpiderReply srep;
srep.reset();
// it MUST match up with original spider request so the
// lock key in Spider.cpp can unlock it. that lock key
// uses the "uh48" (48bit hash of the url) and "srep.m_firstIp"
// in this case the SpiderRequest, sreq, is docid-based because
// it was added through PageReindex.cpp (query reindex) so
// it will be the 48 bit hash64b() of the docid
// (see PageReindex.cpp)'s call to SpiderRequest::setKey()
srep.m_firstIp = m_sreq.m_firstIp;
// assume no error
srep.m_errCount = 0;
// do not inherit this one, it MIGHT HAVE CHANGE!
srep.m_siteHash32 = m_sreq.m_siteHash32;
srep.m_domHash32 = m_sreq.m_domHash32;
srep.m_spideredTime = getTimeGlobal();
int64_t uh48 = m_sreq.getUrlHash48();
int64_t parentDocId = 0LL;
srep.m_contentHash32 = 0;
// were we already in titledb before we started spidering?
// yes otherwise we would have called "goto skip9" above
srep.m_wasIndexed = 1;
srep.m_wasIndexedValid = 1;
srep.m_isIndexed = 1;
srep.m_isIndexedINValid = false;
srep.m_errCode = EREINDEXREDIR; // indexCode
srep.m_downloadEndTime = 0;
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
// lock of request needs to match that of reply so the
// reply, when recevied by Rdb.cpp which calls addSpiderReply()
// can unlock this url so it can be spidered again.
int64_t lock1 = makeLockTableKey(&m_sreq);
int64_t lock2 = makeLockTableKey(&srep);
if ( lock1 != lock2 ) { char *xx=NULL;*xx=0; }
// make a fake spider reply so this docid-based spider
// request is not used again
//SpiderReply srep;
// store the rdbid
char rd = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
if ( ! m_zbuf.pushChar(rd) )
return NULL;
// store that reply to indicate this spider request has
// been fulfilled!
if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
return NULL;
// but also store a new spider request for the parent url
SpiderRequest ksr;
int64_t pd;
// skip if doc is a spider status "document". their docids
// often get added during a query reindex but we should ignore
// them completely.
if ( od->m_contentType == CT_STATUS )
goto returnList;
//goto returnList;
// complain
if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
log("build: doing query reindex but diffbot api "
"url is not set in spider controls");
// just copy original request
gbmemcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
// do not spider links, it's a page reindex of a multidoc url
ksr.m_avoidSpiderLinks = 1;
// avoid EDOCUNCHANGED
ksr.m_ignoreDocUnchangedError = 1;
// no longer docid based we set it to parentUrl
ksr.m_urlIsDocId = 0;
// but consider it a manual add. this should already be set.
ksr.m_isPageReindex = 1;
// but it is not docid based, so overwrite the docid
// in ksr.m_url with the parent multidoc url. it \0 terms it.
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
// this must be valid
//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
// set the key, ksr.m_key. isDel = false
// fake docid
pd = g_titledb.getProbableDocId(parentUrl);
ksr.setKey ( m_sreq.m_firstIp, pd , false );
// store this
if ( ! m_zbuf.pushChar(rd) )
return NULL;
// then the request
if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
return NULL;
returnList:
// prevent cores in indexDoc()
m_indexCode = EREINDEXREDIR;
m_indexCodeValid = true;
// for now we set this crap
m_metaList = m_zbuf.getBufStart();
m_metaListSize = m_zbuf.length();
m_metaListValid = true;
return m_metaList;
}
//
// END DIFFBOT OBJECT QUERY REINDEX HACK
//
skip9:
// get our checksum
int32_t *plainch32 = getContentHash32();
if ( ! plainch32 || plainch32 == (void *)-1 ) return (char *)plainch32;
// get this too
int16_t *hs = getHttpStatus ();
if ( ! hs || hs == (void *)-1 ) return (char *)hs;
// make sure site is valid
char *site = getSite();
if ( ! site || site == (void *)-1 ) return (char *)site;
// this seems to be an issue as well for "unchanged" block below
char *isr = getIsSiteRoot();
if ( ! isr || isr == (void *)-1 ) return (char *)isr;
// get hash of all tags from tagdb that affect what we index
//int32_t *tagHash = getTagHash32 ( );
//if ( ! tagHash || tagHash == (void *)-1 ) return (char *)tagHash;
int64_t *sh64 = getSiteHash64();
if ( ! sh64 || sh64 == (void *)-1 ) return (char *)sh64;
// make sure docid valid
int64_t *mydocid = getDocId();
if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
// . get the old version of our XmlDoc from the previous spider time
// . set using the old title rec in titledb
// . should really not do any more than set m_titleRec...
// . should not even uncompress it!
// . getNewSpiderReply() will use this to set the reply if
// m_indexCode == EDOCUNCHANGED...
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
// point to the old xml doc if no error, etc.
XmlDoc *od = *pod;
// check if we are already indexed
char *isIndexed = getIsIndexed ();
if ( ! isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
// do not delete anything in these cases, but do remove us from
// spiderdb, and from tfndb (except for EDOCNOTNEW)
//if ( m_indexCode == EDOCNOTNEW || m_indexCode == EDOCNOTOLD )
// od = NULL;
// why call this way down here? it ends up downloading the doc!
int32_t *indexCode = getIndexCode();
if ( ! indexCode || indexCode ==(void *)-1) return (char *)indexCode;
// sanity check
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
// this means to abandon the injection
if ( *indexCode == EABANDONED ||
*indexCode == EHITCRAWLLIMIT ||
*indexCode == EHITPROCESSLIMIT ) {
m_metaList = (char *)0x123456;
m_metaListSize = 0;
m_metaListValid = true;
return m_metaList;
}
// if diffbot reply is empty, don't bother adding anything except
// for the spider reply... reply might be "-1" too!
//if ( m_useDiffbot &&
// ! m_isDiffbotJSONObject &&
// m_diffbotReplyValid &&
// m_diffbotReply.length() <= 3 )
// diffbotEmptyReply = true;
// . some index code warrant retries, like EDNSTIMEDOUT, ETCPTIMEDOUT,
// etc. these are deemed temporary errors. other errors basically
// indicate a document that will never be indexable and should,
// if currently indexed, be deleted.
// . just add the spider reply and we're done
if ( *indexCode == EDNSTIMEDOUT
|| *indexCode == ETCPTIMEDOUT
|| *indexCode == EUDPTIMEDOUT
|| *indexCode == EDNSDEAD
|| *indexCode == ENETUNREACH
|| *indexCode == EHOSTUNREACH
// . rejected from a diffbot regex url crawl filter?
// . or no json objects returned from diffbot?
// . or rejected from the processign regex filter?
// . then just add the SpiderReply to avoid respidering
// . NO! still need to add outlinks
//|| diffbotEmptyReply
// . treat this as a temporary error i guess
// . getNewSpiderReply() below will clear the error in it and
// copy stuff over from m_sreq and m_oldDoc for this case
|| *indexCode == EDOCUNCHANGED
) {
// sanity - in repair mode?
if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
// . this seems to be an issue for blocking
// . if we do not have a valid ip, we can't compute this,
// in which case it will not be valid in the spider reply
// . why do we need this for timeouts etc? if the doc is
// unchanged
// we should probably update its siteinlinks in tagdb
// periodically and reindex the whole thing...
// . i think we were getting the sitenuminlinks for
// getNewSpiderReply()
if ( m_ipValid &&
m_ip != 0 &&
m_ip != -1 ) {
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
}
// all done!
bool addReply = true;
// Scraper.cpp uses this
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
// page parser calls set4 and sometimes gets a dns time out!
if ( m_sreqValid && m_sreq.m_isPageParser ) addReply = false;
// return nothing if done
if ( ! addReply ) {
m_metaListSize = 0;
m_metaList = (char *)0x1;
return m_metaList;
}
// save this
int32_t savedCode = *indexCode;
// before getting our spider reply, assign crap from the old
// doc to us since we are unchanged! this will allow us to
// call getNewSpiderReply() without doing any processing, like
// setting the Xml or Words classes, etc.
copyFromOldDoc ( od );
// need this though! i don't want to print out "Success"
// in the log in the logIt() function
m_indexCode = savedCode;
m_indexCodeValid = true;
// but set our m_contentHash32 from the spider request
// which got it from the spiderreply in the case of
// EDOCUNCHANGED. this way ch32=xxx will log correctly.
// I think this is only when EDOCUNCHANGED is set in the
// Msg13.cpp code, when we have a spider compression proxy.
if ( *indexCode == EDOCUNCHANGED &&
m_sreqValid &&
! m_contentHash32Valid ) {
m_contentHash32 = m_sreq.m_contentHash32;
m_contentHash32Valid = true;
}
// we need these got getNewSpiderReply()
m_wasInIndex = false;
if ( od ) m_wasInIndex = true;
m_isInIndex = m_wasInIndex;
m_wasInIndexValid = true;
m_isInIndexValid = true;
// unset our ptr_linkInfo1 so we do not free it and core
// since we might have set it in copyFromOldDoc() above
ptr_linkInfo1 = NULL;
size_linkInfo1 = 0;
m_linkInfo1Valid = false;
bool indexNewTimeStamp = false;
if ( getUseTimeAxis() &&
od &&
m_hasMetadata &&
*indexCode == EDOCUNCHANGED
//m_spideredTimeValid &&
//od->m_spideredTime != m_spideredTime
)
indexNewTimeStamp = true;
// . if not using spiderdb we are done at this point
// . this happens for diffbot json replies (m_dx)
if ( ! m_useSpiderdb && ! indexNewTimeStamp ) {
m_metaList = NULL;
m_metaListSize = 0;
return (char *)0x01;
}
// get our spider reply
SpiderReply *newsr = getNewSpiderReply();
// return on error
if ( ! newsr ) return (char *)newsr;
// . panic on blocking! this is supposed to be fast!
// . it might still have to lookup the tagdb rec?????
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
// how much we need
int32_t needx = sizeof(SpiderReply) + 1;
// . INDEX SPIDER REPLY (1a)
// . index ALL spider replies as separate doc. error or not.
// . then print out error histograms.
// . we should also hash this stuff when indexing the
// doc as a whole
// i guess it is safe to do this after getting the spiderreply
SafeBuf *spiderStatusDocMetaList = NULL;
// if ( cr->m_indexSpiderReplies &&
// m_useSpiderdb &&
// // doing it for diffbot throws off smoketests.
// // yeah, but we need it, so we'll just have to update
// // the smoketests
// ! cr->m_isCustomCrawl ) {
// get the spiderreply ready to be added
spiderStatusDocMetaList = getSpiderStatusDocMetaList(newsr ,
forDelete);
// error?
if ( ! spiderStatusDocMetaList ) return NULL;
// blocked?
if (spiderStatusDocMetaList==(void *)-1)
return (char *)-1;
// . now append the new stuff.
// . we overwrite the old titlerec with the new one that has
// some more json in the ptr_metaInfo buffer so we hash
// its new timestamp. 'gbspiderdate' and any info from
// the meta info given in the injection request if there.
// this allows you to tag each document, even multiple
// versions of the same url with the same content. so if
// you spider the doc again and it is unchanged since last
// time we still index some of this meta stuff.
if ( indexNewTimeStamp )
appendNewMetaInfo (spiderStatusDocMetaList,forDelete);
// need to alloc space for it too
int32_t len = spiderStatusDocMetaList->length();
needx += len;
// this too
m_addedStatusDocSize = len;
m_addedStatusDocSizeValid = true;
//}
// doledb key?
//if ( m_doledbKey.n0 || m_doledbKey.n1 )
// needx += 1 + sizeof(key_t); // + 4;
// the FAKEDB unlock key for msg12 in spider.cpp
//needx += 1 + sizeof(key_t); // FAKEDB
// make the buffer
m_metaList = (char *)mmalloc ( needx , "metalist");
if ( ! m_metaList ) return NULL;
// save size for freeing later
m_metaListAllocSize = needx;
// ptr and boundary
m_p = m_metaList;
m_pend = m_metaList + needx;
// save it
char *saved = m_p;
// first store spider reply "document"
if ( spiderStatusDocMetaList ) {
gbmemcpy ( m_p,
spiderStatusDocMetaList->getBufStart(),
spiderStatusDocMetaList->length() );
m_p += spiderStatusDocMetaList->length();
}
/*
Not any more, now we remove from doledb as soon
as we get all the lock grants in our group (shard)
using Msg4 in Spider.cpp. That way we can add a
"0" entry into the waiting tree (or a time X ms into
the future from now) to try to enforce a sameIpWait
constraint and also allow up to maxSpidersPerIP.
// remove from doledb if we had a valid key
// (BEFORE adding SpiderReply)
if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
// note it
setStatus ( "removing key from doledb" );
// . now remove the original spider rec from "doledb"
// . rdbid first
*m_p = RDB_DOLEDB;
m_p++;
// then the key
*(key_t *)m_p = m_doledbKey;
// nukey, clear del bit to delete it
*m_p = *m_p & 0xfe;
// skip key
m_p += sizeof(key_t);
// then zero for data size
// *(int32_t *)m_p = 0;
//m_p += 4;
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
}
*/
// sanity check
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// . make a fake titledb key
// . remove the spider lock (Msg12 in Spider.cpp)
// . now SPider.cpp uses SpiderReply reception to remove lock
// - mdw 9/28/13
//*m_p++ = RDB_FAKEDB;
//*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
//key_t fakeKey;
//fakeKey.n1 = 0;
//fakeKey.n0 = m_docId;
//gbmemcpy ( m_p , &fakeKey , sizeof(key_t) );
//m_p += sizeof(key_t);
// now add the new rescheduled time
setStatus ( "adding SpiderReply to spiderdb" );
// rdbid first
char rd = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
*m_p++ = rd;
// get this
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
// store the spider rec
int32_t newsrSize = newsr->getRecSize();
gbmemcpy ( m_p , newsr , newsrSize );
m_p += newsrSize;
m_addedSpiderReplySize = newsrSize;
m_addedSpiderReplySizeValid = true;
// sanity check
if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
// verify it
m_metaListValid = true;
// set size
m_metaListSize = m_p - m_metaList;
// all done
return m_metaList;
}
// if using diffbot do not index the content of the web page we
// got the json objects from, although, do keep it cached in titledb
// because that can be useful
// Not any more, now index the pages as well! then restrict search
// to type:json to just search json objects.
//if ( m_useDiffbot && ! m_isDiffbotJSONObject ) {
// m_usePosdb = false;
// m_useClusterdb = false;
//}
// get the old meta list if we had an old doc
char *oldList = NULL;
int32_t oldListSize = 0;
if ( od ) {
od->m_useSpiderdb = false;
od->m_useTagdb = false;
// do not use diffbot for old doc since we call
// od->nukeJSONObjects below()
od->m_diffbotApiUrlValid = true;
// api url should be empty by default
//od->m_diffbotApiNum = DBA_NONE;
//log("break it here. shit this is not getting the list!!!");
// if we are doing diffbot stuff, we are still indexing this
// page, so we need to get the old doc meta list
oldList = od->getMetaList ( true );
oldListSize = od->m_metaListSize;
if ( ! oldList || oldList ==(void *)-1) return (char *)oldList;
}
// . set whether we should add recs to titledb, posdb, linkdb, etc.
// . if this doc is set by titlerec we won't change these
// . we only turn off m_usePosdb, etc. if there is a
// <meta name=noindex content=1>
// . we will still add to spiderdb, but not posdb, linkdb, titledb
// and clusterdb.
// . so we'll add the spiderreply for this doc and the spiderrequests
// for all outlinks and "firstIp" tagrecs to tagdb for those outlinks
// . we use this for adding the url seed file gbdmoz.urls.txt
// which contains a list of all the dmoz urls we want to spider.
// gbdmoz.urls.txt is generated by dmozparse.cpp. we spider all
// these dmoz urls so we can search the CONTENT of the pages in dmoz,
// something dmoz won't let you do.
char *mt = hasNoIndexMetaTag();
if ( ! mt || mt == (void *)-1 ) return (char *)mt;
if ( *mt ) {
m_usePosdb = false;
m_useLinkdb = false;
m_useTitledb = false;
m_useClusterdb = false;
// do not add the "firstIp" tagrecs of the outlinks any more
// because it might hurt us?
m_useTagdb = false;
}
if ( cr->m_isCustomCrawl )
m_useLinkdb = false;
// . should we recycle the diffbot reply for this url?
// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
// our existing diffbot reply, i.e. recycle it, even though we
// respidered this page.
bool *recycle = getRecycleDiffbotReply();
if ( ! recycle || recycle == (void *)-1) return (char *)recycle;
// in that case inherit this from the old doc...
if ( od && *recycle && od->m_diffbotJSONCount &&
// somehow i am seeing that this is empty!
// this is how many title hashes of diffbot replies we've
// stored in the old doc's titlerec. if these are not equal
// and we call reindexJSONObjects() below then it cores
// in redoJSONObjects().
od->size_linkInfo2/4 == od->m_diffbotJSONCount &&
// only call this once otherwise we double stock
// m_diffbotTitleHashBuf
m_diffbotJSONCount == 0 ) {//cr->m_isCustomCrawl){
m_diffbotJSONCount = od->m_diffbotJSONCount;
m_sentToDiffbot = od->m_sentToDiffbot;
m_gotDiffbotSuccessfulReply = od->m_gotDiffbotSuccessfulReply;
// copy title hashes info. it goes hand in hand with the
// NUMBER of diffbot items we have.
int nh = 0;
int32_t *ohbuf = od->getDiffbotTitleHashes ( &nh );
if ( ! m_diffbotTitleHashBuf.safeMemcpy ( ohbuf , nh*4 ) )
return NULL;
ptr_linkInfo2 =(LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
size_linkInfo2=m_diffbotTitleHashBuf.length();
}
// but we might have to call reindexJSONObjects() multiple times if
// it would block
if ( od && *recycle &&
// only reindex if it is a query reindex i guess otherwise
// just leave it alone
m_sreqValid && m_sreq.m_isPageReindex &&
od->m_diffbotJSONCount &&
size_linkInfo2 ) {
// similar to od->nukeJSONObjects
int32_t *ohbuf =(int32_t *)m_diffbotTitleHashBuf.getBufStart();
int32_t nh =m_diffbotTitleHashBuf.length() / 4;
int32_t *status = reindexJSONObjects( ohbuf , nh );
if ( ! status || status == (void *)-1) return (char *)status;
}
// just delete the json items whose "title hashes" are present
// in the "old doc" but NOT i the "new doc".
// we use the title hash to construct a unique url for each json item.
// if the title hash is present in both the old and new docs then
// do not delete it here, but we will reindex it later in
// getMetaList() below when we call indexDoc() on each one after
// setting m_dx to each one.
bool nukeJson = true;
if ( ! od ) nukeJson = false;
if ( od && od->m_diffbotJSONCount <= 0 ) nukeJson = false;
// if recycling json objects, leave them there!
if ( *recycle ) nukeJson = false;
// you have to be a diffbot crawl to do this
// no, not if you have th diffbot api url set... so take this out
//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
// do not remove old diffbot json objects if pageparser.cpp test
// because that can not change the index, etc.
if ( getIsPageParser() ) nukeJson = false;
if ( nukeJson ) {
// it should only nuke/delete the json items that we LOST,
// so if we still have the title hash in our latest
// diffbot reply, then do not nuke that json item, which
// will have a url ending in -diffboyxyz%"UINT32"
// (where %"UINT32" is the json item title hash).
// This will download the diffbot reply if not already there.
int32_t numHashes;
int32_t *th = getDiffbotTitleHashes(&numHashes);
if ( ! th && ! g_errno ) { char *xx=NULL;*xx=0; }
if ( ! th || th == (void *)-1 ) return (char *)th;
// this returns false if it blocks
int32_t *status = od->nukeJSONObjects( th , numHashes );
if ( ! status || status == (void *)-1) return (char *)status;
}
// . need this if useTitledb is true
// . otherwise XmlDoc::getTitleRecBuf() cores because its invalid
// . this cores if rebuilding just posdb because hashAll() needs
// the inlink texts for hashing
//if ( m_useTitledb ) {
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 )
return (char *)info1;
//}
// global debug
g_od = od;
/*
// is the document content unchanged?
bool unchanged = false;
if ( od && od->m_contentHash32 == *ch32 ) unchanged = true;
// http status of 304 means "not modified since"
if ( od && *hs == 304 ) unchanged = true;
// compare to last time
if ( od && *tagHash != od->m_tagHash32 ) unchanged = false;
// do not do this if from pageparser.cpp
//if ( m_sreqValid && m_sreq.m_isPageParser ) unchanged = false;
if ( getIsPageParser() ) unchanged = false;
// force reindex if it was from query reindex (docid based spider req)
if ( m_sreqValid && m_sreq.m_urlIsDocId ) unchanged = false;
// if we were turked... how to tell????
if ( m_sreqValid && m_sreq.m_isInjecting ) unchanged = false;
// just turn it all off for now because our parsing logic might
// have changed
unchanged = false;
// set this i guess for doConsistencyTest()
m_unchanged = unchanged;
m_unchangedValid = true;
// . if doc content was unchanged just add the SpiderReply to the
// meta list so that spiderdb knows we attempted it at this time.
// . copy over member vars of the old titleRec/XmlDoc into us so
// we can save time and cpu
if ( unchanged ) {
// this seems to be an issue for blocking
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
// all done!
bool addReply = true;
// Scraper.cpp uses this
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
// return nothing if done
if ( ! addReply ) {
m_metaListSize = 0;
m_metaList = (char *)0x1;
return m_metaList;
}
// before getting our spider reply, assign crap from the old
// doc to us since we are unchanged! this will allow us to
// call getNewSpiderReply() without doing any processing, like
// setting the Xml or Words classes, etc.
copyFromOldDoc ( od );
// and don't forget to validate this
int32_t *ic = getIndexCode();
// should never block since we copied from old doc
if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
// get our spider reply
SpiderReply *newsr = getNewSpiderReply();
// return on error
if ( ! newsr ) return (char *)newsr;
// . panic on blocking! this is supposed to be fast!
// . it might still have to lookup the tagdb rec?????
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
// unset our ptr_linkInfo1 so we do not free it and core
// since we might have set it in copyFromOldDoc() above
ptr_linkInfo1 = NULL;
size_linkInfo1 = 0;
// how much we need
int32_t needx = sizeof(SpiderReply) + 1;
// doledb key?
if ( m_doledbKey.n0 || m_doledbKey.n1 )
needx += 1 + sizeof(key_t); // + 4;
// the titledb unlock key for msg12 in spider.cpp
needx += 1 + sizeof(key_t);
// make the buffer
m_metaList = (char *)mmalloc ( needx , "metalist");
if ( ! m_metaList ) return NULL;
// save size for freeing later
m_metaListAllocSize = needx;
// ptr and boundary
m_p = m_metaList;
m_pend = m_metaList + needx;
// save it
char *saved = m_p;
// remove from doledb if we had a valid key (BEFORE adding SpiderReply)
if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
// note it
setStatus ( "removing key from doledb" );
// . now remove the original spider rec from "doledb"
// . rdbid first
*m_p = RDB_DOLEDB;
m_p++;
// then the key
*(key_t *)m_p = m_doledbKey;
// nukey, clear del bit to delete it
*m_p = *m_p & 0xfe;
// skip key
m_p += sizeof(key_t);
// then zero for data size
// *(int32_t *)m_p = 0;
//m_p += 4;
// sanity check
verifyMetaList( m_metaList , m_p );
}
// sanity check
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// . make a fake titledb key
// . remove the spider lock (Msg12 in Spider.cpp)
*m_p++ = RDB_FAKEDB;
*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
m_p += sizeof(key_t);
// now add the new rescheduled time
// note it
setStatus ( "adding SpiderReply to spiderdb" );
// rdbid first
*m_p = RDB_SPIDERDB;
// use secondary?
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
m_p++;
// get this
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
// store the spider rec
int32_t newsrSize = newsr->getRecSize();
gbmemcpy ( m_p , newsr , newsrSize );
m_p += newsrSize;
// sanity check
if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p );
// verify it
m_metaListValid = true;
// set size
m_metaListSize = m_p - m_metaList;
// all done
return m_metaList;
}
*/
// so getSiteRank() works
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
// so addTable144 works
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;
//
// . before making the title rec we need to set all the ptrs!
// . so at least now set all the data members we will need to
// seriazlize into the title rec because we can't be blocking further
// down below after we set all the hashtables and XmlDoc::ptr_ stuff
if ( ! m_setFromTitleRec || m_useSecondaryRdbs ) {
// all member vars should already be valid if set from titlerec
char *ptg = prepareToMakeTitleRec ();
// return NULL with g_errno set on error
if ( ! ptg || ptg == (void *)-1 ) return (char *)ptg;
}
// sanity check - if the valid title rec is null, m_indexCode is set!
//if ( ! *tr && ! m_indexCode ) { char *xx=NULL;*xx=0; }
// . bail. return an empty meta list, m_metaListSize should be 0!
// . NO! we need to add a SpiderReply to spiderdb...
//if ( ! *tr )
// log("HEY");
/*
if ( ! *tr ) {
m_metaList = "";
m_metaListSize = 0;
m_metaListValid = true;
return m_metaList;
}
*/
// get this for hashing stuff
//Spam *sp = getSpam();
//if ( ! sp || sp == (void *)-1 ) return (char *)sp;
// our next slated spider priority
char *spiderLinks3 = getSpiderLinks();
if ( ! spiderLinks3 || spiderLinks3 == (char *)-1 )
return (char *)spiderLinks3;
bool spideringLinks = *spiderLinks3;
// int16_tcut
XmlDoc *nd = this;
///////////////////////////////////
///////////////////////////////////
//
//
// if we had an error, do not add us regardless to the index
// although we might add SOME things depending on the error.
// Like add the redirecting url if we had a ESIMPLIFIEDREDIR error.
// So what we had to the Rdbs depends on the indexCode.
//
if ( m_indexCode ) nd = NULL;
// OR if deleting from index, we just want to get the metalist
// directly from "od"
if ( m_deleteFromIndex ) nd = NULL;
//
//
///////////////////////////////////
///////////////////////////////////
if ( ! nd )
spideringLinks = false;
// set these for getNewSpiderReply() so it can set
// SpiderReply::m_wasIndexed and m_isIndexed...
m_wasInIndex = false;
m_isInIndex = false;
if ( od ) m_wasInIndex = true;
if ( nd ) m_isInIndex = true;
m_wasInIndexValid = true;
m_isInIndexValid = true;
// if we are adding a simplified redirect as a link to spiderdb
if ( m_indexCode == EDOCSIMPLIFIEDREDIR )
spideringLinks = true;
// likewise if there error was ENONCANONICAL treat it like that
if ( m_indexCode == EDOCNONCANONICAL )
spideringLinks = true;
//
// . prepare the outlink info if we are adding links to spiderdb!
// . do this before we start hashing so we do not block and re-hash!!
//
if ( spideringLinks && ! m_doingConsistencyCheck && m_useSpiderdb){
setStatus ( "getting outlink info" );
TagRec ***grv = getOutlinkTagRecVector();
if ( ! grv || grv == (void *)-1 ) return (char *)grv;
//char **iiv = getOutlinkIsIndexedVector();
//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
int32_t **ipv = getOutlinkFirstIpVector();
if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
//int8_t *hcv = getOutlinkHopCountVector();
//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
//char *ipi = getIsIndexed(); // is the parent indexed?
//if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
}
// get the tag buf to add to tagdb
SafeBuf *ntb = NULL;
if ( m_useTagdb && ! m_deleteFromIndex ) {
ntb = getNewTagBuf();
if ( ! ntb || ntb == (void *)-1 ) return (char *)ntb;
}
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
Words *ww = getWords();
if ( ! ww || ww == (void *)-1 ) return (char *)ww;
int64_t *pch64 = getExactContentHash64();
//int64_t *pch64 = getLooseContentHash64();
if ( ! pch64 || pch64 == (void *)-1 ) return (char *)pch64;
// get the voting table which we will add to sectiondb
SectionVotingTable *nsvt = NULL;
SectionVotingTable *osvt = NULL;
// seems like
// sectiondb takes up abotu 15% of the disk space like this. no!
// cuz then there is revdb, so we are 30%. so that's a no go.
bool addSectionVotes = false;
if ( nd ) addSectionVotes = true;
if ( ! m_useSectiondb ) addSectionVotes = false;
// to save disk space no longer add the roots! nto only saves sectiondb
// but also saves space in revdb
//if ( nd && *isRoot ) addSectionVotes = true;
if ( addSectionVotes ) {
nsvt = getNewSectionVotingTable();
if ( ! nsvt || nsvt == (void *)-1 ) return (char *)nsvt;
// get the old table too!
osvt = getNewSectionVotingTable();
if ( ! osvt || osvt == (void *)-1 ) return (char *)osvt;
}
// get the addresses for hashing tag hashes that indicate place names
Addresses *na = NULL;
//Addresses *oa = NULL;
if ( nd ) na = getAddresses();
//if ( od ) oa = od->getAddresses();
// get dates ready for hashing
Dates *ndp = NULL;
//Dates *odp = NULL;
if ( nd ) {
ndp = nd->getDates();
if ( ! ndp || ndp==(void *)-1) return (char *)ndp;
}
//if ( od ) {
// odp = od->getDates();
// if ( ! odp || odp==(void *)-1) return (char *)odp;
//}
// need firstip if adding a rebuilt spider request
if ( m_useSecondaryRdbs && ! m_isDiffbotJSONObject && m_useSpiderdb ) {
int32_t *fip = getFirstIp();
if ( ! fip || fip == (void *)-1 ) return (char *)fip;
}
// shit, we need a spider reply so that it will not re-add the
// spider request to waiting tree, we ignore docid-based
// recs that have spiderreplies in Spider.cpp
SpiderReply *newsr = NULL;
if ( m_useSpiderdb ) { // && ! m_deleteFromIndex ) {
newsr = getNewSpiderReply();
if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
}
// the site hash for hashing
int32_t *sh32 = getSiteHash32();
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (char *)sh32;
// set ptr_clockCandidatesData
// if ( nd ) {
// HashTableX *cct = nd->getClockCandidatesTable();
// if ( ! cct || cct==(void *)-1) return (char *)cct;
// }
if ( m_useLinkdb && ! m_deleteFromIndex ) {
int32_t *linkSiteHashes = getLinkSiteHashes();
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
return (char *)linkSiteHashes;
}
//SafeBuf *au = getDiffbotApiUrl();
//if ( ! au || au == (void *)-1 ) return (char *)au;
// test json parser
//
/*
char *json = "{\"icon\":\"http://www.pixar.com/sites/default/files/pixar_2012_favicon_0.ico\",\"text\":\"\",\"title\":\"Pixar\",\"type\":\"article\",\"media\":[{\"primary\":\"true\",\"link\":\"http://www.pixar.com/sites/default/files/home_billboard_v7.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/roz1_0.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/home_bu-thumb_v1.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/mu_home_thumb.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/brenda_homepage.jpg\",\"type\":\"image\"}],\"url\":\"http://www.pixar.com/\"}";
char *dd = getNextJSONObject ( json );
if ( *dd ) { char *xx=NULL;*xx=0; }
*/
///////////
//
// BEGIN the diffbot json object index hack
//
// if we are using diffbot, then each json object in the diffbot reply
// should be indexed as its own document.
//
///////////
// . get the reply of json objects from diffbot
// . this will be empty if we are a json object!
// . will also be empty if not meant to be sent to diffbot
// . the TOKENIZED reply consists of \0 separated json objects that
// we create from the original diffbot reply
SafeBuf *tdbr = getTokenizedDiffbotReply();
if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr;
// i guess it is safe to do this after getting the spiderreply
SafeBuf *spiderStatusDocMetaList = NULL;
//if ( indexReply ) {
// get the spiderreply ready to be added to the rdbs w/ msg4
// but if doing a rebuild operation then do not get it, we'll rebuild
// it since it will have its own titlerec
if ( ! m_useSecondaryRdbs ) {
spiderStatusDocMetaList =
getSpiderStatusDocMetaList (newsr,forDelete);
if ( ! spiderStatusDocMetaList ) {
log("build: ss doc metalist null. bad!");
return NULL;
}
}
if ( spiderStatusDocMetaList == (void *)-1)
return (char *)spiderStatusDocMetaList;
//}
int32_t tdbrLen = tdbr->length();
// do not index json items as separate docs if we are page parser
if ( getIsPageParser() ) tdbrLen = 0;
// same goes if appending -diffbotxyz%UINT32 would be too long
if ( m_firstUrl.getUrlLen() + 11 + 10 > MAX_URL_LEN )
tdbrLen = 0;
// once we have tokenized diffbot reply we can get a unique
// hash of the title of each json item. that way, if a page changes
// and it gains or loses a diffbot item, the old items will still
// have the same url and we can set their m_indexCode to EDOCUNCHANGED
// if the individual json item itself has not changed when we
// call m_dx->indexDoc() below.
int32_t numHashes = 0;
int32_t *titleHashBuf = NULL;
//
// if we got a json object or two from diffbot, index them
// as their own child xmldocs.
// watch out for reply from diffbot of "-1" indicating error!
//
if ( tdbrLen > 3 ) {
// get title hashes of the json items
titleHashBuf = getDiffbotTitleHashes(&numHashes);
if (!titleHashBuf || titleHashBuf == (void *)-1){
char *xx=NULL;*xx=0;}
// make sure diffbot reply is valid for sure
if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
// set status for this
setStatus ( "indexing diffbot json doc");
// new guy here
if ( ! m_dx ) {
try { m_dx = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
log("xmldoc: failed to alloc m_dx");
return NULL;
}
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
// we now parse the array of products out of the
// diffbot reply. each product is an item/object.
m_diffbotObj = tdbr->getBufStart();
m_diffbotJSONCount = 0;
}
// loop back up here to process next json object from below
jsonloop:
// if m_dx has no url set, call set4 i guess
if ( ! m_dx->m_contentValid ) {
// sanity. ensure the json item we are trying to
// index has a title hash in this buf
if(m_diffbotJSONCount>=numHashes){char *xx=NULL;*xx=0;}
// get the title of the json we are indexing
int32_t jth = titleHashBuf [ m_diffbotJSONCount ];
// make the fake url for this json object for indexing
SafeBuf fakeUrl;
fakeUrl.set ( m_firstUrl.getUrl() );
// append -diffbot-0 etc. for fake url
fakeUrl.safePrintf("-diffbotxyz%"UINT32"",
//(int32_t)m_diffbotJSONCount);
(uint32_t)jth);
if ( fakeUrl.length() > MAX_URL_LEN ) {
log("build: diffbot enhanced url too long for "
"%s",fakeUrl.getBufStart());
char *xx=NULL;*xx=0;
}
m_diffbotJSONCount++;
// this can go on the stack since set4() copies it
SpiderRequest sreq;
sreq.reset();
// string ptr
char *url = fakeUrl.getBufStart();
// use this as the url
strcpy( sreq.m_url, url );
// parentdocid of 0
int32_t firstIp = hash32n ( url );
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1;
sreq.m_isPageInject = 1;
sreq.m_hopCount = m_hopCount;
sreq.m_hopCountValid = m_hopCountValid;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// so we can match url filters' "insitelist" directive
// in Spider.cpp::getUrlFilterNum()
sreq.m_domHash32 = m_domHash32;
sreq.m_siteHash32 = m_siteHash32;
sreq.m_hostHash32 = m_siteHash32;
// set this
if (!m_dx->set4 ( &sreq ,
NULL ,
cr->m_coll ,
NULL , // pbuf
// give it a niceness of 1, we have
// to be careful since we are a
// niceness of 0!!!!
m_niceness, // 1 ,
// inject this content
m_diffbotObj,
false, // deleteFromIndex ,
0, // forcedIp ,
CT_JSON, // contentType ,
0, // lastSpidered ,
false )) // hasMime
// g_errno should be set!
return NULL;
// we are indexing json objects, don't use all these
m_dx->m_useClusterdb = false;
m_dx->m_useSpiderdb = false;
m_dx->m_useTagdb = false;
m_dx->m_usePlacedb = false;
m_dx->m_useLinkdb = false;
m_dx->m_isChildDoc = true;
m_dx->m_parentDocPtr = this;
// we like to sort json objects using
// 'gbsortby:spiderdate' query to get the most
// recent json objects, so this must be valid
if ( m_spideredTimeValid ) {
m_dx->m_spideredTimeValid = true;
m_dx->m_spideredTime = m_spideredTime;
}
m_dx->m_isDiffbotJSONObject = true;
}
// when the indexdoc completes, or if it blocks, call us!
// we should just pass through here
//xd->setCallback ( this , getMetaListWrapper );
m_dx->setCallback ( m_masterState , m_masterLoop );
///////////////
// . inject the content of the json using this fake url
// . return -1 if this blocks
// . if m_dx got its msg4 reply it ends up here, in which
// case do NOT re-call indexDoc() so check for
// m_listAdded.
///////////////
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
return (char *)-1;
// critical error on our part trying to index it?
// does not include timeouts or 404s, etc. mostly just
// OOM errors.
if ( g_errno ) return NULL;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// count as deleted
cr->m_localCrawlInfo.m_objectsAdded++;
cr->m_globalCrawlInfo.m_objectsAdded++;
cr->m_needsSave = true;
// we successfully index the json object, skip to next one
m_diffbotObj += gbstrlen(m_diffbotObj) + 1;
// but gotta set this crap back
log(LOG_INFO,"diffbot: resetting %s",m_dx->m_firstUrl.m_url);
// clear for next guy if there is one. clears
// m_dx->m_contentValid so the set4() can be called again above
m_dx->reset();
// have we breached the buffer of json objects? if not, do more
if ( m_diffbotObj < tdbr->getBuf() ) goto jsonloop;
}
/////
//
// END the diffbot json object index hack
//
/////
//
// CAUTION
//
// CAUTION
//
// We should never "block" after this point, lest the hashtables
// we create get messed up.
//
//
//
// START HASHING
//
//
// store what we hash into this table
if ( (m_pbuf || m_storeTermListInfo) && ! m_wts ) {
// init it. the value is a TermInfo class. allowDups=true!
m_wtsTable.set (12,sizeof(TermDebugInfo),
0,NULL,0,true,m_niceness,
"wts-tab");
// point to it, make it active
m_wts = &m_wtsTable;
}
// how much to alloc? compute an upper bound
int32_t need = 0;
// should we index this doc?
bool index1 = true;
setStatus ( "hashing posdb and datedb terms" );
// . hash our documents terms into "tt1"
// . hash the old document's terms into "tt2"
// . by old, we mean the older versioned doc of this url spidered b4
HashTableX tt1;
HashTableX tt2;
// how many words we got?
int32_t nw = m_words.getNumWords();
// . prepare it, 5000 initial terms
// . make it nw*8 to avoid have to re-alloc the table!!!
// . i guess we can have link and neighborhood text too! we don't
// count it here though... but add 5k for it...
int32_t need4 = nw * 4 + 5000;
if ( nd && index1 && m_usePosdb ) {
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,
"posdb-indx"))
return NULL;
int32_t did = tt1.m_numSlots;
//bool index2 = true;
// . hash the document terms into "tt1"
// . this is a biggie!!!
// . only hash ourselves if m_indexCode is false
// . m_indexCode is non-zero if we should delete the doc from
// index
// . i think this only adds to posdb
//log("xmldoc: CALLING HASHALL");
// shit, this blocks which is bad!!!
char *nod = hashAll ( &tt1 ) ;
// you can't block here because if we are re-called we lose tt1
if ( nod == (char *)-1 ) { char *xx=NULL;*xx=0; }
// error?
if ( ! nod ) return NULL;
int32_t done = tt1.m_numSlots;
if ( done != did )
log("xmldoc: reallocated big table! bad. old=%"INT32" "
"new=%"INT32" nw=%"INT32"",did,done,nw);
}
// if indexing the spider reply as well under a different docid
// there is no reason we can't toss it into our meta list here
if ( spiderStatusDocMetaList )
need += spiderStatusDocMetaList->length();
// now we use revdb
// before hashing the old doc into it
//if ( od && index2 ) {
// // if this hash table init fails, return NULL
// if (!tt2.set(12,4,5000,NULL,0,false,m_niceness)) return NULL;
// char *rod = od->hash ( &tt2 ) ;
// if ( ! rod || rod == (char *)-1 ) return rod;
//}
// space for indexdb AND DATEDB! +2 for rdbids
int32_t needIndexdb = 0;
needIndexdb +=tt1.m_numSlotsUsed*(sizeof(key144_t)+2+sizeof(key128_t));
//needIndexdb+=tt2.m_numSlotsUsed * (sizeof(key_t)+2+sizeof(key128_t));
need += needIndexdb;
// sanity check
//if ( ! od && m_skipIndexing && needIndexdb ) { char *xx=NULL;*xx=0; }
// . sanity check - must have one or the other!
// . well, not in the case of EDOCNOTNEW or EDOCNOTOLD, in which
// case we just remove ourselves from spiderdb, and in the case
// of EDOCNOTOLD, from tfndb as well
//if ( ! od && ! nd ) { char *xx=NULL;*xx=0; }
// what pub dates do the old and new doc have? -1 means none.
int32_t date1 = -1; if ( nd ) date1 = nd->m_pubDate;
//int32_t date2 = -1; if ( od ) date2 = od->m_pubDate;
// now we also add the title rec. true = ownsCbuf? ret NULL on error
// with g_errno set.
//if ( nd && ! nd->compress( true , m_niceness ) ) return NULL;
/*
now we have the bit in the posdb key, so this should not be needed...
use Posdb::isShardedByTermId() to see if it is such a spcial case key
like Hostdb::getShardNum() now does...
setStatus ( "hashing nosplit keys" );
// hash no split terms into ns1 and ns2
HashTableX ns1;
// prepare it, 500 initial terms
if ( ! ns1.set ( 18 , 4 , 500,NULL,0,false,m_niceness,"nosplt-indx" ))
return NULL;
// . hash for no splits
// . like above, but these are "no split" termids
if ( nd && m_usePosdb && ! hashNoSplit ( &ns1 ) ) return NULL;
//if(index2 && od && ! od->hashNoSplit ( &ns2 ) ) return NULL;
// needs for hashing no split terms
int32_t needNoSplit1 = 0;
// add em up. +1 for rdbId. add to both indexdb AND datedb i guess...
needNoSplit1 += ns1.m_numSlotsUsed * (18+1); // +16+1);
//needNoSplit += ns2.m_numSlotsUsed * (12+1+16+1);
// add it in
need += needNoSplit1;
// sanity check
//if ( ! od && m_skipIndexing && needNoSplit ) { char *xx=NULL;*xx=0; }
*/
setStatus ( "hashing sectiondb keys" );
// add in special sections keys. "ns" = "new sections", etc.
// add in the special nosplit datedb terms from the Sections class
// these hash into the term table so we can do incremental updating
HashTableX st1; // <key128_t,char> dt1;
//HashTableX st2; // <key128_t,char> dt2;
// set key/data size
int32_t svs = sizeof(SectionVote);
st1.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness,"sectdb-indx");
// tell hashtable to use the sectionhash for determining the slot,
// not the lower 4 bytes because that is the docid which is the
// same for every key
st1.m_maskKeyOffset = 6;
//st2.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness);
// do not bother if deleting
if ( m_indexCode ) nsvt = NULL;
// . now we hash the root just to get some section votes i guess
//if ( nts && ! *isr ) nsvt = NULL;
// if old voting table add more than 100,000 votes forget it!!! do
// not bloat sectiondb that big...
if ( osvt && osvt->m_totalSiteVoters >= MAX_SITE_VOTERS ) nsvt = NULL;
// hash terms into a table that uses full datedb keys
if ( nsvt && ! nsvt->hash (m_docId,&st1,*sh64,m_niceness)) return NULL;
// needs for hashing no split terms
int32_t needSectiondb = 0;
// add em up. plus one for rdbId
needSectiondb += st1.m_numSlotsUsed * (16+svs+1);
//needSectiondb += st2.m_numSlotsUsed * (16+svs+1);
// add it in
need += needSectiondb;
// Sections::respiderLineWaiters() adds one docid-based spider rec
// for every url waiting in line. Sections::m_numLineWaiters. assume
// 64 bytes per line waiter spider rec i guess
//int32_t needLineWaiters = 0;
// +1 for rdbId
//if ( ns ) needLineWaiters = ns->m_numLineWaiters * 64;
// forgot to add this?
//need += needLineWaiters;
// . for adding Sections.cpp keys
// . Sections::hash() does not bother with invalid sections
// . waitInLine might be true in Sections::hash() too, so always add 12
//if ( ns ) need += (ns->m_numSections - ns->m_numInvalids)*12 + 12;
//if ( os ) need += (os->m_numSections - os->m_numInvalids)*12 + 12;
// for adding Addresses::m_keys[] (Addresses::hash())
//if ( na ) need += (na->m_numKeys * 16);
//if ( oa ) need += (oa->m_numKeys * 16);
// don't forget Dates!
//if ( ndp ) need += ndp->m_numPubDates * sizeof(key_t);
//if ( odp ) need += odp->m_numPubDates * sizeof(key_t);
// clusterdb keys. plus one for rdbId
int32_t needClusterdb = 0;
//if ( nd && ! nd->m_skipIndexing ) needClusterdb += 13;
//if ( od && ! od->m_skipIndexing ) needClusterdb += 13;
if ( nd ) needClusterdb += 13;
//if ( od ) needClusterdb += 13;
need += needClusterdb;
// . LINKDB
// . linkdb records. assume one per outlink
// . we may index 2 16-byte keys for each outlink
Links *nl2 = NULL;
//if ( spideringLinks ) nl2 = &m_links;
// if injecting, spideringLinks is false, but then we don't
// add the links to linkdb, which causes the qainlinks() test to fail
nl2 = &m_links;
// do not bother if deleting. but we do add simplified redirects
// to spiderdb as SpiderRequests now.
int32_t code = m_indexCode;
if ( code == EDOCSIMPLIFIEDREDIR ) code = 0;
if ( code == EDOCNONCANONICAL ) code = 0;
if ( code ) nl2 = NULL;
//Links *ol = NULL; if ( od ) ol = od->getLinks();
// . set key/data size
// . use a 16 byte key, not the usual 12
// . use 0 for the data, since these are pure keys, which have no
// scores to accumulate
HashTableX kt1;
//HashTableX kt2;
int32_t nis = 0;
if ( nl2 && m_useLinkdb ) nis = nl2->getNumLinks() * 4;
// pre-grow table based on # outlinks
kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" );
// use magic to make fast
kt1.m_useKeyMagic = true;
// linkdb keys will have the same lower 4 bytes, so make hashing fast.
// they are 28 byte keys. bytes 20-23 are the hash of the linkEE
// so that will be the most random.
kt1.m_maskKeyOffset = 20;
// faster
//kt2.set ( sizeof(key128_t) , 0,0,NULL,0,false,m_niceness );
// do not add these
//bool add1 = true;
// do not add negative key if no old doc
//if ( ! od ) add2 = false;
// . we already have a Links::hash into the Termtable for links: terms,
// but this will have to be for adding to Linkdb. basically take a
// lot of it from Linkdb::fillLinkdbList()
// . these return false with g_errno set on error
if ( m_useLinkdb && nl2 && ! hashLinksForLinkdb(&kt1) ) return NULL;
//if ( add2 && ol && ! !od->m_skipIndexing &&
// ol->hash(&kt2,od,m_niceness) )
// return NULL;
// add up what we need. +1 for rdbId
int32_t needLinkdb = 0;
needLinkdb += kt1.m_numSlotsUsed * (sizeof(key224_t)+1);
//needLinkdb += kt2.m_numSlotsUsed * (sizeof(key128_t)+1);
need += needLinkdb;
// sanity check
//if ( ! od && m_skipIndexing && needLinkdb ) { char *xx=NULL;*xx=0; }
// PLACEDB
HashTableX pt1;
//HashTableX pt2;
// . set key/data size
// . limit every address to 512 bytes
pt1.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness,"placedb-indx");
//pt2.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness);
//
// if this is true, then we just store the placedb recs
// directly into the title rec. That way we do not have
// to store the content of the web page, and we save space.
//
// otherwise, we have to parse out the sections and it is much slower
//else if (oa && !oa->hashForPlacedb(m_docId,*sh32,*od->getIp(),&pt2) )
// return NULL;
// hash terms into a table that uses full datedb keys
if ( na && !na->hashForPlacedb(m_docId,*sh32,*nd->getIp(),&pt1))
return NULL;
setStatus("hashing place info");
int32_t needPlacedb = 0;
// . +1 for rdbId
// . up to 512 bytes per address
needPlacedb += pt1.m_numSlotsUsed * (sizeof(key128_t)+1+512);
//needPlacedb += pt2.m_numSlotsUsed * (sizeof(key128_t)+1+512);
need += needPlacedb;
// sanity check -- coring here because we respider the page and
// the address is gone so it tries to delete it!
//if ( ! od && m_skipIndexing && needPlacedb ) { char *xx=NULL;*xx=0; }
// we add a negative key to doledb usually (include datasize now)
int32_t needDoledb = sizeof(key_t) + 1 ; // + 4;
if ( forDelete ) needDoledb = 0;
need += needDoledb;
// for adding the SpiderReply to spiderdb (+1 for rdbId)
int32_t needSpiderdb1 = sizeof(SpiderReply) + 1;
if ( forDelete ) needSpiderdb1 = 0;
need += needSpiderdb1;
// if injecting we add a spiderrequest to be able to update it
// but don't do this if it is pagereindex. why is pagereindex
// setting the injecting flag anyway?
int32_t needSpiderdb3 = 0;
if ( m_sreqValid &&
m_sreq.m_isInjecting &&
m_sreq.m_fakeFirstIp &&
! m_sreq.m_forceDelete &&
// do not rebuild spiderdb if only rebuilding posdb
// this is explicitly for injecting so we need to add
// the spider request to spiderdb...
//m_useSpiderdb &&
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
! m_isDiffbotJSONObject ) {
needSpiderdb3 = m_sreq.getRecSize() + 1;
// NO! because when injecting a warc and the subdocs
// it contains, gb then tries to spider all of them !!! sux...
needSpiderdb3 = 0;
}
// or if we are rebuilding spiderdb
else if (m_useSecondaryRdbs && !m_isDiffbotJSONObject && m_useSpiderdb)
needSpiderdb3 = sizeof(SpiderRequest) + m_firstUrl.m_ulen+1;
need += needSpiderdb3;
//int32_t needSpiderdb3 = 0;
//if ( m_sreqValid ) needSpiderdb3 = m_sreq.getRecSize() + 1;
//need += needSpiderdb3;
// . for adding our outlinks to spiderdb
// . see SpiderRequest::getRecSize() for description
// . SpiderRequest::getNeededSize() will include the null terminator
int32_t hsize = SpiderRequest::getNeededSize ( 0 );
int32_t needSpiderdb2 = hsize * m_links.getNumLinks();
// and the url buffer of outlinks. includes \0 terminators i think
needSpiderdb2 += m_links.getLinkBufLen();
// don't need this if doing consistecy check
if ( m_doingConsistencyCheck ) needSpiderdb2 = 0;
// nor for generating the delete meta list for incremental indexing
if ( forDelete ) needSpiderdb2 = 0;
// accumulate it
need += needSpiderdb2;
// the new tags for tagdb
int32_t needTagdb = 0;
if ( ntb ) needTagdb = ntb->length() ;
// add 1 byte for up to 128 rdbids
//needTagdb += needTagdb/sizeof(Tag) + 1;
// add that in
need += needTagdb;
// . add in title rec size
// . should be valid because we called getTitleRecBuf() above
// . this should include the key
// . add in possible negative key for deleting old title rec
//int32_t needTitledb = sizeof(key96_t);
// +1 for rdbId
//if ( nd && m_useTitledb ) needTitledb = m_titleRecSize + 1;
//need += needTitledb;
//
// . CHECKSUM PARSING CONSISTENCY TEST
//
// . set m_metaListChecksum member (will be stored in titleRec header)
// . gotta set m_metaListCheckSum8 before making titleRec below
// . also, if set from titleRec, verify metalist is the same!
//
if ( ! m_computedMetaListCheckSum ) {
// do not call twice!
m_computedMetaListCheckSum = true;
// all keys in tt1, ns1, kt1 and pt1
int32_t ck32 = 0;
ck32 ^= tt1.getKeyChecksum32();
// show tt1
//
// UNCOMMENT this to debug parsing inconsistencies!!!
//
// SafeBuf sb;
// tt1.print(&sb);
// if(sb.getBufStart()) fprintf(stderr,"%s", sb.getBufStart());
//ck32 ^= ns1.getKeyChecksum32();
//ck32 ^= kt1.getKeyChecksum32();
//ck32 ^= pt1.getKeyChecksum32();
// set this before calling getTitleRecBuf() below
uint8_t currentMetaListCheckSum8 = (uint8_t)ck32;
// see if matches what was in old titlerec
if ( m_metaListCheckSum8Valid &&
// if we were set from a titleRec, see if we got
// a different hash of terms to index this time around...
m_setFromTitleRec &&
// fix for import log spam
! m_isImporting &&
m_version >= 120 &&
m_metaListCheckSum8 != currentMetaListCheckSum8 ) {
log("xmldoc: checksum parsing inconsistency for %s "
"(old)%i != %i(new). Uncomment tt1.print() "
"above to debug.",
m_firstUrl.getUrl(),
(int)m_metaListCheckSum8,
(int)currentMetaListCheckSum8);
// if doing qa test drop core
CollectionRec *cr = getCollRec();
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 ) {
log("xmldoc: sleep 1000");
sleep(1000);
exit(0);}//char *xx=NULL;*xx=0; }
}
// assign the new one, getTitleRecBuf() call below needs this
m_metaListCheckSum8 = currentMetaListCheckSum8;
m_metaListCheckSum8Valid = true;
}
//
// now that we've set all the ptr_* members vars, we can make
// the title rec
//
// . MAKE the title rec from scratch, that is all we need at this point
// . sets m_indexCode to EDOCNOTNEW or EDOCNOTOLD sometimes
// . if repairing and not rebuilding titledb, we do not need the
// titlerec
if ( m_useTitledb ) {
// this buf includes key/datasize/compressdata
SafeBuf *tr = getTitleRecBuf ();
// panic if this blocks! it should not at this point because
// we'd have to re-hash the crap above
if ( tr == (void *) -1 ) { char *xx=NULL;*xx=0; }
// return NULL with g_errno set on error
if ( ! tr ) return (char *)tr;
// sanity check - if the valid title rec is null,
// m_indexCode is set!
if ( tr->length()==0 && ! m_indexCode ) { char *xx=NULL;*xx=0;}
}
// . add in title rec size
// . should be valid because we called getTitleRecBuf() above
// . this should include the key
// . add in possible negative key for deleting old title rec
int32_t needTitledb = sizeof(key96_t) + 1;
// +1 for rdbId
if ( nd && m_useTitledb && ! forDelete )
needTitledb += m_titleRecBuf.length();
// set new and old keys for titledb
//key_t ok;
key_t nk;
//ok.setMin();
nk.setMin();
//if ( od ) ok = *od->getTitleRecKey();
if ( nd && m_useTitledb ) nk = *nd->getTitleRecKey();
//if ( od && m_useTitledb && ok != nk ) needTitledb += sizeof(key_t)+1;
if ( m_useTitledb ) {
// then add it in
need += needTitledb;
// the titledb unlock key for msg12 in spider.cpp
need += sizeof(key_t);
}
//
// now space for the revdb record, which is the meta list itself!
//
//need = need + 12 + 4 + need;
// . alloc mem for metalist
// . sanity
if ( m_metaListSize > 0 ) { char *xx=NULL;*xx=0; }
// make the buffer
m_metaList = (char *)mmalloc ( need , "metalist");
if ( ! m_metaList ) return NULL;
// save size for freeing later
m_metaListAllocSize = need;
// ptr and boundary
m_p = m_metaList;
m_pend = m_metaList + need;
//
// TITLEDB
//
setStatus ("adding titledb recs");
// checkpoint
char *saved = m_p;
// . delete old title rec key if different
// . Repair.cpp might set useTitledb to false!
//if ( od && m_useTitledb && ok != nk ) {
// // rdbId
// *m_p++ = RDB_TITLEDB;
// // key
// *(key_t *)m_p = *od->getTitleRecKey();
// // make it negative
// *m_p &= 0xfe;
// // skip over it
// m_p += sizeof(key_t);
// // then data size, 0
// //*(int32_t *)m_p = 0;
// //m_p+= 4;
//}
// . store title rec
// . Repair.cpp might set useTitledb to false!
if ( nd && m_useTitledb ) {
// rdbId
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_TITLEDB2;
else *m_p++ = RDB_TITLEDB;
// sanity
if ( ! nd->m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
// key, dataSize, data is the whole rec
int32_t tsize = nd->m_titleRecBuf.length();
// if getting an "oldList" to do incremental posdb updates
// then do not include the data portion of the title rec
if ( forDelete ) tsize = sizeof(key_t);
gbmemcpy ( m_p , nd->m_titleRecBuf.getBufStart() , tsize );
// make it a negative key
//if ( forDelete ) *m_p = *m_p & 0xfe;
m_p += tsize;//nd->m_titleRecSize;
// store a zero datasize, key is still positive until the dt8
// table deletes it
//if ( forDelete ) { *(int32_t *)m_p = 0; m_p += 4; }
}
// sanity check
if ( m_p - saved > needTitledb ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// ADD BASIC INDEXDB/DATEDB TERMS
//
setStatus ( "adding posdb and datedb terms");
// checkpoint
saved = m_p;
// store indexdb terms into m_metaList[]
if ( m_usePosdb && ! addTable144 ( &tt1 , m_docId )) return NULL;
//if(!addTable96 ( &tt2, &tt1, date2, date1, true ,false)) return NULL;
//if ( od ) tt2.clear();
// sanity check
if ( m_p - saved > needIndexdb ) { char*xx=NULL;*xx=0; }
// free all mem
tt1.reset();
//tt2.reset();
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// ADD NOSPLIT INDEXDB/DATEDB TERMS
//
/*
we added these now in hashAll() to tt1, no longer ns1 since we
have the sharded by termid bit in the actual posdb key now so
Rebalance.cpp works
setStatus ( "adding posdb shardByTermId terms");
// checkpoint
saved = m_p;
// no longer anything special now since the
// Posdb::isShardedyTermId() bit
// is in the key now so Rebalance.cpp can work
if ( m_usePosdb && ! addTable144 ( &ns1 )) return NULL;
//if(! addTable96 ( &ns2, &ns1, -1, -1, true ,true)) return NULL;
// sanity check
if ( m_p - saved > needNoSplit1 ) { char*xx=NULL;*xx=0; }
// free all mem
ns1.reset();
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
*/
/*
setStatus ( "adding datedb nosplit terms");
// checkpoint
saved = m_p;
// this is now for datedb
if ( m_useDatedb && ! addTableDate(&ns2,m_docId,RDB_DATEDB,true))
return NULL;
// sanity check
if ( m_p - saved > needNoSplit2 ) { char*xx=NULL;*xx=0; }
// free all mem
ns2.reset();
// sanity check
verifyMetaList( m_metaList , m_p );
*/
//
// ADD SECTIONS SPECIAL TERMS
//
setStatus ( "adding sectiondb keys");
// checkpoint
saved = m_p;
// add that table to the metalist
if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete))
return NULL;
//if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL;
// sanity check
if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; }
// free mem
st1.reset();
//st2.reset();
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// ADD CLUSTERDB KEYS
//
setStatus ( "adding clusterdb keys" );
// checkpoint
saved = m_p;
// . do we have adult content?
// . should already be valid!
if ( nd && ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
// . get new clusterdb key
// . we use the host hash for the site hash! hey, this is only 26 bits!
key_t newk ; newk.setMin();
if ( nd )
newk = g_clusterdb.makeClusterRecKey ( *nd->getDocId() ,
*nd->getIsAdult() ,
*nd->getLangId(),
nd->getHostHash32a(),
false ); // del?
//key_t oldk; oldk.setMin();
//if ( od ) // && add2 )
// oldk = g_clusterdb.makeClusterRecKey ( *od->getDocId(),
// *od->getIsAdult() ,
// *od->getLangId() ,
// od->getHostHash32a(),
// true ); // del?
// . store old only if new tr is good and keys are different from old
// . now we store even if skipIndexing is true because i'd like to
// see how many titlerecs we have and count them towards the
// docsIndexed count...
if ( nd && m_useClusterdb ) {
// store rdbid
*m_p = RDB_CLUSTERDB;
// use secondary if we should
if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
// skip
m_p++;
// and key
*(key_t *)m_p = newk;
// skip it
m_p += sizeof(key_t);
}
// store new if different
//if ( od && ( ! nd || newk != oldk ) ) { // && !od->m_skipIndexing ) {
// // store rdbid
// *m_p = RDB_CLUSTERDB;
// // use secondary if we should
// if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
// // skip
// m_p++;
// // turn on last bit (undo del)
// //newk.n0 |= 0x01;
// // and key
// *(key_t *)m_p = oldk;
// // skip it
// m_p += sizeof(key_t);
//}
// sanity check
if ( m_p - saved > needClusterdb ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// ADD LINKDB KEYS
//
setStatus ( "adding linkdb keys" );
// checkpoint
saved = m_p;
// add that table to the metalist (LINKDB)
if ( m_useLinkdb && !addTable224(&kt1))
return NULL;
//if(add2&&!addTable128(&kt2,&kt1,RDB_LINKDB, false))return NULL;
// sanity check
if ( m_p - saved > needLinkdb ) { char *xx=NULL;*xx=0; }
// all done
kt1.reset();
//kt2.reset();
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// . ADD ADDRESSES TO NAMEDB/PLACEDB
// . key is basically a hash of the address (excluding place name
// and street indicators)
//
setStatus ( "adding to placedb" );
// checkpoint
saved = m_p;
// add that table to the metalist
if ( m_usePlacedb && ! addTable128 ( &pt1, RDB_PLACEDB,forDelete))
return NULL;
//if(! addTable128 ( &pt2, &pt1, RDB_PLACEDB, true , true))return NULL;
// sanity check
if ( m_p - saved > needPlacedb ) { char *xx=NULL;*xx=0; }
// free mem
pt1.reset();
//pt2.reset();
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
/*
//
// ADD REVDB RECORD
//
//
// . add the metalist to itself
// . this way, when we delete this doc from the index, we just
// lookup the original metalist in revdb, set all the
// delbits, and re-add that. this avoid having to ensure
// parsing consistency, which is a royal pain in the ass
// . now we also update getMetaList() to check revdb to get
// the meta list if the doc is already indexed...
//
// define current meta list
char *x = m_metaList;
char *xend = m_p;
// skip adding to revdb?
if ( ! m_useRevdb ) xend = x;
int32_t *dataSizePtr;
char *savedp;
// if nothing in current list do not add revdb rec
bool hadStuff = ( x < xend);
if ( hadStuff ) {
// put in the rdbId
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_REVDB2;
else *m_p++ = RDB_REVDB;
// the key
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
*(key_t *)m_p = g_revdb.makeKey ( m_docId , false );
m_p += sizeof(key_t);
// data size
dataSizePtr = (int32_t *)m_p;
// skip for now
m_p += 4;
// save it
savedp = m_p;
}
// scan the current metalist and add keys to the revdb record
for ( ; x < xend ; ) {
// breathe
QUICKPOLL(m_niceness);
// save this
char byte = *x;
// get rdbId
char rdbId = byte & 0x7f;
//
// convert if adding to secondary rdbids!!!!!!!!
//
if ( m_useSecondaryRdbs ) {
if ( rdbId == RDB2_POSDB2 )
rdbId = RDB_POSDB;
else if ( rdbId == RDB2_DATEDB2 )
rdbId = RDB_DATEDB;
else if ( rdbId == RDB2_SECTIONDB2 )
rdbId = RDB_SECTIONDB;
else if ( rdbId == RDB2_PLACEDB2 )
rdbId = RDB_PLACEDB;
else if ( rdbId == RDB2_TITLEDB2 )
rdbId = RDB_TITLEDB;
else if ( rdbId == RDB2_LINKDB2 )
rdbId = RDB_LINKDB;
else if ( rdbId == RDB2_CLUSTERDB2 )
rdbId = RDB_CLUSTERDB;
else if ( rdbId == RDB2_SPIDERDB2 )
rdbId = RDB_SPIDERDB;
else if ( rdbId == RDB2_TAGDB2 )
rdbId = RDB_TAGDB;
// must be covered!!
else { char *xx=NULL;*xx=0; }
// rewrite byte now b/c we store it below
byte = (byte & 0x80) | rdbId;
}
// skip that
x++;
// copy that over
*m_p++ = byte;
// sanity check -- no negative keys allowed in here
if ( (x[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
// get key size
int32_t ks = getKeySizeFromRdbId(rdbId);
// copy that over
gbmemcpy ( m_p , x , ks );
// skip that
m_p += ks;
x += ks;
// datasize?
int32_t ds = getDataSizeFromRdbId(rdbId);
if ( ds == -1 ) {
ds = *(int32_t *)x;
x += 4;
}
// skip data
x += ds;
}
// record size of what we wrote
if ( hadStuff )
*dataSizePtr = ( m_p - savedp );
// sanity check
if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
// sanity check
verifyMetaList( m_metaList , m_p );
*/
//////
//
// add SPIDERREPLY BEFORE and SPIDERREQUEST!!!
//
// add spider reply first so we do not immediately respider
// this same url if we were injecting it because no SpiderRequest
// may have existed, and SpiderColl::addSpiderRequest() will
// spawn a spider of this url again unless there is already a REPLY
// in spiderdb!!! crazy...
bool addReply = true;
// Scraper.cpp uses this
if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
// save it
saved = m_p;
// now add the new rescheduled time
if ( addReply && m_useSpiderdb && ! forDelete ) {
// note it
setStatus ( "adding SpiderReply to spiderdb" );
// rdbid first
*m_p = RDB_SPIDERDB;
// use secondary?
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
m_p++;
// get this
if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
// store the spider rec
int32_t newsrSize = newsr->getRecSize();
gbmemcpy ( m_p , newsr , newsrSize );
m_p += newsrSize;
m_addedSpiderReplySize = newsrSize;
m_addedSpiderReplySizeValid = true;
// sanity check - must not be a request, this is a reply
if ( g_spiderdb.isSpiderRequest( &newsr->m_key ) ) {
char *xx=NULL;*xx=0; }
// sanity check
if ( m_p - saved != needSpiderdb1 ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
}
// if we are injecting we must add the spider request
// we are injecting from so the url can be scheduled to be
// spidered again.
// NO! because when injecting a warc and the subdocs
// it contains, gb then tries to spider all of them !!! sux...
if ( needSpiderdb3 ) {
// note it
setStatus("adding spider request");
// checkpoint
saved = m_p;
// store it here
SpiderRequest revisedReq;
// if doing a repair/rebuild of spiderdb...
if ( m_useSecondaryRdbs )
getRebuiltSpiderRequest ( &revisedReq );
// this fills it in for doing injections
if ( ! m_useSecondaryRdbs ) {
getRevisedSpiderRequest ( &revisedReq );
// sanity log
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// sanity log
if ( m_firstIp == 0 || m_firstIp == -1 ) {
char *url = "unknown";
if ( m_sreqValid ) url = m_sreq.m_url;
log("build: error3 getting real firstip of "
"%"INT32" for %s. not adding new request.",
(int32_t)m_firstIp,url);
goto skipNewAdd2;
}
}
// copy it
if ( m_useSecondaryRdbs ) *m_p++ = RDB2_SPIDERDB2;
else *m_p++ = RDB_SPIDERDB;
// store it back
gbmemcpy ( m_p , &revisedReq , revisedReq.getRecSize() );
// skip over it
m_p += revisedReq.getRecSize();
// sanity check
if ( m_p - saved > needSpiderdb3 ) { char *xx=NULL;*xx=0; }
m_addedSpiderRequestSize = revisedReq.getRecSize();
m_addedSpiderRequestSizeValid = true;
}
skipNewAdd2:
//
// ADD SPIDERDB RECORDS of outlinks
//
// - do this AFTER computing revdb since we do not want spiderdb recs
// to be in revdb.
//
setStatus ( "adding spiderdb keys" );
// sanity check. cannot spider until in sync
if ( ! isClockInSync() ) { char *xx=NULL;*xx=0; }
// checkpoint
saved = m_p;
// . should be fixed from Links::setRdbList
// . we should contain the msge that msg16 uses!
// . we were checking m_msg16.m_recycleContent, but i have not done
// that in years!!! MDW
// . we were also checking if the # of banned outlinks >= 2, then
// we would not do this...
// . should also add with a time of now plus 5 seconds to that if
// we spider an outlink linkdb should be update with this doc
// pointing to it so it can get link text then!!
if ( spideringLinks && nl2 && ! m_doingConsistencyCheck &&
m_useSpiderdb && ! forDelete ){
// returns NULL and sets g_errno on error
char *ret = addOutlinkSpiderRecsToMetaList ();
// sanity check
if ( ! ret && ! g_errno ) { char *xx=NULL;*xx=0; }
// return NULL on error
if ( ! ret ) return NULL;
// this MUST not block down here, to avoid re-hashing above
if ( ret == (void *)-1 ) { char *xx=NULL;*xx=0; }
}
// sanity check
if ( m_p - saved > needSpiderdb2 ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// ADD TAG RECORDS TO TAGDB
//
// checkpoint
saved = m_p;
// . only do this if NOT setting from a title rec
// . it might add a bunch of forced spider recs to spiderdb
// . store into tagdb even if indexCode is set!
if ( ntb && m_useTagdb && ! forDelete ) {
// ntb is a safebuf of Tags, which are already Rdb records
// so just gbmemcpy them directly over
char *src = ntb->getBufStart();
int32_t srcSize = ntb->length();
gbmemcpy ( m_p , src , srcSize );
m_p += srcSize;
}
// sanity check
if ( m_p - saved > needTagdb ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
//
// ADD INDEXED SPIDER REPLY with different docid so we can
// search index of spider replies! (NEW!)
//
// . index spider reply with separate docid so they are all searchable.
// . see getSpiderStatusDocMetaList() function to see what we index
// and the titlerec we create for it
if ( spiderStatusDocMetaList ) {
gbmemcpy ( m_p ,
spiderStatusDocMetaList->getBufStart() ,
spiderStatusDocMetaList->length() );
m_p += spiderStatusDocMetaList->length();
m_addedStatusDocSize = spiderStatusDocMetaList->length();
m_addedStatusDocSizeValid = true;
}
/*
//
// ADD FORCED RESPIDER DOCID-BASED SPIDER RECS for Sections
//
// used by Sections.cpp to respider docs because we just identified an
// article section and they need to be re-indexed to take advantage
// of that
//
// checkpoint
saved = m_p;
// . only do this if NOT setting from a title rec
// . it might add a bunch of forced spider recs to spiderdb
if ( ! m_setFromTitleRec && nd ) { // && ! m_isInjecting ) {
Sections *ss = &m_sections;
m_p = ss->respiderLineWaiters ( m_p , m_pend );
if ( ! m_p ) return NULL;
}
// sanity check
if ( m_p - saved > needLineWaiters ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p );
*/
//
// NOW UPDATE OURSELVES (OUR URL) IN SPIDERDB
//
// but not if injecting!
//if ( ! m_sreqValid ) {
// // set the list size, different from the alloc size
// m_metaListSize = m_p - m_metaList;
// // all done
// return m_metaList;
//}
// note it
//setStatus ( "deleting old spider rec key" );
// rdbid first
// *p = RDB_SPIDERDB;
// use secondary?
//if ( m_useSecondaryRdbs ) *p = RDB2_SPIDERDB2;
//p++;
// must be legit
//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
// then the key
// *(key_t *)p = m_sreq.m_key;
// nukey, clear del bit to delete it
// *p &= 0xfe;
// skip key
//p += sizeof(key_t);
// int16_tcut
saved = m_p;
/*
See comment under DOLEDB above! this approach is no longer used.
// . remove from doledb if we had a valid key
// . DO THIS BEFORE adding the SpiderReply since
// Spider.cpp::addSpiderReply() will
// decrement the count for firstIp in m_doleIpTable
if ( (m_doledbKey.n0 || m_doledbKey.n1) &&
! m_useSecondaryRdbs &&
// do not add if we are generating the meta list for incremental
// indexing purposes from an old doc
! forDelete ) {
// note it
setStatus ( "removing key from doledb" );
// . now remove the original spider rec from "doledb"
// . rdbid first
*m_p = RDB_DOLEDB;
m_p++;
// then the key
*(key_t *)m_p = m_doledbKey;
// nukey, clear del bit to delete it
*m_p = *m_p & 0xfe;
// skip key
m_p += sizeof(key_t);
// datasize is 0
// *(int32_t *)m_p = 0;
//m_p += 4;
// sanity check
if ( m_p - saved != needDoledb ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p , forDelete );
}
*/
// note it
//setStatus ( "removing spider lock");
// . make a fake titledb key
// . remove the spider lock (Msg12 in Spider.cpp)
// . no need to do this if called from Repair.cpp
// . the uh48 is zero, that means fake!
// . i added "&& m_useSpiderdb" here because it was messing up
// the cacheTermLists() function which ONLY wants posdb keys and
// any other keys in the metalist messes it up. MDW 1/26/13
// . now SPider.cpp uses SpiderReply reception to remove lock
// - mdw 9/28/13
//if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) {
// *m_p++ = RDB_FAKEDB;
// ((key_t *)m_p)->n1 = 0;
// ((key_t *)m_p)->n0 = m_docId;
// //= g_titledb.makeKey ( m_docId , 0LL , true );
// m_p += sizeof(key_t);
//}
// MDW: new spider algo does not need this
/*
// save it
saved = m_p;
// re-add the same request since it was removed from Spider.cpp's
// m_urlBuf and the associated orderTree,ipTree, etc. and now
// since we are un-doling (undoling) it we need to re-add and this
// is the easiest way. it really was never removed from spiderdb
// but it will no longer be in the spider's cache since we delete
// it from there when we add it to doledb. so this is just a quick
// way of getting it back into the cache.
// now, we add this first since now Rdb.cpp calls evaluateAllReqeusts()
// AFTER the REPLY now
if ( m_sreqValid &&
// page parser has an invalid firstIp which causes printMetaList()
// to core when trying to print this out, so don't add it when
// doing page parser
! m_sreq.m_isPageParser ) {
// note it
setStatus ( "adding SpiderRequest back to spiderdb" );
// rdbid first
*m_p = RDB_SPIDERDB;
// use secondary?
if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
m_p++;
// store the spider rec
int32_t size = m_sreq.getRecSize();
gbmemcpy ( m_p , &m_sreq , size );
// set this one bit
SpiderRequest *rr = (SpiderRequest *)m_p;
rr->m_readd = 1;
// and hafta reset this junk otherwise it cores
// (see Spider.h::SpiderRequest::reset())
rr->m_ufn = -1;
rr->m_priority = -1;
rr->m_doled = 0;
// skip over the whole rec
m_p += size;
// sanity check - must not be a request, this is a reply
if ( ! g_spiderdb.isSpiderRequest( &m_sreq.m_key ) ) {
char *xx=NULL;*xx=0; }
// sanity check
if ( m_p - saved != needSpiderdb3 ) { char *xx=NULL;*xx=0; }
// sanity check
verifyMetaList( m_metaList , m_p );
}
*/
// sanity check
if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
int32_t now = getTimeGlobal();
/////////////////
//
// INCREMENTAL INDEXING / INCREMENTAL UPDATING
//
// now prune/manicure the metalist to remove records that
// were already added, and insert deletes for records that
// changed since the last time. this is how we do deletes
// now that we have revdb. this allows us to avoid
// parsing inconsistency errors.
//
/////////////////
// disable for parsing consistency testing of already indexed docs
//oldList = NULL;
if ( oldList ) { // && oldList->m_listSize > 16 ) {
// point to start of the old meta list, the first and only
// record in the oldList
char *om = oldList;// + 12 + 4;
// the size
int32_t osize = oldListSize;//*(int32_t *)(oldList + 12);
// the end
char *omend = om + osize;
int32_t needx = 0;
// init these. data is just the rdbid, a single byte.
//HashTableX dt12;
//HashTableX dt16;
//char dbuf12[30000];
//char dbuf16[40000];
//dt12.set ( 12,1,2048,dbuf12,30000,false,m_niceness);
//dt16.set ( 16,1,2048,dbuf16,40000,false,m_niceness);
HashTableX dt8;
char dbuf8[34900];
// value is the ptr to the rdbId/key in the oldList
dt8.set ( 8,sizeof(char *),2048,dbuf8,34900,
false,m_niceness,"dt8-tab");
// just for linkdb:
//HashTableX dt9;
//char dbuf9[30000];
//dt9.set ( 8,4,2048,dbuf9,30000,false,m_niceness,"dt9-tab");
// scan recs in that and hash them
for ( char *p = om ; p < omend ; ) {
// breathe
QUICKPOLL(m_niceness);
// save this
char byte = *p;
// save this
char *rec = p;
// get the rdbid for this rec
char rdbId = byte & 0x7f;
// skip that
p++;
// get the key size
int32_t ks = getKeySizeFromRdbId ( rdbId );
// get that
char *k = p;
// unlike a real meta list, this meta list has
// no data field, just rdbIds and keys only! because
// we only use it for deleting, which only requires
// a key and not the data
p += ks;
// tally this up in case we have to add the delete
// version of this key back (add 1 for rdbId)
needx += ks + 1;
// always re-add titledb record!
// if our current/new list is basically empty
// except for a SpiderReply because it got deleted
// from the index, we need to store the titledb key
// in dt8 so we can add it as a negative! so i
// don't really know what this was trying to fix
// because it broke that!
//if ( rdbId == RDB_TITLEDB ) continue;
// for linkdb, sometimes we also add a "lost" link
// key in addition to deleting the old key! see below
if ( rdbId == RDB_LINKDB ) needx += ks + 1;
// do not add it if datasize > 0
uint64_t hk;
// do not include discovery or lost dates in the
// linkdb key...
if ( rdbId == RDB_LINKDB )
hk = hash64 (k+12,ks-12);
else
hk = hash64 (k,ks);
// sanity check
if ( rdbId == RDB_LINKDB &&
g_linkdb.getLinkerDocId_uk((key224_t *)k)!=
m_docId ) {
char *xx=NULL;*xx=0; }
//if ( getDataSize(rdbId) != 0 ) continue;
// hash this key
//bool status;
// sectiondb keys all have the same last few bits...
// so this clogs up the hash table.
// so mix up the key bits for hashing
//uint64_t hk = hash64 ( k,ks);
//if (ks == 12 ) status = dt12.addKey ( k, &byte);
//else if (ks == 16 ) status = dt16.addKey ( k, &byte);
//else { char *xx=NULL; *xx=0; }
if ( ! dt8.addKey(&hk,&rec) ) return NULL;
// return NULL with g_errno set on error
//if ( ! status ) return NULL;
}
// also need all the new keys just to be sure, in case none
// are already in the rdbs
needx += (m_p - m_metaList);
// now alloc for our new manicured metalist
char *nm = (char *)mmalloc( needx, "newmeta" );
if ( ! nm ) return NULL;
char *nptr = nm;
char *nmax = nm + needx;
// scan each rec in the current meta list, see if its in either
// the dt12 or dt16 hash table, if it already is, then
// do NOT add it to the new metalist, nm, because there is
// no need to.
char *p = m_metaList;
char *pend = p + (m_p - m_metaList);
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// save it with the flag
char byte = *p;
// get rdbId
char rdbId = byte & 0x7f;
// skip that
p++;
// key size
int32_t ks = getKeySizeFromRdbId(rdbId);
// get key
char *key = p;
// skip that
p += ks;
// get data size
int32_t ds = getDataSizeFromRdbId(rdbId);
// assume we do not store the datasize
bool neg = false;
// . if key is negative, no data is present
// . the doledb key is negative for us here
if ( (key[0] & 0x01) == 0x00 ) { neg = true; ds = 0; }
// if datasize variable, read it in
if ( ds == -1 ) {
// get data size
ds = *(int32_t *)p;
// skip data size int32_t
p += 4;
}
// point to data
char *data = p;
// skip data if not zero
p += ds;
// mix it up for hashtable speed
uint64_t hk ;//= hash64 ( key,ks);
// skip if for linkdb, we do that below
if ( rdbId == RDB_LINKDB )
hk = hash64(key+12,ks-12);
else
hk = hash64(key,ks);
// was this key already in the "old" list?
int32_t slot = dt8.getSlot(&hk);
// do we got a linkdb key that existed last time
// we indexed this doc? if so, inherit its discovery
// date.
if ( slot >= 0 && rdbId == RDB_LINKDB ) {
/*
// get old key from last time
char *oldk=*(char**)dt8.getValueFromSlot(slot);
// skip rdbid
oldk++;
// sanity
if(g_linkdb.getLinkerDocId_uk((key224_t *)oldk)
!=m_docId){
char *xx=NULL;*xx=0; }
// copy rdbid into new meta list
*nptr++ = byte;
// point to where key will be stored in new lst
char *nk = nptr;
// store the new key in the new meta list
gbmemcpy ( nptr , key , ks );
// advance ptr
nptr += ks;
// get disocvery time of old key from last time
int32_t dd = g_linkdb.getDiscoveryDate_uk(oldk);
// sanity
if ( dd < 0 ) { char *xx=NULL;*xx=0; }
// but mod the new key's discovery time
g_linkdb.setDiscoveryDate_uk ( nk, dd );
*/
// . no need to deal with this any further
// . yeah, because there could be dups!
// so don't delete it just yet
// . but make the data ptr NULL so we
// know to disregard it below...???
dt8.removeSlot(slot);
// all done for this key
continue;
}
// see if already in an rdb, IFF dataless, otherwise
// the keys might be the same but with different data!
if ( slot >= 0 ) { // dt8.isInTable(&hk) ) {
// remove from hashtable so we do not add it
// as a delete key below
// dt8.removeKey(&hk);
dt8.removeSlot(slot);
// but do add like a titledb rec that has the
// same key, because its data is probably
// different...
// HACK: enable for now since we lost
// the url:www.geico.com term somehow!!!
// geico got deleted but not the title rec!!
// MAKE SURE TITLEREC gets deleted then!!!
if ( ds==0 && g_conf.m_doIncrementalUpdating )
continue;
}
// ok, it is not already in an rdb, so add it
*nptr++ = byte;
// store key
gbmemcpy ( nptr, key , ks );
// skip over it
nptr += ks;
// store data size. BUT not if negative key!
if ( getDataSizeFromRdbId(rdbId) == -1 && ! neg ) {
*(int32_t *)nptr = ds;
nptr += 4;
}
// store data
if ( ds ) {
gbmemcpy ( nptr , data , ds );
nptr += ds;
}
}
// now scan dt8 and add their keys as del keys
for ( int32_t i = 0 ; i < dt8.m_numSlots ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if empty
if ( ! dt8.m_flags[i] ) continue;
// store rdbid first
char *rec = *(char **)dt8.getValueFromSlot(i);
// get rdbId with hi bit possibly set
char rdbId = rec[0] & 0x7f;
// key size
int32_t ks = getKeySizeFromRdbId(rdbId);
// sanity test - no negative keys
if ( (rec[1] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0;}
// copy the rdbId byte and key
gbmemcpy ( nptr , rec , 1 + ks );
// skip over rdbid
nptr++;
// make it a negative key by clearing lsb
*nptr = *nptr & 0xfe;
// skip it
nptr += ks;
// if it is from linkdb, and unmet, then it is a
// lost link, so set the lost date of it. we keep
// these so we can graph lost links
if ( rdbId == RDB_LINKDB ) {
// the real linkdb rec is at rec+1
int32_t lost = g_linkdb.getLostDate_uk( rec+1 );
// how can it be non-zero? it should have
// been freshly made from the old titlerec...
if ( lost ) { char *xx=NULL;*xx=0; }
// if zero, set it to now!
//g_linkdb.setLostDate_uk(realRec,now);
// copy the rdbId byte and key
gbmemcpy ( nptr , rec , 1 + ks );
// set it in there now
g_linkdb.setLostDate_uk(nptr+1,now);
// carry it through on revdb, do not delete
// it! we want a linkdb history for seomasters
nptr += 1 + ks;
// and go on to delete the old linkdb key that
// did not have a lost date
//continue;
}
}
// sanity. check for metalist breach
if ( nptr > nmax ) { char *xx=NULL;*xx=0; }
// free the old meta list
mfree ( m_metaList , m_metaListAllocSize , "fm" );
// now switch over to the new one
m_metaList = nm;
m_metaListAllocSize = needx;
m_p = nptr;
}
// if we only removed it from index, set this flag
if ( oldList && ! nd ) m_didDelete = true;
//
// repeat this logic special for linkdb since we keep lost links
// and may update the discovery date or lost date in the keys
//
// 1. hash keys of old linkdb keys into dt9 here
// 2. do not hash the discovery/lost dates when making key hash for dt9
// 3. scan keys in meta list and add directly into new meta list
// if not in dt9
// 4. if in dt9 then add dt9 key instead
// 5. remove dt9 keys as we add them
// 6. then add remaining dt9 keys into meta list but with lost date
// set to now UNLESS it's already set
//
//
// validate us!
//
m_metaListValid = true;
// set the list size, different from the alloc size
m_metaListSize = m_p - m_metaList;//end - m_p;
// sanity check
verifyMetaList( m_metaList , m_metaList + m_metaListSize , forDelete );
// all done
return m_metaList;
}
// . copy from old title rec to us to speed things up!
// . returns NULL and set g_errno on error
// . returns -1 if blocked
// . returns 1 otherwise
// . when to doc content is unchanged, just inherit crap from the old title
// rec so we can make the spider reply in getNewSpiderReply()
void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
// skip if none
if ( ! od ) return;
// skip if already did it
if ( m_copied1 ) return;
// do not repeat
m_copied1 = true;
// set these
m_percentChanged = 0;
m_percentChangedValid = true;
// copy over bit members
m_contentHash32 = od->m_contentHash32;
//m_tagHash32 = od->m_tagHash32;
m_tagPairHash32 = od->m_tagPairHash32;
//m_sitePop = od->m_sitePop;
m_httpStatus = od->m_httpStatus;
m_hasAddress = od->m_hasAddress;
m_hasTOD = od->m_hasTOD;
//m_hasSiteVenue = od->m_hasSiteVenue;
m_isRSS = od->m_isRSS;
m_isPermalink = od->m_isPermalink;
m_hasContactInfo= od->m_hasContactInfo;
m_hopCount = od->m_hopCount;
m_crawlDelay = od->m_crawlDelay;
// do not forget the shadow members of the bit members
m_hasAddress2 = m_hasAddress;
m_hasTOD2 = m_hasTOD;
//m_hasSiteVenue2 = m_hasSiteVenue;
m_isRSS2 = m_isRSS;
m_isPermalink2 = m_isPermalink;
// validate them
m_contentHash32Valid = true;
//m_tagHash32Valid = true;
m_tagPairHash32Valid = true;
//m_sitePopValid = true;
m_httpStatusValid = true;
m_hasAddressValid = true;
m_hasTODValid = true;
//m_hasSiteVenueValid = true;
m_isRSSValid = true;
m_isPermalinkValid = true;
m_hasContactInfoValid= true;
m_hopCountValid = true;
m_crawlDelayValid = true;
m_pubDate = od->m_pubDate;
m_langId = od->m_langId;
m_pubDateValid = true;
m_langIdValid = true;
// so get sitenuminlinks doesn't crash when called by getNewSpiderReply
// because dns timed out. it timed out with EDNSTIMEDOUT before.
// so overwrite it here...
if ( m_ip == -1 || m_ip == 0 || ! m_ipValid ) {
m_ip = od->m_ip;
m_ipValid = true;
m_siteNumInlinks = od->m_siteNumInlinks;
// m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
// m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlo
// m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
m_siteNumInlinksValid =
od->m_siteNumInlinksValid;
// m_siteNumInlinksUniqueIpValid =
// od->m_siteNumInlinksUniqueIpValid;
// m_siteNumInlinksUniqueCBlockValid =
// od->m_siteNumInlinksUniqueCBlockValid;
// m_siteNumInlinksTotal =
// od->m_siteNumInlinksTotalValid;
}
m_indexCode = 0;//od->m_indexCode;
m_indexCodeValid = true;
// we need the link info too!
ptr_linkInfo1 = od->ptr_linkInfo1;
size_linkInfo1 = od->size_linkInfo1;
if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
else m_linkInfo1Valid = false;
// turn off for debug
ptr_sectiondbData = NULL;
size_sectiondbData = 0;
}
// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
SpiderReply *XmlDoc::getFakeSpiderReply ( ) {
if ( ! m_tagRecValid ) {
m_tagRec.reset();
m_tagRecValid = true;
}
if ( ! m_siteHash32Valid ) {
m_siteHash32 = 1;
m_siteHash32Valid = true;
}
if ( ! m_downloadEndTimeValid ) {
m_downloadEndTime = 0;
m_downloadEndTimeValid = true;
}
if ( ! m_ipValid ) {
m_ipValid = true;
m_ip = atoip("1.2.3.4");
}
if ( ! m_spideredTimeValid ) {
m_spideredTimeValid = true;
m_spideredTime = getTimeGlobal();//0; use now!
}
// don't let it get the diffbot reply either! it should be empty.
if ( ! m_diffbotReplyValid ) {
m_diffbotReplyValid = true;
}
// if doing diffbot query reindex
// TODO: does this shard the request somewhere else???
if ( ! m_firstIpValid ) {
m_firstIp = m_ip;//atoip("1.2.3.4");
m_firstIpValid = true;
}
// this was causing nsr to block and core below on a bad engineer
// error loading the old title rec
if ( ! m_isPermalinkValid ) {
m_isPermalink = false;
m_isPermalinkValid = true;
}
//if ( ! m_sreqValid ) {
// m_sreqValid = true;
// m_sreq.m_parentDocId = 0LL;
// }
// if error is EFAKEFIRSTIP, do not core
//if ( ! m_isIndexedValid ) {
// m_isIndexed = false;
// m_isIndexedValid = true;
//}
// if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT
// or ECORRUPTDATA (corrupt gzip reply)
// then this should not block. we need a spiderReply to release the
// url spider lock in SpiderLoop::m_lockTable.
// if m_isChildDoc is true, like for diffbot url, this should be
// a bogus one.
SpiderReply *nsr = getNewSpiderReply ();
if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
if ( ! nsr ) {
log("doc: crap, could not even add spider reply "
"to indicate internal error: %s",mstrerror(g_errno));
if ( ! g_errno ) g_errno = EBADENGINEER;
//return true;
return NULL;
}
return nsr;
//if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; }
//CollectionRec *cr = getCollRec();
//if ( ! cr ) return true;
}
// getSpiderReply()
SpiderReply *XmlDoc::getNewSpiderReply ( ) {
if ( m_srepValid ) return &m_srep;
setStatus ( "getting spider reply" );
// diffbot guys, robots.txt, frames, sshould not be here
if ( m_isChildDoc ) { char *xx=NULL;*xx=0; }
// . get the mime first
// . if we are setting XmlDoc from a titleRec, this causes
// doConsistencyCheck() to block and core
//HttpMime *mime = getMime();
//if ( ! mime || mime == (HttpMime *)-1 ) return (SpiderReply *)mime;
// if we had a critical error, do not do this
int32_t *indexCode = getIndexCode();
if (! indexCode || indexCode == (void *)-1)
return (SpiderReply *)indexCode;
// if it has been abandoned early, i.e. cut-off, then we should
// add a "fake" spider reply to release the lock in
// SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply()
// to see what parts of this are relevant.
/*
if ( *indexCode == EABANDONED ||
// . any internal "error" needs to be here really
// . was there an error unzipping the title rec?
*indexCode == ECORRUPTDATA ||
*indexCode == EHITCRAWLLIMIT ||
*indexCode == EHITPROCESSLIMIT ) {
// clear everything
m_srep.reset();
// get from spider request, if there
int32_t firstIp = 0;
if ( m_sreqValid ) firstIp = m_sreq.m_firstIp;
// otherwise, wtf?
if ( ! firstIp )
log("build: no first ip to make fake spiderReply. "
"injected?");
// we at least need this
m_srep.m_firstIp = firstIp;
Url *fu = getFirstUrl();
// this is the lock key
int64_t uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
m_srep.setKey ( firstIp, 0 , uh48 , false );
// tell it we are fake and not to really add us to
// spiderdb, but just to release the lock
m_srep.m_errCode = *indexCode;
m_srepValid = true;
return &m_srep;
}
*/
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;
// can't call getIsPermalink() here without entering a dependency loop
//char *pp = getIsUrlPermalinkFormat();
//if ( !pp || pp == (char *)-1 ) return (SpiderReply *)pp;
// the site hash
int32_t *sh32 = getSiteHash32();
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SpiderReply *)sh32;
int64_t *de = getDownloadEndTime();
if ( ! de || de == (void *)-1 ) return (SpiderReply *)de;
// need to set m_sentToDiffbot!!
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return (SpiderReply *)dbr;
// was the doc index when we started trying to spider this url?
//char *wasIndexed = getIsIndexed();
//if ( ! wasIndexed || wasIndexed == (void *)-1 )
// return (SpiderReply *)wasIndexed;
//Tag *vt = m_oldTagRec.getTag("venueaddress");
//bool siteHasVenue = (bool)vt;
// int16_tcut
Url *fu = NULL;
// watch out for titlerec lookup errors for docid based spider reqs
if ( m_firstUrlValid ) fu = getFirstUrl();
// reset
m_srep.reset();
int32_t firstIp = -1;
// inherit firstIp
Tag *tag = m_tagRec.getTag("firstip");
// tag must be there?
if ( tag ) firstIp = atoip(tag->getTagData());
// this is usually the authority
if ( m_firstIpValid )
firstIp = m_firstIp;
// otherwise, inherit from oldsr to be safe
// BUT NOT if it was a fakeip and we were injecting because
// the SpiderRequest was manufactured and not actually taken
// from spiderdb! see XmlDoc::injectDoc() because that is where
// it came from!! if it has m_sreq.m_isAddUrl and
// m_sreq.m_fakeFirstIp then we actually do add the reply with that
// fake ip so that they will exist in the same shard.
// BUT if it is docid pased from PageReindex.cpp (a query reindex)
// we set the injection bit and the pagereindex bit, we should let
// thise guys keep the firstip because the docid-based spider request
// is in spiderdb. it needs to match up.
if ( m_sreqValid && (!m_sreq.m_isInjecting||m_sreq.m_isPageReindex) )
firstIp = m_sreq.m_firstIp;
// sanity
if ( firstIp == 0 || firstIp == -1 ) {
if ( m_firstUrlValid )
log("xmldoc: BAD FIRST IP for %s",m_firstUrl.getUrl());
else
log("xmldoc: BAD FIRST IP for %"INT64"",m_docId);
firstIp = 12345;
//char *xx=NULL;*xx=0; }
}
// store it
m_srep.m_firstIp = firstIp;
// assume no error
// MDW: not right...
m_srep.m_errCount = 0;
// otherwise, inherit from oldsr to be safe
//if ( m_sreqValid )
// m_srep.m_firstIp = m_sreq.m_firstIp;
// do not inherit this one, it MIGHT HAVE CHANGE!
m_srep.m_siteHash32 = m_siteHash32;
// need this for updating crawl delay table, m_cdTable in Spider.cpp
if ( fu ) m_srep.m_domHash32 = getDomHash32();
else m_srep.m_domHash32 = 0;
if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// . set other fields besides key
// . crap! if we are the "qatest123" collection then m_spideredTime
// was read from disk usually and is way in the past! watch out!!
m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// crap, for the test coll this is often a very old time and it
// causes the spider request to be repeatedly executed, so let's
// fix that
if ( ! strcmp(cr->m_coll,"qatest123") )
m_srep.m_spideredTime = getTimeGlobal();
// TODO: expire these when "ownershipchanged" tag is newer!!
if ( gr->getTag ( "ingoogle" ) ) {
m_srep.m_inGoogle = 1;
m_srep.m_inGoogleValid = 1;
}
if ( gr->getTag ( "authorityinlink" ) )
m_srep.m_hasAuthorityInlink = 1;
// automatically valid either way
m_srep.m_hasAuthorityInlinkValid = 1;
// but for this tag, it must exist even if it has no contact info
//tag = gr->getTag ( "hascontactinfo" );
//if ( tag ) {
int64_t uh48 = 0LL;
// we might be a docid based spider request so fu could be invalid
// if the titlerec lookup failed
if ( fu ) uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
int64_t parentDocId = 0LL;
if ( m_sreqValid )
parentDocId = m_sreq.getParentDocId();
//else { char *xx=NULL;*xx=0; }
// for docid based urls from PageReindex.cpp we have to make
// sure to set the urlhash48 correctly from that.
if ( m_sreqValid ) uh48 = m_sreq.getUrlHash48();
// note it
if ( g_conf.m_logDebugSpider )
log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);
// set the key, m_srep.m_key
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
// . did we download a page? even if indexcode is set we might have
// . if this is non-zero that means its valid
if ( m_contentHash32Valid )
m_srep.m_contentHash32 = m_contentHash32;
// injecting the content (url implied)
if ( m_contentInjected ) // m_sreqValid && m_sreq.m_isInjecting )
m_srep.m_fromInjectionRequest = 1;
// can be injecting a url too, content not necessarily implied
if ( m_sreqValid && m_sreq.m_isInjecting )
m_srep.m_fromInjectionRequest = 1;
if ( m_sentToDiffbotThisTime )
m_srep.m_sentToDiffbotThisTime = true;
else
m_srep.m_sentToDiffbotThisTime = false;
if ( m_diffbotReplyError )
m_srep.m_hadDiffbotError = true;
else
m_srep.m_hadDiffbotError = false;
// if we only had an error code in the diffbot reply, record that
if ( ! m_indexCode && m_diffbotReplyError )
m_srep.m_errCode = m_diffbotReplyError;
// sanity. if being called directly from indexDoc() because of
// an error like out of memory, then we do not know if it is
// indexed or not or was indexed...
//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
// were we already in titledb before we started spidering?
m_srep.m_wasIndexed = m_wasInIndex;
// note whether m_wasIndexed is valid because if it isn't then
// we shouldn't be counting this reply towards the page counts.
// if we never made it this far i guess we should not forcibly call
// getIsIndexed() at this point so our performance is fast in case
// this is an EFAKEFIRSTIP error or something similar where we
// basically just add this reply and we're done.
// NOTE: this also pertains to SpiderReply::m_isIndexed.
m_srep.m_wasIndexedValid = m_wasInIndexValid;
// assume no change
m_srep.m_isIndexed = m_isInIndex;
// we need to know if the m_isIndexed bit is valid or not
// because sometimes like if we are being called directly from
// indexDoc() because of an error situation, we do not know!
if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
else m_srep.m_isIndexedINValid = true;
// likewise, we need to know if we deleted it so we can decrement the
// quota count for this subdomain/host in SpiderColl::m_quotaTable
//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;
// treat error replies special i guess, since langId, etc. will be
// invalid
if ( m_indexCode ) {
// validate
m_srepValid = true;
// set these items if valid already, but don't bother
// trying to compute them, since we are not indexing.
if ( m_siteNumInlinksValid ) {
m_srep.m_siteNumInlinks = m_siteNumInlinks;
m_srep.m_siteNumInlinksValid = true;
}
//if ( m_percentChangedValid )
// m_srep.m_percentChangedPerDay = m_percentChanged;
if ( m_crawlDelayValid && m_crawlDelay >= 0 )
// we already multiply x1000 in isAllowed2()
m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
else
m_srep.m_crawlDelayMS = -1;
if ( m_pubDateValid ) m_srep.m_pubDate = m_pubDate;
if ( m_langIdValid ) m_srep.m_langId = m_langId;
if ( m_isRSSValid ) m_srep.m_isRSS = m_isRSS;
if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
if ( m_httpStatusValid ) m_srep.m_httpStatus = m_httpStatus;
// stuff that is automatically valid
m_srep.m_isPingServer = 0;
if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
// this was replaced by m_contentHash32
//m_srep.m_newRequests = 0;
m_srep.m_errCode = m_indexCode;
if ( m_downloadEndTimeValid )
m_srep.m_downloadEndTime = m_downloadEndTime;
else
m_srep.m_downloadEndTime = 0;
// is the original spider request valid?
if ( m_sreqValid ) {
// preserve the content hash in case m_indexCode is
// EDOCUNCHANGED. so we can continue to get that
// in the future. also, if we had the doc indexed,
// just carry the contentHash32 forward for the other
// errors like EDNSTIMEDOUT or whatever.
m_srep.m_contentHash32 = m_sreq.m_contentHash32;
// int16_tcuts
SpiderReply *n = &m_srep;
SpiderRequest *o = &m_sreq;
// more stuff
n->m_inGoogle = o->m_inGoogle;
n->m_hasContactInfo = o->m_hasContactInfo;
n->m_isContacty = o->m_isContacty;
n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
n->m_isPingServer = o->m_isPingServer;
// the validator flags
n->m_inGoogleValid = o->m_inGoogleValid;
n->m_hasContactInfoValid = o->m_hasContactInfoValid;
n->m_isContactyValid = o->m_isContactyValid;
n->m_hasAuthorityInlinkValid =
o->m_hasAuthorityInlinkValid;
// get error count from original spider request
int32_t newc = m_sreq.m_errCount;
// inc for us, since we had an error
newc++;
// contain to one byte
if ( newc > 255 ) newc = 255;
// store in our spiderreply
m_srep.m_errCount = newc;
}
// . and do not really consider this an error
// . i don't want the url filters treating it as an error reply
// . m_contentHash32 should have been carried forward from
// the block of code right above
if ( m_indexCode == EDOCUNCHANGED ) {
// we should have had a spider request, because that's
// where we got the m_contentHash32 we passed to
// Msg13Request.
if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
// make it a success
m_srep.m_errCode = 0;
// and no error count, it wasn't an error per se
m_srep.m_errCount = 0;
// call it 200
m_srep.m_httpStatus = 200;
}
// copy flags and data from old doc...
if ( m_indexCode == EDOCUNCHANGED &&
m_oldDocValid &&
m_oldDoc ) {
m_srep.m_pubDate = m_oldDoc->m_pubDate;
m_srep.m_langId = m_oldDoc->m_langId;
m_srep.m_isRSS = m_oldDoc->m_isRSS;
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
m_srep.m_hasAddress = m_oldDoc->m_hasAddress;
m_srep.m_hasTOD = m_oldDoc->m_hasTOD;
//m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
// they're all valid
m_srep.m_hasAddressValid = true;
m_srep.m_hasTODValid = true;
//m_srep.m_hasSiteVenueValid = true;
m_srep.m_siteNumInlinksValid = true;
}
// do special things if
return &m_srep;
}
// this will help us avoid hammering ips & respect same ip wait
if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0; }
m_srep.m_downloadEndTime = m_downloadEndTime;
// . if m_indexCode was 0, we are indexed then...
// . this logic is now above
//m_srep.m_isIndexed = 1;
// get ptr to old doc/titlerec
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (SpiderReply *)pod;
// this is non-NULL if it existed
XmlDoc *od = *pod;
// status is -1 if not found
int16_t *hs = getHttpStatus ();
if ( ! hs || hs == (void *)-1 ) return (SpiderReply *)hs;
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (SpiderReply *)sni;
float *pc = getPercentChanged();
if ( ! pc || pc == (void *)-1 ) return (SpiderReply *)pc;
// these are "non-dup" addresses (nondup)
bool *hasAddress = getHasAddress();
if ( ! hasAddress || hasAddress == (void *)-1 )
return (SpiderReply *)hasAddress;
// does it have a tod (i.e. 6pm) in there somewhere?
bool *hasTOD = getHasTOD();
if ( ! hasTOD || hasTOD == (void *)-1 )
return (SpiderReply *)hasTOD;
// does it have a venue address?
//bool *hasSiteVenue = getHasSiteVenue();
//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
// return (SpiderReply *)hasSiteVenue;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (SpiderReply *)isRoot;
char *hci = getHasContactInfo();
if ( ! hci || hci == (char *)-1 ) return (SpiderReply *)hci;
int32_t *pubDate = getPubDate();
if ( ! pubDate || pubDate == (int32_t *)-1 )
return (SpiderReply *)pubDate;
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 )
return (SpiderReply *)langId;
char *isRSS = getIsRSS();
if ( ! isRSS || isRSS == (char *)-1 )
return (SpiderReply *)isRSS;
char *pl = getIsPermalink();
if ( ! pl || pl == (char *)-1 )
return (SpiderReply *)pl;
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
if ( m_hasContactInfo ) {
m_srep.m_hasContactInfo = 1;
m_srep.m_hasContactInfoValid = 1;
}
// this is only know if we download the robots.tt...
if ( od && m_recycleContent ) {
m_crawlDelay = od->m_crawlDelay;
m_crawlDelayValid = true;
}
// sanity checks
//if(! m_sreqValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_isSpamValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_crawlDelayValid ) { char *xx=NULL;*xx=0; }
// httpStatus is -1 if not found (like for empty http replies)
m_srep.m_httpStatus = *hs;
// zero if none
//m_srep.m_percentChangedPerDay = 0;
// . only if had old one
// . we use this in url filters to set the respider wait time usually
if ( od ) {
int32_t spideredTime = getSpideredTime();
int32_t oldSpideredTime = od->getSpideredTime();
float numDays = spideredTime - oldSpideredTime;
m_srep.m_percentChangedPerDay = (m_percentChanged+.5)/numDays;
}
// . update crawl delay, but we must store now as milliseconds
// because Spider.cpp like it better that way
// . -1 implies crawl delay unknown or not found
if ( m_crawlDelay >= 0 && m_crawlDelayValid )
// we already multiply x1000 in isAllowed2()
m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
else
// -1 means invalid/unknown
m_srep.m_crawlDelayMS = -1;
if ( ! m_hasAddressValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hasTODValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }
// . we use this to store "bad" spider recs to keep from respidering
// a "bad" url over and over again
// . it is up to the url filters whether they want to retry this
// again or not!
// . TODO: how to represent "ETCPTIMEDOUT"????
// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
// ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
m_srep.m_siteNumInlinks = m_siteNumInlinks;
m_srep.m_pubDate = *pubDate;
// this was replaced by m_contentHash32
//m_srep.m_newRequests = 0;
m_srep.m_langId = *langId;
m_srep.m_isRSS = (bool)*isRSS;
m_srep.m_isPermalink = (bool)*pl;
m_srep.m_isPingServer = (bool)fu->isPingServer();
//m_srep.m_isSpam = m_isSpam;
m_srep.m_siteNumInlinksValid = true;
// . ignore address in dup sections (nondup/non-dup addresses only)
// . this way if the place always has their address in the header or
// footer of every web page we will ignore it
m_srep.m_hasAddress = *hasAddress;
m_srep.m_isContacty = *hci;//getIsContacty(fu,
// info1,
// m_hopCount ,
// *ct , // contentType
// *isRoot ,
// m_niceness );
m_srep.m_hasTOD = *hasTOD;
//m_srep.m_hasSiteVenue = *hasSiteVenue;
// validate all
m_srep.m_inGoogleValid = 1;
m_srep.m_hasContactInfoValid = 1;
m_srep.m_hasAuthorityInlinkValid = 1;
m_srep.m_isContactyValid = 1;
m_srep.m_hasAddressValid = 1;
m_srep.m_hasTODValid = 1;
//m_srep.m_hasSiteVenueValid = 1;
// a quick validation. reply must unlock the url from the lock table.
// so the locks must be equal.
if ( m_sreqValid &&
// we create a new spiderrequest if injecting with a fake firstip
// so it will fail this test...
! m_sreq.m_isInjecting ) {
int64_t lock1 = makeLockTableKey(&m_sreq);
int64_t lock2 = makeLockTableKey(&m_srep);
if ( lock1 != lock2 ) {
log("build: lock1 != lock2 lock mismatch for %s",
m_firstUrl.m_url);
char *xx=NULL;*xx=0;
}
}
// validate
m_srepValid = true;
return &m_srep;
}
// . so Msg20 can see if we are banned now or not...
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
// because things like "parentIsRSS" can be both true or false since a url
// can have multiple spider recs associated with it!
void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq ,
SpiderReply *srep ) {
// sanity checks
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; }
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_isUrlPermalinkFormatValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; }
Url *fu = getFirstUrl();
// get this
//TagRec *gr = (TagRec *)ptr_tagRecData;
//Tag *tag = NULL;
//if ( gr ) tag = gr->getTag("sitenuminlinks");
// reset
sreq->reset();
// assume not valid
sreq->m_siteNumInlinks = -1;
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
// how many site inlinks?
sreq->m_siteNumInlinks = m_siteNumInlinks;
sreq->m_siteNumInlinksValid = true;
// set other fields besides key
sreq->m_firstIp = m_ip;
sreq->m_hostHash32 = m_hostHash32a;
//sreq->m_domHash32 = m_domHash32;
//sreq->m_siteNumInlinks = m_siteNumInlinks;
//sreq->m_pageNumInlinks = m_pageNumInlinks;
sreq->m_hopCount = m_hopCount;
sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32;
sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32;
sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32;
sreq->m_pageNumInlinks = 0;//m_sreq.m_parentFirstIp;
sreq->m_isNewOutlink = 0;
sreq->m_isAddUrl = 0;//m_isAddUrl;
sreq->m_isPingServer = fu->isPingServer();
//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;
// transcribe from old spider rec, stuff should be the same
sreq->m_addedTime = m_firstIndexedDate;
sreq->m_sameDom = 0;//m_sreq.m_sameDom;
sreq->m_sameHost = 0;//m_sreq.m_sameHost;
sreq->m_sameSite = 0;//m_sreq.m_sameSite;
sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed;
sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS;
sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink;
sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer;
// validate the stuff so getUrlFilterNum() acks it
sreq->m_hopCountValid = 1;
srep->reset();
srep->m_spideredTime = getSpideredTime();//m_spideredTime;
//srep->m_isSpam = isSpam; // real-time update this!!!
srep->m_isRSS = m_isRSS;
srep->m_isPermalink = m_isPermalink;
srep->m_httpStatus = 200;
//srep->m_retryNum = 0;
srep->m_langId = m_langId;
srep->m_percentChangedPerDay = 0;//m_percentChanged;
// we need this now for ucp ucr upp upr new url filters that do
// substring matching on the url
if ( m_firstUrlValid )
strcpy(sreq->m_url,m_firstUrl.m_url);
}
// defined in PageCrawlBot.cpp
int32_t isInSeedBuf ( CollectionRec *cr , char *url, int len ) ;
// . add the spiderdb recs to the meta list
// . used by XmlDoc::setMetaList()
// . returns NULL and sets g_errno on error
// . otherwise returns the "new p"
// . if Scraper.cpp or PageAddUrl.cpp and Msg7.cpp should all use the XmlDoc
// class even if just adding links. they should make a fake html page and
// "inject" it, with only m_useSpiderdb set to true...
char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
// do not do this if recycling content
// UNLESS REBUILDING...
if ( m_recycleContent && ! m_useSecondaryRdbs ) return (char *)0x01;
// for now skip in repair tool
if ( m_useSecondaryRdbs && ! g_conf.m_rebuildAddOutlinks )
return (char *)0x01;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
Links *links = getLinks();
if ( ! links || links == (Links *)-1 ) return (char *)links;
char *spiderLinks = getSpiderLinks();
if ( ! spiderLinks || spiderLinks == (char *)-1 )
return (char *)spiderLinks;
TagRec ***grv = getOutlinkTagRecVector();
if ( ! grv || grv == (void *)-1 ) return (char *)grv;
//char **iiv = getOutlinkIsIndexedVector();
//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
int32_t **ipv = getOutlinkFirstIpVector();
if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
//int8_t *hcv = getOutlinkHopCountVector();
//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
char *ipi = getIsIndexed(); // is the parent indexed?
if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
Addresses *aa = getAddresses ();
if ( ! aa || aa == (Addresses *)-1 ) return (char *)aa;
// sanity check
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
// . ignore address in dup sections
// . this way if the place always has their address in the header or
// footer of every web page we will ignore it (SEC_DUP section flag)
bool parentHasAddress = (bool)(aa->getNumNonDupAddresses()>0);
// need this
int32_t parentDomHash32 = getDomHash32();
if ( parentDomHash32 != m_domHash32 ) { char *xx=NULL;*xx=0; }
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
int32_t *psni = getSiteNumInlinks();
if ( ! psni || psni == (int32_t *)-1 ) return (char *)psni;
int32_t *pfip = getFirstIp();
if ( ! pfip || pfip == (void *)-1 ) return (char *)pfip;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (char *)d;
Url *fu = getFirstUrl();
if ( ! fu || fu == (void *)-1 ) return (char *)fu;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (char *)cu;
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;
// validate this to prevent core for simplified redirect links
int32_t hostHash32a = getHostHash32a();
// so linkSites[i] is site for link #i in Links.cpp class
int32_t *linkSiteHashes = getLinkSiteHashes ( );
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
return (char *)linkSiteHashes;
XmlDoc *nd = this;
// set "od". will be NULL if no old xml doc, i.e. no old title rec
//XmlDoc **pod = getOldXmlDoc ( );
//if ( ! pod || pod == (void *)-1 ) return (char *)pod;
//XmlDoc *od = *pod;
// if this page is hacked, then do not spider external outlinks
//char *comp = getIsCompromised();
//if ( ! comp || comp == (char *)-1 ) return (char *)comp;
//if ( *comp )
// onlyInternal = true;
bool isParentRSS = false;
bool parentIsPermalink = false;
bool parentIsSiteMap = false;
// PageAddUrl.cpp does not supply a valid new doc, so this is NULL
if ( nd ) {
isParentRSS = *nd->getIsRSS() ;
parentIsPermalink = *nd->getIsPermalink();
parentIsSiteMap = *nd->getIsSiteMap();
}
int32_t n = links->m_numLinks;
// return early if nothing to do. do not return NULL though cuz we
// do not have g_errno set!
if ( n <= 0 ) return (char *)0x01;
// sanity checks
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
int64_t myUh48 = m_firstUrl.getUrlHash48();
// . pre-allocate a buffer to hold the spider recs
// . taken from SpiderRequest::store()
int32_t size = 0;
for ( int32_t i = 0 ; i < n ; i++ )
size += SpiderRequest::getNeededSize ( links->getLinkLen(i) );
// append spider recs to this list ptr
char *p = m_p;
// hash table to avoid dups
HashTableX ht;
char buf2[8192];
if ( ! ht.set ( 4,0,1000,buf2 , 8192,false,m_niceness,"linkdedup" ) )
return NULL;
// count how many we add
int32_t numAdded = 0;
int32_t numAddedFromSameDomain = 0;
int32_t linksBanned = 0;
int32_t linksFiltered = 0;
bool isParentPingServer = false;
if ( fu && fu->isPingServer() ) isParentPingServer = true;
if ( cu && cu->isPingServer() ) isParentPingServer = true;
// int16_tcut
bool isScraping = (m_sreqValid && m_sreq.m_isScraping);
//bool useTestSpiderDir = (m_sreqValid && m_sreq.m_useTestSpiderDir);
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// do not do this if not test collection for now
bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
// turn off for now
isTestColl = false;
//char **wptrs = m_words.getWords();
//int32_t *wlens = m_words.getWordLens();
// need this for setting SpiderRequest::m_spiderTime
//int32_t nowGlobal = getTimeGlobal();
// for setting LF_CONTACTY bit on the outlinks
char disbuf[1000];
HashTableX disqualify;
disqualify.set(4,0,32,disbuf,1000,false,m_niceness,"disqual");
int32_t consec = 0;
int32_t linkTypes[2000];
int32_t lastType = 0;
// if the file we are indexing now has
// "<meta name=spiderlinkslinks value=0>" then that means to
// add the links to spiderdb, but do not spider their links!
// dmozparse uses this to make a file called gbdmoz.urs.txt.0
// that is just filled with urls that are in dmoz. and we want
// to index just those urls.
//
// now just make dmozparse output urls as <a href=> tags.
//
char mbuf[16];
mbuf[0] = '\0';
char *tag = "spiderlinkslinks";
int32_t tlen = gbstrlen(tag);
xml->getMetaContent ( mbuf, 16 , tag , tlen );
bool avoid = false;
if ( mbuf[0] == '0' ) avoid = true;
// if this is a simplified redir and we should not be spidering
// links then turn it off as well! because we now add simplified
// redirects back into spiderdb using this function.
if ( m_spiderLinksValid && ! m_spiderLinks )
avoid = true;
// it also has this meta tag now too
mbuf[0] = '\0';
tag = "ignorelinksexternalerrors";
tlen = gbstrlen(tag);
xml->getMetaContent ( mbuf, 16 , tag , tlen );
bool ignore = false;
if ( mbuf[0] == '1' ) ignore = true;
// for diffbot crawlbot, if we are a seed url and redirected to a
// different domain... like bn.com --> barnesandnoble.com
int32_t redirDomHash32 = 0;
int32_t redirHostHash32 = 0;
//int32_t redirSiteHash32 = 0;
if ( //cr->m_isCustomCrawl == 1 &&
//isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) &&
m_hopCount == 0 &&
m_redirUrlValid &&
ptr_redirUrl &&
//m_redirUrlPtr && (this gets reset to NULL as being LAST redir)
// this is the last non-empty redir here:
m_redirUrl.getUrlLen() > 0 ) {
log("build: seed REDIR: %s",m_redirUrl.getUrl());
redirDomHash32 = m_redirUrl.getDomainHash32();
redirHostHash32 = m_redirUrl.getHostHash32();
}
//SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
//
// serialize each link into the metalist now
//
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// grab our info
TagRec *gr = (*grv)[i];
int32_t firstIp = (*ipv)[i];
//char isIndexed = (*iiv)[i];
//int32_t hc = hcv[i];
// ip lookup failed? do not add to spiderdb then
if ( firstIp == 0 || firstIp == -1 ) continue;
// if firstIp is in the SpiderColl::m_overflowFirstIps list
// then do not add any more links to it. it already has
// more than 500MB worth.
// this was moved to Rdb.cpp's addRecord()
// if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
// m_linkOverflows++;
// g_stats.m_totalOverflows++;
// continue;
// }
// sanity check
//if ( firstIp == 0x03 ) {char *xx=NULL;*xx=0; }
// get flags
linkflags_t flags = links->m_linkFlags[i];
// . skip if we are rss page and this link is an <a href> link
// . we only harvest <link> urls from rss feeds, not href links
// . or in the case of feedburner, those orig tags
if ( isParentRSS && (flags & LF_AHREFTAG) ) continue;
// if we have a <feedburner:origLink> tag, then ignore <link>
// tags and only get the links from the original links
if ( links->m_isFeedBurner && !(flags & LF_FBTAG) ) continue;
// do not add self links, pointless
if ( flags & LF_SELFLINK ) continue;
// do not add if no follow
if ( flags & LF_NOFOLLOW ) continue;
// point to url
char *s = links->getLink (i);
int32_t slen = links->getLinkLen(i);
// breathe
QUICKPOLL(m_niceness);
// get hash
int32_t uh = hash32 ( s , slen );
// it does not like keys of 0, that means empty slot
if ( uh == 0 ) uh = 1;
// skip if dup
if ( ht.isInTable ( &uh ) ) continue;
// add it, returns false and sets g_errno on error
if ( ! ht.addKey ( &uh ) ) return NULL;
// we now supports HTTPS
if ( strncmp(s,"http://",7) && strncmp(s,"https://",8) )
continue;
// . do not add if "old"
// . Links::set() calls flagOldOutlinks()
// . that just means we probably added it the last time
// we spidered this page
// . no cuz we might have a different siteNumInlinks now
// and maybe this next hop count is now allowed where as
// before it was not!
//if ( flags & LF_OLDLINK ) continue;
// set it. addWWW = true! no.. make it false because of issues
// like tmblr.co/ZHw5yo1E5TAaW injection where
// www.tmblr.co has no IP
Url url; url.set ( s , slen , false ); // true );
// if hostname length is <= 2 then SILENTLY reject it
if ( url.getHostLen() <= 2 ) continue;
// are we a new outlink from a ? i.e. a "hot link"? assume so
bool newOutlink = true;
// if no old links, can not be a new outlink then
if ( flags & LF_OLDLINK ) newOutlink = false;
// . do not consider outlinks of new pages to be newOutlinks.
// that is somewhat redundant.
// . you can use "parentisnew" to do what you want in the url
// filters table
//if ( ! isIndexed ) newOutlink = false;
// get # of inlinks to this site... if recorded...
int32_t ksni = -1;
Tag *st = NULL;
if ( gr ) st = gr->getTag ("sitenuminlinks");
if ( st ) ksni = atol(st->getTagData());
int32_t hostHash32 = url.getHostHash32();
// . consult our sitelinks.txt file
// . returns -1 if not found
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
if ( min < 0 && ! url.hasSubdomain() ) {
int32_t wwwHash32 = url.getHash32WithWWW();
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
if ( min >= 0 && ksni < min )
ksni = min;
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
//int32_t ksni = m_siteNumInlinks;
// . get possible pub date from url (.../2008/09/23/page.htm)
// . this returns 0 if none found
//int32_t urlPubDate = parseDateFromUrl(s);
// use zero for the timestamp so SiteGetter does not recompute
// any tags in the tagRec thereby blocking!
//SiteGetter sg;
//sg.getSite ( s , gr , 0, m_coll, m_niceness,false,NULL,NULL);
// get this
bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );
//int32_t siteHash32 = hash32n ( linkSite );
// get it quick
bool ispingserver = url.isPingServer();
int32_t domHash32 = url.getDomainHash32();
// is link rss?
//bool isrss = false;
//if (slen>6 && !strncasecmp(s+slen-4,".rss",4)) isrss = true;
bool isRSSExt = false;
char *ext = url.getExtension();
if ( ext && strcasecmp(ext,"rss" ) == 0 ) isRSSExt = true;
if ( ext && strcasecmp(ext,"xml" ) == 0 ) isRSSExt = true;
if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;
// make the spider request rec for it
SpiderRequest ksr;
// to defaults (zero out)
ksr.reset();
// set other fields besides key
ksr.m_firstIp = firstIp;
ksr.m_hostHash32 = hostHash32;
ksr.m_domHash32 = domHash32;
ksr.m_siteHash32 = linkSiteHashes[i];//siteHash32;
ksr.m_siteNumInlinks = ksni;
ksr.m_siteNumInlinksValid = true;
ksr.m_isRSSExt = isRSSExt;
// continue using "test-spider" subdir to cache web pages
// if our parent was using that
//ksr.m_useTestSpiderDir = useTestSpiderDir;
ksr.m_parentIsSiteMap = parentIsSiteMap;
ksr.m_hasMediaExtension = url.hasMediaExtension();
ksr.m_hasMediaExtensionValid = 1;
// now we need this so we can share Msg12 spider locks with
// query reindex docid-based spider requests. that way
// we do not spider the same document at the same time.
//ksr.m_probDocId = g_titledb.getProbableDocId(&url);
//ksr.m_pageNumInlinks = 0;
// hop count is now 16 bits so do not wrap that around
int32_t hc = m_hopCount + 1;
if ( hc > 65535 ) hc = 65535;
ksr.m_hopCount = hc;
// keep hopcount the same for redirs
if ( m_indexCodeValid &&
( m_indexCode == EDOCSIMPLIFIEDREDIR ||
m_indexCode == EDOCNONCANONICAL ) )
ksr.m_hopCount = m_hopCount;
// for diffbot custom crawls we keep the computed hopcount
if ( ! cr->m_isCustomCrawl ) {
if ( issiteroot ) ksr.m_hopCount = 0;
if ( ispingserver ) ksr.m_hopCount = 0;
//if ( isrss ) ksr.m_hopCount = 0;
}
// log("ksr: url=%s hc=%i (isr=%i ips=%i icv=%i ic=%i mhc=%i)",
// url.getUrl(),(int)ksr.m_hopCount,
// (int)issiteroot,(int)ispingserver,(int)m_indexCodeValid,
// (int)m_indexCode,(int)m_hopCount
// );
// validate it
ksr.m_hopCountValid = true;
ksr.m_addedTime = getSpideredTime();//m_spideredTime;
//ksr.m_lastAttempt = 0;
//ksr.m_urlPubDate = urlPubDate;
//ksr.m_errCode = 0;
ksr.m_parentHostHash32 = hostHash32a;
ksr.m_parentDomHash32 = m_domHash32;
ksr.m_parentSiteHash32 = m_siteHash32;
// if a seed/hopcount0 url redirected to a different domain
// then use that if it is the same. that way we can satisft
// the "isonsamedomain" expression in the url filters table.
if ( redirDomHash32 == domHash32 && redirDomHash32 )
ksr.m_parentDomHash32 = redirDomHash32;
if ( redirHostHash32 == hostHash32 && redirHostHash32 )
ksr.m_parentHostHash32 = redirHostHash32;
//ksr.m_parentFirstIp = *pfip;//m_ip;
ksr.m_pageNumInlinks = 0;
ksr.m_parentHasAddress = parentHasAddress;
// get this
bool isupf = ::isPermalink(NULL,&url,CT_HTML,NULL,isRSSExt);
// set some bit flags. the rest are 0 since we call reset()
if ( newOutlink ) ksr.m_isNewOutlink = 1;
if ( isupf ) ksr.m_isUrlPermalinkFormat = 1;
//if ( isIndexed ) ksr.m_isIndexed = 1;
if ( ispingserver ) ksr.m_isPingServer = 1;
// is it like www.xxx.com/* (does not include www.xxx.yyy.com)
// includes xxx.com/* however
ksr.m_isWWWSubdomain = url.isSimpleSubdomain();
// get link text we use for this outlink
/*
char tbuf[200];
int32_t tlen = links->getLinkText2 ( i ,
tbuf ,
200 ,
NULL ,
NULL ,
NULL ,
m_niceness );
*/
// the updated isContacty algo to fix www.apha.org which
// has a ton of apha.org/about/* links
int32_t t = getIsContacty ( &url,
NULL ,
ksr.m_hopCount ,
0 , // content type
(ksr.m_hopCount==0),
m_niceness );
// if same type as last one we might disqualify if 3 in a row
if ( t && t == lastType ) consec++;
else consec = 0;
// disqualify this pattern as a contacty link if is abused
if ( consec >= 3 )
if ( ! disqualify.addKey(&t) )
return NULL;
// remember. use numAdded as the index for this since we do
// not add all the outlinks to this list.
if ( numAdded < 2000 ) linkTypes[numAdded] = t;
// set this
lastType = t;
// validate
ksr.m_isContactyValid = 1;
// if parent is a root of a popular site, then it is considered
// an authority linker. (see updateTagdb() function above)
if ( *isRoot && *psni >= 500 )
ksr.m_hasAuthorityInlink = 1;
// this is in request now as well as reply
//Tag *tag;
// hascontactinfo tag can have a value of 0 or 1
//tag = gr->getTag("hascontactinfo");
//if ( tag ) {
if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
if ( m_hasContactInfo ) {
ksr.m_hasContactInfo = 1;
ksr.m_hasContactInfoValid = true;
}
// if we just set the contact info, use us, more recent
if ( linkSiteHashes[i]==m_siteHash32 && m_hasContactInfoValid){
ksr.m_hasContactInfo = m_hasContactInfo;
ksr.m_hasContactInfoValid = true;
}
if ( gr->getTag("ingoogle" ) ) {
ksr.m_inGoogle = 1;
ksr.m_inGoogleValid = true;
}
// the mere existence of these tags is good
if ( gr->getTag("authorityinlink"))ksr.m_hasAuthorityInlink =1;
ksr.m_hasAuthorityInlinkValid = true;
// if our url was a seed and redirected to another domain
// allow outlinks on that other domain to be on domain too.
// only used for diffbot crawlbot right now.
if ( domHash32 == redirDomHash32 && redirDomHash32 )
ksr.m_sameDom = 1;
if ( hostHash32 == redirHostHash32 && redirHostHash32 )
ksr.m_sameHost = 1;
// if ( linkSiteHashes[i]==redirSiteHash32 && redirSiteHash32)
// ksr.m_sameSite = 1;
// set parent based info
if ( domHash32 == m_domHash32 ) ksr.m_sameDom = 1;
if ( hostHash32 == m_hostHash32a ) ksr.m_sameHost = 1;
if ( linkSiteHashes[i]==m_siteHash32 ) ksr.m_sameSite = 1;
if ( *ipi ) ksr.m_wasParentIndexed = 1;
if ( isParentRSS ) ksr.m_parentIsRSS = 1;
if ( parentIsPermalink ) ksr.m_parentIsPermalink = 1;
if ( isParentPingServer ) ksr.m_parentIsPingServer= 1;
if ( parentIsSiteMap ) ksr.m_parentIsSiteMap = 1;
// this is used for building dmoz. we just want to index
// the urls in dmoz, not their outlinks.
if ( avoid ) ksr.m_avoidSpiderLinks = 1;
// this is used for building dmoz. we need to index this
// url even in the case of ETCPTIMEDOUT, etc.
if ( ignore ) ksr.m_ignoreExternalErrors = 1;
// . if this is the 2nd+ time we were spidered and this outlink
// wasn't there last time, then set this!
// . if this is the first time spidering this doc then set it
// to zero so that m_minPubDate is set to -1 when the outlink
// defined by "ksr" is spidered.
if ( m_oldDocValid && m_oldDoc ) {
int32_t oldSpideredTime = m_oldDoc->getSpideredTime();
ksr.m_parentPrevSpiderTime = oldSpideredTime;
}
else
ksr.m_parentPrevSpiderTime = 0;
//
// . inherit manual add bit if redirecting to simplified url
// . so we always spider seed url even if prohibited by
// the regex, and even if it simplified redirects
//
if ( m_indexCodeValid &&
( m_indexCode == EDOCSIMPLIFIEDREDIR ||
m_indexCode == EDOCNONCANONICAL ) &&
m_sreqValid ) {
if ( m_sreq.m_isInjecting )
ksr.m_isInjecting = 1;
if ( m_sreq.m_isAddUrl )
ksr.m_isAddUrl = 1;
}
// it is useful to know the primary langid of the parent
// when prioritizing links for spidering in the case of
// focussing the search engine on a particular set of langs
ksr.m_parentLangId = *langId;
// don't forget this one!
//ksr.m_spiderTime = nowGlobal;
// . is it "spam"? XmlDoc.cpp::isSpam()
// . we need to make that root quality into site root quality!
// . let's put spam detection logic into url filters
//if ( isSpam ( s,gr,m_spideredTime,true ) )
// // set the bit flag
// ksr.m_isSpam = 1;
// copy the url into SpiderRequest::m_url buffer
strcpy(ksr.m_url,s);
// this must be valid
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// set the key, ksr.m_key. isDel = false
ksr.setKey ( firstIp, *d , false );
// we were hopcount 0, so if we link to ourselves we override
// our original hopcount of 0 with this guy that has a
// hopcount of 1. that sux... so don't do it.
if ( ksr.getUrlHash48() == myUh48 ) continue;
// if we've recently added this url to spiderdb in Spider.cpp, skip it
//if ( sc && sc->isInDupCache ( &ksr , false ) )
// continue;
// . technically speaking we do not have any reply so we
// should not be calling this! cuz we don't have all the info
// . see if banned or filtered, etc.
// . at least try to call it. getUrlFilterNum() should
// break out and return -1 if it encounters a filter rule
// that it does not have enough info to answer.
// so if your first X filters all map to a "FILTERED"
// priority and this url matches one of them we can
// confidently toss this guy out.
// . show this for debugging!
// int32_t ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
// false, m_niceness, cr,
// false,//true , // outlink?
// NULL ); // quotatable
// logf(LOG_DEBUG,"build: ufn=%"INT32" for %s",
// ufn,ksr.m_url);
// bad?
//if ( ufn < 0 ) {
// log("build: link %s had bad url filter."
// , ksr.m_url );
// g_errno = EBADENGINEER;
// return NULL;
//}
//int32_t priority = -1;
//if ( ufn >= 0 )
// priority = cr->m_spiderPriorities[ufn];
// debug
if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
// print the tag rec out into sb2
SafeBuf sb2;
if ( gr ) gr->printToBuf ( &sb2 );
// get it
//SafeBuf sb1;
char *action = "add";
if ( isScraping ) action = "scrape";
logf(LOG_DEBUG,
"spider: attempting to %s link. "
"%s "
"tags=%s "
"onpage=%s"
,
action ,
ksr.m_url,
//sb1.getBufStart(),
sb2.getBufStart(),
m_firstUrl.m_url);
}
// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
// . mdw: oct 24, 2013. now i add so the urls show up in
// the pagecrawlbot.cpp spiderdb dump, so you can examine
// exactly why a url was crawled or not. plus if you change
// your mind about banning/filtering then it'd be nice to
// have these urls readily available.
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
// linksFiltered++; continue; }
//if ( priority == SPIDER_PRIORITY_BANNED ) {
// linksBanned++; continue; }
// serialize into the buffer
int32_t need = ksr.getRecSize();
// is that what we thought it would be?
//int32_t thought = links->m_linkLens[i] + 1 + hsize;
// sanity check
//if ( need + 12 + 4 > thought ) { char *xx=NULL;*xx=0; }
// sanity check
if ( p + 1 + need > m_pend ) { char *xx=NULL;*xx=0; }
// store the rdbId
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
else *p++ = RDB_SPIDERDB;
// print it for debug
if ( isTestColl ) {
SafeBuf tmp;
ksr.print(&tmp);
log("spider: attempting to add outlink "
"%s",tmp.getBufStart());
}
// store the spider rec
gbmemcpy ( p , &ksr , need );
// skip it
p += need;
// count it
numAdded++;
// check domain
//if ( domHash32 == m_domHash32 ) numAddedFromSameDomain++;
if ( ksr.m_sameDom ) numAddedFromSameDomain++;
}
//
// scan through requests and set m_isContacty
//
char *s = m_p;
int32_t k = 0;
for ( ; s < p ; k++ ) {
// advance over rdbid
s++;
// breathe
QUICKPOLL(m_niceness);
// cast
SpiderRequest *ksr = (SpiderRequest *)s;
// set size
size = ksr->getRecSize();
// advance over that
s += size;
// stop if breach
if ( k >= 2000 ) break;
// must be isContacty
if ( ! linkTypes[k] ) continue;
// and not disqualified
if ( disqualify.isInTable(&linkTypes[k] )) continue;
// ok, we are good to go
ksr->m_isContacty = 1;
}
// . this is just how many urls we tried to index
// . move into Spider::addSpiderRequest()
//cr->m_localCrawlInfo.m_urlsHarvested += numAdded;
//cr->m_globalCrawlInfo.m_urlsHarvested += numAdded;
//cr->m_needsSave = true;
// save it
m_numOutlinksAdded = numAdded;
m_numOutlinksAddedValid = true;
m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
m_numOutlinksFiltered = linksFiltered;
m_numOutlinksBanned = linksBanned;
// update end of list once we have successfully added all spider recs
m_p = p;
// return current ptr
return m_p ;
}
/*
// add keys/recs from the table into the metalist
bool XmlDoc::addTable96 ( HashTableX *tt1 ,
int32_t date1 ,
bool nosplit ) {
// sanity check
if ( tt1->m_numSlots ) {
if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
}
// docid is handy
int64_t d = *getDocId();
uint8_t f = 0;
if ( nosplit ) f = 0x80;
// use secondary rdbs if repairing
//bool useRdb2 = ( g_repair.isRepairActive() &&
// ! g_repair.m_fullRebuild &&
// ! g_repair.m_removeBadPages );
char rdbId1 = RDB_INDEXDB;
char rdbId2 = RDB_DATEDB;
if ( m_useSecondaryRdbs ) { // useRdb2 ) {
rdbId1 = RDB2_INDEXDB2;
rdbId2 = RDB2_DATEDB2;
}
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// get its key
int64_t *termId1 = (int64_t *)tt1->getKey ( i );
// get the score
uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
// sanity check
if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
// store rdbid
*m_p++ = (rdbId1 | f);
// store it. not a del key.
*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,false);
// skip it
m_p += sizeof(key_t);
// add to datedb?
if ( date1 == -1 ) continue;
// yes
*m_p++ = (rdbId2 | f);
// store it. not a del key.
*(key128_t *)m_p=
g_datedb.makeKey(*termId1,date1,score1,d,false);
// advance over that
m_p += sizeof(key128_t);
}
return true;
}
*/
bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1
uint8_t rdbId ,
bool forDelete ) {
// sanity check
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
// store this rdbId into the list
char useRdbId = rdbId;
//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) useRdbId = RDB2_CLUSTERDB2;
if ( useRdb2 && rdbId == RDB_LINKDB ) useRdbId = RDB2_LINKDB2;
if ( useRdb2 && rdbId == RDB_DATEDB ) useRdbId = RDB2_DATEDB2;
if ( useRdb2 && rdbId == RDB_PLACEDB ) useRdbId = RDB2_PLACEDB2;
if ( useRdb2 && rdbId == RDB_SECTIONDB ) useRdbId = RDB2_SECTIONDB2;
// sanity checks
if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
if ( rdbId == RDB_PLACEDB ) {
if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; }
}
else if ( rdbId == RDB_SECTIONDB ) {
int32_t svs = sizeof(SectionVote);
if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; }
}
else {
if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; }
}
int32_t count = 0;
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// breathe
QUICKPOLL(m_niceness);
// get its key
key128_t *k = (key128_t *)tt1->getKey ( i );
// no key is allowed to have the del bit clear at this point
// because we reserve that for making negative keys!
if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
// store rdbid
*m_p++ = useRdbId; // (useRdbId | f);
// store it
// *(key128_t *)m_p = *k; does this work?
gbmemcpy ( m_p , k , sizeof(key128_t) );
// all keys must be positive at this point
if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
// or if getting for incremental indexing and this is
// from the "oldList"
//if ( forDelete ) *m_p = *m_p & 0xfe;
// skip key
m_p += sizeof(key128_t);
// count it
count++;
// do not add the data if deleting
if ( forDelete ) continue;
// skip if not sectiondb or placedb
if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
// ok test it out (MDW)
//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
//if ( count > 1 ) continue;
// get the data value
char *val = (char *)tt1->getValue ( k );
// get the size of the data to store. assume Sectiondb vote.
int32_t ds = sizeof(SectionVote);
// placedb is special even. include the \0 terminator
if ( rdbId == RDB_PLACEDB ) {
// "ds" is how many bytes we store as data
ds = gbstrlen(val)+1;
// store dataSize first
*(int32_t *)m_p = ds;
// skip it
m_p += 4;
}
// store possible accompanying date of the rdb record
gbmemcpy (m_p,val, ds );
// skip it
m_p += ds;
}
//if(rdbId==RDB_LINKDB ) log("doc: added %"INT32" linkdb keys" ,count);
//if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count);
return true;
}
int32_t XmlDoc::getSiteRank ( ) {
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
return ::getSiteRank ( m_siteNumInlinks );
}
// . add keys/recs from the table into the metalist
// . we store the keys into "m_p" unless "buf" is given
bool XmlDoc::addTable144 ( HashTableX *tt1 , int64_t docId , SafeBuf *buf ) {
// sanity check
if ( tt1->m_numSlots ) {
if ( tt1->m_ks != sizeof(key144_t) ) {char *xx=NULL;*xx=0;}
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
}
// assume we are storing into m_p
char *p = m_p;
// reserve space if we had a safebuf and point into it if there
if ( buf ) {
int32_t slotSize = (sizeof(key144_t)+2+sizeof(key128_t));
int32_t need = tt1->getNumSlotsUsed() * slotSize;
if ( ! buf->reserve ( need ) ) return false;
// get cursor into buf, NOT START of buf
p = buf->getBufStart();
}
int32_t siteRank = getSiteRank ();
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
char rdbId = RDB_POSDB;
if ( m_useSecondaryRdbs ) rdbId = RDB2_POSDB2;
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// get its key
char *kp = (char *)tt1->getKey ( i );
// store rdbid
*p++ = rdbId; // (rdbId | f);
// store it as is
gbmemcpy ( p , kp , sizeof(key144_t) );
// sanity check
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//h64 &= TERMID_MASK;
//if ( g_posdb.getTermId(kp) == h64 ) {
// log("hey: docid=%"INT64" float=%f",m_docId,
// g_posdb.getFloat(kp) );
//}
/*
// get the score
int32_t score = tt1->getScoreFromSlot ( i ) ;
// set the M-bits to the score. used to accumulate link texts
// that are the same so pages like google.com do not have
// the word 'google' like 1 million times. this should reduce
// our "score" logarithmacly into the 7-bits or whatever.
//
// NO! now we just always increment the distance cursor
// m_dist so there will never be a collision of any posdb
// key we add... so we think
if ( score ) {
int32_t newScore = score;
if ( score >= 65 ) newScore = 65 +(score/100);
//if ( score >= 65+3200) newScore = 65 +(score/100);
if ( newScore > MAXMULTIPLIER )
newScore = MAXMULTIPLIER;
g_posdb.setMultiplierBits(m_p,(unsigned char)newScore);
}
*/
// this was zero when we added these keys to zero, so fix it
g_posdb.setDocIdBits ( p , docId );
// if this is a numeric field we do not want to set
// the siterank or langid bits because it will mess up
// sorting by the float which is basically in the position
// of the word position bits.
if ( g_posdb.isAlignmentBitClear ( p ) ) {
// make sure it is set again. it was just cleared
// to indicate that this key contains a float
// like a price or something, and we should not
// set siterank or langid so that its termlist
// remains sorted just by that float
g_posdb.setAlignmentBit ( p , 1 );
}
// otherwise, set the siterank and langid
else {
// this too
g_posdb.setSiteRankBits ( p , siteRank );
// set language here too
g_posdb.setLangIdBits ( p , m_langId );
}
// advance over it
p += sizeof(key144_t);
}
// all done
if ( ! buf ) { m_p = p; return true; }
// update safebuf otherwise
char *start = buf->getBufStart();
// fix SafeBuf::m_length
buf->setLength ( p - start );
// sanity
if ( buf->length() > buf->getCapacity() ) { char *xx=NULL;*xx=0; }
return true;
}
// add keys/recs from the table into the metalist
bool XmlDoc::addTable224 ( HashTableX *tt1 ) {
// sanity check
if ( tt1->m_numSlots ) {
if ( tt1->m_ks != sizeof(key224_t) ) {char *xx=NULL;*xx=0;}
if ( tt1->m_ds != 0 ) {char *xx=NULL;*xx=0;}
}
char rdbId = RDB_LINKDB;
if ( m_useSecondaryRdbs ) rdbId = RDB2_LINKDB2;
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// get its key
char *kp = (char *)tt1->getKey ( i );
// store rdbid
*m_p++ = rdbId; // (rdbId | f);
// store it as is
gbmemcpy ( m_p , kp , sizeof(key224_t) );
// advance over it
m_p += sizeof(key224_t);
}
return true;
}
/*
// . add table into our metalist pointed to by m_p
// . k.n1 = date (see hashWords() below)
// . k.n0 = termId (see hashWords() below)
// . and the value is the score, 32-bits
bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1
uint64_t docId ,
uint8_t rdbId ,
bool nosplit ) {
if ( tt1->m_numSlotsUsed == 0 ) return true;
uint8_t f = 0;
if ( nosplit ) f = 0x80;
// sanity check
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
// sanity checks
if ( nosplit ) {
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
}
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
// sanity checks
if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; }
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// breathe
QUICKPOLL(m_niceness);
// get its key
key96_t *k = (key96_t *)tt1->getKey ( i );
// get its value
uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
// convert to 8 bits
v = score32to8 ( v );
// . make the meta list key for datedb
// . a datedb key (see Datedb.h)
key128_t mk = g_datedb.makeKey ( k->n0 , // termId
k->n1 , // date
v , // score (8 bits)
docId ,
false );// del key?
// store rdbid with optional "nosplit" flag
*m_p++ = (rdbId | f);
// store it. it is a del key.
*(key128_t *)m_p = mk;
// skip it
m_p += sizeof(key128_t);
}
return true;
}
*/
/*
// add keys/recs from the table into the metalist
bool XmlDoc::addTable96 ( HashTableX *tt1 ,
HashTableX *tt2 ,
int32_t date1 ,
int32_t date2 ,
bool del ,
bool nosplit ) {
// sanity check
if ( tt1->m_numSlots ) {
if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;}
}
if ( tt2->m_numSlots ) {
if ( tt2->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
if ( tt2->m_ds != 4 ) {char *xx=NULL;*xx=0;}
}
// docid is handy
int64_t d = *getDocId();
uint8_t f = 0;
if ( nosplit ) f = 0x80;
// use secondary rdbs if repairing
//bool useRdb2 = ( g_repair.isRepairActive() &&
// ! g_repair.m_fullRebuild &&
// ! g_repair.m_removeBadPages );
char rdbId1 = RDB_INDEXDB;
char rdbId2 = RDB_DATEDB;
if ( m_useSecondaryRdbs ) { // useRdb2 ) {
rdbId1 = RDB2_INDEXDB2;
rdbId2 = RDB2_DATEDB2;
}
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// breathe
QUICKPOLL(m_niceness);
// get its key
int64_t *termId1 = (int64_t *)tt1->getKey ( i );
// get the score
uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
// sanity check
if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
// see if in "tt2"
int32_t slot = tt2->getSlot ( termId1 );
// assume 0
uint8_t score2 = 0;
// look it up in the positive key table
if ( slot >= 0 ) {
score2 = score32to8 ( tt2->getScoreFromSlot(slot) );
// sanity check
if ( score2 <= 0 ) { char *xx=NULL;*xx=0; }
}
// we annihilate!
if ( score1 != score2 ) {
// store rdbid
*m_p++ = (rdbId1 | f);
// store it. it is a del key.
*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,del);
// skip it
m_p += sizeof(key_t);
}
// add to datedb?
if ( date1 == -1 ) continue;
// same dates too?
if ( date1 == date2 && score1 == score2 ) continue;
// yes
*m_p++ = (rdbId2 | f);
// store it. it is a del key.
*(key128_t *)m_p=g_datedb.makeKey(*termId1,date1,score1,d,del);
// advance over that
m_p += sizeof(key128_t);
}
return true;
}
// . add table into our metalist pointed to by m_p
// . k.n1 = date (see hashWords() below)
// . k.n0 = termId (see hashWords() below)
// . and the value is the score, 32-bits
bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1
HashTableX *tt2 , // <key128_t,char> *tt2
uint64_t docId ,
uint8_t rdbId ,
bool del ,
bool nosplit ) {
uint8_t f = 0;
if ( nosplit ) f = 0x80;
// sanity check
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
// sanity checks
if ( nosplit ) {
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
}
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
// sanity checks
if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
if ( tt2->m_ks != 12 ) { char *xx=NULL;*xx=0; }
if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; }
if ( tt2->m_ds != 4 ) { char *xx=NULL;*xx=0; }
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// breathe
QUICKPOLL(m_niceness);
// get its key
key96_t *k = (key96_t *)tt1->getKey ( i );
// get its value
uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
// convert to 8 bits
v = score32to8 ( v );
// see if in "tt2"
int32_t slot = tt2->getSlot ( k );
// get value if there
if ( slot >= 0 ) {
// get it
uint32_t val =*(uint32_t *)tt2->getValueFromSlot(slot);
// convert to 8 bits
val = score32to8 ( val );
// compare, if same, skip it!
if ( val == v ) continue;
}
// . make the meta list key for datedb
// . a datedb key (see Datedb.h)
key128_t mk = g_datedb.makeKey ( k->n0 , // termId
k->n1 , // date
v , // score (8 bits)
docId ,
del );// del key?
// store rdbid with optional "nosplit" flag
*m_p++ = (rdbId | f);
// store it. it is a del key.
*(key128_t *)m_p = mk;
// skip it
m_p += sizeof(key128_t);
}
return true;
}
bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1
HashTableX *tt2 , // <key128_t,char> *tt2
uint8_t rdbId ,
bool del ,
bool nosplit ) {
uint8_t f = 0;
if ( nosplit ) f = 0x80;
// sanity check
if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }
// sanity checks
if ( nosplit ) {
if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
if ( rdbId == RDB_DATEDB ) { char *xx=NULL;*xx=0; }
}
bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
//if ( g_repair.m_fullRebuild ) useRdb2 = false;
//if ( g_repair.m_removeBadPages ) useRdb2 = false;
if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2;
if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2;
// sanity checks
if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
if ( tt2->m_ks != 16 ) { char *xx=NULL;*xx=0; }
if ( rdbId == RDB_PLACEDB ) {
if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; }
if ( tt2->m_ds != 512 ) { char *xx=NULL;*xx=0; }
}
else if ( rdbId == RDB_SECTIONDB ) {
int32_t svs = sizeof(SectionVote);
if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; }
if ( tt2->m_ds != svs ) { char *xx=NULL;*xx=0; }
}
else {
if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; }
if ( tt2->m_ds != 0 ) { char *xx=NULL;*xx=0; }
}
int32_t count = 0;
// store terms from "tt1" table
for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
// skip if empty
if ( tt1->m_flags[i] == 0 ) continue;
// breathe
QUICKPOLL(m_niceness);
// get its key
key128_t *k = (key128_t *)tt1->getKey ( i );
// no key is allowed to have the del bit clear at this point
// because we reserve that for making negative keys!
if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
// see if in "tt2"
int32_t slot = tt2->getSlot ( k );
// . skip if already indexed
// . do not do incremental indexing for sectiondb/placedb since
// it may have the same key but different data!!!!!!!
if ( slot >= 0 &&
rdbId != RDB_SECTIONDB &&
rdbId != RDB_PLACEDB )
continue;
// store rdbid with optional "nosplit" flag
*m_p++ = (rdbId | f);
// store it
// *(key128_t *)m_p = *k; does this work?
gbmemcpy ( m_p , k , sizeof(key128_t) );
// all keys must be positive at this point
if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
// clear the del bit if we are an unmatched key and "del"
// is true. we need to be a negative key now
if ( del ) m_p[0] = m_p[0] & 0xfe;
// skip key
m_p += sizeof(key128_t);
// count it
count++;
// skip if not sectiondb or placedb
if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
// ok test it out (MDW)
//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
//if ( count > 1 ) continue;
// if we were a negative key, do not add a value, even for
// sectiondb
if ( del ) continue;
// get the data value
char *val = (char *)tt1->getValue ( k );
// get the size of the data to store. assume Sectiondb vote.
int32_t ds = sizeof(SectionVote);
// placedb is special even. include the \0 terminator
if ( rdbId == RDB_PLACEDB ) {
// "ds" is how many bytes we store as data
ds = gbstrlen(val)+1;
// store dataSize first
*(int32_t *)m_p = ds;
// skip it
m_p += 4;
}
// store possible accompanying date of the rdb record
gbmemcpy (m_p,val, ds );
// skip it
m_p += ds;
}
//if(rdbId==RDB_LINKDB ) log("doc: added %"INT32" linkdb keys" ,count);
//if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count);
return true;
}
*/
//
// . hash terms that are sharded by TERMID not DOCID!!
//
// . returns false and sets g_errno on error
// . these terms are stored in indexdb/datedb, but all terms with the same
// termId reside in one and only one group. whereas normally the records
// are split based on docid and every group gets 1/nth of the termlist.
// . we do this "no splitting" so that only one disk seek is required, and
// we know the termlist is small, or the termlist is being used for spidering
// or parsing purposes and is usually not sent across the network.
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
//if ( m_pbuf )
// m_pbuf->safePrintf("<h3>Terms which are immune to indexdb "
// "splitting:</h3>");
//if ( m_skipIndexing ) return true;
// this should be ready to go and not block!
int64_t *pch64 = getExactContentHash64();
//int64_t *pch64 = getLooseContentHash64();
if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
// int16_tcut
Url *fu = getFirstUrl();
if ( ! hashVectors ( tt ) ) return false;
// constructor should set to defaults automatically
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// usually we shard by docid, but these are terms we shard by termid!
hi.m_shardByTermId = true;
// for exact content deduping
setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
char cbuf[64];
int32_t clen = sprintf(cbuf,"%"UINT64"",*pch64);
hi.m_prefix = "gbcontenthash";
if ( ! hashString ( cbuf,clen,&hi ) ) return false;
////
//
// let's stop here for now, until other stuff is actually used again
//
////
// let's bring back image thumbnail support for the widget project
//return true;
char *host = fu->getHost ();
//int32_t hlen = fu->getHostLen ();
/*
setStatus ( "hashing no-split qdom keys" );
char *dom = fu->getDomain ();
int32_t dlen = fu->getDomainLen();
// desc is NULL, prefix will be used as desc
hi.m_prefix = "qdom";
if ( ! hashString ( dom,dlen,&hi ) ) return false;
setStatus ( "hashing no-split qhost keys" );
// desc is NULL, prefix will be used as desc
hi.m_prefix = "qhost";
if ( ! hashString ( host,hlen,&hi ) ) return false;
*/
// now hash the site
setStatus ( "hashing no-split SiteGetter terms");
//
// HASH terms for SiteGetter.cpp
//
// these are now no-split terms
//
char *s = fu->getUrl ();
int32_t slen = fu->getUrlLen();
// . this termId is used by SiteGetter.cpp for determining subsites
// . matches what is in SiteGet::getSiteList()
// for www.xyz.com/a/ HASH www.xyz.com
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
bool add = true;
// we only hash this for urls that end in '/'
if ( s[slen-1] != '/' ) add = false;
// and no cgi
if ( fu->isCgi() ) add = false;
// skip if root
if ( fu->m_plen <= 1 ) add = false;
// sanity check
if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
// . skip if we have no subdirectory outlinks
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
// hash it
if ( add ) {
// remove the last path component
char *end2 = s + slen - 2;
// back up over last component
for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
// hash that part of the url
hi.m_prefix = "siteterm";
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
//Dates *dp = getDates ();
// hash the clocks into indexdb
//if ( ! dp->hash ( m_docId , tt , this ) ) return false;
// . hash special site/hopcount thing for permalinks
// . used by Images.cpp for doing thumbnails
// . this returns false and sets g_errno on error
// . let's try thumbnails for all...
//if ( ! *getIsPermalink() ) return true;
setStatus ( "hashing no-split gbsitetemplate keys" );
// must be valid
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
char buf[MAX_URL_LEN+20];
//uint32_t th = m_tagVector.getVectorHash();
uint32_t tph = *getTagPairHash32();
// . skip this so we can do site:xyz.com queries
// . but if this is https:// then you will have to
// specify that...
char *site = getSite();
// sanity check, must NOT start with http://
if ( ! strncmp ( site , "http://", 7 ) ) { char *xx=NULL;*xx=0;}
// this must match what we search in Images.cpp::getThumbnail()
int32_t blen = sprintf(buf,"%"UINT32"%s",tph,site);
// use the prefix as the description if description is NULL
hi.m_prefix = "gbsitetemplate";
//if ( ! hashString ( buf,blen,&hi ) ) return false;
if ( ! hashSingleTerm ( buf,blen,&hi ) ) return false;
setStatus ( "hashing no-split gbimage keys" );
hi.m_prefix = "gbimage";
// hash gbimage: for permalinks only for Images.cpp
for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) {
// get the node number
//int32_t nn = m_images.m_imageNodes[i];
// get the url of the image
//XmlNode *xn = m_xml.getNodePtr(nn);
int32_t srcLen;
char *src = m_images.getImageUrl(i,&srcLen);
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
Url *cu = getCurrentUrl();
// we can addwww to normalize since this is for deduping kinda
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
char *u = iu.getUrl ();
int32_t ulen = iu.getUrlLen();
// hash each one
//if ( ! hashString ( u,ulen,&hi ) ) return false;
// hash a single entity
if ( ! hashSingleTerm ( u,ulen,&hi) ) return false;
//log("test: %s",u);
}
return true;
}
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . "sr" is the tagdb Record
// . "ws" store the terms for PageParser.cpp display
char *XmlDoc::hashAll ( HashTableX *table ) {
setStatus ( "hashing document" );
if ( m_allHashed ) return (char *)1;
// sanity checks
if ( table->m_ks != 18 ) { char *xx=NULL;*xx=0; }
if ( table->m_ds != 4 ) { char *xx=NULL;*xx=0; }
if ( m_wts && m_wts->m_ks != 12 ) { char *xx=NULL;*xx=0; }
// ptr to term = 4 + score = 4 + ptr to sec = 4
if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){char *xx=NULL;*xx=0;}
unsigned char *hc = (unsigned char *)getHopCount();
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
// need this for hashing
HashTableX *cnt = getCountTable();
if ( ! cnt ) return (char *)cnt;
if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; }
// and this
//Weights *we = getWeights();
//if ( ! we || we == (void *)-1 ) return (char *)we;
// and this
Links *links = getLinks();
if ( ! links ) return (char *)links;
if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; }
// and now this
//Synonyms *syn = getSynonyms();
//if ( ! syn || syn == (void *)-1 ) return (char *)syn;
char *wordSpamVec = getWordSpamVec();
if (!wordSpamVec) return (char *)wordSpamVec;
if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;}
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
if ( ! fragVec ) return (char *)fragVec;
if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; }
// why do we need this?
if ( m_wts ) {
uint8_t *lv = getLangVector();
if ( ! lv ) return (char *)lv;
if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; }
}
TagRec *gr = getTagRec();
if ( ! gr ) return (char *)gr;
if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// just keep it somewhat sane...
//if ( nw > 30000 ) nw = 30000;
// then each singleton has one phrase, and 1 empty for good hashing
//if ( ! table->setTableSize ( nw * 4 ) )
// return log("build: Could not allocate %"INT32" bytes for table "
// "for indexing document.",
// (nw*4)*(8+sizeof(int32_t)));
/*
const char *help =
"<table><td bgcolor=lightgreen>\n"
"Each document has several associated pieces. Each piece "
"is indexed individually. The pieces are listed below and "
"are preceeded with a table dictating the parameters with "
"which the piece was indexed."
"<br><br>"
"Below that table the actual text of the piece is displayed. "
"Each alphanumeric word in the text has two subscripts of the "
"form <i>X/Y</i> where X and Y are percentage weights on the "
"score of that particular alphanumeric word. X is the weight "
"on the word itself and Y is the weight on the phrase which "
"is started by that word. A weight of 100% "
"indicates a weight which does not affect the score."
"<br><br>"
"Words that are struck out and in a box with a red background "
"instead of light blue are considered to be spam, meaning "
"they are repeated in a pattern. They "
"contain a number in that box which indicates the probability "
"they are spam and 100 minus that probability is weighted "
"with their score to get a new, spam-adjusted score. "
"<br>\n"
"</tr>\n"
"</table>\n"
"</td></table>\n"
"<br><br>\n";
if ( m_pbuf ) m_pbuf->safePrintf("%s",help);
*/
/*
int32_t inlinks = *getSiteNumInlinks();
int32_t boost1 = getBoostFromSiteNumInlinks ( inlinks );
// . now we hard code "boost2"
// . based on # of alnum words
// . this makes us look at keyword density, not just the
// plain keyword count
int32_t naw = m_words.getNumAlnumWords();
// . keep at 100% for up to 200 words then reduce linearly
// . only do this for newer title recs to avoid undeletable data
// . if we have a huge document, it can still contain a very
// relevant paragraph that is dense in the query terms, so
// we really only want to punish enough so the post query
// reranking has some good candidates for doing proximity
// scoring.
// . back off by .90 every 1000 words
float nn = naw;
float bb = 100.0;
while ( nn > 1000 ) {
nn *= .9;
bb *= .9;
}
// never drop below %1
if ( bb < 1.0 ) bb = 1.0;
// set it
int64_t boost2 = (int64_t)bb;
*/
/*
int32_t siteNumInlinks = *getSiteNumInlinks();
if ( m_pbuf )
m_pbuf->safePrintf(
"<table border=1 cellpadding=2>"
"<tr><td>siteNumInlinks</td><td><b>%"INT32"%%</b></td></tr>"
"<tr><td>siteNumInlinksBoost</td>"
"<td>%"INT32"%%</td></tr>"
"<tr><td>numAlnumWords</td>"
"<td>%"INT32"</td></tr> "
"<tr><td>scoreWeightFromNumAlnumWords"
"</td><td>%"INT32"%%</td></tr>"
"<tr><td>headerWeight</td>"
"<td>%"INT32"%%</td></tr>"
"<tr><td>urlPathWeight</td>"
"<td>%"INT32"%%</td></tr>"
"<tr><td>externalLinkTextWeight</td>"
"<td>%"INT32"%%</td></tr>"
"<tr><td>internalLinkTextWeight</td>"
"<td>%"INT32"%%</td></tr>"
"<tr><td>conceptWeight</td>"
"<td>%"INT32"%%</td></tr>"
"<tr><td>titleWeight</td>"
"<td>%"INT32"%%</td></tr>"
"</table>"
"<br>"
,
(int32_t)siteNumInlinks,
(int32_t)boost1,
//(int32_t)len,
(int32_t)naw,
(int32_t)boost2,
(int32_t)boost1,
(int32_t)boost2,
//(int32_t)boost1,
(int32_t)m_headerWeight,
(int32_t)m_urlPathWeight,
(int32_t)m_externalLinkTextWeight,
(int32_t)m_internalLinkTextWeight,
(int32_t)m_conceptWeight,
(int32_t)m_titleWeight,
(int32_t)m_titleWeight,
(int32_t)boost1,
(int32_t)boost1,
);
*/
// do not repeat this if the cachedb storage call blocks
m_allHashed = true;
// reset distance cursor
m_dist = 0;
// hash diffbot's json output here
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
/*
if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) {
// hash the content type for type:json query
if ( ! hashContentType ( table ) ) return NULL;
// and the url: query support
if ( ! hashUrl ( table ) ) return NULL;
// language support
if ( ! hashLanguage ( table ) ) return NULL;
// country?
if ( ! hashCountry ( table ) ) return NULL;
if ( ! hashTagRec ( table ) ) return NULL;
// hash for gbsortby:gbspiderdate
if ( ! hashDateNumbers ( table ) ) return NULL;
// has gbhasthumbnail:1 or 0
if ( ! hashImageStuff ( table ) ) return NULL;
// and the json itself
return hashJSON ( table );
}
*/
if ( ! hashContentType ( table ) ) return NULL;
if ( ! hashUrl ( table ) ) return NULL;
if ( ! hashLanguage ( table ) ) return NULL;
if ( ! hashCountry ( table ) ) return NULL;
if ( ! hashSiteNumInlinks( table ) ) return NULL;
if ( ! hashTagRec ( table ) ) return NULL;
if ( ! hashAds ( table ) ) return NULL;
if ( ! hashSubmitUrls ( table ) ) return NULL;
if ( ! hashIsAdult ( table ) ) return NULL;
// has gbhasthumbnail:1 or 0
if ( ! hashImageStuff ( table ) ) return NULL;
// . hash sectionhash:xxxx terms
// . diffbot still needs to hash this for voting info
if ( ! hashSections ( table ) ) return NULL;
// now hash the terms sharded by termid and not docid here since they
// just set a special bit in posdb key so Rebalance.cpp can work.
// this will hash the content checksum which we need for deduping
// which we use for diffbot custom crawls as well.
if ( ! hashNoSplit ( table ) ) return NULL;
// MDW: i think we just inject empty html with a diffbotreply into
// global index now, so don't need this... 9/28/2014
// stop indexing xml docs
bool indexDoc = true;
if ( cr->m_isCustomCrawl ) indexDoc = false;
if ( ! cr->m_indexBody ) indexDoc = false;
// if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject )
// indexDoc = true;
// always index diffbot json objects for GI (custom crawl is false)
if ( m_isDiffbotJSONObject )
indexDoc = true;
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
if ( ! indexDoc ) return (char *)1;
// hash json fields
if ( *ct == CT_JSON ) {
// this hashes both with and without the fieldname
hashJSONFields ( table );
goto skip;
}
// same for xml now, so we can search for field:value like w/ json
if ( *ct == CT_XML ) {
// this hashes both with and without the fieldname
hashXMLFields ( table );
goto skip;
}
// hash the body of the doc first so m_dist is 0 to match
// the rainbow display of sections
if ( ! hashBody2 (table ) ) return NULL;
// hash the title now too so neighborhood singles have more
// to match. plus, we only hash these title terms iff they
// are not already in the hash table, so as to avoid hashing
// repeated title terms because we do not do spam detection
// on them. thus, we need to hash these first before anything
// else. give them triple the body score
if ( ! hashTitle ( table )) return NULL;
// . hash the keywords tag, limited to first 2k of them so far
// . hash above the neighborhoods so the neighborhoods only index
// what is already in the hash table
if ( ! hashMetaKeywords(table ) ) return NULL;
// then hash the incoming link text, NO ANOMALIES, because
// we index the single words in the neighborhoods next, and
// we had songfacts.com coming up for the 'street light facts'
// query because it had a bunch of anomalous inlink text.
if ( ! hashIncomingLinkText(table,false,true)) return NULL;
// then the meta summary and description tags with half the score of
// the body, and only hash a term if was not already hashed above
// somewhere.
if ( ! hashMetaSummary(table) ) return NULL;
skip:
// this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision
if ( ! hashNeighborhoods ( table ) ) return NULL;
if ( ! hashLinks ( table ) ) return NULL;
if ( ! hashDateNumbers ( table ) ) return NULL;
if ( ! hashMetaTags ( table ) ) return NULL;
if ( ! hashMetaZip ( table ) ) return NULL;
if ( ! hashDMOZCategories( table ) ) return NULL;
if ( ! hashCharset ( table ) ) return NULL;
if ( ! hashRSSInfo ( table ) ) return NULL;
if ( ! hashPermalink ( table ) ) return NULL;
// hash gblang:de last for parsing consistency
if ( ! hashLanguageString ( table ) ) return NULL;
// we set this now in hashWords3()
if ( m_doingSEO )
m_wordPosInfoBufValid = true;
// store the m_wordPosInfoBuf into cachedb
// NO! we are not allowed to block in here it messes shit up!!!
//if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
// return (char *)-1;
// . hash gbkeyword:gbmininlinks where the score is the inlink count
// . the inlink count can go from 1 to 255
// . an ip neighborhood can vote no more than once
// . this is in LinkInfo::hash
//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;
if ( ! hashMetaData ( table ) ) return NULL;
// return true if we don't need to print parser info
//if ( ! m_pbuf ) return true;
// print out the table into g_bufPtr now if we need to
//table->print ( );
return (char *)1;
}
// . "inlinks" is # of inlinks to the SITE
// . returns a percentage boost
int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) {
// . base on # of site inlinks
// . just hard code this for now
int32_t boost1 = 100;
if ( inlinks >= 10 ) boost1 = 150;
if ( inlinks >= 50 ) boost1 = 200;
if ( inlinks >= 100 ) boost1 = 250;
if ( inlinks >= 200 ) boost1 = 300;
if ( inlinks >= 400 ) boost1 = 350;
if ( inlinks >= 800 ) boost1 = 400;
if ( inlinks >= 1600 ) boost1 = 450;
if ( inlinks >= 3200 ) boost1 = 500;
if ( inlinks >= 6400 ) boost1 = 550;
if ( inlinks >= 12800 ) boost1 = 600;
if ( inlinks >= 25600 ) boost1 = 650;
if ( inlinks >= 51200 ) boost1 = 700;
return boost1;
}
bool XmlDoc::appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) {
// set4() called from the inject sets these two things for meta data
// which is basically json that augments the doc, tags it with stuff
if ( ! m_hasMetadata ) return true;
if ( ! ptr_metadata ) return true;
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod ) { char *xx=NULL;*xx=0; }
if ( pod == (XmlDoc **)-1 ) { char *xx=NULL;*xx=0; }
// this is non-NULL if it existed
XmlDoc *od = *pod;
// wtf?
if ( ! od ) return true;
// dedup. if already in there, do not re-add it
if ( strstr ( od->ptr_metadata , ptr_metadata ) )
return true;
SafeBuf md;
// copy over and append
if ( ! md.safeMemcpy ( od->ptr_metadata , od->size_metadata ) )
return false;
// remove trailing \0 if there
md.removeLastChar ( '\0' );
// separate from the new stuff
if ( ! md.safePrintf(",\n") )
return false;
if ( ! md.safeMemcpy ( ptr_metadata , size_metadata ) )
return false;
if ( ! md.nullTerm ( ) )
return false;
// update his meta data
od->ptr_metadata = md.getBufStart();
od->size_metadata = md.length();
int32_t nw = od->size_metadata * 4;
HashTableX tt1;
int32_t need4 = nw * 4 + 5000;
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,"posdb-i2"))
return false;
od->hashMetaData ( &tt1 );
// store the posdb keys from tt1 into our safebuf, tmp
SafeBuf sb;
if ( m_usePosdb && ! addTable144 ( &tt1 , od->m_docId , &sb ) )
return false;
// this could use time axis so that is taken into account
int64_t uh48 = getFirstUrlHash48();
// and re-formulate (and compress) his new title rec
SafeBuf trec;
if ( ! od->setTitleRecBuf ( &trec , od->m_docId , uh48 ) )
return false;
// force the title rec key to be the same
// if ( od->m_titleRecKeyValid && trec.getLength() >= sizeof(key_t) ) {
// char *p = trec.getBufStart();
// *(key_t *)p = od->m_titleRecKey;
// }
// else {
// log("build: old titlerec invalid docid=%"INT64,od->m_docId);
// }
// store the posdb keys in the meta list
if ( m_usePosdb && ! metaList->safeMemcpy ( &sb ) )
return false;
// store the updated titlerec into the meta list
if ( m_useTitledb && ! metaList->pushChar(RDB_TITLEDB) )
return false;
if ( m_useTitledb && ! metaList->safeMemcpy(&trec) )
return false;
m_updatedMetaData = true;
return true;
}
// . this is kinda hacky because it uses a short XmlDoc on the stack
// . no need to hash this stuff for regular documents since all the terms
// are fielded by gberrorstr, gberrornum or gbisreply.
// . normally we might use a separate xmldoc class for this but i wanted
// something more lightweight
SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
bool forDelete ) {
// set status for this
setStatus ( "getting spider reply meta list");
if ( m_spiderStatusDocMetaListValid )
return &m_spiderStatusDocMetaList;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( ! cr->m_indexSpiderReplies || forDelete ) {
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
}
// if docid based do not hash a spider reply. docid-based spider
// requests are added to spiderdb from the query reindex tool.
// do not do for diffbot subdocuments either, usespiderdb should be
// false for those.
// MDW: i disagree, i want to see when these get updated! 9/6/2014
// ok, let's index for diffbot objects so we can see if they are
// a dup of another diffbot object, or so we can see when they get
// revisted, etc.
//if ( m_setFromDocId || ! m_useSpiderdb ) {
if ( ! m_useSpiderdb && ! m_isDiffbotJSONObject ) {
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
}
// do not add a status doc if doing a query delete on a status doc
if ( m_contentTypeValid && m_contentType == CT_STATUS ) {
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
}
// doing it for diffbot throws off smoketests
// ok, smoketests are updated now, so remove this
// if ( strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
// m_spiderStatusDocMetaListValid = true;
// return &m_spiderStatusDocMetaList;
// }
// we double add regular html urls in a query reindex because the
// json url adds the parent, so the parent gets added twice sometimes,
// and for some reason it is adding a spider status doc the 2nd time
// so cut that out. this is kinda a hack b/c i'm not sure what's
// going on. but you can set a break point here and see what's up if
// you want.
// MDW: likewise, take this out, i want these recorded as well..
// if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
// m_spiderStatusDocMetaListValid = true;
// return &m_spiderStatusDocMetaList;
// }
// . fake this out so we do not core
// . hashWords3() uses it i guess
bool forcedLangId = false;
if ( ! m_langIdValid ) {
forcedLangId = true;
m_langIdValid = true;
m_langId = langUnknown;
}
// prevent more cores
bool forcedSiteNumInlinks = false;
if ( ! m_siteNumInlinksValid ) {
forcedSiteNumInlinks = true;
m_siteNumInlinks = 0;
m_siteNumInlinksValid = true;
}
SafeBuf *mbuf = getSpiderStatusDocMetaList2 ( reply );
if ( forcedLangId )
m_langIdValid = false;
if ( forcedSiteNumInlinks ) {
m_siteNumInlinksValid = false;
}
return mbuf;
}
// . the spider status doc
// . TODO:
// usedProxy:1
// proxyIp:1.2.3.4
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
setStatus ( "making spider reply meta list");
// . we also need a unique docid for indexing the spider *reply*
// as a separate document
// . use the same url, but use a different docid.
// . use now to mix it up
//int32_t now = getTimeGlobal();
//int64_t h = hash64(m_docId, now );
// to keep qa test consistent this docid should be consistent
// so base it on spidertime of parent doc.
// if doc is being force deleted then this is invalid!
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
int64_t h = hash64(m_docId, m_spideredTime );
// mask it out
int64_t d = h & DOCID_MASK;
// try to get an available docid, preferring "d" if available
int64_t *uqd = getAvailDocIdOnly ( d );
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
m_addedStatusDocId = *uqd;
// unsigned char *hc = (unsigned char *)getHopCount();
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
int32_t tmpVal = -1;
int32_t *priority = &tmpVal;
int32_t *ufn = &tmpVal;
// prevent a core if sreq is not valid, these will freak out
// diffbot replies may not have a valid m_sreq
if ( m_sreqValid ) {
priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1 )
return (SafeBuf *)priority;
ufn = getUrlFilterNum();
if ( ! ufn || ufn == (void *)-1 )
return (SafeBuf *)ufn;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
Json *jp1 = NULL;
// i've seen ptr_utf8Content NULL and content type as html for
// some reason when deleting a diffbot object doc so check for that
// here and forget it. we don't want getParsedJson() to core.
if ( m_isDiffbotJSONObject &&
m_contentType == CT_JSON &&
m_contentTypeValid ) {
jp1 = getParsedJson();
if ( ! jp1 || jp1 == (void *)-1) return (SafeBuf *)jp1;
}
// sanity
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
// why isn't gbhopcount: being indexed consistently?
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
// reset just in case
m_spiderStatusDocMetaList.reset();
// sanity
if ( *uqd <= 0 || *uqd > MAX_DOCID ) {
log("xmldoc: avail docid = %"INT64". could not index spider "
"reply or %s",*uqd,m_firstUrl.m_url);
//char *xx=NULL;*xx=0; }
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
}
// the old doc
XmlDoc *od = NULL;
if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;
Url *fu = &m_firstUrl;
// . make a little json doc that we'll hash up
// . only index the fields in this doc, no extra gbdocid: inurl:
// hash terms
SafeBuf jd;
jd.safePrintf("{\n");
// so type:status query works
jd.safePrintf("\"type\":\"status\",\n");
jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl() );
if ( ptr_redirUrl )
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
ptr_redirUrl);
if ( m_indexCodeValid ) {
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
jd.safePrintf("\"gbssStatusMsg\":\"");
jd.jsonEncode (mstrerror(m_indexCode));
jd.safePrintf("\",\n");
}
else {
jd.safePrintf("\"gbssStatusCode\":-1,\n");
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
}
if ( m_httpStatusValid )
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
(int32_t)m_httpStatus);
// do not index gbssIsSeedUrl:0 because there will be too many usually
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
if ( isSeed )
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
if ( od )
jd.safePrintf("\"gbssWasIndexed\":1,\n");
else
jd.safePrintf("\"gbssWasIndexed\":0,\n");
int32_t now = getTimeGlobal();
if ( od )
jd.safePrintf("\"gbssAgeInIndex\":"
"%"UINT32",\n",now - od->m_spideredTime);
if ( m_isDiffbotJSONObject ) { // && cr->m_isCustomCrawl
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
JsonItem *jsonItem = NULL;
if ( jp1 ) jsonItem = jp1->getItem("diffbotUri");
if ( jsonItem ) {
jd.safePrintf("\"gbssDiffbotUri\":\"");
int32_t vlen;
char *val = jsonItem->getValueAsString( &vlen );
if ( val ) jd.safeMemcpy ( val , vlen );
jd.safePrintf("\",\n");
}
else
jd.safePrintf("\"gbssDiffbotUri\":"
"\"none\",\n");
// show the type as gbssDiffbotType:"article" etc.
JsonItem *dti = NULL;
if ( jp1 )
dti = jp1->getItem("type");
if ( dti ) {
jd.safePrintf("\"gbssDiffbotType\":\"");
int32_t vlen;
char *val = dti->getValueAsString( &vlen );
if ( val ) jd.jsonEncode ( val , vlen );
jd.safePrintf("\",\n");
}
}
else { // if ( cr->m_isCustomCrawl ) {
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
}
jd.safePrintf("\"gbssDomain\":\"");
jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
jd.safePrintf("\",\n");
jd.safePrintf("\"gbssSubdomain\":\"");
jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
jd.safePrintf("\",\n");
//if ( m_redirUrlPtr && m_redirUrlValid )
//if ( m_numRedirectsValid )
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);
if ( m_docIdValid )
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
if ( m_parentDocPtr && m_isChildDoc && m_parentDocPtr->m_docIdValid )
jd.safePrintf("\"gbssParentDocId\":%"INT64",\n",
m_parentDocPtr->m_docId);
if ( m_hopCountValid )
//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);
// crawlbot round
if ( cr->m_isCustomCrawl )
jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
cr->m_spiderRoundNum);
// for -diffbotxyz fake docs addedtime is 0
if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
// in Spider.cpp we try to set m_sreq's m_addedTime to the
// min of all the spider requests, and we try to ensure
// that in the case of deduping we preserve the one with
// the oldest time. no, now we actually use
// m_discoveryTime since we were using m_addedTime in
// the url filters as it was originally intended.
jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
m_sreq.m_discoveryTime);
}
if ( m_isDupValid && m_isDup )
jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
m_docIdWeAreADupOf);
// how many spiderings were successful vs. failed
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// if ( m_sreqValid ) {
// jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
// m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
// jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
// m_sreq.m_reservedc1);
// jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
// m_sreq.m_reservedc2);
// }
if ( m_spideredTimeValid )
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
m_spideredTime);
else
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",0);
if ( m_firstIndexedDateValid )
jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
m_firstIndexedDate);
if ( m_contentHash32Valid )
jd.safePrintf("\"gbssContentHash32\":%"UINT32",\n",
m_contentHash32);
// so we know what hostid spidered the url. this is not the
// same hostid that will store it necessarily
jd.safePrintf("\"gbssSpideredByHostId\":%"INT32",\n",
(int32_t)g_hostdb.getMyHostId());
// which shard will store the titlerec and index terms? it
// is based on docid.
if ( m_docIdValid ) {
int32_t shardNum = getShardNumFromDocId ( m_docId );
jd.safePrintf("\"gbssStoredOnShard\":%"INT32",\n",shardNum);
}
if ( m_downloadStartTimeValid && m_downloadEndTimeValid ) {
jd.safePrintf("\"gbssDownloadStartTimeMS\":%"INT64",\n",
m_downloadStartTime);
jd.safePrintf("\"gbssDownloadEndTimeMS\":%"INT64",\n",
m_downloadEndTime);
int64_t took = m_downloadEndTime - m_downloadStartTime;
jd.safePrintf("\"gbssDownloadDurationMS\":%"INT64",\n",took);
jd.safePrintf("\"gbssDownloadStartTime\":%"UINT32",\n",
(uint32_t)(m_downloadStartTime/1000));
jd.safePrintf("\"gbssDownloadEndTime\":%"UINT32",\n",
(uint32_t)(m_downloadEndTime/1000));
}
jd.safePrintf("\"gbssUsedRobotsTxt\":%"INT32",\n",
m_useRobotsTxt);
if ( m_linksValid )
jd.safePrintf("\"gbssNumOutlinksOnPage\":%"INT32",\n",
(int32_t)m_links.getNumLinks());
//if ( m_numOutlinksAddedValid )
// crap, this is not right because we only call addOutlinksToMetaList()
// after we call this function.
// jd.safePrintf("\"gbssNumOutlinksAdded\":%"INT32",\n",
// (int32_t)m_numOutlinksAdded);
// how many download/indexing errors we've had, including this one
// if applicable.
if ( m_srepValid )
jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",
m_srep.m_errCount);
else
jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",0);
if ( m_ipValid )
jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
else
jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");
if ( m_ipEndTime ) {
int64_t took = m_ipEndTime - m_ipStartTime;
jd.safePrintf("\"gbssIpLookupTimeMS\":%"INT64",\n",took);
}
if ( m_siteNumInlinksValid ) {
jd.safePrintf("\"gbssSiteNumInlinks\":%"INT32",\n",
(int32_t)m_siteNumInlinks);
char siteRank = getSiteRank();
jd.safePrintf("\"gbssSiteRank\":%"INT32",\n",
(int32_t)siteRank);
}
jd.safePrintf("\"gbssContentInjected\":%"INT32",\n",
(int32_t)m_contentInjected);
if ( m_percentChangedValid && od )
jd.safePrintf("\"gbssPercentContentChanged\""
":%.01f,\n",
m_percentChanged);
if ( ! m_isDiffbotJSONObject )
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
*priority);
// this could be -1, careful
if ( *ufn >= 0 && ! m_isDiffbotJSONObject )
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
cr->m_regExs[*ufn].getBufStart());
// we forced the langid valid above
if ( m_langIdValid && m_contentLen )
jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
getLangAbbr(m_langId));
if ( m_contentTypeValid && m_contentLen )
jd.safePrintf("\"gbssContentType\":\"%s\",\n",
g_contentTypeStrings[m_contentType]);
if ( m_contentValid )
jd.safePrintf("\"gbssContentLen\":%"INT32",\n",
m_contentLen);
if ( m_isContentTruncatedValid )
jd.safePrintf("\"gbssIsContentTruncated\":%"INT32",\n",
(int32_t)m_isContentTruncated);
// do not show the -1 any more, just leave it out then
// to make things look prettier
if ( m_crawlDelayValid && m_crawlDelay >= 0 &&
! m_isDiffbotJSONObject )
// -1 if none?
jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
(int32_t)m_crawlDelay);
// was this url ever sent to diffbot either now or at a previous
// spider time?
if ( ! m_isDiffbotJSONObject ) {
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
(int)m_sentToDiffbot);
// sent to diffbot?
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
(int)m_sentToDiffbotThisTime);
}
// page must have been downloaded for this one
if ( cr->m_isCustomCrawl &&
m_utf8ContentValid &&
! m_isDiffbotJSONObject &&
m_content &&
m_contentValid &&
cr->m_diffbotPageProcessPattern.getBufStart() &&
cr->m_diffbotPageProcessPattern.getBufStart()[0] ) {
char match = doesPageContentMatchDiffbotProcessPattern();
jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n",
(int)match);
}
if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){
char *url = getFirstUrl()->getUrl();
// the crawl regex
int match = 1;
regex_t *ucr = &cr->m_ucr;
if ( ! cr->m_hasucr ) ucr = NULL;
if ( ucr && regexec(ucr,url,0,NULL,0) ) match = 0;
if ( ucr )
jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
match);
// now the substring pattern
match = 1;
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
if ( ucp && ! doesStringContainPattern(url,ucp) ) match = 0;
if ( ucp )
jd.safePrintf("\"gbssMatchesUrlCrawlPattern\":%i,\n",
match);
// now process regex
match = 1;
regex_t *upr = &cr->m_upr;
if ( ! cr->m_hasupr ) upr = NULL;
if ( upr && regexec(upr,url,0,NULL,0) ) match = 0;
if ( upr )
jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
match);
// now process pattern
match = 1;
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( upp && ! doesStringContainPattern(url,upp) ) match = 0;
if ( upp )
jd.safePrintf("\"gbssMatchesUrlProcessPattern\":%i,\n",
match);
}
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime &&
! m_isDiffbotJSONObject ) {
jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
m_diffbotReplyError);
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
jd.jsonEncode(mstrerror(m_diffbotReplyError));
jd.safePrintf("\",\n");
jd.safePrintf("\"gbssDiffbotReplyLen\":%"INT32",\n",
m_diffbotReply.length());
int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%"INT64",\n",
took );
jd.safePrintf("\"gbssDiffbotReplyRetries\":%"INT32",\n",
m_diffbotReplyRetries );
// this is not correct at this point we haven't parsed the json
// jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%"INT32",\n",
// m_diffbotJSONCount);
}
// remove last ,\n
jd.incrementLength(-2);
// end the json spider status doc
jd.safePrintf("\n}\n");
// BEFORE ANY HASHING
int32_t savedDist = m_dist;
// add the index list for it. it returns false and sets g_errno on err
// otherwise it sets m_spiderStatusDocMetaList
if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
return NULL;
// now make the titlerec
char xdhead[2048];
// just the head of it. this is the hacky part.
XmlDoc *xd = (XmlDoc *)xdhead;
// clear it out
memset ( xdhead, 0 , 2048);
// copy stuff from THIS so the spider reply "document" has the same
// header info stuff
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
gbmemcpy ( xdhead , (char *)this , hsize );
// override spider time in case we had error to be consistent
// with the actual SpiderReply record
//xd->m_spideredTime = reply->m_spideredTime;
//xd->m_spideredTimeValid = true;
// sanity
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
// this will cause the maroon box next to the search result to
// say "STATUS" similar to "PDF" "DOC" etc.
xd->m_contentType = CT_STATUS;
int32_t fullsize = &m_dummyEnd - (char *)this;
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
/*
// the ptr_* were all zero'd out, put the ones we want to keep back in
SafeBuf tmp;
// was "Spider Status: %s" but that is unnecessary
tmp.safePrintf("<title>%s</title>",
mstrerror(m_indexCode));
// if we are a dup...
if ( m_indexCode == EDOCDUP )
tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
if ( m_redirUrlPtr && m_redirUrlValid )
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
*/
// put stats like we log out from logIt
//tmp.safePrintf("<div style=max-width:800px;>\n");
// store log output into doc
//logIt(&tmp);
//tmp.safePrintf("\n</div>");
// the content is just the title tag above
// xd->ptr_utf8Content = tmp.getBufStart();
// xd->size_utf8Content = tmp.length()+1;
xd->ptr_utf8Content = jd.getBufStart();
xd->size_utf8Content = jd.length()+1;
// keep the same url as the doc we are the spider reply for
xd->ptr_firstUrl = ptr_firstUrl;
xd->size_firstUrl = size_firstUrl;
// serps need site, otherwise search results core
xd->ptr_site = ptr_site;
xd->size_site = size_site;
// if this is null then ip lookup failed i guess so just use
// the subdomain
if ( ! ptr_site && m_firstUrlValid ) {
xd->ptr_site = m_firstUrl.getHost();
xd->size_site = m_firstUrl.getHostLen();
}
// we can't do this the head is not big enough
// xd->m_collnum = m_collnum;
// xd->m_collnumValid = m_collnumValid;
// use the same uh48 of our parent
int64_t uh48 = m_firstUrl.getUrlHash48();
// then make into a titlerec but store in metalistbuf, not m_titleRec
SafeBuf titleRecBuf;
// this should not include ptrs that are NULL when compressing
// using its m_internalFlags1
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
return NULL;
// concat titleRec to our posdb key records
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
return NULL;
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
return NULL;
// return the right val
m_dist = savedDist;
// ok, good to go, ready to add to posdb and titledb
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
}
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
// the posdb table
HashTableX tt4;
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
return false;
Json jp2;
if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
g_errno = EBADJSONPARSER;
return false;
}
// re-set to 0
m_dist = 0;
// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = &tt4;
hi.m_desc = "json spider status object";
hi.m_useCountTable = false;
hi.m_useSections = false;
// fill up tt4. false -> do not hash without field prefixes.
hashJSONFields2 ( &tt4 , &hi , &jp2 , false );
/*
char buf[64];
int32_t bufLen;
// hash 'type:status' similar to 'type:json' etc.
hi.m_prefix = "type";
if ( ! hashString("status" , &hi ) ) return NULL;
// . hash gbstatus:0 for no error, otherwise the error code
// . this also hashes it as a number so we don't have to
// . so we can do histograms on this #
hi.m_prefix = "gbstatus";
hi.m_desc = "spider error number as string";
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode );
if ( ! hashString( buf , &hi ) ) return NULL;
*/
/*
logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
logf(LOG_DEBUG,"hashing indexcode=%"INT32"",m_indexCode);
bool ok = false;
if ( m_indexCode ) ok = true;
// scan the keys in tt and make sure the termid fo
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
int32_t recSize = 0;
int32_t rcount = 0;
char *p = m_spiderStatusDocMetaList.getBufStart();
char *pend =m_spiderStatusDocMetaList.getBuf();
for ( ; p < pend ; p += recSize ) {
// get rdbid, RDB_POSDB
uint8_t rdbId = *p & 0x7f;
// skip
p++;
// get key size
int32_t ks = getKeySizeFromRdbId ( rdbId );
// init this
int32_t recSize = ks;
// convert into a key128_t, the biggest possible key
//key224_t k ;
char k[MAX_KEY_BYTES];
if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
//k.setMin();
gbmemcpy ( &k , p , ks );
// is it a negative key?
char neg = false;
if ( ! ( p[0] & 0x01 ) ) neg = true;
// this is now a bit in the posdb key so we can rebalance
char shardByTermId = false;
if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
shardByTermId = true;
// skip it
p += ks;
// . always zero if key is negative
// . this is not the case unfortunately...
if ( neg ) {char *xx=NULL;*xx=0; }
// print dbname
if ( rdbId != RDB_POSDB ) { char *xx=NULL;*xx=0; }
// get termid et al
key144_t *k2 = (key144_t *)k;
int64_t tid = g_posdb.getTermId(k2);
log("db: tid=%"INT64"",tid);
if ( tid == 199947062354729LL ) ok = true;
//if ( m_indexCode == 0 && tid != 199947062354729LL ) {
// char *xx=NULL;*xx=0; }
}
if ( ! ok ) { char *xx=NULL;*xx=0; }
goto SKIP;
// was here....
*/
/*
// gbstatus:"tcp timed out"
hi.m_prefix = "gbstatusmsg";
hi.m_desc = "spider error msg";
if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;
//hi.m_prefix = "gbdocid";
//hi.m_desc = "docid";
//bufLen = sprintf ( buf , "%"UINT64"", *uqd ) ;
//if ( ! hashString( buf , &hi ) ) return NULL;
// . then the url. url: site: ip: etc. terms
// . do NOT hash non-fielded terms so we do not get "status"
// results poluting the serps => false
if ( ! hashUrl ( &tt4 , true ) ) return NULL;
// false --> do not hash the gbdoc* terms (CT_STATUS)
hashDateNumbers ( &tt4 , true );
*/
// store keys in safebuf then to make our own meta list
addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );
// debug this shit
//SafeBuf tmpsb;
//printMetaList ( m_spiderStatusDocMetaList.getBufStart() ,
// m_spiderStatusDocMetaList.getBuf(),
// &tmpsb );
//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());
return true;
}
// returns false and sets g_errno on error
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
setStatus ( "hashing meta tags" );
// assume it's empty
char buf [ 32*1024 ];
int32_t bufLen = 32*1024 - 1;
buf[0] = '\0';
int32_t n = m_xml.getNumNodes();
XmlNode *nodes = m_xml.getNodes();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INMETATAG;
hi.m_tt = tt;
hi.m_desc = "custom meta tag";
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != 68 ) continue;
// only get content for <meta name=..> not <meta http-equiv=..>
int32_t tagLen;
char *tag = m_xml.getString ( i , "name" , &tagLen );
char *tptr = tag;
char tagLower[128];
int32_t j ;
int32_t code;
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// make tag name lower case and do not allow bad chars
if ( tagLen > 126 ) tagLen = 126 ;
to_lower3_a ( tag , tagLen , tagLower );
for ( j = 0 ; j < tagLen ; j++ ) {
// bail if has unacceptable chars
if ( ! is_alnum_a ( tag[j] ) &&
tag[j] != '-' &&
tag[j] != '_' &&
tag[j] != '.' ) break;
// convert to lower
tagLower[j] = to_lower_a ( tag[j] );
}
// skip this meta if had unacceptable chars
if ( j < tagLen ) continue;
// is it recognized?
code = getFieldCode ( tag , tagLen );
// after version 45 or more, do not allow gbrss
// meta tags, because those are now reserved for us
if ( code == FIELD_GBRSS ) continue;
// allow gbrss: fields for earlier versions though
if ( code == FIELD_GBRSS ) code = FIELD_GENERIC;
// . do not allow reserved tag names
// . title,url,suburl,
if ( code != FIELD_GENERIC ) continue;
// this is now reserved
// do not hash keyword, keywords, description, or summary metas
// because that is done in hashRange() below based on the
// tagdb (ruleset) record
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
continue;
// . don't allow reserved names: site, url, suburl, link and ip
// . actually, the colon is included as part of those
// field names, so we really lucked out...!
// . index this converted tag name
tptr = tagLower;
// get the content
int32_t len;
char *s = m_xml.getString ( i , "content" , &len );
if ( ! s || len <= 0 ) continue;
// . ensure not too big for our buffer (keep room for a \0)
// . TODO: this is wrong, should be len+1 > bufLen,
// but can't fix w/o resetting the index (COME BACK HERE
// and see where we index meta tags besides this place!!!)
// remove those other places, except... what about keywords
// and description?
if ( len+1 >= bufLen ) {
//len = bufLen - 1;
// assume no punct to break on!
len = 0;
// only cut off at punctuation
char *p = s;
char *pend = s + len;
char *last = NULL;
int32_t size ;
for ( ; p < pend ; p += size ) {
// skip if utf8 char
size = getUtf8CharSize(*p);
// skip if 2+ bytes
if ( size > 1 ) continue;
// skip if not punct
if ( is_alnum_a(*p) ) continue;
// mark it
last = p;
}
if ( last ) len = last - s;
// this old way was faster...:
//while ( len > 0 && is_alnum(s[len-1]) ) len--;
}
// convert html entities to their chars
len = saftenTags ( buf , bufLen , s , len );
// NULL terminate the buffer
buf[len] = '\0';
// temp null term
char c = tptr[tagLen];
tptr[tagLen] = 0;
// custom
hi.m_prefix = tptr;
// desc is NULL, prefix will be used as desc
bool status = hashString ( buf,len,&hi );
// put it back
tptr[tagLen] = c;
// bail on error, g_errno should be set
if ( ! status ) return false;
// return false with g_errno set on error
//if ( ! hashNumber ( buf , bufLen , &hi ) )
// return false;
}
return true;
}
bool XmlDoc::hashMetaData ( HashTableX *tt ) {
if ( ! ptr_metadata || !ptr_metadata[0] ) return true;
Json jp;
if ( ! jp.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
log("XmlDoc had error parsing json in metadata %s",
ptr_metadata);
return false;
}
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INMETATAG;
hi.m_tt = tt;
hi.m_desc = "meta data";
hi.m_useCountTable = false;
// always reset to word pos to 0 now when hashing a json field
// since it shouldn't matter because they are in a field so we
// have to search like myfield:whatever. this way we can
// augment ptr_metadata on an EDOCUNCHANGED error and
// not end up with undeleteable data in posdb. if we have
// duplicate fields in our doc and our doc is json, we could have
// some word position conflicts, which kinda sucks, but can be
// avoided becomes this is HASHGROUP_INMETATAG, but should really
// be HASHGROUP_INMETADATA just to be sure.
int32_t saved = m_dist;
m_dist = 0;
hashJSONFields2 ( tt , &hi , &jp , false );
m_dist = saved;
return true;
}
// slightly greater than m_spideredTime, which is the download time.
// we use this for sorting as well, like for the widget so things
// don't really get added out of order and not show up in the top spot
// of the widget list.
int32_t XmlDoc::getIndexedTime() {
if ( m_indexedTimeValid ) return m_indexedTime;
m_indexedTime = getTimeGlobal();
return m_indexedTime;
}
// . hash dates for sorting by using gbsortby: and gbrevsortby:
// . do 'gbsortby:gbspiderdate' as your query to see this in action
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
// stop if already set
if ( ! m_spideredTimeValid ) return true;
int32_t indexedTime = getIndexedTime();
// first the last spidered date
HashInfo hi;
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
hi.m_tt = tt;
hi.m_desc = "last spidered date";
hi.m_prefix = "gbspiderdate";
char buf[64];
int32_t bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// and index time is >= spider time, so you want to sort by that for
// the widget for instance
hi.m_desc = "last indexed date";
hi.m_prefix = "gbindexdate";
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// do not index the rest if we are a "spider reply" document
// which is like a fake document for seeing spider statuses
//if ( isStatusDoc == CT_STATUS ) return true;
//if ( isStatusDoc ) return true;
// now for CT_STATUS spider status "documents" we also index
// gbspiderdate so index this so we can just do a
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
// spider status "documents"
hi.m_desc = "doc last spidered date";
hi.m_prefix = "gbdocspiderdate";
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
hi.m_desc = "doc last indexed date";
hi.m_prefix = "gbdocindexdate";
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// all done
return true;
}
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
setStatus ( "hashing meta zip" );
// . set the score based on quality
// . scores are multiplied by 256 to preserve fractions for adding
uint32_t score = *getSiteNumInlinks8() * 256 ;
if ( score <= 0 ) score = 1;
// search for meta date
char buf [ 32 ];
int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
char *p = buf;
char *pend = buf + bufLen ;
if ( bufLen <= 0 ) return true;
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
//hi.m_prefix = "zipcode";
hi.m_prefix = "gbzipcode";
nextZip:
// . parse out the zip codes, may be multiple ones
// . skip non-digits
while ( p < pend && ! is_digit(*p) ) p++;
// skip if no digits
if ( p == pend ) return true;
// need at least 5 consecutive digits
if ( p + 5 > pend ) return true;
// if not a zip code, skip it
if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
// do we have too many consectuive digits?
if ( p + 5 != pend && is_digit(p[5]) ) {
// if so skip this whole string of digits
p += 5; while ( p < pend && is_digit(*p) ) p++;
goto nextZip;
}
// 90210 --> 90 902 9021 90210
for ( int32_t i = 0 ; i <= 3 ; i++ )
// use prefix as description
if ( ! hashString ( p,5-i,&hi ) ) return false;
p += 5;
goto nextZip;
}
// returns false and sets g_errno on error
bool XmlDoc::hashContentType ( HashTableX *tt ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
uint8_t ctype = *getContentType();
char *s = NULL;
setStatus ( "hashing content type" );
// hash numerically so we can do gbfacetint:type on it
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "type";
char tmp[6];
sprintf(tmp,"%"UINT32"",(uint32_t)ctype);
if ( ! hashString (tmp,gbstrlen(tmp),&hi ) ) return false;
// these ctypes are defined in HttpMime.h
switch (ctype) {
case CT_HTML: s = "html"; break;
case CT_TEXT: s = "text"; break;
case CT_XML : s = "xml" ; break;
case CT_PDF : s = "pdf" ; break;
case CT_DOC : s = "doc" ; break;
case CT_XLS : s = "xls" ; break;
case CT_PPT : s = "ppt" ; break;
case CT_PS : s = "ps" ; break;
// for diffbot. so we can limit search to json objects
// in Diffbot.cpp
case CT_JSON: s = "json" ; break;
}
// bail if unrecognized content type
if ( ! s ) return true;
// hack for diffbot. do not hash type:json because diffbot uses
// that for searching diffbot json objects
if ( cr->m_isCustomCrawl && ctype==CT_JSON && !m_isDiffbotJSONObject )
return true;
// . now hash it
// . use a score of 1 for all
// . TODO: ensure doc counting works ok with this when it does
// it's interpolation
return hashString (s,gbstrlen(s),&hi );
}
// . hash the link: terms
// . ensure that more useful linkers are scored higher
// . useful for computing offsite link text for qdb-ish algorithm
// . NOTE: for now i do not hash links to the same domain in order to
// hopefully save 10%-25% index space
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
// different site links with no link text will be ranked behind them
// . the 8-bit bitmap of the score of a link: term:
// . 00ubdcss u = link is Unbanned? b = link isBanned?
// d = link dirty? c = link clean?
// s = 01 if no link text, 10 if link text
// . NOTE: this is used in Msg18.cpp for extraction
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
// so i moved the bits down
bool XmlDoc::hashLinks ( HashTableX *tt ) {
setStatus ( "hashing links" );
// int16_tcuts
bool isRSSFeed = *getIsRSS();
Url *cu = getCurrentUrl() ;
Url *ru = *getRedirUrl() ;
char dbuf[8*4*1024];
HashTableX dedup;
dedup.set( 8,0,1024,dbuf,8*4*1024,false,m_niceness,"hldt");
// see ../url/Url2.cpp for hashAsLink() algorithm
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
// skip links with zero 0 length
if ( m_links.m_linkLens[i] == 0 ) continue;
// . skip if we are rss page and this link is an <a href> link
// . we only harvest/index <link> urls from rss feeds
// . or in the case of feedburner, those orig tags
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) )
continue;
// if we have a <feedburner:origLink> tag, then ignore <link>
// tags and only get the links from the original links
if ( m_links.m_isFeedBurner &&
!(m_links.m_linkFlags[i] & LF_FBTAG) )
continue;
// normalize the link
Url link;
// now we always add "www" to these links so that any link
// to cnn.com is same as link to www.cnn.com, because either
// we index cnn.com or www.cnn.com but not both providing
// their content is identical (deduping). This way whichever
// one we index, we can take advantage of all link text whether
// it's to cnn.com or www.cnn.com.
// Every now and then we add new session ids to our list in
// Url.cpp, too, so we have to version that.
// Since this is just for hashing, it shouldn't matter that
// www.tmblr.co has no IP whereas only tmblr.co does.
link.set ( m_links.m_linkPtrs[i] ,
m_links.m_linkLens[i] ,
true , // addWWW?
m_links.m_stripIds ,
false , // stripPound?
false , // stripCommonFile?
m_version );// used for new session id stripping
// breathe
QUICKPOLL(m_niceness);
// . the score depends on some factors:
// . NOTE: these are no longer valid! (see score bitmap above)
// . 4 --> if link has different domain AND has link text
// . 3 --> if link has same domain AND has link text
// . 2 --> if link has different domain AND no link text
// . 1 --> if link has sam domain AND no link text
// . is domain the same as ours?
// . NOTE: ideally, using the IP domain would be better, but
// we do not know the ip of the linker right now... so scores
// may be topped with a bunch of same-ip domain links so that
// we may not get as much link text as we'd like, since we
// only sample from one link text per ip domain
// . now we also just use the mid domain! (excludes TLD)
bool internal = false;
int32_t mdlen = cu->getMidDomainLen();
if ( mdlen == link.getMidDomainLen() &&
strncmp(cu->getMidDomain(),link.getMidDomain(),mdlen)==0)
//continue; // sameMidDomain = true;
internal = true;
// also check the redir url
if ( ru ) {
mdlen = ru->getMidDomainLen();
if ( mdlen == link.getMidDomainLen() &&
strncmp(ru->getMidDomain(),
link.getMidDomain(),mdlen)==0)
//continue; // sameMidDomain = true;
internal = true;
}
// now make the score
//unsigned char score ;
// . TODO: consider not hashing link w/o text!
// . otherwise, give it a higher score if it's got link TEXT
//bool gotLinkText = m_links.hasLinkText ( i, m_version );
// otherwise, beginning with version 21, allow internal links,
// but with lower scores
// score
// internal, no link text: 2
// internal, w/ link text: 4
// external, no link text: 6
// external, w/ link text: 8
//if ( internal ) {
// if ( ! gotLinkText ) score = 0x02;
// else score = 0x04;
//}
//else {
// if ( ! gotLinkText ) score = 0x06;
// else score = 0x08;
//}
// dedup this crap
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
if ( dedup.isInTable ( &h ) ) continue;
if ( ! dedup.addKey ( &h ) ) return false;
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "link";
// hash link:<url>
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi ))
return false;
h = hash64 ( link.getHost() , link.getHostLen() );
if ( dedup.isInTable ( &h ) ) continue;
if ( ! dedup.addKey ( &h ) ) return false;
// fix parm
hi.m_prefix = "sitelink";
// hash sitelink:<urlHost>
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi))
return false;
// breathe
QUICKPOLL(m_niceness);
}
// skip this for now
return true;
/*
setStatus ("hashing gbhasbannedoutlink" );
// only lets a domain vote once
int32_t numBannedOutlinks = *getNumBannedOutlinks();
//if ( numBannedOutlinks <= 0 ) return true;
// a score of 235 seems to give a negative return for score8to32()
uint32_t score = score8to32 ( numBannedOutlinks );
// make score at least 1!
if ( score <= 0 ) score = 1;
// a hack fix
if ( score > 0x7fffffff ) score = 0x7fffffff;
// set up the hashing parms
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "gbhasbannedoutlink";
// hash this special thing to help us de-spam the index
if ( numBannedOutlinks > 0 ) return hashString ("1",1,&hi );
else return hashString ("0",1,&hi );
*/
}
// . returns false and sets g_errno on error
// . hash for linkdb
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
// sanity check
if ( dt->m_ks != sizeof(key224_t) ) { char *xx=NULL;*xx=0; }
if ( dt->m_ds != 0 ) { char *xx=NULL;*xx=0; }
// this will be different with our new site definitions
uint32_t linkerSiteHash32 = *getSiteHash32();
char siteRank = getSiteRank();
if ( ! m_linksValid ) { char *xx=NULL;*xx=0; }
// we need to store this in the title rec for re-building
// the meta list from the title rec...
// is this just site info?
//TagRec ***pgrv = getOutlinkTagRecVector();
//if ( ! pgrv || pgrv == (void *)-1 ) { char *xx=NULL;*xx=0; }
//TagRec **grv = *pgrv;
int32_t *linkSiteHashes = getLinkSiteHashes();
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ){
char *xx=NULL;*xx=0;}
// convert siteNumInlinks into a score
//int32_t numSiteInlinks = *xd->getSiteNumInlinks();
unsigned char hopCount = *getHopCount();
// use spidered time! might not be current time! like if rebuilding
// or injecting from a past spider time
int32_t discoveryDate = getSpideredTime();//TimeGlobal();
int32_t lostDate = 0;
// add in new links
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
// give up control
QUICKPOLL ( m_niceness );
// skip if empty
if ( m_links.m_linkLens[i] == 0 ) continue;
// . skip if spam, ALWAYS allow internal outlinks though!!
// . CAUTION: now we must version islinkspam()
bool spam = m_links.isLinkSpam(i) ;
// or if it has no link text, skip it
//if ( ! links->hasLinkText(i,TITLEREC_CURRENT_VERSION) )
//continue;
// get site of outlink from tagrec if in there
int32_t linkeeSiteHash32 = linkSiteHashes[i];
/*
TagRec *gr = grv[i];
char *site = NULL;
int32_t siteLen = 0;
if ( gr ) {
int32_t dataSize = 0;
site = gr->getString("site",NULL,&dataSize);
if ( dataSize ) siteLen = dataSize - 1;
}
// otherwise, make it the host or make it cut off at
// a "/user/" or "/~xxxx" or whatever path component
if ( ! site ) {
// GUESS link site... TODO: augment for /~xxx
char *s = m_links.getLink(i);
//int32_t slen = m_links.getLinkLen(i);
//siteLen = slen;
site = ::getHost ( s , &siteLen );
}
uint32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
*/
//
// when setting the links class it should set the site hash
//
// set this key, it is the entire record
key224_t k;
k = g_linkdb.makeKey_uk ( linkeeSiteHash32 ,
m_links.getLinkHash64(i) ,
spam , // link spam?
siteRank , // was quality
hopCount,
*getIp() ,
*getDocId() ,
discoveryDate ,
lostDate ,
false , // new add?
linkerSiteHash32 ,
false );// delete?
/*
// debug
if ( m_links.getLinkHash64(i) != 0x3df1c439a364e18dLL )
continue;
//char c = site[siteLen];
//site[siteLen]=0;
//char tmp[1024];
//sprintf(tmp,"xmldoc: hashinglink site=%s sitelen=%"INT32" ",
// site,siteLen);
//site[siteLen] = c;
log(//"%s "
"url=%s "
"linkeesitehash32=0x%08"XINT32" "
"linkersitehash32=0x%08"XINT32" "
"urlhash64=0x%16llx "
"docid=%"INT64" k=%s",
//tmp,
m_links.getLink(i),
(int32_t)linkeeSiteHash32,
linkerSiteHash32,
m_links.getLinkHash64(i),
*getDocId(),
KEYSTR(&k,sizeof(key224_t))
);
*/
// store in hash table
if ( ! dt->addKey ( &k , NULL ) ) return false;
}
return true;
}
bool XmlDoc::getUseTimeAxis ( ) {
if ( m_useTimeAxisValid )
return m_useTimeAxis;
if ( m_setFromTitleRec )
// return from titlerec header
return m_useTimeAxis;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) return false;
m_useTimeAxis = cr->m_useTimeAxis;
m_useTimeAxisValid = true;
// sanity check
// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
// log("build: custom crawls can't use time axis");
// char *xx=NULL;*xx=0;
// m_useTimeAxis = false;
// }
return m_useTimeAxis;
}
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {
setStatus ( "hashing url colon" );
// get the first url
Url *fu = getFirstUrl();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// we do not need diversity bits for this
hi.m_useCountTable = false;
//
// HASH url: term
//
// append a "www." for doing url: searches
Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
hi.m_prefix = "url";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "url2";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
hi.m_prefix = "gbtimeurl";
SafeBuf *tau = getTimeAxisUrl();
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
}
// use hash of url as score so we can get a # of docs per site est.
//uint16_t score = hash16 ( fu->getUrl() , fu->getUrlLen() );
setStatus ( "hashing inurl colon" );
//
// HASH inurl: terms
//
char *s = fu->getUrl ();
int32_t slen = fu->getUrlLen();
hi.m_prefix = "inurl";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "inurl2";
if ( ! hashString ( s,slen, &hi ) ) return false;
setStatus ( "hashing ip colon" );
//
// HASH ip:a.b.c.d
//
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
// copy it to save it
char ipbuf[64];
int32_t iplen = sprintf(ipbuf,"%s",iptoa(m_ip));
//char *tmp = iptoa ( m_ip );
//int32_t tlen = gbstrlen(tmp);
hi.m_prefix = "ip";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "ip2";
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
//
// HASH ip:a.b.c
//
char *end1 = ipbuf + iplen - 1;
while ( *end1 != '.' ) end1--;
if ( ! hashSingleTerm(ipbuf,end1-ipbuf,&hi) ) return false;
// . sanity check
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
// get the boost
//floatboost1=(float)getBoostFromSiteNumInlinks(m_siteNumInlinks)/100.0
//
// HASH the url path plain as if in body
//
// get number of components in the path. does not include the filename
int32_t pathDepth = fu->getPathDepth(false);
// make it a density thing
//pathScore /= ( pathDepth + 1 );
// ensure score positive
//if ( pathScore <= 0 ) pathScore = 1;
// get it
char *path = fu->getPath();
int32_t plen = fu->getPathLen();
/*
// update it
float boost2 = (float)m_urlPathWeight / 100;
// again
float boost3 = 1.0 / ((float)pathDepth + 1.0) ;
// make a description
char tmp3[190];
sprintf( tmp3 ,
"path score = "
"siteInlinksBoost * "
"urlPathWeight * "
"pathDepthBoost * "
"256 = %.02f * %.02f * %.02f * 256 " ,
boost1 ,
boost2 ,
boost3 );
*/
//int32_t pathScore = (int32_t) (256.0 * boost1 * boost2 * boost3);
// update parms
//hi.m_desc = tmp3;
hi.m_prefix = NULL;
hi.m_desc = "url path";
hi.m_hashGroup = HASHGROUP_INURL;
// if parm "index article content only" is true, do not index this!
//if ( m_eliminateMenus ) skipIndex=true;
setStatus ( "hashing gbpathdepth");
//
// HASH gbpathdepth:X
//
// xyz.com/foo --> 0
// xyz.com/foo/ --> 1
// xyz.com/foo/boo --> 1
// xyz.com/foo/boo/ --> 2
char buf[20];
int32_t blen = sprintf(buf,"%"INT32"",pathDepth);
// update parms
hi.m_prefix = "gbpathdepth";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash gbpathdepth:X
if ( ! hashString ( buf,blen,&hi) ) return false;
//
// HASH gbhopcount:X
//
setStatus ( "hashing gbhopcount");
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount);
// update parms
hi.m_prefix = "gbhopcount";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash gbpathdepth:X
if ( ! hashString ( buf,blen,&hi) ) return false;
setStatus ( "hashing gbhasfilename");
//
// HASH gbhasfilename:0 or :1
//
char *hm;
if ( fu->getFilenameLen() ) hm = "1";
else hm = "0";
// update parms
hi.m_prefix = "gbhasfilename";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
// hash gbhasfilename:[0|1]
if ( ! hashString ( hm,1,&hi) ) return false;
setStatus ( "hashing gbiscgi");
//
// HASH gbiscgi:0 or gbiscgi:1
//
if ( fu->isCgi() ) hm = "1";
else hm = "0";
hi.m_prefix = "gbiscgi";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
if ( ! hashString ( hm,1,&hi) ) return false;
setStatus ( "hashing gbext");
//
// HASH gbhasext:0 or gbhasext:1 (does it have a fileextension)
//
// . xyz.com/foo --> gbhasext:0
// . xyz.com/foo.xxx --> gbhasext:1
if ( fu->getExtensionLen() ) hm = "1";
else hm = "0";
hi.m_prefix = "gbhasext";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
if ( ! hashString ( hm,1,&hi) ) return false;
//
// HASH the url's mid domain and host as they were in the body
//
setStatus ( "hashing site colon terms");
//
// HASH the site: terms
//
// . hash the pieces of the site
// . http://host.domain.com/~harry/level1/ should hash to:
// . site:host.domain.com/~harry/level1/
// . site:host.domain.com/~harry/
// . site:host.domain.com/~
// . site:host.domain.com/
// . site:domain.com/~harry/level1/
// . site:domain.com/~harry/
// . site:domain.com/~
// . site:domain.com/
// ensure score is positive
//if ( siteScore <= 0 ) siteScore = 1;
// get the hostname (later we set to domain name)
char *name = fu->getHost();
int32_t nameLen = fu->getHostLen();
// . point to the end of the whole thing, including port field
// . add in port, if non default
char *end3 = name + fu->getHostLen() + fu->getPortLen();
loop:
// now loop through the sub paths of this url's path
for ( int32_t i = 0 ; ; i++ ) {
// get the subpath
int32_t len = fu->getSubPathLen(i);
// FIX: always include first /
if ( len == 0 ) len = 1;
// write http://www.whatever.com/path into buf
char buf[MAX_URL_LEN+10];
char *p = buf;
gbmemcpy ( p , "http://" , 7 ); p += 7;
gbmemcpy ( p , name , nameLen ); p += nameLen;
gbmemcpy ( p , fu->getPath() , len ); p += len;
*p = '\0';
// update hash parms
hi.m_prefix = "site";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "site2";
hi.m_hashGroup = HASHGROUP_INURL;
// this returns false on failure
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
// break when we hash the root path
if ( len <=1 ) break;
}
// now keep moving the period over in the hostname
while ( name < end3 && *name != '.' ) { name++; nameLen--; }
// skip the '.'
name++; nameLen--;
// if not '.' we're done
if ( name < end3 ) goto loop;
setStatus ( "hashing ext colon");
//
// HASH ext: term
//
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
char *ext = fu->getExtension();
int32_t elen = fu->getExtensionLen();
// update hash parms
hi.m_prefix = "ext";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "ext2";
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
setStatus ( "hashing gbdocid" );
hi.m_prefix = "gbdocid";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
char buf2[32];
sprintf(buf2,"%"UINT64"",(m_docId) );
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
// if indexing a json diffbot object, index
// gbparenturl:xxxx of the original url from which the json was
// datamined. we use this so we can act as a diffbot json cache.
if ( m_isDiffbotJSONObject ) {
setStatus ( "hashing gbparenturl term");
char *p = fu->getUrl() + fu->getUrlLen() - 1;
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
for ( ; *p && *p != '-' ; p-- );
// set up the hashing parms
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_desc = "diffbot parent url";
// append a "www." as part of normalization
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
hi.m_prefix = "gbparenturl";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
}
//if ( isStatusDoc ) return true;
setStatus ( "hashing SiteGetter terms");
//
// HASH terms for SiteGetter.cpp
//
// . this termId is used by SiteGetter.cpp for determining subsites
// . matches what is in SiteGet::getSiteList()
// for www.xyz.com/a/ HASH www.xyz.com
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
bool add = true;
// we only hash this for urls that end in '/'
if ( s[slen-1] != '/' ) add = false;
// and no cgi
if ( fu->isCgi() ) add = false;
// skip if root
if ( fu->m_plen <= 1 ) add = false;
// sanity check
if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
// . skip if we have no subdirectory outlinks
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
char *host = fu->getHost ();
int32_t hlen = fu->getHostLen ();
// tags from here out
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_shardByTermId = true;
// hash it
if ( add ) {
// remove the last path component
char *end2 = s + slen - 2;
// back up over last component
for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
// hash that part of the url
hi.m_prefix = "siteterm";
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
hi.m_shardByTermId = false;
setStatus ( "hashing urlhashdiv10 etc");
//
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
//
// this is for proving how many docs are in the index
uint32_t h = hash32 ( s , slen );
blen = sprintf(buf,"%"UINT32"",h);
hi.m_prefix = "urlhash";
if ( ! hashString(buf,blen,&hi) ) return false;
blen = sprintf(buf,"%"UINT32"",h/10);
// update hashing parms
hi.m_prefix = "urlhashdiv10";
if ( ! hashString(buf,blen,&hi) ) return false;
blen = sprintf(buf,"%"UINT32"",h/100);
// update hashing parms
hi.m_prefix = "urlhashdiv100";
if ( ! hashString(buf,blen,&hi) ) return false;
setStatus ( "hashing url mid domain");
// the final score
//int32_t plainScore = (int32_t)(256.0 * boost1 * boost2 * fw);
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";//tmp3;
hi.m_hashGroup = HASHGROUP_INURL;
// if parm "index article content only" is true, do not index this!
//if ( m_eliminateMenus ) plainScore = 0;
//char *mid = fu->getMidDomain ();
//int32_t mlen = fu->getMidDomainLen();
//hi.m_desc = "url mid dom";
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
//hi.m_desc = "url host";
if ( ! hashString ( host,hlen,&hi)) return false;
setStatus ( "hashing url path");
// hash the path plain
if ( ! hashString (path,plen,&hi) ) return false;
return true;
}
/////////////
//
// CHROME DETECTION
//
// we search for these terms we hash here in getSectionsWithDupStats()
// so we can remove chrome.
//
/////////////
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashSections ( HashTableX *tt ) {
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
//if ( m_contentType == CT_HTML ) return true;
setStatus ( "hashing sections" );
if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
Sections *ss = &m_sections;
int32_t siteHash32 = *getSiteHash32();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// the prefix is custom set for each section below
//hi.m_prefix = "gbsectionhash";
// put all guys with the same xpath/site on the same shard
hi.m_shardByTermId = true;
Section *si = ss->m_rootSection;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// . skip if empty
// . this needs to be like 48 bits because 32 bits is not
// big enought!
//uint64_t ih64 = si->m_sentenceContentHash64;
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
continue;
// skip if sentence, only hash tags now i guess for diffbot
//if ( si->m_sentenceContentHash64 )
// continue;
// get hash of sentences this tag contains indirectly
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
if ( ! val32 )
continue;
// the termid is now the xpath and the sitehash, the "value"
// will be the hash of the innerhtml, m_sentenceContentHash64
uint64_t thash64 = (uint32_t)si->m_turkTagHash32;
// combine with site hash
thash64 ^= (uint32_t)siteHash32;
// this is a special hack we need to make it the
// hash of the inner html
//hi.m_sentHash32 = (uint32_t)ih64;
// . get section xpath & site hash
// . now if user does a gbfacets:gbxpathsitehashxxxxxx query
// he will get back a histogram of the values it hash,
// which are 32-bit hashes of the innerhtml for that
// xpath on this site.
char prefix[96];
sprintf(prefix,"gbxpathsitehash%"UINT64"",thash64);
// like a normal key but we store "ih64" the innerHTML hash
// of the section into the key instead of wordbits etc.
// similar to hashNumber*() functions.
//if ( ! hashSectionTerm ( term , &hi, (uint32_t)ih64 ) )
// return false;
// i guess use facets
hi.m_prefix = prefix;
// we already have the hash of the inner html of the section
hashFacet2 ( "gbfacetstr",
prefix,
//(int32_t)(uint32_t)ih64 ,
val32,
hi.m_tt ,
// shard by termId?
true );
}
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
bool hashAnomalies ,
bool hashNonAnomalies ) {
// do not index ANY of the body if it is NOT a permalink and
// "menu elimination" technology is enabled.
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
setStatus ( "hashing link text" );
// . now it must have an rss item to be indexed in all its glory
// . but if it tells us it has an rss feed, toss it and wait for
// the feed.... BUT sometimes the rss feed outlink is 404!
// . NO, now we discard with ENORSS at Msg16.cpp
//if ( ! *getHasRSSItem() && m_eliminateMenus ) return true;
// sanity check
if ( hashAnomalies == hashNonAnomalies ) { char *xx = NULL; *xx =0; }
// display this note in page parser
char *note = "hashing incoming link text";
// sanity
if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
// . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to
// somewhere in TitleRec
// . otherwise, we generated it from merging a bunch of LinkInfos
// and storing them in this new TitleRec
LinkInfo *info1 = getLinkInfo1 ();
LinkInfo **pinfo2 = getLinkInfo2 ();
LinkInfo *info2 = *pinfo2;
LinkInfo *linkInfo = info1;
// pick the one with the most inlinks with valid incoming link text,
// otherwise, we end up with major bias when we stop importing
// link text from another cluster, because some pages will have
// twice as many links as they should!
if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
linkInfo = info2;
note = "hashing incoming link text from other cluster";
}
// sanity checks
if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
//
// brought the following code in from LinkInfo.cpp
//
int32_t noteLen = 0;
if ( note ) noteLen = gbstrlen ( note );
// count "external" inlinkers
int32_t ecount = 0;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_useSynonyms = true;
// hashstring should update this like a cursor.
hi.m_startDist = 0;
// loop through the link texts and hash them
for ( Inlink *k = NULL; (k = linkInfo->getNextInlink(k)) ; ) {
// is this inlinker internal?
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// count external inlinks we have for indexing gbmininlinks:
if ( ! internal ) ecount++;
// get score
//int64_t baseScore = k->m_baseScore;
// get the weight
//int64_t ww ;
//if ( internal ) ww = m_internalLinkTextWeight;
//else ww = m_externalLinkTextWeight;
// modify the baseScore
//int64_t final = (baseScore * ww) / 100LL;
// get length of link text
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// get the text
char *txt = k->getLinkText();
// sanity check
if ( ! verifyUtf8 ( txt , tlen ) ) {
log("xmldoc: bad link text 2 from url=%s for %s",
k->getUrl(),m_firstUrl.m_url);
continue;
}
// if it is anomalous, set this, we don't
//if ( k->m_isAnomaly )
// hi.m_hashIffNotUnique = true;
//hi.m_baseScore = final;
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
// store the siterank of the linker in this and use that
// to set the multiplier M bits i guess
hi.m_linkerSiteRank = k->m_siteRank;
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
k->m_wordPosStart = m_dist; // hi.m_startDist;
// . hash the link text into the table
// . returns false and sets g_errno on error
// . we still have the score punish from # of words though!
// . for inlink texts that are the same it should accumulate
// and use the reserved bits as a multiplier i guess...
if ( ! hashString ( txt,tlen,&hi) ) return false;
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
//k->m_wordPosEnd = hi.m_startDist;
// spread it out
hi.m_startDist += 20;
}
/*
// . hash gbkeyword:numinlinks where score is # of inlinks from 1-255
// . do not hash gbkeyword:numinlinks if we don't got any
if ( ecount <= 0 ) return true;
// limit it since our score can't be more than 255 (8-bits)
//if ( ecount > 255 ) ecount = 255;
// convert our 32 bit score to 8-bits so we trick it!
//int32_t score = score8to32 ( (uint8_t)ecount );
// watch out for wrap
//if ( score < 0 ) score = 0x7fffffff;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "gbkeyword";
hi.m_hashGroup = HASHGROUP_INTAG;
// for terms where word position/density/diversity is irrelevant,
// we can store this value...
hi.m_fakeValue = ecount;
// hash gbkeyword:numinlinks term
if ( ! hashString ( "numinlinks",10,&hi ) )return false;
*/
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
// seems like iffUnique is off, so do this
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
setStatus ( "hashing neighborhoods" );
//g_tt = table;
// . now we also hash the neighborhood text of each inlink, that is,
// the text surrounding the inlink text.
// . this is also destructive in that it will remove termids that
// were not in the document being linked to in order to save
// space in the titleRec
// . now we only do one or the other, not both
LinkInfo *info1 = getLinkInfo1 ();
LinkInfo **pinfo2 = getLinkInfo2 ();
LinkInfo *info2 = *pinfo2;
LinkInfo *linkInfo = info1;
char *note = " (internal cluster)";
// pick the one with the most inlinks with valid incoming link text
// otherwise, we end up with major bias when we stop importing
// link text from another cluster, because some pages will have
// twice as many links as they should!
if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
linkInfo = info2;
note = " (external cluster)";
}
// loop over all the Inlinks
Inlink *k = NULL;
loop:
// get the next inlink
k = linkInfo->getNextInlink( k );
// break if done
if ( ! k ) return true;
// skip if internal, they often have the same neighborhood text
if ( (k->m_ip&0x0000ffff)==(m_ip&0x0000ffff) ) goto loop;
// get the left and right texts and hash both
char *s = k->getSurroundingText();
if ( ! s || k->size_surroundingText <= 1 ) goto loop;
//int32_t inlinks = *getSiteNumInlinks();
// HACK: to avoid having to pass a flag to TermTable, then to
// Words::hash(), Phrases::hash(), etc. just flip a bit in the
// table to make it not add anything unless it is already in there.
tt->m_addIffNotUnique = true;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "surrounding text";
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
// . hash that
// . this returns false and sets g_errno on error
int32_t len = k->size_surroundingText - 1;
if ( ! hashString ( s, len, &hi ) ) return false;
// now turn it back off
tt->m_addIffNotUnique = false;
// get the next Inlink
goto loop;
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
setStatus ( "hashing rss info" );
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
// . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to
// somewhere in TitleRec
// . otherwise, we generated it from merging a bunch of LinkInfos
// and storing them in this new TitleRec
LinkInfo *linkInfo = getLinkInfo1();
// get the xml of the first rss/atom item/entry referencing this url
Xml xml;
// . returns NULL if no item xml
// . this could also be a "channel" blurb now, so we index channel pgs
if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;
if ( xml.isEmpty() )
// hash gbrss:0
return hashRSSTerm ( tt , false );
// parser info msg
//if ( m_pbuf ) {
// m_pbuf->safePrintf(
// "<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
//}
// hash nothing if not a permalink and eliminating "menus"
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
// . IMPORTANT: you must be using the new link algo, so turn it on
// in the spider controls. this allows us to include LinkTexts from
// the same IP in our LinkInfo class in the TitleRec.
// . is it rss or atom? both use title tag, so doesn't matter
// . get the title tag
bool isHtmlEncoded;
int32_t titleLen;
char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
char c = 0;
// sanity check
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
bool hashIffUnique = true;
// but if we had no content because we were an mp3 or whatever,
// do not worry about avoiding double hashing
if ( size_utf8Content <= 0 ) hashIffUnique = false;
// decode it?
// should we decode it? if they don't use [CDATA[]] then we should
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
// but most other feeds do not use it
if ( isHtmlEncoded && title && titleLen > 0 ) {
// it is html encoded so that the <'s are encoded to &lt;'s so
// we must decode them back. this could turn latin1 into utf8
// though? no, because the &'s should have been encoded, too!
int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
// make sure we don't overflow the buffer
if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
// reassign the length
titleLen = newLen;
// NULL terminate it
c = title[titleLen];
title[titleLen] = '\0';
}
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_TITLE;
hi.m_desc = "rss title";
// . hash the rss title
// . only hash the terms if they are unique to stay balanced with docs
// that are not referenced by an rss feed
bool status = hashString ( title,titleLen,&hi ) ;
// pop the end back just in case
if ( c ) title[titleLen] = c;
// return false with g_errno set on error
if ( ! status ) return false;
// get the rss description
int32_t descLen;
char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );
// for adavanced hashing
Xml xml2;
Words w;
//Scores scores;
Words *wordsPtr = NULL;
//Scores *scoresPtr = NULL;
c = 0;
// should we decode it? if they don't use [CDATA[]] then we should
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
// but most other feeds do not use it
if ( isHtmlEncoded && desc && descLen > 0 ) {
// it is html encoded so that the <'s are encoded to &lt;'s so
// we must decode them back. this could turn latin1 into utf8
// though? no, because the &'s should have been encoded, too!
int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
// make sure we don't overflow the buffer
if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
// reassign the length
descLen = newLen;
}
// NULL terminate it
if ( desc ) {
c = desc[descLen];
desc[descLen] = '\0';
// set the xml class from the decoded html
if ( ! xml2.set ( desc ,
descLen ,
false , // own data?
0 , // allocSize
false , // pure xml?
m_version ,
true , // set parents?
m_niceness ,
*ct ) )
return false;
// set the words class from the xml, returns false and sets
// g_errno on error
if ( ! w.set ( &xml2 ,
true , // compute Ids
true ))// has html ents? (WERE encoded twice!)
return false;
// pass it in to TermTable::hash() below
wordsPtr = &w;
}
// update hash parms
hi.m_tt = tt;
hi.m_desc = "rss body";
hi.m_hashGroup = HASHGROUP_BODY;
// . hash the rss/atom description
// . only hash the terms if they are unique to stay balanced with docs
// that are not referenced by an rss feed
status = hashString ( desc, descLen, &hi );
// pop the end back just in case
if ( c ) desc[descLen] = c;
// return false with g_errno set
if ( ! status ) return false;
// hash gbrss:1
if ( ! hashRSSTerm ( tt , true ) ) return false;
// parser info msg
//if ( m_pbuf ) {
// m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
// "</b><br><br>");
//}
return true;
}
bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
// hash gbrss:0 or gbrss:1
char *value;
if ( inRSS ) value = "1";
else value = "0";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "gbinrss";
hi.m_hashGroup = HASHGROUP_INTAG;
// returns false and sets g_errno on error
if ( ! hashString(value,1,&hi ) ) return false;
// hash gbisrss:1 if we are an rss page ourselves
if ( *getIsRSS() ) value = "1";
else value = "0";
// update hash parms
hi.m_prefix = "gbisrss";
// returns false and sets g_errno on error
if ( ! hashString(value,1,&hi) ) return false;
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
// does have an <index> block in the ruleset.
// . the new Weights class hashes title as part of body now with a high weight
// given by "titleWeight" parm
bool XmlDoc::hashTitle ( HashTableX *tt ) {
// sanity check
if ( m_hashedTitle ) { char *xx=NULL ; *xx=0; }
setStatus ( "hashing title" );
// this has been called, note it
m_hashedTitle = true;
nodeid_t *tids = m_words.m_tagIds;
int32_t nw = m_words.m_numWords;
// find the first <title> tag in the doc
int32_t i ;
for ( i = 0 ; i < nw ; i++ )
if ( tids[i] == TAG_TITLE ) break;
// return true if no title
if ( i >= nw ) return true;
// skip tag
i++;
// mark it as start of title
int32_t a = i;
// limit end
int32_t max = i + 40;
if ( max > nw ) max = nw;
// find end of title, either another <title> or a <title> tag
for ( ; i < max ; i++ )
if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break;
// ends on a <title> tag?
if ( i == a ) return true;
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "title";
hi.m_useSynonyms= true;
// the new posdb info
hi.m_hashGroup = HASHGROUP_TITLE;
// . hash it up! use 0 for the date
// . use XmlDoc::hashWords()
// . use "title" as both prefix and description
//if ( ! hashWords (a,i,&hi ) ) return false;
char **wptrs = m_words.getWords();
int32_t *wlens = m_words.getWordLens();
char *title = wptrs[a];
char *titleEnd = wptrs[i-1] + wlens[i-1];
int32_t titleLen = titleEnd - title;
if ( ! hashString ( title, titleLen, &hi) ) return false;
// now hash as without title: prefix
hi.m_prefix = NULL;
if ( ! hashString ( title, titleLen, &hi) ) return false;
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
// does have an <index> block in the ruleset.
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
// do not index ANY of the body if it is NOT a permalink and
// "menu elimination" technology is enabled.
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
setStatus ( "hashing body" );
// if more than X% of words are spammed to some degree, index all
// words with a minimum score
//int64_t x[] = {30,40,50,70,90};
//int64_t y[] = {6,8,10,20,30};
//int32_t mp = getY ( *getSiteNumInlinks8() , x , y , 5 );
//int32_t nw = m_words.getNumWords();
// record this
m_bodyStartPos = m_dist;
m_bodyStartPosValid = true;
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "body";
hi.m_useSynonyms= true;
hi.m_hashGroup = HASHGROUP_BODY;
// use NULL for the prefix
return hashWords (&hi );
}
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
// do not index meta tags if "menu elimination" technology is enabled.
//if ( m_eliminateMenus ) return true;
setStatus ( "hashing meta keywords" );
// hash the meta keywords tag
//char buf [ 2048 + 2 ];
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
int32_t mklen;
char *mk = getMetaKeywords( &mklen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "meta keywords";
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString ( mk , mklen , &hi);
}
// . hash the meta summary, description and keyword tags
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
// sanity check
if ( m_hashedMetas ) { char *xx=NULL ; *xx=0; }
// this has been called, note it
m_hashedMetas = true;
// do not index meta tags if "menu elimination" technology is enabled.
//if ( m_eliminateMenus ) return true;
setStatus ( "hashing meta summary" );
// hash the meta keywords tag
//char buf [ 2048 + 2 ];
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
int32_t mslen;
char *ms = getMetaSummary ( &mslen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INMETATAG;
// udpate hashing parms
hi.m_desc = "meta summary";
// hash it
if ( ! hashString ( ms , mslen , &hi )) return false;
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
int32_t mdlen;
char *md = getMetaDescription ( &mdlen );
// udpate hashing parms
hi.m_desc = "meta desc";
// . TODO: only hash if unique????? set a flag on ht then i guess
if ( ! hashString ( md , mdlen , &hi ) ) return false;
return true;
}
//bool XmlDoc::linksToGigablast ( ) {
// // check m_links for a link to gigablast.com or www.gigablast.com
// return m_links.linksToGigablast();
//}
bool XmlDoc::searchboxToGigablast ( ) {
// . they may have a form variable like
// . <form method=get action=http://www.gigablast.com/cgi/0.cgi name=f>
return m_xml.hasGigablastForm();
}
// . bring back support for dmoz integration
// . when clicking on a "search within this category" it does a gbpdcat:<catid>
// search to capture all pages that have that dmoz category as one of their
// parent topics
bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
getDmozTitles();
char *titlePtr = ptr_dmozTitles;
char *sumPtr = ptr_dmozSumms;
//char *anchPtr = ptr_dmozAnchors;
char buf[128];
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
int32_t *catIds = (int32_t *)ptr_catIds;
int32_t numCatIds = size_catIds / 4;
// go through the catIds and hash them
for (int32_t i = 0; i < numCatIds; i++) {
// write the catid as a string
sprintf(buf, "%"UINT32"", (uint32_t)catIds[i]);
// term prefix for hashing
hi.m_prefix = "gbcatid";
// hash it
hashString ( buf , gbstrlen(buf) , &hi );
// we also want to hash the parents
int32_t currCatId = catIds[i];
int32_t currParentId = catIds[i];
int32_t currCatIndex;
// loop to the Top, Top = 1
while ( currCatId > 1 ) {
// hash the parent
sprintf(buf, "%"UINT32"", (uint32_t)currParentId);
hi.m_prefix = "gbpcatid";
hashString ( buf , gbstrlen(buf), &hi );
// next cat
currCatId = currParentId;
// get the index for this cat
currCatIndex = g_categories->getIndexFromId(currCatId);
if ( currCatIndex <= 0 ) break;
// get the parent for this cat
currParentId =
g_categories->m_cats[currCatIndex].m_parentid;
}
// do not hash titles or summaries if "index article content
// only" parm is on
//if ( tr->eliminateMenus() ) continue;
// hash dmoz title
hi.m_prefix = NULL;
// call this DMOZ title as regular title i guess
hi.m_hashGroup = HASHGROUP_TITLE;
// hash the DMOZ title
hashString ( titlePtr , gbstrlen(titlePtr), &hi );
// next title
titlePtr += gbstrlen(titlePtr) + 1;
// hash DMOZ summary
hi.m_prefix = NULL;
// call this DMOZ summary as body i guess
hi.m_hashGroup = HASHGROUP_BODY;
// hash the DMOZ summary
hashString ( sumPtr , gbstrlen(sumPtr), &hi );
// next summary
sumPtr += gbstrlen(sumPtr) + 1;
}
int32_t numIndCatIds = size_indCatIds / 4;
int32_t *indCatIds = (int32_t *)ptr_indCatIds;
// go through the INDIRECT catIds and hash them
for (int32_t i = 0 ; i < numIndCatIds; i++) {
// write the catid as a string
sprintf(buf, "%"UINT32"", (uint32_t)indCatIds[i]);
// use prefix
hi.m_prefix = "gbicatid";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash it
hashString ( buf , gbstrlen(buf), &hi );
// we also want to hash the parents
int32_t currCatId = indCatIds[i];
int32_t currParentId = indCatIds[i];
int32_t currCatIndex;
// loop to the Top, Top = 1
while (currCatId > 1) {
// hash the parent
sprintf(buf, "%"UINT32"", (uint32_t)currParentId);
// new prefix
hi.m_prefix = "gbipcatid";
// hash it
hashString ( buf , gbstrlen(buf), &hi );
// next cat
currCatId = currParentId;
// get the index for this cat
currCatIndex = g_categories->getIndexFromId(currCatId);
if ( currCatIndex <= 0 ) break;
// get the parent for this cat
currParentId =
g_categories->m_cats[currCatIndex].m_parentid;
}
}
return true;
}
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
setStatus ( "hashing language" );
int32_t langId = (int32_t)*getLangId();
char s[32]; // numeric langid
int32_t slen = sprintf(s, "%"INT32"", langId );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
if ( ! hashString ( s, slen, &hi ) ) return false;
// try lang abbreviation
sprintf(s , "%s ", getLangAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
// by adding hashLanguageString() function below
//sprintf(s , "%s ", getLangAbbr(langId) );
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
setStatus ( "hashing language string" );
int32_t langId = (int32_t)*getLangId();
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
// try lang abbreviation
char s[32];
int32_t slen = sprintf(s , "%s ", getLangAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashCountry ( HashTableX *tt ) {
setStatus ( "hashing country" );
//uint16_t *cids = getCountryIds();
//if ( ! cids ) return true;
//if ( cids == (uint16_t *)-1 ) return false;
uint16_t *cid = getCountryId();
if ( ! cid || cid == (uint16_t *)-1 ) return false;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbcountry";
for ( int32_t i = 0 ; i < 1 ; i++ ) {
// get the ith country id
//int32_t cid = cids[i];
// convert it
char buf[32];
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
// hash it
if ( ! hashString ( buf, blen, &hi ) ) return false;
}
// all done
return true;
}
bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {
setStatus ( "hashing site num inlinks" );
char s[32];
int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbsitenuminlinks";
// hack test
// slen = sprintf(s,"%"UINT32"",
// ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);
return hashString ( s, slen, &hi );
}
bool XmlDoc::hashCharset ( HashTableX *tt ) {
setStatus ( "hashing charset" );
char s[128]; // charset string
int32_t slen;
// hash the charset as a string
if ( ! get_charset_str(*getCharset()))
slen = sprintf(s, "unknown");
else
slen = sprintf(s, "%s", get_charset_str(*getCharset()));
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbcharset";
if ( ! hashString ( s,slen, &hi ) ) return false;
// hash charset as a number
slen = sprintf(s, "%d", *getCharset());
return hashString ( s,slen, &hi ) ;
}
// . only hash certain tags (single byte scores and ST_COMMENT)
// . do not hash clocks, ST_SITE, ST_COMMENT
// . term = gbtag:blog1 score=0-100
// . term = gbtag:blog2 score=0-100
// . term = gbtag:english1 score=0-100
// . term = gbtag:pagerank1 score=0-100, etc. ...
// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
// . later we can support query like gbtag:english1>30
bool XmlDoc::hashTagRec ( HashTableX *tt ) {
setStatus ( "hashing tag rec" );
//char *field = "gbtag:";
//int32_t fieldlen = gbstrlen(field);
//bool retval = true;
// . this tag rec does not have the ST_SITE tag in it to save space
// . it does not have clocks either?
TagRec *gr = getTagRec();
// count occurence of each tag id
//int16_t count [ LAST_TAG ];
//memset ( count , 0 , 2 * LAST_TAG );
// loop over all tags in the title rec
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
// breathe
QUICKPOLL(m_niceness);
// get id
int32_t type = tag->m_type;
// skip tags we are not supposed to index, like
// ST_CLOCK, etc. or anything with a dataSize not 1
if ( ! tag->isIndexable() ) continue;
// hash these metas below
//if ( type == ST_META ) continue;
//if ( tag->isType("meta") ) continue;
// only single byters. this should have been covered by the
// isIndexable() function.
//if ( tag->getTagDataSize() != 1 ) continue;
// get the name
char *str = getTagStrFromType ( type );
// get data size
//uint8_t *data = (uint8_t *)tag->getTagData();
// make it a string
//char dataStr[6];
//sprintf ( dataStr , "%"INT32"",(int32_t)*data );
// skip if has non numbers
//bool num = true;
//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
// if ( ! is_digit(tag->getTagData()[i]) ) num = false;
// skip if it has more than just digits, we are not indexing
// strings at this point
//if ( ! num ) continue;
// point to it, should be a NULL terminated string
char *dataStr = tag->getTagData();
// skip if number is too big
//int32_t val = atol ( dataStr );
// boost by one so we can index "0" score
//val++;
// we really only want to index scores from 0-255
//if ( val > 255 ) continue;
// no negatives
//if ( val <= 0 ) continue;
// count occurence
//count [ type ]++;
// . make the term name to hash after the gbtag:
// . we want to hash "gbtag:english3" for example, for the
// ST_ENGLISH tag id.
char prefix[64];
// . do not include the count for the first occurence
// . follows the gbruleset:36 convention
// . index gbtagspam:0 or gbtagspam:1, etc.!!!
//if ( count[type] == 1 )
sprintf ( prefix , "gbtag%s",str);
// assume that is good enough
//char *prefix = tmp;
// store prefix into m_wbuf so XmlDoc::print() works!
//if ( m_pbuf ) {
// int32_t tlen = gbstrlen(tmp);
// m_wbuf.safeMemcpy(tmp,tlen+1);
// prefix = m_wbuf.getBuf() - (tlen+1);
//}
//else
// sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]);
// "unmap" it so when it is hashed it will have the correct
// 8-bit score. IndexList.cpp will convert it back to 8 bits
// in IndexList::set(table), which sets our termlist from
// this "table".
//int32_t score = score8to32 ( val );
// we already incorporate the score as a string when we hash
// gbtagtagname:tagvalue so why repeat it?
//int32_t score = 1;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = prefix;
hi.m_hashGroup = HASHGROUP_INTAG;
// meta is special now
if ( tag->isType("meta") ) {
hi.m_prefix = NULL;
}
// hash it. like "gbtagenglish:1" with a score of 1, etc.
// or "gbtagspam:33" with a score of 33. this would also
// hash gbtagclock:0xfe442211 type things as well.
int32_t dlen = gbstrlen(dataStr);
if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
}
return true;
}
bool XmlDoc::hashPermalink ( HashTableX *tt ) {
setStatus ( "hashing is permalink" );
// put a colon in there so it can't be faked using a meta tag.
char *s = "0";
if ( *getIsPermalink() ) s = "1";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbpermalink";
return hashString ( s,1,&hi );
}
//hash the tag pair vector, the gigabit vector and the sample vector
bool XmlDoc::hashVectors ( HashTableX *tt ) {
setStatus ( "hashing vectors" );
int32_t score = *getSiteNumInlinks8() * 256;
if ( score <= 0 ) score = 1;
char buf[32];
uint32_t h;
//char *field;
//char *descr;
//h = m_tagVector.getVectorHash();
uint32_t tph = *getTagPairHash32();
int32_t blen = sprintf(buf,"%"UINT32"", tph);
//field = "gbtagvector";
//descr = "tag vector hash";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbtagvector";
hi.m_desc = "tag vector hash";
hi.m_shardByTermId = true;
// this returns false on failure
if ( ! hashString ( buf,blen, &hi ) ) return false;
h = *getGigabitVectorScorelessHash();
blen = sprintf(buf,"%"UINT32"",(uint32_t)h);
// udpate hash parms
hi.m_prefix = "gbgigabitvector";
hi.m_desc = "gigabit vector hash";
// this returns false on failure
if ( ! hashString ( buf,blen,&hi) ) return false;
// . dup checking uses the two hashes above, not this hash!!! MDW
// . i think this vector is just used to see if the page changed
// significantly since last spidering
// . it is used by getPercentChanged() and by Dates.cpp
// . sanity check
//if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
//int32_t *pc = m_pageSampleVec;
//h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE);
//blen = sprintf(buf,"%"UINT32"",(int32_t unsigned int)h);
//field = "gbsamplevector";
//descr = "sample vector hash";
// this returns false on failure
//if ( ! hashString ( tt,buf,blen,score,field,descr) )
// return false;
// . hash combined for Dup Dectection
// . must match XmlDoc::getDupList ( );
//uint64_t h1 = m_tagVector.getVectorHash();
//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
//uint64_t h64 = hash64 ( h1 , h2 );
// take this out for now
/*
uint64_t *dh = getDupHash ( );
blen = sprintf(buf,"%"UINT64"", *dh );//h64);
//field = "gbduphash";
//descr = "dup vector hash";
// update hash parms
hi.m_prefix = "gbduphash";
hi.m_desc = "dup vector hash";
// this returns false on failure
if ( ! hashString ( buf,blen,&hi ) ) return false;
*/
// hash the wikipedia docids we match
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) {
blen = sprintf(buf,"%"UINT64"",ptr_wikiDocIds[i]);
// convert to int32_t
//int32_t convScore = (int32_t)ptr_wikiScores[i];
// get score
//uint32_t ws = score8to32 ( convScore );
// update hash parms
hi.m_prefix = "gbwikidocid";
hi.m_desc = "wiki docid";
hi.m_hashGroup = HASHGROUP_INTAG;
// this returns false on failure
if ( ! hashString ( buf,blen,&hi ) ) return false;
}
return true;
}
bool XmlDoc::hashAds ( HashTableX *tt ) {
setStatus ( "hashing ad ids" );
for(int32_t i = 0; i < size_adVector / 8 ; i++) {
int32_t score = *getSiteNumInlinks8() * 256;
if ( score <= 0 ) score = 1;
char buf[128];
char *field;
char *descr;
//buflen = snprintf(buf,128,"%s-%s",
// m_adProvider[i],m_adClient[i]);
snprintf(buf,128,"%"UINT64"",ptr_adVector[i] );
int32_t bufLen = gbstrlen(buf);
field = "gbad";
descr = "ad provider and id";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbad";
hi.m_desc = "ad provider and id";
//log(LOG_WARN, "build: url %s indexing ad termid %s:%s",
// getFirstUrl()->getUrl(), field, buf);
//this returns false on failure
if ( ! hashString ( buf,bufLen,&hi ) ) return false;
}
return true;
}
Url *XmlDoc::getBaseUrl ( ) {
if ( m_baseUrlValid ) return &m_baseUrl;
// need this
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
// no longer set addWWW to true since tmblr.co has an IP but
// www.tmblr.co does not
m_baseUrl.set ( cu , false ); // addWWW = true
// look for base url
for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
// 12 is the <base href> tag id
if ( xml->getNodeId ( i ) != TAG_BASE ) continue;
// get the href field of this base tag
int32_t linkLen;
char *link = (char *) xml->getString ( i, "href", &linkLen );
// skip if not valid
if ( ! link || linkLen == 0 ) continue;
// set base to it. addWWW=true
m_baseUrl.set(link, linkLen, false);//true);
break;
}
// fix invalid <base href="/" target="_self"/> tag
if ( m_baseUrl.getHostLen () <= 0 || m_baseUrl.getDomainLen() <= 0 )
m_baseUrl.set ( cu , false );
m_baseUrlValid = true;
return &m_baseUrl;
}
// hash gbhasthumbnail:0|1
bool XmlDoc::hashImageStuff ( HashTableX *tt ) {
setStatus ("hashing image stuff");
char *val = "0";
char **td = getThumbnailData();
if ( *td ) val = "1";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbhasthumbnail";
hi.m_desc = "has a thumbnail";
// this returns false on failure
if ( ! hashString ( val,1,&hi ) ) return false;
return true;
}
// returns false and sets g_errno on error
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
setStatus ("hashing isadult");
char *ia = getIsAdult();
// this should not block or return error! should have been
// set in prepareToMakeTitleRec() before hashAll() was called!
if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }
// index gbisadult:1 if adult or gbisadult:0 if not
char *val;
if ( *ia ) val = "1";
else val = "0";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbisadult";
hi.m_desc = "is document adult content";
// this returns false on failure
if ( ! hashString ( val,1,&hi ) ) return false;
return true;
}
// hash destination urls for embedded gb search boxes
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {
setStatus ( "hashing submit urls" );
Url *baseUrl = getBaseUrl();
if ( ! baseUrl || baseUrl == (Url *)-1) { char*xx=NULL;*xx=0;}
for ( int32_t i = 0 ; i < m_xml.getNumNodes() ; i++ ) {
// Find forms
if ( m_xml.getNodeId(i) != TAG_FORM ) continue;
if ( m_xml.isBackTag(i) ) continue;
int32_t score = *getSiteNumInlinks8() * 256;
if ( score <= 0 ) score = 1;
int32_t len;
char *s = m_xml.getString ( i , "action" , &len );
if (!s || len == 0) continue;
Url url; url.set(baseUrl, s, len, true);
char *buf = url.getUrl();
int32_t blen = url.getUrlLen();
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbsubmiturl";
hi.m_desc = "submit url for form";
// this returns false on failure
if ( ! hashString ( buf,blen,&hi ) ) return false;
}
return true;
}
//
// STUFF IMPORTED FROM INDEXLIST.CPP
//
// we also assume all scores are above 256, too
uint8_t score32to8 ( uint32_t score ) {
// ensure score is > 0... no! not any more
if ( score <= 0 ) return (unsigned char) 0;
// extremely large scores need an adjustment to avoid wrapping
if ( score < (uint32_t)0xffffffff - 128 )
score += 128;
// scores are multiplied by 256 to preserve fractions, so undo that
score /= 256;
// ensure score is > 0
if ( score <= 0 ) return (unsigned char) 1;
// if score < 128 return it now
if ( score < 128 ) return (unsigned char) score;
// now shrink it so it's now from 1 upwards
score -= 127;
// . take NATURAL log of score now
// . PROBLEM: for low scores logscore may increase by close to 1.0
// for a score increase of 1.0. and since s_maxscore is about 22.0
// we end up moving 1.0/22.0 of 128 total pts causing a jump of
// 2 or more score points!! oops!!! to fix, let's add 10 pts
// to the score
score += 10;
double logscore = ::log ( (double)score );
// now the max it can be
//double maxscore = ::log ( (double)(0x00ffffff - 127));
static double s_maxscore = -1.0;
static double s_minscore = -1.0;
if ( s_maxscore == -1.0 ) {
uint32_t max = ((0xffffffff + 0)/256) - 127 + 10;
uint32_t min = ( 128 ) - 127 + 10;
s_maxscore = ::log((double)max);
s_minscore = ::log((double)min);
// adjust
s_maxscore -= s_minscore;
}
// adjust it
logscore -= s_minscore;
// scale it into [126,0] (add .5 for rounding)
double scaled = (logscore* 127.0) / s_maxscore + .5;
// sanity check
if ( (unsigned char)scaled >= 128 ) { char *xx=NULL;*xx=0; }
// . go into the 8 bit score now
// . set the hi bit so they know we took its log
unsigned char score8 = (unsigned char)scaled | 128;
return score8;
}
// for score8to32() below
static uint32_t s_scoreMap[] = {
0UL,
1UL,
385UL,
641UL,
897UL,
1153UL,
1409UL,
1665UL,
1921UL,
2177UL,
2433UL,
2689UL,
2945UL,
3201UL,
3457UL,
3713UL,
3969UL,
4225UL,
4481UL,
4737UL,
4993UL,
5249UL,
5505UL,
5761UL,
6017UL,
6273UL,
6529UL,
6785UL,
7041UL,
7297UL,
7553UL,
7809UL,
8065UL,
8321UL,
8577UL,
8833UL,
9089UL,
9345UL,
9601UL,
9857UL,
10113UL,
10369UL,
10625UL,
10881UL,
11137UL,
11393UL,
11649UL,
11905UL,
12161UL,
12417UL,
12673UL,
12929UL,
13185UL,
13441UL,
13697UL,
13953UL,
14209UL,
14465UL,
14721UL,
14977UL,
15233UL,
15489UL,
15745UL,
16001UL,
16257UL,
16513UL,
16769UL,
17025UL,
17281UL,
17537UL,
17793UL,
18049UL,
18305UL,
18561UL,
18817UL,
19073UL,
19329UL,
19585UL,
19841UL,
20097UL,
20353UL,
20609UL,
20865UL,
21121UL,
21377UL,
21633UL,
21889UL,
22145UL,
22401UL,
22657UL,
22913UL,
23169UL,
23425UL,
23681UL,
23937UL,
24193UL,
24449UL,
24705UL,
24961UL,
25217UL,
25473UL,
25729UL,
25985UL,
26241UL,
26497UL,
26753UL,
27009UL,
27265UL,
27521UL,
27777UL,
28033UL,
28289UL,
28545UL,
28801UL,
29057UL,
29313UL,
29569UL,
29825UL,
30081UL,
30337UL,
30593UL,
30849UL,
31105UL,
31361UL,
31617UL,
31873UL,
32129UL,
32385UL,
32641UL,
32897UL,
33488UL,
33842UL,
34230UL,
34901UL,
35415UL,
35979UL,
36598UL,
37278UL,
38025UL,
39319UL,
40312UL,
41404UL,
43296UL,
44747UL,
46343UL,
48098UL,
51138UL,
53471UL,
56037UL,
58859UL,
61962UL,
65374UL,
71287UL,
75825UL,
80816UL,
86305UL,
92342UL,
98982UL,
110492UL,
119326UL,
129042UL,
139728UL,
151481UL,
171856UL,
187496UL,
204699UL,
223622UL,
244437UL,
267333UL,
307029UL,
337502UL,
371022UL,
407893UL,
448450UL,
493062UL,
570408UL,
629783UL,
695095UL,
766938UL,
845965UL,
982981UL,
1088163UL,
1203862UL,
1331130UL,
1471124UL,
1625117UL,
1892110UL,
2097072UL,
2322530UL,
2570533UL,
2843335UL,
3143416UL,
3663697UL,
4063102UL,
4502447UL,
4985726UL,
5517332UL,
6439034UL,
7146599UL,
7924919UL,
8781070UL,
9722836UL,
10758778UL,
12554901UL,
13933735UL,
15450451UL,
17118838UL,
18954063UL,
20972809UL,
24472927UL,
27159874UL,
30115514UL,
33366717UL,
36943040UL,
43143702UL,
47903786UL,
53139877UL,
58899576UL,
65235244UL,
72204478UL,
84287801UL,
93563849UL,
103767501UL,
114991518UL,
127337936UL,
140918995UL,
164465962UL,
182542348UL,
202426372UL,
224298798UL,
248358466UL,
290073346UL,
322096762UL,
357322519UL,
396070851UL,
438694015UL,
485579494UL,
566869982UL,
629274552UL,
697919578UL,
773429105UL,
856489583UL,
947856107UL,
1106268254UL,
1227877095UL,
1361646819UL,
1508793514UL,
1670654878UL,
1951291651UL,
2166729124UL,
2403710344UL,
2664389686UL,
2951136962UL,
3266558965UL,
3813440635UL,
4233267317UL
};
uint32_t score8to32 ( uint8_t score8 ) {
/*
int32_t test = score32to8((uint32_t)0xffffffff);
static bool s_set = false;
if ( ! s_set ) {
s_set = true;
uint8_t lasts = 0;
int32_t step = 128;
int64_t start = gettimeofdayInMilliseconds();
for ( uint64_t i=1 ; i<(uint32_t)0xffffffff ; i+=step) {
// get the score
uint8_t s = score32to8(i);
// print it out now
if ( s != lasts ) {
fprintf(stderr,"\t%"UINT32"UL,\n",i);
}
// if no change, skip it
if (lasts != 0 && s == lasts ) {
if ( s > 128 )
step = (int32_t)((float)step * 1.1);
continue;
}
// otherwise set it
s_scoreMap[s] = i;
// reset
lasts = s;
}
// sanity test
for ( int32_t j = 1 ; j < 256 ; j++ ) {
uint32_t big = s_scoreMap[j];
if ( score32to8(big) != j ) { char *xx=NULL;*xx=0;}
}
int64_t end = gettimeofdayInMilliseconds();
logf(LOG_DEBUG,
"gb: took %"INT64" ms to build score table.",
end-start);
}
// sanity test
static bool s_set = false;
if ( ! s_set ) {
for ( int32_t j = 1 ; j < 256 ; j++ ) {
uint32_t big = s_scoreMap[j];
uint8_t tt;
tt = score32to8(big);
if ( tt != j ) { char *xx=NULL;*xx=0;}
}
s_set = true;
}
*/
return(s_scoreMap[score8]);
}
////////////////////////////////////////////////////////////
//
// Summary/Title generation for Msg20
//
////////////////////////////////////////////////////////////
void XmlDoc::set20 ( Msg20Request *req ) {
// clear it all out
reset();
// this too
m_reply.reset();
m_pbuf = NULL;//pbuf;
m_niceness = req->m_niceness;
// remember this
m_req = req;
// and this!
//m_coll = req->ptr_coll;
//setCollNum ( req->ptr_coll );
m_collnum = req->m_collnum;
m_collnumValid = true;
// make this stuff valid
if ( m_req->m_docId > 0 ) {
m_docId = m_req->m_docId;
m_docIdValid = true;
}
// set url too if we should
if ( m_req->size_ubuf > 1 )
setFirstUrl ( m_req->ptr_ubuf , false );
}
#define MAX_LINK_TEXT_LEN 512
#define MAX_RSSITEM_SIZE 30000
void getMsg20ReplyWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// return if it blocked
if ( THIS->getMsg20Reply ( ) == (void *)-1 ) return;
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
// . returns NULL with g_errno set on error
// . returns -1 if blocked
Msg20Reply *XmlDoc::getMsg20Reply ( ) {
// return it right away if valid
if ( m_replyValid ) return &m_reply;
// . internal callback
// . so if any of the functions we end up calling directly or
// indirectly block, this callback will be called
if ( ! m_masterLoop ) {
m_masterLoop = getMsg20ReplyWrapper;
m_masterState = this;
}
// used by Msg20.cpp to time this XmlDoc::getMsg20Reply() function
if ( ! m_startTimeValid && isClockInSync() ) {
m_startTime = gettimeofdayInMilliseconds();
m_startTimeValid = true;
}
// caller shouldhave the callback set
if ( ! m_callback1 && ! m_callback2 ) { char *xx=NULL;*xx=0; }
//char safeStack[100000];
//safeStack[0] = 0;
//safeStack[90000] = 0;
// int16_tcut
Msg20Reply *reply = &m_reply;
m_niceness = m_req->m_niceness;
m_collnum = m_req->m_collnum;//cr->m_collnum;
m_collnumValid = true;
//char *coll = m_req->ptr_coll;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }
//CollectionRec *cr = getCollRec();
//if ( ! cr ) return NULL;
// set this important member var
//if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll));
// return NULL with g_errno set on error
//if ( ! cr ) return NULL;
// . cache it for one hour
// . this will set our ptr_ and size_ member vars
char **otr = getOldTitleRec ( );
if ( ! otr || otr == (void *)-1 ) return (Msg20Reply *)otr;
// must have a title rec in titledb
if ( ! *otr ) { g_errno = ENOTFOUND; return NULL; }
// sanity
if ( *otr != m_oldTitleRec ) { char *xx=NULL;*xx=0; }
// what is this?
int32_t maxSize = 0;
// . set our ptr_ and size_ member vars from it after uncompressing
// . returns false and sets g_errno on error
if ( ! m_setTr ) {
// . this completely resets us
// . this returns false with g_errno set on error
bool status = set2( *otr, maxSize, cr->m_coll, NULL,
m_niceness);
// sanity check
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
// if there was an error, g_errno should be set.
if ( ! status ) return NULL;
m_setTr = true;
}
// breathe
QUICKPOLL(m_niceness);
// init
reply->m_nextMerged = NULL;
reply->m_collnum = m_collnum;
// MsgE uses this one
if ( m_req->m_getTitleRec ) {
// this is the original compressed titleRec, preceeded
// by key and dataSize and followed by the data
reply-> ptr_tr = m_oldTitleRec;
reply->size_tr = m_oldTitleRecSize;
m_replyValid = true;
return reply;
}
// if they provided a query with gbfacet*: terms then we have
// to get those facet values.
if ( ! m_gotFacets ) {
// only do this once
m_gotFacets = true;
// get facet term
char *qs = m_req->ptr_qbuf;
facetPrintLoop:
for ( ; qs && *qs ; qs++ ) {
if ( qs[0] != 'g' ) continue;
if ( qs[1] != 'b' ) continue;
if ( qs[2] != 'f' ) continue;
if ( strncasecmp(qs,"gbfacet",7) ) continue;
qs += 7;
// gbfacetstr: gbfacetint: gbfacetfloat:
if ( strncasecmp(qs,"str:" ,4) == 0 ) qs += 4;
else if ( strncasecmp(qs,"int:" ,4) == 0 ) qs += 4;
else if ( strncasecmp(qs,"float:",6) == 0 ) qs += 6;
else continue;
break;
}
// if we had a facet, get the values it has in the doc
if ( qs && *qs ) {
// need this for storeFacetValues() if we are json
if ( m_contentType == CT_JSON ||
// spider status docs are really json
m_contentType == CT_STATUS ) {
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1)
return (Msg20Reply *)jp;
}
if ( m_contentType == CT_HTML ||
m_contentType == CT_XML ) {
Xml *xml = getXml();
if ( ! xml || xml==(void *)-1)
return (Msg20Reply *)xml;
}
// find end of it
char *e = qs;
for ( ; *e && ! is_wspace_a(*e) ; e++ );
// tmp null it
char c = *e; *e = '\0';
// this is zero if unspecifed
FacetValHash_t fvh = m_req->m_facetValHash;
// . this will store facetField/facetValue pairs
// . stores into safebuf, m_tmpBuf2
// . it will terminate all stored strings with \0
// . we check meta tags for html docs
// . otherwise we check xml/json doc fields
// . returns false with g_errno set on error
bool ret = storeFacetValues ( qs , &m_tmpBuf2 , fvh ) ;
// revert the \0
*e = c;
// return NULL with g_errno set on error
if ( ! ret ) return NULL;
// advance
qs = e;
// do another one
goto facetPrintLoop;
}
// assign
reply-> ptr_facetBuf = m_tmpBuf2.getBufStart();
reply->size_facetBuf = m_tmpBuf2.length();
}
if ( m_req->m_justGetFacets ) {
m_replyValid = true;
return reply;
}
if ( m_req->m_getTermListBuf ) {
// ensure content is recycled from title rec
m_recycleContent = true;
//xd->m_recycleLinkInfo = true;
// only get posdb keys really for this stuff
m_useTitledb = false;
m_useTagdb = false;
m_useClusterdb = false;
m_useSpiderdb = false;
m_useLinkdb = false;
// time it
if ( m_tlbufTimer == 0 )
m_tlbufTimer = gettimeofdayInMilliseconds();
// . shit limit content for speed!!!
// . this is for getting matching queries/relatedqueries
// anyway, so should be ok
if ( size_utf8Content > 150000 ) {
char *p = ptr_utf8Content + 150000 - 1;
char *pstart = ptr_utf8Content;
// back up until we hit punct
for ( ; p > pstart ; p-- )
if ( is_punct_utf8(p) ) break;
// set new size then
*p = '\0';
size_utf8Content = p - pstart + 1;
}
// hack: should be sorted by lower 32bits of termids
// so handleRequest8e does not have to sort before doing
// its query matching algo with queries in g_qbuf.
// but these termlists are really mostly used for doing
// the gbdocid:|xxxx queries in handleRequest8e.
SafeBuf *tbuf = getTermListBuf();
if ( ! tbuf || tbuf == (void *)-1 ) return (Msg20Reply *)tbuf;
SafeBuf *tibuf = getTermId32Buf();
if ( ! tibuf || tibuf == (void *)-1)return (Msg20Reply *)tibuf;
// time it
int64_t took = gettimeofdayInMilliseconds() - m_tlbufTimer;
log("seo: tlistbuf gen took %"INT64" ms for docid %"INT64"",
took,m_docId);
// just that
reply-> ptr_tlistBuf = tbuf->getBufStart();
reply->size_tlistBuf = tbuf->length();
reply-> ptr_tiBuf = tibuf->getBufStart();
reply->size_tiBuf = tibuf->length();
m_replyValid = true;
return reply;
}
// lookup the tagdb rec fresh if setting for a summary. that way we
// can see if it is banned or not. but for getting m_getTermListBuf
// and stuff above, skip the tagrec lookup!
// save some time when SPIDERING/BUILDING by skipping fresh
// tagdb lookup and using tags in titlerec
if ( m_req && ! m_req->m_getLinkText && ! m_checkedUrlFilters )
m_tagRecDataValid = false;
// set and validate member vars
//if ( ! m_setFromTitleRec )
// // return NULL with g_errno set on error
// if ( ! set ( tr , NULL , m_niceness ) ) return NULL;
// if shard responsible for tagrec is dead, then
// just recycle!
if ( m_req && ! m_checkedUrlFilters && ! m_tagRecDataValid ) {
char *site = getSite();
TAGDB_KEY tk1 = g_tagdb.makeStartKey ( site );
TAGDB_KEY tk2 = g_tagdb.makeDomainStartKey ( &m_firstUrl );
uint32_t shardNum1 = g_hostdb.getShardNum(RDB_TAGDB,&tk1);
uint32_t shardNum2 = g_hostdb.getShardNum(RDB_TAGDB,&tk2);
// shardnum1 and shardnum2 are often different!
// log("db: s1=%i s2=%i",(int)shardNum1,(int)shardNum2);
if ( g_hostdb.isShardDead ( shardNum1 ) ) {
log("query: skipping tagrec lookup for dead shard "
"# %"INT32""
,shardNum1);
m_tagRecDataValid = true;
}
if ( g_hostdb.isShardDead ( shardNum2 ) && m_firstUrlValid ) {
log("query: skipping tagrec lookup for dead shard "
"# %"INT32""
,shardNum2);
m_tagRecDataValid = true;
}
}
// if we are showing sites that have been banned in tagdb, we dont
// have to do a tagdb lookup. that should speed things up.
TagRec *gr = NULL;
if ( cr && cr->m_doTagdbLookups ) {
gr = getTagRec();
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
}
//reply-> ptr_tagRec = (char *)gr;
//reply->size_tagRec = gr->getSize();
// we use this instead of nowGlobal
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// this should be valid, it is stored in title rec
if ( m_contentHash32Valid ) reply->m_contentHash32 = m_contentHash32;
else reply->m_contentHash32 = 0;
// if this page is potential spam, toss it!
//char *isSpam = getIsSpam();
//if ( ! isSpam || isSpam == (char *)-1 ) return (Msg20Reply *)isSpam;
if ( ! m_checkedUrlFilters ) {
// do it
//int32_t *rn = getRegExpNum2(-1);
//if ( ! rn || rn == (int32_t *)-1 ) return (Msg20Reply *)rn;
// do not re-check
m_checkedUrlFilters = true;
// a non-www url?
/*
now we allow domain-only urls in the index, so this is
hurting us...
if ( ! m_req->m_getLinkText ) {
Url tmp;
tmp.set ( ptr_firstUrl );
if ( tmp.getHostLen() == tmp.getDomainLen() ) {
// set m_errno
reply->m_errno = EDOCFILTERED;
// tmp debug
log("xmldoc: filtering non www url %s",
ptr_firstUrl);
// and this
reply->m_isFiltered = true;
// give back the url at least
reply->ptr_ubuf = getFirstUrl()->getUrl();
reply->size_ubuf =getFirstUrl()->getUrlLen()+1;
// validate
m_replyValid = true;
// and return
return reply;
}
}
*/
// get this
//time_t nowGlobal = getTimeGlobal();
// get this
SpiderRequest sreq;
SpiderReply srep;
setSpiderReqForMsg20 ( &sreq , &srep );//, *isSpam );
int32_t spideredTime = getSpideredTime();
int32_t langIdArg = -1;
if ( m_langIdValid ) langIdArg = m_langId;
// get it
int32_t ufn;
ufn=::getUrlFilterNum(&sreq,&srep,spideredTime,true,
m_niceness,cr,
false, // isOutlink?
NULL ,
langIdArg);
// sanity check
if ( ufn < 0 ) {
log("msg20: bad url filter for url %s", sreq.m_url);
}
// save it
reply->m_urlFilterNum = ufn;
// get spider priority if ufn is valid
int32_t pr = 0;
//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
if ( cr->m_forceDelete[ufn] ) pr = -3;
// this is an automatic ban!
if ( gr && gr->getLong("manualban",0))
pr=-3;//SPIDER_PRIORITY_BANNED;
// is it banned
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
// set m_errno
reply->m_errno = EDOCBANNED;
// and this
reply->m_isBanned = true;
}
//
// for now always allow it until we can fix this better
// we probably should assume NOT filtered unless it matches
// a string match only url filter... but at least we will
// allow it to match "BANNED" filters for now...
//
pr = 0;
// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
// // set m_errno
// reply->m_errno = EDOCFILTERED;
// // and this
// reply->m_isFiltered = true;
// }
// done if we are
if ( reply->m_errno && ! m_req->m_showBanned ) {
// give back the url at least
reply->ptr_ubuf = getFirstUrl()->getUrl();
reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
m_replyValid = true;
return reply;
}
}
// breathe
QUICKPOLL ( m_niceness );
// a special hack for XmlDoc::getRecommendedLinksBuf() so we exclude
// links that link to the main url's site/domain as well as a
// competitor url (aka related docid)
Links *links = NULL;
if ( m_req->m_ourHostHash32 || m_req->m_ourDomHash32 ) {
links = getLinks();
if ( ! links || links==(Links *)-1) return (Msg20Reply *)links;
}
// breathe
QUICKPOLL ( m_niceness );
// truncate content length if we should
// this was hurting our linkdb lookups! do not do it for those!
/*
if ( size_utf8Content > cr->m_contentLenMaxForSummary &&
// fix for link text fetching!
! req->m_getLinkText ) {
logf(LOG_DEBUG,"summary: truncating doc of len %"INT32" to %"INT32" for "
"generating summary",
size_utf8Content,cr->m_contentLenMaxForSummary);
size_utf8Content = cr->m_contentLenMaxForSummary ;
// null term just in case
ptr_utf8Content[size_utf8Content-1] = '\0';
}
*/
// do they want a summary?
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
char *hsum = getHighlightedSummary();
if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
//Summary *s = getSummary();
//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
//int32_t sumLen = m_finalSummaryBuf.length();
// is it size and not length?
int32_t hsumLen = 0;
// seems like it can return 0x01 if none...
if ( hsum == (char *)0x01 ) hsum = NULL;
// get len. this is the HIGHLIGHTED summary so it is ok.
if ( hsum ) hsumLen = gbstrlen(hsum);
// must be \0 terminated. not any more, it can be a subset
// of a larger summary used for deduping
if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
// assume size is 0
//int32_t sumSize = 0;
// include the \0 in size
//if ( sum ) sumSize = sumLen + 1;
// do not get any more than "me" lines/excerpts of summary
//int32_t max = m_req->m_numSummaryLines;
// grab stuff from it!
//reply->m_proximityScore = s->getProximityScore();
reply-> ptr_displaySum = hsum;//s->getSummary();
reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
// this is unhighlighted for deduping, and it might be longer
// . seems like we are not using this for deduping but using
// the gigabit vector in Msg40.cpp, so take out for now
//reply-> ptr_dedupSum = s->m_summary;
//reply->size_dedupSum = s->m_summaryLen+1;
//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
//reply->m_diversity = s->getDiversity();
}
reply->m_numAlnumWords = 0;
if ( m_wordsValid )
reply->m_numAlnumWords = m_words.m_numAlnumWords;
// . we filter out search results that do not have all the query terms
// . Matches.cpp checks the link text, dmoz, etc. for all query terms
// . it must get into the results form indexdb corruption?
// . this filtering method is/was known as the "BIG HACK"
// . We also make sure that matches aren't based on
// . "anomalous" link text, where a doc has so many link texts
// . that most common dictionary terms appear in or around
// . a link to the site.
if ( m_req->size_qbuf > 1 ) {
Matches *mm = getMatches();
int32_t numInlinks = getLinkInfo1()->getNumLinkTexts( );
reply->m_hasAllQueryTerms = mm->docHasQueryTerms(numInlinks);
}
// breathe
QUICKPOLL ( m_niceness );
// copy the link info stuff?
if ( ! m_req->m_getLinkText ) {
reply->ptr_linkInfo = (char *)ptr_linkInfo1;
reply->size_linkInfo = size_linkInfo1;
}
// breathe
QUICKPOLL ( m_niceness );
bool getThatTitle = true;
if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
if ( reply->ptr_tbuf ) getThatTitle = false;
// if steve's requesting the inlink summary we will want to get
// the title of each linker even if they are spammy!
// only get title here if NOT getting link text otherwise
// we only get it down below if not a spammy voter, because
// this sets the damn slow sections class
if ( m_req->m_getLinkText &&
! m_useSiteLinkBuf &&
! m_usePageLinkBuf &&
// m_pbuf is used by pageparser.cpp now, not the other two things
// above this.
! m_pbuf )
getThatTitle = false;
// if steve is getting the inlinks, bad and good, for displaying
// then get the title here now... otherwise, if we are just spidering
// and getting the inlinks, do not bother getting the title because
// the inlink might be linkspam... and we check down below...
if ( ! m_req->m_onlyNeedGoodInlinks )
getThatTitle = true;
// ... no more seo so stop it... disable this for sp
if ( m_req->m_getLinkText )
getThatTitle = false;
if ( getThatTitle ) {
Title *ti = getTitle();
if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
char *tit = ti->getTitle();
int32_t titLen = ti->getTitleLen();
reply-> ptr_tbuf = tit;
reply->size_tbuf = titLen + 1; // include \0
// sanity
if ( tit && tit[titLen] != '\0' ) { char *xx=NULL;*xx=0; }
if ( ! tit || titLen <= 0 ) {
reply->ptr_tbuf = NULL;
reply->size_tbuf = 0;
}
}
// this is not documented because i don't think it will be popular
if ( m_req->m_getHeaderTag ) {
SafeBuf *htb = getHeaderTagBuf();
if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
// . it should be null terminated
// . actually now it is a \0 separated list of the first
// few h1 tags
// . we call SafeBuf::pushChar(0) to add each one
reply->ptr_htag = htb->getBufStart();
reply->size_htag = htb->getLength();
}
// breathe
QUICKPOLL ( m_niceness );
if ( m_req->m_getMatches && ! reply->ptr_mbuf ) {
MatchOffsets *mo = getMatchOffsets();
if ( ! mo || mo == (MatchOffsets *)-1) return (Msg20Reply *)mo;
reply-> ptr_mbuf = (char *)mo->m_matchOffsets;
reply->size_mbuf = mo->m_numMatches*4;
}
// breathe
QUICKPOLL ( m_niceness );
// get site
reply->ptr_site = ptr_site;
reply->size_site = size_site;
// assume unknown
reply->m_noArchive = 0;
// are we noarchive? only check this if not getting link text
if ( ! m_req->m_getLinkText ) {
char *na = getIsNoArchive();
if ( ! na || na == (char *)-1 ) return (Msg20Reply *)na;
reply->m_noArchive = *na;
}
// breathe
QUICKPOLL ( m_niceness );
int32_t nowUTC2 = m_req->m_nowUTC;
if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet;
// . summary vector for deduping
// . does not compute anything if we should not! (svSize will be 0)
if ( ! reply->ptr_vbuf &&
m_req->m_getSummaryVector &&
cr->m_percentSimilarSummary > 0 &&
cr->m_percentSimilarSummary < 100 ) {
int32_t *sv = getSummaryVector ( );
if ( ! sv || sv == (void *)-1 ) return (Msg20Reply *)sv;
reply-> ptr_vbuf = (char *)m_summaryVec;
reply->size_vbuf = m_summaryVecSize;
}
// breathe
QUICKPOLL ( m_niceness );
if ( m_req->m_numSummaryLines > 0 ) {
// turn off for now since we added this to posdb
uint8_t *sl = getSummaryLangId();
if ( ! sl || sl == (void *)-1 ) return (Msg20Reply *)sl;
reply->m_summaryLanguage = *sl;
}
// breathe
QUICKPOLL ( m_niceness );
// returns values of specified meta tags
if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) {
int32_t dsize; char *d;
d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d;
reply->ptr_dbuf = d;
reply->size_dbuf = dsize; // includes \0
}
// breathe
QUICKPOLL ( m_niceness );
// . sample buffer for doing gigabit generation
// . Msg40.cpp calls intersectGigabits on all these samples from
// all the Msg20Replies it gets in the search results
//if ( ! reply->ptr_gigabitQuery && m_req->m_bigSampleMaxLen > 0 ) {
if ( ! reply->ptr_gigabitSample && m_req->m_bigSampleMaxLen > 0 ) {
// before we got a chunk of text from teh doc
SafeBuf *gsbuf = getSampleForGigabits();
if ( ! gsbuf||gsbuf ==(void *)-1) return (Msg20Reply *)gsbuf;
reply->ptr_gigabitSample = gsbuf->getBufStart();
reply->size_gigabitSample = gsbuf->length();
// . now we use the gigabit query!
// . this is really used to find out what wikipedia pages
// we match the best...
// . this also sets the vector
/*
char *gq = getGigabitQuery();
if ( ! gq || gq == (char *)-1) return (Msg20Reply *)gq;
reply-> ptr_gigabitQuery = m_gigabitQuery;
reply->size_gigabitQuery = gbstrlen(m_gigabitQuery)+1;
reply-> ptr_gigabitScores = ptr_gigabitScores;
reply->size_gigabitScores = size_gigabitScores;
*/
}
// get full image url. but not if we already have a thumbnail...
if ( ! reply->ptr_imgUrl&&!reply->ptr_imgData&&!m_req->m_getLinkText){
// && m_req->m_getImageUrl ) {
char **iu = getImageUrl();
if ( ! iu || iu == (char **)-1 ) return (Msg20Reply *)iu;
reply-> ptr_imgUrl = *iu;
reply->size_imgUrl = 0;
if ( *iu ) reply->size_imgUrl = gbstrlen(*iu)+1;
}
// get thumbnail image DATA
if ( ! reply->ptr_imgData && ! m_req->m_getLinkText ) {
// && m_req->m_getImageUrl ) {
reply-> ptr_imgData = ptr_imageData;
reply->size_imgData = size_imageData;
}
// . adids contained in the doc
// . get from title rec rather than generating
// . but we need to generate to store in titleRec at index time
// . they are 32 bits each
int64_t **avp = getAdVector();
if ( ! avp || avp == (void *)-1 ) return (Msg20Reply *)avp;
// get firstip
int32_t *fip = getFirstIp();
if ( ! fip || fip == (void *)-1 ) return (Msg20Reply *)fip;
//Url **redir = getRedirUrl();
//if ( ! redir || redir == (Url **)-1 ) return (Msg20Reply *)redir;
//int32_t redirSize = 0;
//if ( *redir ) redirSize = (*redir)->getUrlLen() + 1;
//char *ru = NULL;
//if ( *redir ) ru = (*redir)->getUrl();
char *ru = ptr_redirUrl;
int32_t rulen = 0;
if ( ru ) rulen = gbstrlen(ru)+1;
// . Msg25.cpp uses m_adIdHash for restricting voting
// . these are 64 bit termids hashes
reply-> ptr_gbAdIds = (char *)*avp;
// this size is in bytes and includes the \0
reply->size_gbAdIds = size_adVector;
// need full cached page of each search result?
// include it always for spider status docs.
if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
reply-> ptr_content = ptr_utf8Content;
reply->size_content = size_utf8Content;
}
// if ( m_req->m_getSectionVotingInfo && m_tmpBuf3.getCapacity() <=0) {
// Sections *ss = getSections();
// if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
// // will at least store a \0 in there, but will not count
// // as part of the m_tmpBuf.length()
// ss->printVotingInfoInJSON ( &m_tmpBuf3 );
// reply-> ptr_sectionVotingInfo = m_tmpBuf3.getBufStart();
// reply->size_sectionVotingInfo = m_tmpBuf3.length() + 1;
// }
// breathe
QUICKPOLL ( m_niceness );
// do they want to know if this doc has an outlink to a url
// that has the provided site and domain hash, Msg20Request::
// m_ourHostHash32 and m_ourDomHash32?
int32_t nl = 0;
if ( links ) nl = links->getNumLinks();
// scan all outlinks we have on this page
int32_t i ; for ( i = 0 ; i < nl ; i++ ) {
// get the normalized url
//char *url = links->getLinkPtr(i);
// get the site. this will not block or have an error.
int32_t hh32 = (int32_t)((uint32_t)links->getHostHash64(i));
if ( hh32 == m_req->m_ourHostHash32 ) break;
int32_t dh32 = links->getDomHash32(i);
if ( dh32 == m_req->m_ourDomHash32 ) break;
}
reply->m_hasLinkToOurDomOrHost = false;
if ( i < nl )
reply->m_hasLinkToOurDomOrHost = true;
// easy ones
reply->m_isPermalink = m_isPermalink;
reply->m_ip = m_ip;
reply->m_firstIp = *fip;
reply->m_domHash = getDomHash32();//domHash;
reply->m_docId = m_docId;
reply->m_urlHash48 = getFirstUrlHash48();
reply->m_contentLen = size_utf8Content;
reply->m_lastSpidered = getSpideredTime();//m_spideredTime;
reply->m_datedbDate = m_pubDate;
reply->m_firstIndexedDate = m_firstIndexedDate;
reply->m_firstSpidered = m_firstIndexedDate;
reply->m_contentType = m_contentType;
reply->m_hostHash = getHostHash32a();
//reply->m_contentHash = *getContentHash32();
reply->m_language = m_langId;
reply->m_country = *getCountryId();
//reply->m_hasAllQueryTerms = false;
reply->m_hopcount = m_hopCount;
reply->m_siteRank = getSiteRank();
reply->ptr_ubuf = getFirstUrl()->getUrl();
reply->ptr_rubuf = ru;
reply->ptr_catIds = ptr_catIds;
reply->ptr_indCatIds = ptr_indCatIds;
reply->ptr_dmozTitles = ptr_dmozTitles;
reply->ptr_dmozSumms = ptr_dmozSumms;
reply->ptr_dmozAnchors = ptr_dmozAnchors;
reply->ptr_metadataBuf = ptr_metadata;
reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
reply->size_rubuf = rulen;
reply->size_catIds = size_catIds;
reply->size_indCatIds = size_indCatIds;
reply->size_dmozTitles = size_dmozTitles;
reply->size_dmozSumms = size_dmozSumms;
reply->size_dmozAnchors = size_dmozAnchors;
reply->size_metadataBuf = size_metadata;
// breathe
QUICKPOLL( m_req->m_niceness );
/*
// truncate if necessary (buzz)
int32_t maxLen = 150000;
// truncate it?
bool trunc = true;
// not if getting link text
if ( req->m_getLinkText ) trunc = false;
// or outlinks
if ( req->m_getOutlinks ) trunc = false;
// or any niceness 1+ for that matter, that indicates a build operation
if ( req->m_niceness > 0 ) trunc = false;
// this is causing us to get EMISSINGQUERYTERMS errors!!!
trunc = false;
// MDW: int16_ten for speed test
//int32_t maxLen = 1000;
if ( trunc && contentLen > maxLen+1 ) {
contentLen = maxLen;
content [maxLen ] = '\0';
}
*/
// check the tag first
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
//Tag *tag1 = gr->getTag ("sitenuminlinks");
//Tag *tag2 = gr->getTag ("sitepop");
//int32_t sni = 0;
//int32_t spop = 0;
//if ( tag1 ) sni = atol(tag1->m_data);
//if ( tag2 ) spop = atol(tag2->m_data);
reply->m_siteNumInlinks = m_siteNumInlinks;
//reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
//reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
//reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
//reply->m_sitePop = m_sitePop;
// . get stuff from link info
// . this is so fast, just do it for all Msg20 requests
// . no! think about it -- this can be huge for pages like
// google.com!!!
LinkInfo *info1 = ptr_linkInfo1;
if ( info1 ) { // && m_req->m_getLinkInfo ) {
reply->m_pageNumInlinks = info1->m_totalInlinkingDocIds;
reply->m_pageNumGoodInlinks = info1->m_numGoodInlinks;
reply->m_pageNumUniqueIps = info1->m_numUniqueIps;
reply->m_pageNumUniqueCBlocks = info1->m_numUniqueCBlocks;
reply->m_pageInlinksLastUpdated = info1->m_lastUpdated;
//reply->m_pagePop = 0;//info1->m_pagePop;
//reply->m_siteNumInlinks = info1->m_siteNumInlinks;
//reply->m_sitePop = info1->m_sitePop;
}
// breathe
QUICKPOLL ( m_niceness );
// getLinkText is true if we are getting the anchor text for a
// supplied url as part of the SPIDER process..
// this was done by Msg23 before
if ( ! m_req->m_getLinkText ) {
m_replyValid = true;
return &m_reply;
}
// use the first url of the linker by default
Url *linker = &m_firstUrl;
// the base url, used for doing links: terms, is the final url,
// just in case there were any redirects
Url redir;
if ( ru ) {
redir.set ( ru );
linker = &redir;
}
// breathe
QUICKPOLL( m_niceness );
// . get score weight of link text
// . phase out the sitedb*.xml files
//int64_t x[] = {0,20,30,40,50,70,90,100}; qualities!
// map these siteNumInlinks (x) to a weight (y)
//int64_t x[] = {0,50,100,200,500,3000,10000,50000};
// these are the weights the link text will receive
//int64_t y[] = {10,30,2000,3000,4000,5000,6000,7000};
// sanity check
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
// int16_tcut
//int32_t sni = m_siteNumInlinks;// *getSiteNumInlinks();
// get the final link text weight as a percentage
//int32_t ltw = getY ( m_siteNumInlinks , x , y , 8 );
// store the weight in the reply
//reply->m_linkTextScoreWeight = ltw;
//log(LOG_DEBUG,"build: got score weight of %"INT32" for sni=%"INT32"",
// (int32_t)reply->m_linkTextScoreWeight, m_siteNumInlinks);
// breathe
//QUICKPOLL( m_niceness );
// . we need the mid doma hash in addition to the ip domain because
// chat.yahoo.com has different ip domain than www.yahoo.com , ...
// and we don't want them both to be able to vote
// . the reply is zeroed out in call the reply->reset() above so
// if this is not yet set it will be 0
if ( reply->m_midDomHash == 0 ) {
char *m = linker->getMidDomain();
int32_t mlen = linker->getMidDomainLen();
reply->m_midDomHash = hash32 ( m , mlen );
}
// breathe
QUICKPOLL( m_niceness );
int64_t start = gettimeofdayInMilliseconds();
// if not set from above, set it here
if ( ! links ) links = getLinks ( true ); // do quick set?
if ( ! links || links == (Links *)-1 ) return (Msg20Reply *)links;
Pos *pos = getPos();
if ( ! pos || pos == (Pos *)-1 ) return (Msg20Reply *)pos;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Msg20Reply *)ww;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Msg20Reply *)xml;
//Sections *ss = getSections();
//if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
// . is this page a dynamic page?
// . like a guestbook, access log stats, etc.
// . we don't like to count such pages for links analysis because
// they can be spammed so easily
// . TODO: guestbooks and message boards typically contain cgi links
// can we use that to identify?
// . the coll size includes the \0
//CollectionRec *cr ;
//cr = g_collectiondb.getRec ( m_req->ptr_coll,m_req->size_coll-1);
// g_errno should be ENOCOLLREC
//if ( ! cr ) return NULL;
// . we want link text for this url, "linkee"
// . TODO: true --> add "www" to see if that fixes our problem
// i guess Links.cpp does that with the outlinks, so when
// Linkdb::fillList() uses Links.cpp, the outlinks have "www"
// prepended on them...
//Url linkee;
//linkee.set ( m_req->ptr_linkee , m_req->size_linkee );
// get a ptr to the link in the content. will point to the
// stuff in the href field of the anchor tag. used for seeing if
// we have bad links or not.
int32_t linkNode = -1;
int32_t linkNum = -1;
// . get associated link text from the linker's document for our "url"
// . only gets from FIRST link to us
// . TODO: allow more link text from better quality pages?
// . TODO: limit score based on link text length?
// . should always be NULL terminated
// . should not break in the middle of a word
// . this will return the item/entry if we are extracting from an
// rss/atom feed
char *rssItem = NULL;
int32_t rssItemLen = 0;
// store link text in here
//char linkTextBuf[MAX_LINK_TEXT_LEN];
//
// TODO: for getting siteinlinks just match the site in the url
// not the full url... and maybe match the one with the int16_test path.
//
// . get the link text
// . linkee might be a site if m_isSiteLinkInfo is true in which
// case we get the best inlink to that site, and linkee is
// something like blogspot.com/mary/ or some other site.
int32_t blen = links->getLinkText ( m_req->ptr_linkee ,//&linkee,
m_req->m_isSiteLinkInfo ,
m_linkTextBuf ,
MAX_LINK_TEXT_LEN-2 ,
&rssItem ,
&rssItemLen ,
&linkNode ,
&linkNum ,
m_niceness );
// . BUT this skips the news topic stuff too. bad?
// . THIS HAPPENED before because we were truncating the xml(see above)
if ( linkNode < 0 ) {
int64_t took = gettimeofdayInMilliseconds() - start;
if ( took > 100 )
log("build: took %"INT64" ms to get link text for "
"%s from linker %s",
took,
m_req->ptr_linkee,
m_firstUrl.m_url );
logf(LOG_DEBUG,"build: Got linknode = %"INT32" < 0. Cached "
"linker %s does not have outlink to %s like linkdb "
"says it should. page is probably too big and the "
"outlink is past our limit. contentLen=%"INT32". or "
"a sitehash collision, or an area tag link.",
linkNode,getFirstUrl()->getUrl(),m_req->ptr_linkee,
m_xml.getContentLen());
//g_errno = ECORRUPTDATA;
// do not let multicast forward to a twin! so use this instead
// of ECORRUTPDATA
g_errno = EBADENGINEER;
//char *xx=NULL;*xx=0;
return NULL;
}
// breathe
QUICKPOLL(m_niceness);
if ( ! verifyUtf8 ( m_linkTextBuf , blen ) ) {
log("xmldoc: bad OUT link text from url=%s for %s",
m_req->ptr_linkee,m_firstUrl.m_url);
m_linkTextBuf[0] = '\0';
blen = 0;
}
// verify for rss as well. seems like we end up coring because
// length/size is not in cahoots and [size-1] != '\0' sometimes
if ( ! verifyUtf8 ( rssItem , rssItemLen ) ) {
log("xmldoc: bad RSS ITEM text from url=%s for %s",
m_req->ptr_linkee,m_firstUrl.m_url);
rssItem[0] = '\0';
rssItemLen = 0;
}
// point to it, include the \0.
if ( blen > 0 ) {
reply->ptr_linkText = m_linkTextBuf;
// save the size into the reply, include the \0
reply->size_linkText = blen + 1;
// sanity check
if ( blen + 2 > MAX_LINK_TEXT_LEN ) { char *xx=NULL;*xx=0; }
// sanity check. null termination required.
if ( m_linkTextBuf[blen] ) { char *xx=NULL;*xx=0; }
}
// . the link we link to
// . important when getting site info because the link url
// can be different than the root url!
reply-> ptr_linkUrl = links->getLink (linkNum);
reply->size_linkUrl = links->getLinkLen(linkNum)+1;
// save the rss item in our state so we can point to it, include \0
//if(rssItemLen > MAX_RSSITEM_SIZE-2 ) rssItemLen = MAX_RSSITEM_SIZE-2;
//char rssItemBuf[MAX_RSSITEM_SIZE];
if ( rssItemLen > MAX_RSSITEM_SIZE )
rssItemLen = MAX_RSSITEM_SIZE;
if ( rssItemLen > 0) {
m_rssItemBuf.safeMemcpy ( rssItem , rssItemLen );
m_rssItemBuf.pushChar('\0');
// gbmemcpy ( rssItemBuf, rssItem , rssItemLen );
// // NULL terminate it
// rssItemBuf[rssItemLen] = 0;
}
// point to it, include the \0
if ( rssItemLen > 0 ) {
reply->ptr_rssItem = m_rssItemBuf.getBufStart();
reply->size_rssItem = m_rssItemBuf.getLength();
}
// breathe
QUICKPOLL( m_niceness );
if ( ! m_req->m_doLinkSpamCheck )
reply->m_isLinkSpam = false;
if ( m_req->m_doLinkSpamCheck ) {
// reset to NULL to avoid gbstrlen segfault
char *note = NULL;
// need this
if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; }
// time it
//int64_t start = gettimeofdayInMilliseconds();
Url linkeeUrl;
linkeeUrl.set ( m_req->ptr_linkee );
// get it. does not block.
reply->m_isLinkSpam = ::isLinkSpam ( linker ,
m_ip ,
ptr_indCatIds ,
size_indCatIds / 4 ,
m_siteNumInlinks,
&m_xml,
links,
// if doc length more
// than 150k then consider
// it linkspam
// automatically so it
// can't vote
150000,//MAXDOCLEN//150000
&note ,
&linkeeUrl , // url ,
linkNode ,
cr->m_coll ,
m_niceness );
// store it
if ( note ) {
// include the \0
reply->ptr_note = note;
reply->size_note = gbstrlen(note)+1;
}
// log the reason why it is a log page
if ( reply->m_isLinkSpam )
log(LOG_DEBUG,"build: linker %s: %s.",
linker->getUrl(),note);
// sanity
if ( reply->m_isLinkSpam && ! note )
log("linkspam: missing note for d=%"INT64"!",m_docId);
// store times... nah, might have yielded cpu!
reply->m_timeLinkSpam = 0;
}
// breathe
QUICKPOLL(m_niceness);
// sanity check
if ( reply->ptr_rssItem &&
reply->size_rssItem>0 &&
reply->ptr_rssItem[reply->size_rssItem-1]!=0) {
char *xx=NULL;*xx=0; }
//log ("nogl=%"INT32"",(int32_t)m_req->m_onlyNeedGoodInlinks );
// . skip all this junk if we are a spammy voter
// . we get the title above in "getThatTitle"
if ( reply->m_isLinkSpam ) {
m_replyValid = true; return reply; }
// . this vector is set from a sample of the entire doc
// . it is used to dedup voters in Msg25.cpp
// . this has pretty much been replaced by vector2, it was
// also saying a doc was a dup if all its words were
// contained by another, like if it was a small subset, which
// wasn't the best behaviour.
// . yeah neighborhood text is much better and this is setting
// the slow sections class, so i took it out
getPageSampleVector ();
// must not block or error out. sanity check
if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
//st->m_v1.setPairHashes ( ww , -1 , m_niceness );
// breathe
QUICKPOLL( m_niceness );
//st->m_v2.setPairHashes ( ww,linkWordNum, m_niceness );
// . this vector is set from the text after the link text
// . it terminates at at a breaking tag
// . check it out in ~/fff/src/Msg20.cpp
getPostLinkTextVector ( linkNode );
// must not block or error out. sanity check
//if ( ! m_postLinkTextVecValid ) { char *xx=NULL;*xx=0; }
// breathe
QUICKPOLL( m_niceness );
// set from the hashes of the tag id pairs
//st->m_v3.setTagPairHashes ( xml , m_niceness );
// get it
getTagPairHashVector();
// must not block or error out. sanity check
if ( ! m_tagPairHashVecValid ) { char *xx=NULL;*xx=0; }
// breathe
QUICKPOLL( m_niceness );
// this vector is set from the hashes of the path components
// with punctuation stripped out
//v4.set ( xml, NULL , linker, -1 ,buf4,size);
// . the 4th vector is provided, this will point to m_topIps[] buffer
// . this is temporarily disabled
// . this is the top 2 bytes of the ips of each inlink
// . we were looking this info up in linkdb
// . so if two good inlinkers had their inlinks from the same ip
// neighborhoods, then one would have its voting power "deduped".
// . see the old LinkText.cpp for the logic that read these from linkdb
//v5.set2 ( (char *)incomingIps , numIncomingIps );
// reference the vectors in our reply
reply-> ptr_vector1 = m_pageSampleVec;//(char *)&st->m_v1;
reply->size_vector1 = m_pageSampleVecSize;//st->m_v1.getSize();
reply-> ptr_vector2 = m_postVec;//(char *)&st->m_v2;
reply->size_vector2 = m_postVecSize;//st->m_v2.getSize();
reply-> ptr_vector3 = m_tagPairHashVec; // (char *)&st->m_v3;
reply->size_vector3 = m_tagPairHashVecSize;//st->m_v3.getSize();
// crap, we gotta bubble sort these i think
// but only tag pair hash vec
bool flag = true;
uint32_t *d = (uint32_t *)m_tagPairHashVec;
// exclude the terminating 0 int32_t
int32_t nd = (m_tagPairHashVecSize / 4) - 1;
while ( flag ) {
// breathe
QUICKPOLL ( m_niceness );
flag = false;
for ( int32_t i = 1 ; i < nd ; i++ ) {
if ( d[i-1] <= d[i] ) continue;
uint32_t tmp = d[i-1];
d[i-1] = d[i];
d[i] = tmp;
flag = true;
}
}
// just always do it
//if ( ! req->m_getInlinkNeighborhoods ) return true;
// convert "linkNode" into a string ptr into the document
char *node = xml->getNodePtr(linkNode)->m_node;
// . find the word index, "n" for this node
// . this is INEFFICIENT!!
char **wp = ww->getWords();
int32_t nw = ww->getNumWords();
int32_t n;
for ( n = 0; n < nw && wp[n] < node ; n++ )
QUICKPOLL(m_niceness);
// sanity check
//if ( n >= nw ) { char *xx=NULL; *xx=0; }
if ( n >= nw ) {
log("links: crazy! could not get word before linknode");
g_errno = EBADENGINEER;
return NULL;
}
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
// get the ptrs to the sections, 1-1 with words
//Section **sp = NULL;
//if ( ss ) sp = ss->m_sectionPtrs;
// . even tags in the article section have positive scores
// . the scores array is 1-1 with the words in Words, not the nodes
// in Xml. so we had to do that conversion.
//if ( ! sp || !(sp[n]->m_flags & NOINDEXFLAGS) )
// reply->m_outlinkInContent = true;
//
// get the surrounding link text, around "linkNode"
//
// radius of 80 characters around n
char sbuf[1201];
int32_t radius = 80;
char *p = sbuf;
char *pend = sbuf + 600;
// . make a neighborhood in the "words" space [a,b]
// . radius is in characters, so "convert" into words by dividing by 5
int32_t a = n - radius / 5;
int32_t b = n + radius / 5;
if ( a < 0 ) a = 0;
if ( b > nw ) b = nw;
int32_t *pp = pos->m_pos;
int32_t len;
// if too big shring the biggest, a or b?
while ( (len=pp[b]-pp[a]) >= 2 * radius + 1 ) {
// decrease the largest, a or b
if ( a<n && (pp[n]-pp[a])>(pp[b]-pp[n])) a++;
else if ( b>n ) b--;
}
// only store it if we can
if ( p + len + 1 < pend ) {
// store it
// FILTER the html entities!!
int32_t len2 = pos->filter(p,pend,ww,a,b,NULL);//ss);
// ensure NULL terminated
p[len2] = '\0';
// store in reply. it will be serialized when sent.
// thanks to isj for finding this bug fix.
m_surroundingTextBuf.safeMemcpy ( p , len2 + 1 );
reply->ptr_surroundingText =m_surroundingTextBuf.getBufStart();
reply->size_surroundingText=m_surroundingTextBuf.getLength();
}
// breathe
QUICKPOLL ( m_niceness );
// get title? its slow because it sets the sections class
if ( m_req->m_titleMaxLen > 0 && ! reply->ptr_tbuf &&
// don't get it anymore if getting link info because it
// is slow...
getThatTitle ) {
Title *ti = getTitle();
if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
char *tit = ti->getTitle();
int32_t titLen = ti->getTitleLen();
reply-> ptr_tbuf = tit;
reply->size_tbuf = titLen + 1; // include \0
if ( ! tit || titLen <= 0 ) {
reply->ptr_tbuf = NULL;
reply->size_tbuf = 0;
}
}
int64_t took = gettimeofdayInMilliseconds() - start;
if ( took > 100 )
log("build: took %"INT64" ms to get link text for "
"%s from linker %s",
took,
m_req->ptr_linkee,
m_firstUrl.m_url );
m_replyValid = true;
return reply;
}
//static void gotMsg5ListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
// XmlDoc *THIS = (XmlDoc *)state;
// THIS->m_masterLoop ( THIS->m_masterState );
//}
char **XmlDoc::getDiffbotPrimaryImageUrl ( ) {
// use new json parser
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1 ) return (char **)jp;
JsonItem *ji = jp->getFirstItem();
// assume none
m_imageUrl2 = NULL;
m_imageUrl2Valid = true;
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
for ( ; ji ; ji = ji->m_next ) {
QUICKPOLL(m_niceness);
// skip if not number or string
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
continue;
//char *topName = NULL;
// what name level are we?
// int32_t numNames = 1;
// JsonItem *pi = ji->m_parent;
// for ( ; pi ; pi = pi->m_parent ) {
// // empty name?
// if ( ! pi->m_name ) continue;
// if ( ! pi->m_name[0] ) continue;
// topName = pi->m_name;
// numNames++;
// }
char *name0 = ji->m_name;
char *name1 = NULL;
char *name2 = NULL;
if ( ji->m_parent )
name1 = ji->m_parent->m_name;
if ( ji->m_parent->m_parent )
name2 = ji->m_parent->m_parent->m_name;
// stop at first image for "images":[{ indicator
if ( strcmp(name0,"url") == 0 &&
name1 &&
strcmp(name1,"images") == 0 )
break;
// for products
if ( strcmp(name0,"link") == 0 &&
name1 &&
strcmp(name1,"media") == 0 )
break;
}
if ( ! ji )
return &m_imageUrl2;
int32_t vlen;
char *val = ji->getValueAsString( &vlen );
// ok, we got it, just copy that
m_imageUrlBuf2.safeMemcpy ( val , vlen );
m_imageUrlBuf2.nullTerm();
m_imageUrl2 = m_imageUrlBuf2.getBufStart();
return &m_imageUrl2;
}
// get the image url SPECIFIED by the page, so there is no guesswork here
// unlike with the Images.cpp class
char **XmlDoc::getImageUrl() {
// return if valid
if ( m_imageUrlValid ) return &m_imageUrl;
// get first url
Url *f = getFirstUrl();
if ( ! f || f == (Url *)-1 ) return (char **)f;
// assume none
m_imageUrl = NULL;
m_imageUrlValid = true;
// we use getDiffbotPrimaryImageUrl() above for doing thumbs
if ( m_isDiffbotJSONObject || m_contentType == CT_JSON )
return &m_imageUrl;
// all done if not youtube or meta cafe
char *host = f->getHost();
char found = 0;
if ( ! strncmp ( host , "www.youtube.com/" , 16 ) ) found = 1;
if ( ! strncmp ( host , "youtube.com/" , 12 ) ) found = 1;
if ( ! strncmp ( host , "www.metacafe.com/" , 17 ) ) found = 2;
if ( ! strncmp ( host , "metacafe.com/" , 13 ) ) found = 2;
if ( ! found ) return &m_imageUrl;
// char ptr
char *u = f->getUrl();
// make it
if ( found == 1 ) {
char *s = strstr(u,"v=");
// if url does not contain a "v=" then forget it
if ( ! s ) return &m_imageUrl;
// point to the id
s += 2;
//m_imageUrl = m_imageUrlBuf;
//char *p = m_imageUrlBuf;
m_imageUrlBuf.safeStrcpy("http://img.youtube.com/vi/");
// do not break
//char *pend = m_imageUrlBuf + 80;
// copy the id/number
//for ( ; is_digit(*s) && p < pend ; ) *p++ = *s++;
for ( ; is_digit(*s) ; s++ )
m_imageUrlBuf.pushChar(*s);
// wrap it up
m_imageUrlBuf.safeStrcpy ( "/2.jpg" );
// size includes \0;
//m_imageUrlSize = p - m_imageUrl ;
// sanity check
//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
m_imageUrl = m_imageUrlBuf.getBufStart();
return &m_imageUrl;
}
// must be meta cafe now
// http://www.metacafe.com/watch/559561/surfer_girls_vol_2/
// http://s2.mcstatic.com/thumb/559561.jpg
// scan url path for first digit
for ( char *t = f->getPath() ; *t ; t++ ) {
// look for digit
if ( ! is_digit ( *t ) ) t++;
// grab that
int32_t id = atol ( t );
// skip ifnot good
if ( id <= 0 ) continue;
// make the url
//m_imageUrl = m_imageUrlBuf;
//char *p = m_imageUrlBuf;
//gbmemcpy ( p , "http://s2.mcstatic.com/thumb/" , 29 );
//p += 29;
//p += sprintf ( p , "%"INT32"" , id );
//gbmemcpy ( p , ".jpg\0" , 5 );
//p += 5;
m_imageUrlBuf.safePrintf("http://s2.mcstatic."
"com/thumb/%"INT32".jpg", id);
m_imageUrl = m_imageUrlBuf.getBufStart();
// size includes \0;
//m_imageUrlSize = p - m_imageUrl ;
// sanity check
//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
break;
}
return &m_imageUrl;
}
MatchOffsets *XmlDoc::getMatchOffsets () {
// return it if it is set
if ( m_matchOffsetsValid ) return &m_matchOffsets;
// need a buncha crap
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (MatchOffsets *)ww;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (MatchOffsets *)xml;
Matches *mm = getMatches();
if ( ! mm || mm == (Matches *)-1 ) return (MatchOffsets *)mm;
m_matchOffsets.set ( xml , ww , mm , true ); // getMatches=true
m_matchOffsetsValid = true;
return &m_matchOffsets;
}
Query *XmlDoc::getQuery() {
if ( m_queryValid ) return &m_query;
// bail if no query
if ( ! m_req || ! m_req->ptr_qbuf ) {
m_queryValid = true;
return &m_query;
}
// return NULL with g_errno set on error
if ( ! m_query.set2( m_req->ptr_qbuf ,
m_req->m_langId ,
true ) ) return NULL;
m_queryValid = true;
return &m_query;
}
Matches *XmlDoc::getMatches () {
// return it if it is set
if ( m_matchesValid ) return &m_matches;
// if no query, matches are empty
if ( ! m_req->ptr_qbuf ) {
m_matchesValid = true;
return &m_matches;
}
// cache it for one hour
//XmlDoc *od = getOldXmlDoc ( 3600 );
//if ( ! od || od == (XmlDoc *)-1 ) return (Matches *)od;
//if ( od->isEmpty() ) od = NULL;
// need a buncha crap
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Matches *)ww;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Matches *)xml;
Bits *bits = getBitsForSummary();
if ( ! bits || bits == (Bits *)-1 ) return (Matches *)bits;
Sections *ss = getSections();
if ( ! ss || ss == (void *)-1) return (Matches *)ss;
Pos *pos = getPos();
if ( ! pos || pos == (Pos *)-1 ) return (Matches *)pos;
Title *ti = getTitle();
if ( ! ti || ti == (Title *)-1 ) return (Matches *)ti;
//Synonyms *syn = getSynonyms();
//if ( ! syn || syn == (void *)-1 ) return (Matches *)syn;
Phrases *phrases = getPhrases();
if ( ! phrases || phrases == (void *)-1 ) return (Matches *)phrases;
Query *q = getQuery();
if ( ! q ) return (Matches *)q;
// set it up
m_matches.setQuery ( q );
// returns false and sets g_errno on error
if ( ! m_matches.set ( this ,
ww ,
//syn ,
phrases ,
ss ,
bits ,
pos ,
xml ,
ti ,
m_niceness ) )
return NULL;
// we got it
m_matchesValid = true;
return &m_matches;
}
// sender wants meta description, custom tags, etc.
char *XmlDoc::getDescriptionBuf ( char *displayMetas , int32_t *dsize ) {
// return the buffer if we got it
if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
// now get the content of the requested display meta tags
//char dbuf [ 1024*64 ];
char *dbufEnd = m_dbuf + 1024;//1024*64;
char *dptr = m_dbuf;
char *pp = displayMetas;
char *ppend = pp + gbstrlen(displayMetas);
// loop over the list of requested meta tag names
while ( pp < ppend && dptr < dbufEnd ) {
// skip initial spaces. meta tag names are ascii always i guess
while ( *pp && is_wspace_a(*pp) ) pp++;
// that's the start of the meta tag name
char *s = pp;
// . find end of that meta tag name
// . can end in :<integer> which specifies max len
while ( *pp && ! is_wspace_a(*pp) && *pp != ':' ) pp++;
// assume no max length to the content of this meta tag
int32_t maxLen = 0x7fffffff;
// save current char
char c = *pp;
// . NULL terminate the name
// . before, overflowed the request buffer and caused core!
// . seems like it is already NULL terminated
if ( *pp ) *pp = '\0';
// always advance regardless though
pp++;
// if ':' was specified, get the max length
if ( c == ':' ) {
if ( is_digit(*pp) ) maxLen = atoi ( pp );
// skip over the digits
while ( *pp && ! is_wspace_a (*pp) ) pp++;
}
// don't exceed our total buffer size (save room for \0 at end)
int32_t avail = dbufEnd - dptr - 1;
if ( maxLen > avail ) maxLen = avail;
// store the content at "dptr" (do not exceed "maxLen" bytes)
int32_t wlen = xml->getMetaContent ( dptr , // write buf
maxLen , // buf length
s , // name value
gbstrlen(s) , // name len
"name" , // http-equiv/name
false );// convert &#'s?
dptr[wlen] = '\0';
// test it out
if ( ! verifyUtf8 ( dptr ) ) {
log("xmldoc: invalid utf8 content for meta tag %s.",s);
continue;
}
// advance and NULL terminate
dptr += wlen;
*dptr++ = '\0';
// bitch if we truncated
if ( dptr >= dbufEnd )
log("query: More than %"INT32" bytes of meta tag "
"content "
"was encountered. Truncating.",
(int32_t)(dbufEnd-m_dbuf));
}
// what is the size of the content of displayed meta tags?
m_dbufSize = dptr - m_dbuf;
m_dbufValid = true;
*dsize = m_dbufSize;
return m_dbuf;
}
SafeBuf *XmlDoc::getHeaderTagBuf() {
if ( m_htbValid ) return &m_htb;
Sections *ss = getSections();
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
int32_t count = 0;
// scan sections
Section *si = ss->m_rootSection;
moreloop:
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
if ( si->m_tagId != TAG_H1 ) continue;
// if it contains now text, this will be -1
// so give up on it
if ( si->m_firstWordPos < 0 ) continue;
if ( si->m_lastWordPos < 0 ) continue;
// ok, it works, get it
break;
}
// if no h1 tag then make buf empty
if ( ! si ) {
m_htb.nullTerm();
m_htbValid = true;
return &m_htb;
}
// otherwise, set it
char *a = m_words.m_words[si->m_firstWordPos];
char *b = m_words.m_words[si->m_lastWordPos] ;
b += m_words.m_wordLens[si->m_lastWordPos];
// copy it
m_htb.safeMemcpy ( a , b - a );
m_htb.pushChar('\0');
si = si->m_next;
// add more?
if ( count++ < 3 ) goto moreloop;
m_htbValid = true;
return &m_htb;
}
Title *XmlDoc::getTitle ( ) {
if ( m_titleValid ) return &m_title;
// need a buncha crap
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Title *)ww;
Sections *sections = getSections();
if ( ! sections ||sections==(Sections *)-1) return (Title *)sections;
Pos *pos = getPos();
if ( ! pos || pos == (Pos *)-1 ) return (Title *)pos;
Query *q = getQuery();
if ( ! q ) return (Title *)q;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
int32_t titleMaxLen = cr->m_titleMaxLen;
if ( m_req ) titleMaxLen = m_req->m_titleMaxLen;
// limit for speed, some guys have a 100k word title!
if ( titleMaxLen > 256 ) titleMaxLen = 256;
m_titleValid = true;
if ( ! m_title.setTitle ( this ,
xml ,
ww ,
sections ,
pos ,
titleMaxLen ,
0xffff ,
NULL ,
q ,
cr ,
m_niceness ) )
return NULL;
return &m_title;
}
Summary *XmlDoc::getSummary () {
if ( m_summaryValid ) return &m_summary;
// xml and json docs have empty summaries for now
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Summary *)ct;
if ( *ct == CT_JSON || *ct == CT_XML ) {
m_summaryValid = true;
return &m_summary;
}
// need a buncha crap
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Summary *)xml;
Bits *bits = getBitsForSummary();
if ( ! bits || bits == (Bits *)-1 ) return (Summary *)bits;
Sections *sections = getSections();
if ( ! sections ||sections==(Sections *)-1) return (Summary *)sections;
Pos *pos = getPos();
if ( ! pos || pos == (Pos *)-1 ) return (Summary *)pos;
char *site = getSite ();
if ( ! site || site == (char *)-1 ) return (Summary *)site;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Summary *)d;
Matches *mm = getMatches();
if ( ! mm || mm == (Matches *)-1 ) return (Summary *)mm;
Title *ti = getTitle();
if ( ! ti || ti == (Title *)-1 ) return (Summary *)ti;
Query *q = getQuery();
if ( ! q ) return (Summary *)q;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . get the highest number of summary lines that we need
// . the summary vector we generate for doing summary-based deduping
// typically has more lines in it than the summary we generate for
// displaying to the user
int32_t numLines = m_req->m_numSummaryLines;
if ( cr->m_percentSimilarSummary > 0 &&
cr->m_percentSimilarSummary < 100 &&
m_req->m_getSummaryVector &&
cr->m_summDedupNumLines > numLines )
// request more lines than we will display
numLines = cr->m_summDedupNumLines;
// int16_tcut
Summary *s = &m_summary;
// time cpu set time
int64_t start = gettimeofdayInMilliseconds();
m_cpuSummaryStartTime = start;
// make sure summary does not include title
char *tbuf = ti->m_title;
// this does not include the terminating \0
int32_t tbufLen = ti->m_titleBytes;
// compute the summary
bool status;
status = s->set2( xml ,
ww ,
bits ,
sections ,
pos ,
q ,
(int64_t *)m_req->ptr_termFreqs ,
(float *)m_req->ptr_affWeights ,
false , // doStemming
m_req->m_summaryMaxLen ,
numLines ,
// . displayLines, # lines we are displaying
// . Summary::getDisplayLen() will return the
// length of the summary to display
m_req->m_numSummaryLines ,
m_req->m_summaryMaxNumCharsPerLine,
m_req->m_ratInSummary ,
getFirstUrl() ,
//&reply->m_queryProximityScore ,
mm ,
tbuf ,
tbufLen );
// error, g_errno should be set!
if ( ! status ) return NULL;
m_summaryValid = true;
return &m_summary;
}
char *XmlDoc::getHighlightedSummary ( ) {
if ( m_finalSummaryBufValid ) {
//char *fsum = m_finalSummaryBuf.getBufStart();
//if ( ! fsum ) fsum = (char *)0x01;
return m_finalSummaryBuf.getBufStart();
}
Summary *s = getSummary();
if ( ! s || s == (void *)-1 ) return (char *)s;
Query *q = getQuery();
if ( ! q ) return (char *)q;
// get the summary
char *sum = s->getSummary();
//int32_t sumLen = s->getSummaryLen();
int32_t sumLen = s->getSummaryDisplayLen();
//sum[sumLen] = 0;
// assume no highlighting?
if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
m_finalSummaryBuf.nullTerm();
m_finalSummaryBufValid = true;
return m_finalSummaryBuf.getBufStart();
//char *fsum = m_finalSummaryBuf.getBufStart();
//if ( ! fsum ) fsum = (char *)0x01;
//return fsum;
}
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
Highlight hi;
StackBuf(hb);
// highlight the query in it
int32_t hlen = hi.set ( &hb,
sum,
sumLen,
m_langId,
q,
false , // doStemming?
false , //click&scroll?
NULL , // base url
"<b>" , // front tag
"</b>" , // back tag
0,
m_niceness );
// highlight::set() returns 0 on error
if ( hlen < 0 ) {
log("build: highlight class error = %s",mstrerror(g_errno));
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return NULL;
}
// store into our safebuf then
m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 );
m_finalSummaryBufValid = true;
m_finalSummaryBuf.nullTerm();
return m_finalSummaryBuf.getBufStart();
//char *fsum = m_finalSummaryBuf.getBufStart();
//if ( ! fsum ) fsum = (char *)0x01;
//return fsum;
}
//
// GET GIGABIT SAMPLE
//
//
// This will get samples surrounding all the query terms for purposes
// of gigabits generation. We don't just generate gigabits from the
// WHOLE document because it takes much longer?? is that still true?
// We assume that the first call to getTopLines() above set
// matches/numMatches. We use those arrays to
// skip directly to just the query terms in the document and save time.
// We may have to reset the Scores array here if we want to use it ltr.
//
// aka getGigabitSample. get gigabit sample
//
SafeBuf *XmlDoc::getSampleForGigabits ( ) {
if ( m_gsbufValid ) return &m_gsbuf;
// assume empty
//m_gsbuf = NULL;
// basically, exit now if no sample needed
if ( m_req->m_bigSampleMaxLen <= 0 ||
m_req->m_bigSampleRadius <= 0 ) {
m_gsbufValid = true;
return &m_gsbuf;
}
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
// if it is json then only return the json fields that are strings
// and json decode them... separate each field with a \0.
if ( *ct == CT_JSON )
return getSampleForGigabitsJSON();
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
// just send back the whole page, but separate each section
// with \0. make only sentences end with ? ! or ., headers
// not with anything, and no menu items
Sections *sections = getSections();
if ( ! sections ||sections==(Sections *)-1) return (SafeBuf *)sections;
Section *sp = sections->m_rootSection;
SafeBuf reply;
reply.setLabel("gbtrepbuf");
// m_contentLen is invalid, don't use that here use size_utf8Content
if ( ! reply.reserve ( size_utf8Content + 1000 ) ) return NULL;
// scan the sections of the document
for ( ; sp ; sp = sp->m_next ) {
QUICKPOLL(m_niceness);
// do not allow menu crap
if ( sp->m_flags & ( SEC_MENU |
SEC_MENU_SENTENCE |
SEC_MENU_HEADER ) )
continue;
// must be sentence or header
bool ok = false;
if ( sp->m_flags & SEC_SENTENCE ) ok = true;
// headings are ok, just don't use as sentences...
if ( sp->m_flags & SEC_HEADING ) ok = true;
if ( ! ok ) continue;
// store without tags
char *p = ww->m_words[sp->m_a];
// include period after final word in section
int32_t b = sp->m_b - 1;
char *e = ww->m_words[b] + ww->m_wordLens[b];
// if 3+ commas and one comma for every 4 words, forget it,
// it is probably a list! well, process it, but make sure it
// does not end in a period so we do not display it
// as a fast fact, but we use it for gigabits.
bool isList = false;
int32_t commaCount = 0;
int32_t bracketCount = 0;
for ( char *z = p ; z < e ; z++ ) {
if ( *z == ',' ) commaCount++;
// fix ] [AllTheWeb] [Gigablast] [Google] [HotBot]...
if ( *z == '[' ) bracketCount++;
}
int32_t naw = (b - sp->m_a) / 2;
// just skip even for gigabits if too long. most likely
// a spammy list of nouns.
if ( naw >= 130 ) continue;
if ( commaCount >= 3 && commaCount *4 >= naw )
isList = true;
if ( commaCount >= 10 )
isList = true;
if ( bracketCount >= 3 )
isList = true;
// too much uppercase?
bool yelling = false;
int32_t upper = 0;
int32_t lower = 0;
char cs = 0;
for ( char *z = p ; z < e ; z += cs ) {
cs = getUtf8CharSize(z);
if ( ! is_alpha_utf8(z) ) continue;
if ( is_upper_utf8(z) ) upper++;
if ( is_lower_utf8(z) ) lower++;
}
if ( upper > lower ) yelling = true;
// ending ) or ]
if ( e[0] == ')' ) e++;
else if ( e[0] == ']' ) e++;
// incorporate period etc.
if ( e[0] == '.' ) e++;
else if ( e[0] == '!' ) e++;
else if ( e[0] == '?' ) e++;
else if ( e[0] == ';' ) e++;
// must end in a period, or .) or .]
bool endsInPeriod = false;
if ( e-2 >= p &&
( e[-1] =='.' ||
e[-1] =='!' ||
e[-1] =='?' ) )
endsInPeriod = true;
if ( (e[-1] == ')' ||
e[-1] == ']' ) &&
(e[-2] == '.' ||
e[-2] == '?' ||
e[-2] == '!' ) )
endsInPeriod = true;
//int32_t off = reply.length();
// filter out tags and \n's and \r's and store into "reply"
if ( ! reply.safePrintFilterTagsAndLines ( p , e-p ,false ) )
return NULL;
// if a sentence and does not end in period, toss one in
//if ( sp->m_flags & SEC_SENTENCE ) {
// if ( e[-1] !='.' &&
// e[-1] !='!' &&
// e[-1] !='?' &&
// e[-1] !=']' &&
// e[-1] !=')' )
// reply.pushChar('.');
//}
// too huge? if # of ALNUM words > 70 it's too big.
bool isHuge = false;
if ( naw > 70 ) isHuge = true;
// ending in a * indicates a printable sentence for fast facts
if ( (sp->m_flags & SEC_SENTENCE) &&
! isList &&
! isHuge &&
! yelling &&
endsInPeriod )
reply.pushChar('*');
// delineate sentences/headers/sections with | now so
// we can still allow a word to be a gigabit even if it is
// not in a sentence with a query term
//reply.pushChar('\0');
reply.pushChar('|');
char *pc = reply.getBufStart() + reply.length() - 1;
*pc = '\0';
// debug
//char *x = reply.getBufStart() + off;
// turn off fast fact debug for now
//log("fastfact: fastfact: %s",x);
// revert back to |
*pc = '|';
// stop? this fixes the query 'lesbain vedeo porno' on
// my cluster taking 10 seconds to get gigabits for.
// bigsamplemaxlen is 1000 as of 12/4/2013.
if ( reply.length() >= m_req->m_bigSampleMaxLen )
break;
}
// a final \0
reply.pushChar('\0');
// move it over to m_gsbuf now
m_gsbuf.stealBuf ( &reply );
// we are valid
m_gsbufValid = true;
// success
return &m_gsbuf;
// need a buncha crap
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
Pos *pos = getPos();
if ( ! pos || pos == (Pos *)-1 ) return (SafeBuf *)pos;
Matches *mm = getMatches();
if ( ! mm || mm == (Matches *)-1 ) return (SafeBuf *)mm;
// convert length to number of words
int32_t bigSampleRadius = m_req->m_bigSampleRadius / 5;
// at least 1
if ( bigSampleRadius <= 0 ) bigSampleRadius = 1;
// alloc for whole document?
int32_t max = xml->getContentLen() ;
// do not exceed
if ( max > m_req->m_bigSampleMaxLen ) max = m_req->m_bigSampleMaxLen;
// make sure we have something in words too. i guess no sample?
if ( max <= 2 ) { m_gsbufValid = true; return &m_gsbuf; }
// a flag so we don't overlap samples...
int32_t lastb = -1;
// . set m_buf to where we write the sample
// . add a byte for the terminating \0
int32_t gsbufAllocSize = max + 1;
// temp hack
//m_gsbuf = (char *)mmalloc(m_gsbufAllocSize,"gsbuf");
if ( ! m_gsbuf.reserve ( gsbufAllocSize, "gsbuf" ) ) return NULL;
// g_errno should be set...
//if ( ! m_gsbuf ) return NULL;
//m_freeBuf = true;
// set our pointer
char *pstart = m_gsbuf.getBufStart();
char *p = pstart;
char *pend = pstart + max;
int32_t nw = ww->m_numWords;
// skip to first query term
for ( int32_t i = 0 ; i < mm->m_numMatches ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get the match
Match *m = &mm->m_matches[i];
// break out if match is not from the document's Words class
if ( m->m_words != ww ) break;
// the word #
int32_t n = m->m_wordNum;
// got a match, add this samplet, [a,b]
int32_t a = n - bigSampleRadius;
int32_t b = n + bigSampleRadius;
if ( a < 0 ) a = 0;
if ( b > nw ) b = nw;
if ( a < lastb ) a = lastb;
// ensure the samples are separated by \0
else if ( p > pstart && p + 2 < pend ) {
*p++ = '\0';
}
Pos *pos = m->m_pos;
int32_t *pp = pos->m_pos;
int32_t len = pp[b+1] - pp[a];
// if match would send us over, we are done
if ( p + len >= pend ) break;
len = pos->filter(p,pend,m->m_words,a,b,m->m_sections);
// for debug (mdw)
//log("query: gigabitsample#%"INT32"=%s",i,p);
p += len;
// we are the new lastb
lastb = b;
}
// always null terminate
*p++ = '\0';
// . set sample size
// . this includes terminating 0\'s in this case
//int32_t gsbufSize = p - m_gsbuf;
m_gsbuf.setLength( p - m_gsbuf.getBufStart() );
// we are valid
m_gsbufValid = true;
// for debug (mdw)
//log("query: finalgigabitsample=%s",m_gsbuf);
// success
return &m_gsbuf;
}
// if it is json then only return the json fields that are strings
// and json decode them... separate each field with a \0.
SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {
SafeBuf tmp;
// use new json parser
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1 ) return (SafeBuf *)jp;
JsonItem *ji = jp->getFirstItem();
for ( ; ji ; ji = ji->m_next ) {
QUICKPOLL(m_niceness);
// skip if not string
if ( ji->m_type != JT_STRING )
continue;
// store field value
char *val = ji->getValue();
int valLen = ji->getValueLen();
// if it contains html then skip it as a gigabit candidate.
// otherwise our fast facts end up including html tags in them
// in computeFastFacts() in Msg40.cpp
int i;
for ( i = 0 ; i < valLen ; i++ )
if ( val[i] == '<' ) break;
if ( i < valLen ) continue;
if ( ! tmp.pushChar('\n') )
return NULL;
// if ( ! tmp.safePrintf("<p>"))
// return NULL;
// decode the json
//SafeBuf xx;
if ( ! tmp.safeDecodeJSONToUtf8(val,valLen,m_niceness))
return NULL;
// escape out the html
// if ( ! tmp.htmlEncode ( xx.getBufStart() ))
// return NULL;
// two new lines
if ( ! tmp.safePrintf("<hr>"))
return NULL;
if ( ! tmp.pushChar('\n') )
return NULL;
if ( ! tmp.pushChar('\n') )
return NULL;
if ( ! tmp.pushChar('\n') )
return NULL;
}
if ( ! tmp.nullTerm() )
return NULL;
Xml xml;
if ( ! xml.set ( tmp.getBufStart() ,
tmp.length() ,
false , // ownData?
0 , // allocSize
false , // pure xml?
m_version ,
false , // setParentsArg?
m_niceness ,
CT_HTML ) ) // *ct ) )
return NULL;
Words ww;
if ( ! ww.set ( &xml , true , m_niceness ) ) return NULL;
Bits bb;
if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL;
Phrases pp;
if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL;
// this uses the sectionsReply to see which sections are
// "text", etc. rather than compute it expensively
Sections sec;
if ( !sec.set ( &ww ,
&pp ,
&bb ,
getFirstUrl() ,
0,//*d ,
0,//*sh64 , // 64 bits
"",//cr->m_coll ,
m_niceness ,
NULL,//m_masterState , // state
NULL,//m_masterLoop , // callback
CT_JSON, // *ct ,
NULL,//&m_dates ,
NULL , // sd // sections data
true , // sections data valid?
NULL , // sv // for m_nsvt
NULL , // buf
0 )) { // bufSize
return NULL;
}
// now add each sentence section into the buffer
// scan the sentences if we got those
char **wptrs = ww.getWords();
int32_t *wlens = ww.getWordLens();
Section *ss = sec.m_firstSent;
for ( ; ss ; ss = ss->m_nextSent ) {
// breathe
QUICKPOLL(m_niceness);
// count of the alnum words in sentence
int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
// start with one word!
count--;
// how can it be less than one alnum word
if ( count < 0 ) continue;
// store it
char *wp1 = wptrs[ss->m_senta];
char *wp2 = wptrs[ss->m_sentb-1] + wlens[ss->m_sentb-1];
bool gotTerm = (wp2[0]=='.' || wp2[0]=='?' || wp2[0]=='!' ) ;
//if ( ! gotTerm ) continue;
if ( ! m_gsbuf.safeMemcpy ( wp1 , wp2 - wp1 ) )
return NULL;
// puncty?
if ( gotTerm && ! m_gsbuf.pushChar(wp2[0]))
return NULL;
// to indicate end of header or sentence, in order to
// qualify as a fast fact, we must add a '*'. see
// PageResults.cpp, search for ''*''
if ( gotTerm && ! m_gsbuf.pushChar('*') )
return NULL;
if ( ! m_gsbuf.pushChar('\0') )
return NULL;
}
m_gsbufValid = true;
return &m_gsbuf;
}
// . good sites sometimes have hacked pages
// . try to identify those
char *XmlDoc::getIsCompromised ( ) {
if ( m_isCompromisedValid ) return &m_isCompromised;
Xml *xml = getXml();
if ( ! xml || xml == (void *)-1 ) return (char *)xml;
int32_t n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes();
// assume compromised
m_isCompromised = true;
m_isCompromisedValid = true;
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != TAG_FONT ) continue;
// only get content for <meta name=..> not <meta http-equiv=..>
int32_t stlen;
char *style = nodes[i].getFieldValue ( "style" , &stlen );
// skip if none
if ( ! style || stlen <= 6 ) continue;
// NULL term
char c = style[stlen];
style[stlen] = '\0';
char *hc = strstr(style,"height");
char *wc = strstr(style,"width");
// skip if neighter
if ( ! hc && ! wc ) continue;
// advance
if ( hc ) hc += 6;
if ( wc ) wc += 5;
while ( is_wspace_a(*hc) ) hc++;
while ( is_wspace_a(*wc) ) wc++;
if ( hc && *hc == ':' ) hc++;
if ( wc && *wc == ':' ) hc++;
while ( is_wspace_a(*hc) ) hc++;
while ( is_wspace_a(*wc) ) wc++;
style[stlen] = c;
// a zero height or width is a signal of invisble text and of
// our syzygy compromised site to compromised site spammer
if ( *hc == '0' ) return &m_isCompromised;
if ( *wc == '0' ) return &m_isCompromised;
}
m_isCompromised = false;
return &m_isCompromised;
}
// <meta name=robots value=noarchive>
// <meta name=gigabot value=noarchive>
char *XmlDoc::getIsNoArchive ( ) {
if ( m_isNoArchiveValid ) return &m_isNoArchive;
Xml *xml = getXml();
if ( ! xml || xml == (void *)-1 ) return (char *)xml;
m_isNoArchive = false;
m_isNoArchiveValid = true;
int32_t n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes();
// find the meta tags
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// continue if not a meta tag
if ( nodes[i].m_nodeId != TAG_META ) continue;
// get robots attribute
int32_t alen; char *att;
// <meta name=robots value=noarchive>
att = nodes[i].getFieldValue ( "name" , &alen );
// need a name!
if ( ! att ) continue;
// get end
char *end = att + alen;
// skip leading spaces
while ( att < end && *att && is_wspace_a(*att) ) att++;
// must be robots or gigabot. skip if not
if ( strncasecmp(att,"robots" ,6) &&
strncasecmp(att,"gigabot",7) ) continue;
// get the content vaue
att = nodes[i].getFieldValue("content",&alen);
// skip if none
if ( ! att ) continue;
// get end
end = att + alen;
// skip leading spaces
while ( att < end && *att && is_wspace_a(*att) ) att++;
// is is noarchive? skip if no such match
if ( strncasecmp(att,"noarchive",9) ) continue;
// ok, we got it
m_isNoArchive = true;
break;
}
// return what we got
return &m_isNoArchive;
}
// this vector's components are 64-bit, not the usual 32-bit
int64_t **XmlDoc::getAdVector ( ) {
if ( m_adVectorValid ) return &ptr_adVector;
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (int64_t **)xml;
setStatus ( "parsing out ad ids");
// assume valid
m_adVectorValid = true;
int32_t na = 0;
int32_t n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes();
// find the meta tags
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// continue if not a script tag
if ( nodes[i].m_nodeId != TAG_SCRIPT ) continue; // 83
// must be a front tag, not a back tag
if ( xml->isBackTag ( i ) ) continue;
// find the back tag for it
int32_t j;
for ( j = i ; j < n ; j++ ) {
// another script tag
if( nodes[i].m_nodeId != TAG_SCRIPT ) continue;
// must be a back tag this time
if ( ! xml->isBackTag ( i ) ) continue;
// ok, we got it
break;
}
// if no back tag, give up
if ( j == n ) break;
// buf/len defines the script area
char *buf = xml->getNode(i);
int32_t len = xml->getNode(j) - buf;
// skip this script tag for next loop
i = j;
bool found = false;
// start off looking for google
char *needles[3] =
{ "google_ad_client" ,
"ctxt_ad_partner",
"http://ad" };
char *providers[3] =
{ "google" ,
"yahoo",
"doubleclick" };
for ( int32_t k = 0 ; k < 3 ; k++ ) {
// try to match this needle
char *match = needles[k];
// try to get a match
char *p = strnstr ( buf, match , len );
// go again
if ( ! p ) continue;
// do not exceed the script area
char *pend = buf + len;
// it is in quotes
// pub-uint64_t for google ad, uint32_t for yahoo
// check for double or single quote
while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
// it must have them!... i guess
if ( p >= pend ) continue;
// point to after the quote
char *pbegin = ++p;
// find the ending quote
while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
// if none, bail
if ( p >= pend ) continue;
// get length of the ad client id between the quotes
int32_t adClientLen = p - pbegin;
if ( k == 2 ) {
p = strnstr(p,".doubleclick.net/",pend-p);
if ( ! p ) continue;
p += 17;
// look for doubleclick ads
// user name is the second element of the path
while(p < pend && *p != '/') p++;
pbegin = ++p;
while(p < pend && *p != '/') p++;
if(p >= pend) continue;
adClientLen = p - pbegin;
found = true;
}
char *f = pbegin;
char *fend = pbegin + adClientLen;
for ( ; f < fend ; f++ ) {
if ( is_alnum_a ( *f ) ) continue;
if ( *f == '-' || *f == '_' || *f == '.' )
continue;
break;
}
if ( f < fend ) continue;
if ( adClientLen >= 400 ) continue;
if ( adClientLen < 4 ) continue;
// null term temp
char c = *fend;
*fend = '\0';
// hash it
char buf[512];
sprintf(buf,"gbad:%s-%s",providers[k],pbegin);
// put it back
*fend = c;
// . make the query term id
// . first hash the field
uint64_t h = hash64 ( "gbad" , 4 );
// then add in the other junk
h = hash64 ( buf , gbstrlen(buf) , h );
// . now we will index that as-is
// . and Msg25/LinkInfo can use to dedup voters!
m_adIds[na++] = h;
// stop if too many. save room for NULL termination.
if ( na + 1 >= XD_MAX_AD_IDS ) break;
}
//look for another if not found or not ok.
}
// null term it like a good vector! no, those are 32-bit components,
// we are a 64-bit component vector
//m_adIds[na++] = 0;
// point to where we should put them
ptr_adVector = m_adIds;
// store this i guess
size_adVector = na * 8;
// *lastNode = nn;
return &ptr_adVector;
}
char *XmlDoc::getIsLinkSpam ( ) {
if ( m_isLinkSpamValid ) return &m_isLinkSpam2;
setStatus ( "checking if linkspam" );
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
Links *links = getLinks();
if ( ! links || links == (Links *)-1 ) return (char *)links;
int32_t *ip = getIp();
if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
int32_t **pici = getIndCatIds();
if ( ! pici || pici == (void *)-1 ) return (char *)pici;
//LinkInfo *info1 = getLinkInfo1();
//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// reset note
m_note = NULL;
// . if a doc is "link spam" then it cannot vote, or its
// voting power is reduced
// . look for indications that the link is from a guestbook
// . doc length over 100,000 bytes consider it link spam
m_isLinkSpamValid = true;
m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
*ip ,
ptr_indCatIds ,
size_indCatIds / 4 ,
*sni ,
xml,
links,
150000,//MAXDOCLEN,//maxDocLen ,
&m_note ,
NULL , // &linkee , // url ,
-1 , // linkNode ,
cr->m_coll ,
m_niceness );
// set shadow
m_isLinkSpam2 = (bool)m_isLinkSpam;
return &m_isLinkSpam2;
}
void *zliballoc ( void *opaque , unsigned int items , unsigned int size ) {
//log("db: got zlib alloc");
return (void *)mmalloc ( items * size , "zlib" );
}
void zlibfree ( void *opaque , void *address ) {
//log("db: got zlib free");
// -1 will tell Mem.cpp to look it up in the table
mfree ( address , -1 , "zlib" );
}
void *malloc_replace (void *pf , unsigned int nitems , unsigned int size ) {
return g_mem.gbmalloc(size*nitems,"malloc_replace");
}
void free_replace ( void *pf , void *s ) {
// -1 means we don't know the size
g_mem.gbfree(s,-1,"free_replace");
}
int gbuncompress ( unsigned char *dest ,
uint32_t *destLen ,
unsigned char *source ,
uint32_t sourceLen ) {
z_stream stream;
int err;
stream.next_in = (Bytef*)source;
stream.avail_in = (uInt)sourceLen;
// Check for source > 64K on 16-bit machine:
if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
stream.next_out = dest;
stream.avail_out = (uInt)*destLen;
if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
//stream.zalloc = (alloc_func)0;
//stream.zfree = (free_func)0;
stream.zalloc = malloc_replace;//zliballoc;
stream.zfree = free_replace;//zlibfree;
// this calls memcpy so make sure Profiler.cpp doesn't crash
// since when it calls backtrace() that calls memcpy() too
// and it's not async safe
g_inMemcpy = 2;
//we can be gzip or deflate
err = inflateInit2(&stream, 47);
g_inMemcpy = 0;
if (err != Z_OK) return err;
err = inflate(&stream, Z_FINISH);
if (err != Z_STREAM_END) {
inflateEnd(&stream);
if (err == Z_NEED_DICT ||
(err == Z_BUF_ERROR && stream.avail_in == 0))
return Z_DATA_ERROR;
return err;
}
*destLen = stream.total_out;
err = inflateEnd(&stream);
return err;
}
void deflateQuickPoll ( ) {
QUICKPOLL(1);
}
int gbcompress ( unsigned char *dest ,
uint32_t *destLen ,
unsigned char *source ,
uint32_t sourceLen ,
int32_t encoding ) {
int level = Z_DEFAULT_COMPRESSION;
z_stream stream;
int err;
int method = Z_DEFLATED;
//lots of mem, faster, more compressed, see zlib.h
int windowBits = 31;
int memLevel = 8;
int strategy = Z_DEFAULT_STRATEGY;
stream.next_in = (Bytef*)source;
stream.avail_in = (uInt)sourceLen;
#ifdef MAXSEG_64K
// Check for source > 64K on 16-bit machine:
if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
#endif
stream.next_out = dest;
stream.avail_out = (uInt)*destLen;
if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
//stream.zalloc = (alloc_func)0;
//stream.zfree = (free_func)0;
stream.zalloc = malloc_replace;//zliballoc;
stream.zfree = free_replace;//zlibfree;
stream.opaque = (voidpf)0;
//we can be gzip or deflate
if(encoding == ET_DEFLATE) err = deflateInit (&stream, level);
else err = deflateInit2(&stream, level,
method, windowBits,
memLevel, strategy);
if (err != Z_OK) {
// zlib's incompatible version error?
if ( err == -6 ) {
log("zlib: zlib did you forget to add #pragma pack(4) to "
"zlib.h when compiling libz.a so it aligns on 4-byte "
"boundaries because we have that pragma in "
"gb-include.h so its used when including zlib.h");
}
return err;
}
// cygwin uses the system libz.a which is not hacked for our quickpoll
#ifndef CYGWIN
// tell deflat() to call quickpoll
// MDW: 11/14/2014 don't do this for the 64bit zlib for now just to
// save some time. do it later when it proves to be an issue.
//setQuickPoll ( (char *)&g_loop.m_needsToQuickPoll, deflateQuickPoll);
#endif
// this calls memcpy so make sure Profiler.cpp doesn't crash
// since when it calls backtrace() that calls memcpy() too
// and it's not async safe
g_inMemcpy = 3;
err = deflate(&stream, Z_FINISH);
g_inMemcpy = 0;
if (err != Z_STREAM_END) {
deflateEnd(&stream);
return err == Z_OK ? Z_BUF_ERROR : err;
}
*destLen = stream.total_out;
err = deflateEnd(&stream);
return err;
}
//
// NO NO don't use until use replace in[64] with SafeBuf in and out below
//
int gbcompress7 ( unsigned char *dest ,
uint32_t *destLen ,
unsigned char *source ,
uint32_t sourceLen ,
bool compress ) {
//int32_t id = 1;
// pass the input to the program through this file
// rather than a pipe, since popen() seems broken
char in[64];
if ( compress ) sprintf ( in , "%s/in.7z", g_hostdb.m_dir );
else sprintf ( in , "%s/out.7z", g_hostdb.m_dir );
unlink ( in );
// collect the output from the filter from this file
char out[64];
if ( compress ) sprintf ( out , "%s/out.7z", g_hostdb.m_dir );
else sprintf ( out , "%s/in.7z", g_hostdb.m_dir );
if ( ! compress )
unlink ( out );
// ignore errno from those unlinks
errno = 0;
// open the input file
retry11:
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry11;
log("build: Could not open file %s for writing: %s.",
in,mstrerror(errno));
return -1;
}
retry12:
// write the content into the input file
int32_t w = write ( fd , source , sourceLen );
// valgrind
if ( w < 0 && errno == EINTR ) goto retry12;
// did we get an error
if ( w != (int32_t)sourceLen ) {
log("build: Error writing to %s: %s.",in,mstrerror(errno));
close(fd);
return -1;
}
// close the file
close ( fd );
// . open a pipe to pdf2html program
// . the output will go to stdout
//char cmd[2048];
SafeBuf cmd;
// different commands to filter differt ctypes
// -i : ignore images
// -stdout: send output to stdout
// -c : generate complex document
// Google generates complex docs, but the large ones are horribly slow
// in the browser, but docs with 2 cols don't display right w/o -c.
// damn, -stdout doesn't work when -c is specified.
// These ulimit sizes are max virtual memory in kilobytes. let's
// keep them to 25 Megabytes
// . the newer 2.6 kernels do not support ulimit !!!
if ( compress )
// 7za a out.7z in.7z
cmd.safePrintf( "%s7za a %s %s > /dev/null",
g_hostdb.m_dir , out,in);
else
// -y = yes on all. so we overwrite "in.7z"
cmd.safePrintf( "%s7za -o%s -y e %s > /dev/null",
g_hostdb.m_dir,g_hostdb.m_dir , in);//,in);
// breach sanity check
//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }
// exectue it
int retVal = gbsystem ( cmd.getBufStart() );
if ( retVal == -1 )
log("gb: system(%s) : %s",cmd.getBufStart(),
mstrerror(g_errno));
// all done with input file
// clean up the binary input file from disk
//if ( unlink ( in ) != 0 ) {
// // log error
// log("gbfilter: unlink (%s): %s\n",in,strerror(errno));
// // ignore it, since it was not a processing error per se
// errno = 0;
//}
retry13:
fd = open ( out , O_RDONLY );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry13;
log("7zip: Could not open file %s for reading: %s.",
out,mstrerror(errno));
return -1;
}
// to read - leave room for \0
//int32_t toRead = MAXDOCLEN + 1000;
int32_t toRead = 150000 + 1000;
retry14:
// read right from pipe descriptor
int32_t r = read (fd, dest,toRead);
// note errors
if ( r < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry14;
log("7zip: reading output: %s",mstrerror(errno));
// this is often bad fd from an oom error, so ignore it
errno = 0;
r = 0;
}
// clean up shop
close ( fd );
// delete output file
//unlink ( out );
if ( r > (int32_t)*destLen ) { char *xx=NULL;*xx=0; }
// assign
*destLen = r;
// debug for now
char *pre = "";
if ( ! compress ) pre = "un";
log("7zip: %scompressed %"UINT32" to %"UINT32" bytes"
, pre,sourceLen , *destLen );
return Z_OK;
}
int gbuncompress7 ( unsigned char *dest ,
uint32_t *destLen ,
unsigned char *source ,
uint32_t sourceLen ) {
return gbcompress7(dest,destLen,source,sourceLen,false);
}
/*
bool XmlDoc::hashSingleTerm ( int64_t termId , HashInfo *hi ) {
// combine with a non-NULL prefix
if ( hi->m_prefix ) {
int64_t prefixHash = hash64b ( hi->m_prefix );
// sanity test, make sure it is in supported list
if ( getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
char *xx=NULL;*xx=0; }
termId = hash64 ( termId , prefixHash );
}
// save it?
if ( m_wts && ! ::storeTerm ( "binary",6,termId,hi,0,0,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
hi->m_hashGroup,
false,&m_wbuf,m_wts,false) )
return false;
// int16_tcut
HashTableX *dt = hi->m_tt;
// sanity check
if ( dt->m_ks != sizeof(key_t) ) { char *xx=NULL;*xx=0; }
// make the key like we do in hashWords()
key96_t k;
k.n1 = hi->m_date;
k.n0 = termId;
// get current score for this wordid
int32_t slot = dt->getSlot ( &k );
// does this termid/date already exist?
if ( slot >= 0 ) {
// done
return true;
}
// otherwise, add a new slot
char val = 1;
if ( ! hi->m_tt->addKey ( (char *)k , &val ) )
return false;
// return true on success
return true;
}
*/
bool storeTerm ( char *s ,
int32_t slen ,
int64_t termId ,
HashInfo *hi ,
int32_t wordNum ,
int32_t wordPos ,
char densityRank,
char diversityRank ,
char wordSpamRank ,
char hashGroup,
//bool isPhrase ,
SafeBuf *wbuf ,
HashTableX *wts ,
char synSrc ,
char langId ,
POSDBKEY key ) {
// store prefix
int32_t poff = wbuf->length();
// int16_tcut
char *p = hi->m_prefix;
// add the prefix too!
if ( p && ! wbuf->safeMemcpy(p,gbstrlen(p)+1)) return false;
// none?
if ( ! p ) poff = -1;
// store description
int32_t doff = wbuf->length();
// int16_tcut
char *d = hi->m_desc;
// add the desc too!
if ( d && ! wbuf->safeMemcpy(d,gbstrlen(d)+1) ) return false;
// none?
if ( ! d ) doff = -1;
// store term
int32_t toff = wbuf->length();
// add it
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
// make this
TermDebugInfo ti;
ti.m_termOff = toff;
ti.m_termLen = slen;
ti.m_descOff = doff;
ti.m_prefixOff = poff;
ti.m_date = hi->m_date;
ti.m_shardByTermId = hi->m_shardByTermId;
ti.m_termId = termId;
//ti.m_weight = 1.0;
//ti.m_spam = -1.0;
ti.m_diversityRank = diversityRank;
ti.m_densityRank = densityRank;
ti.m_wordSpamRank = wordSpamRank;
ti.m_hashGroup = hashGroup;
ti.m_wordNum = wordNum;
ti.m_wordPos = wordPos;
ti.m_langId = langId;
ti.m_key = key;
// was sitehash32
//ti.m_facetVal32 = hi->m_facetVal32;//sentHash32 = hi->m_sentHash32;
// save for printing out an asterisk
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
// get language bit vec
ti.m_langBitVec64 = g_speller.getLangBits64(&termId);
//if ( isPhrase ) ti.m_synSrc = SOURCE_NGRAM;
/*
// the weight vec for the words and phrases
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) ti.m_rv[j] = 1.0;
int32_t *wscores = NULL;
if ( weights && ! isPhrase ) wscores = weights->m_ww;
if ( weights && isPhrase ) wscores = weights->m_pw;
// int16_tcut
int32_t i = wordNum;
if ( weights && ! weights->m_rvw ) { char *xx=NULL;*xx=0; }
if ( weights && ! weights->m_rvp ) { char *xx=NULL;*xx=0; }
float *rv = NULL;
if ( weights && ! isPhrase ) rv = &weights->m_rvw[i*MAX_RULES];
if ( weights && isPhrase ) rv = &weights->m_rvp[i*MAX_RULES];
if ( weights ) ti.m_weight = (float)wscores[i] / (float)DW;
if ( weights )
gbmemcpy ( &ti.m_rv, rv , MAX_RULES*sizeof(float));
// no, because if this is zero we force it up to 1!
//if ( weights )
// ti.m_score32 = (int32_t)((float)ti.m_score32 * ti.m_weight);
ti.m_score32 = score;
if ( isSynonym )
ti.m_score32 = score;
*/
// make the key
key96_t k;
k.n1 = 0; // date
k.n0 = termId;
// store it
return wts->addKey ( &k , &ti ) ;
}
bool XmlDoc::hashSingleTerm ( char *s ,
int32_t slen ,
HashInfo *hi ) {
// empty?
if ( slen <= 0 ) return true;
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
//
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
//
//if ( ! m_storeTermListInfo )
// return true;
// a single blob hash
int64_t termId = hash64 ( s , slen );
// combine with prefix
int64_t final = termId;
// combine with a non-NULL prefix
int64_t prefixHash = 0LL;
if ( hi->m_prefix ) {
prefixHash = hash64b ( hi->m_prefix );
final = hash64 ( termId , prefixHash );
}
// call the other guy now
//return hashSingleTerm ( final , hi );
// int16_tcut
HashTableX *dt = hi->m_tt;
// sanity check
if ( dt->m_ks != sizeof(key144_t) ) { char *xx=NULL;*xx=0; }
// make the key like we do in hashWords()
key144_t k;
g_posdb.makeKey ( &k ,
final,
0LL, // docid
0, // dist
MAXDENSITYRANK, // density rank
MAXDIVERSITYRANK, // diversity rank
MAXWORDSPAMRANK, // wordspamrank
0, // siterank
hi->m_hashGroup,
// we set to docLang in final hash loop
langUnknown,// langid
0, // multiplier
0, // syn?
false , // delkey?
hi->m_shardByTermId );
//
// HACK: mangle the key if its a gbsitehash:xxxx term
// used for doing "facets" like stuff on section xpaths.
//
// no longer do this because we just hash the term
// gbxpathsitehash1234567 where 1234567 is that hash.
// but
//
//static int64_t s_gbsectionhash = 0LL;
//if ( ! s_gbsectionhash ) s_gbsectionhash = hash64b("gbsectionhash");
//if ( prefixHash == s_gbsectionhash )
// g_posdb.setSectionSentHash32 ( &k, hi->m_sentHash32 );
// . otherwise, add a new slot
// . key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
if ( ! dt->addTerm144 ( &k ) ) return false;
// add to wts for PageParser.cpp display
if ( m_wts && ! storeTerm ( s,slen,final,hi,
0, // wordnum
0, // wordPos,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
hi->m_hashGroup,
//false,
&m_wbuf,
m_wts,
SOURCE_NONE, // synsrc
langUnknown,
k) )
return false;
return true;
}
bool XmlDoc::hashString ( char *s, HashInfo *hi ) {
return hashString ( s , gbstrlen(s), hi ); }
bool XmlDoc::hashString ( char *s ,
int32_t slen ,
HashInfo *hi ) {
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
int32_t *sni = getSiteNumInlinks();
return hashString3( s ,
slen ,
hi ,
&m_countTable ,
m_pbuf ,
m_wts ,
&m_wbuf ,
m_version ,
*sni ,
m_niceness );
}
bool XmlDoc::hashString3( char *s ,
int32_t slen ,
HashInfo *hi ,
HashTableX *countTable ,
SafeBuf *pbuf ,
HashTableX *wts ,
SafeBuf *wbuf ,
int32_t version ,
int32_t siteNumInlinks ,
int32_t niceness ) {
Words words;
Bits bits;
Phrases phrases;
//Weights weights;
//Synonyms synonyms;
if ( ! words.set ( s , slen , version , true , niceness ) )
return false;
if ( ! bits.set ( &words , version , niceness ) )
return false;
if ( ! phrases.set(&words,&bits,true,false,version,niceness ) )
return false;
// use primary langid of doc
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
// words
//SafeBuf myLangVec;
//if ( ! setLangVec ( &words , &myLangVec , m_niceness ) )
// return false;
//char *langVec = (char *)myLangVec.getBufStart();
/*
// debugBuf for synonyms? yes if we are debugging
SafeBuf synDebugBuf;
SafeBuf *sdbp = NULL;
if ( pbuf || m_storeTermListInfo ) sdbp = &synDebugBuf;
// now we can set it...
if ( hi->m_useSynonyms && !synonyms.set(&words,
NULL, // langVec,
m_langId,
&phrases,
niceness,
sdbp))
return false;
*/
// set weights because of count table
//if ( countTable && ! weights.set ( &words ,
/*
if ( hi->m_useWeights &&
! weights.set ( &words ,
&phrases ,
&bits ,
NULL ,
pbuf ,
false ,
false ,
version ,
100 , // titleWeight
100 , // headerWeight
countTable ,
false , // isLinkText
false , // isCntTable?
siteNumInlinks ,
niceness ) )
return false;
Weights *wp = &weights;
if ( ! hi->m_useWeights ) wp = NULL;
*/
//Synonyms *sp = NULL;
//if ( hi->m_useSynonyms ) sp = &synonyms;
return hashWords3 ( //0 ,
//words.getNumWords() ,
hi ,
&words ,
&phrases ,
NULL,//sp , synonyms
NULL , // sections
countTable ,
NULL , // fragvec
NULL , // wordspamvec
NULL , // langvec
langUnknown , // default langid doclangid
pbuf ,
wts ,
wbuf ,
niceness );
}
bool XmlDoc::hashWords ( //int32_t wordStart ,
//int32_t wordEnd ,
HashInfo *hi ) {
// sanity checks
if ( ! m_wordsValid ) { char *xx=NULL; *xx=0; }
if ( ! m_phrasesValid ) { char *xx=NULL; *xx=0; }
if ( hi->m_useCountTable &&!m_countTableValid){char *xx=NULL; *xx=0; }
if ( ! m_bitsValid ) { char *xx=NULL; *xx=0; }
if ( ! m_sectionsValid) { char *xx=NULL; *xx=0; }
//if ( ! m_synonymsValid) { char *xx=NULL; *xx=0; }
if ( ! m_fragBufValid ) { char *xx=NULL; *xx=0; }
if ( ! m_wordSpamBufValid ) { char *xx=NULL; *xx=0; }
if ( m_wts && ! m_langVectorValid ) { char *xx=NULL; *xx=0; }
if ( ! m_langIdValid ) { char *xx=NULL; *xx=0; }
// . is the word repeated in a pattern?
// . this should only be used for document body, for meta tags,
// inlink text, etc. we should make sure words are unique
char *wordSpamVec = getWordSpamVec();
char *fragVec = m_fragBuf.getBufStart();
char *langVec = m_langVec.getBufStart();
return hashWords3( //wordStart ,
//wordEnd ,
hi ,
&m_words ,
&m_phrases ,
NULL,//&m_synonyms ,
&m_sections ,
&m_countTable ,
fragVec ,
wordSpamVec ,
langVec ,
m_langId , // defaultLangId docLangId
m_pbuf ,
m_wts ,
&m_wbuf ,
m_niceness );
}
// . this now uses posdb exclusively
bool XmlDoc::hashWords3 ( //int32_t wordStart ,
//int32_t wordEnd ,
HashInfo *hi ,
Words *words ,
Phrases *phrases ,
Synonyms *synonyms ,
Sections *sectionsArg ,
HashTableX *countTable ,
char *fragVec ,
char *wordSpamVec ,
char *langVec ,
char docLangId , // default lang id
//Weights *weights ,
SafeBuf *pbuf ,
HashTableX *wts ,
SafeBuf *wbuf ,
int32_t niceness ) {
//
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
//
//if ( ! m_storeTermListInfo )
// return true;
Sections *sections = sectionsArg;
// for getSpiderStatusDocMetaList() we don't use sections it'll
// mess us up
if ( ! hi->m_useSections ) sections = NULL;
// int16_tcuts
uint64_t *wids = (uint64_t *)words->getWordIds();
//nodeid_t *tids = words->m_tagIds;
uint64_t *pids2 = (uint64_t *)phrases->m_phraseIds2;
//uint64_t *pids3 = (uint64_t *)phrases->m_phraseIds3;
HashTableX *dt = hi->m_tt;
// . sanity checks
// . posdb just uses the full keys with docid
if ( dt->m_ks != 18 ) { char *xx=NULL;*xx=0; }
if ( dt->m_ds != 4 ) { char *xx=NULL;*xx=0; }
// if provided...
if ( wts ) {
if ( wts->m_ks != 12 ) { char *xx=NULL;*xx=0; }
if ( wts->m_ds != sizeof(TermDebugInfo)){char *xx=NULL;*xx=0; }
if ( ! wts->m_allowDups ) { char *xx=NULL;*xx=0; }
}
// ensure caller set the hashGroup
if ( hi->m_hashGroup < 0 ) { char *xx=NULL;*xx=0; }
// handy
char **wptrs = words->getWordPtrs();
int32_t *wlens = words->getWordLens();
// hash in the prefix
uint64_t prefixHash = 0LL;
int32_t plen = 0;
if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && plen ) {
// we gotta make this case insensitive, and skip spaces
// because if it is 'focal length' we can't search
// 'focal length:10' because that comes across as TWO terms.
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
// . sanity test, make sure it is in supported list
// . hashing diffbot json output of course fails this so
// skip in that case if diffbot
//if ( ! m_isDiffbotJSONObject &&
// getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
// if (hi->m_desc&&strcmp(hi->m_desc,"custom meta tag")) {
// char *xx=NULL;*xx=0; }
//}
}
bool hashIffUnique = false;
//if ( hi->m_hashGroup == HASHGROUP_INLINKTEXT ) hashIffUnique = true;
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,niceness,"uqtbl");
///////
//
// diversity rank vector.
//
///////
// the final diversity which is a multiplier
// is converted into a rank from 0-15 i guess.
// so 'mexico' in "new mexico" should receive a low word score but high
// phrase score. thus, a search for 'mexico' should not bring up
// the page for university of new mexico!
SafeBuf dwbuf;
if(!getDiversityVec ( words,phrases,countTable,&dwbuf,niceness))
return false;
char *wdv = dwbuf.getBufStart();
int32_t nw = words->getNumWords();
/////
//
// calculate density ranks
//
/////
//
// this now varies depending on the length of the sentence/header etc.
// so if the hasgroup is not title, link text or meta tag, we have to
// use a safebuf.
SafeBuf densBuf;
// returns false and sets g_errno on error
if ( ! getDensityRanks((int64_t *)wids,
nw,//wordStart,
//wordEnd,
hi->m_hashGroup,
&densBuf,
sections,
m_niceness))
return false;
// a handy ptr
char *densvec = (char *)densBuf.getBufStart();
////////////
//
// get word positions
//
///////////
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
SafeBuf wpos;
if ( ! getWordPosVec ( words ,
sections,
//wordStart,
//wordEnd,
m_dist, // hi->m_startDist,
fragVec,
niceness,
&wpos) ) return false;
// a handy ptr
int32_t *wposvec = (int32_t *)wpos.getBufStart();
/*
// show that for debug
if ( m_docId == 192304365235LL ) {
for ( int32_t i = 0 ; i < nw ; i++ ) {
char buf[1000];
int32_t len = wlens[i];
if ( len > 900 ) len = 900;
gbmemcpy(buf,wptrs[i],len);
buf[len]='\0';
log("seopipe: wptr=%s pos[%"INT32"]=%"INT32"",buf,i,wposvec[i]);
}
}
*/
//int32_t wc = 0;
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
int32_t i;
for ( i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL(niceness);
if ( ! wids[i] ) continue;
// ignore if in repeated fragment
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
// ignore if in style section
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
// do not breach wordpos bits
if ( wposvec[i] > MAXWORDPOS ) break;
// . hash the startHash with the wordId for this word
// . we must mask it before adding it to the table because
// this table is also used to hash IndexLists into that come
// from LinkInfo classes (incoming link text). And when
// those IndexLists are hashed they used masked termIds.
// So we should too...
//uint64_t h = g_indexdb.getTermId ( startHash , wids[i] ) ;
uint64_t h ;
if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash );
else h = wids[i];
// . get word spam rank. 0 means not spammed
// . just mod Weights class to ues a weight rank...
// . and diversity rank
// . need to separate weights by spam vs. diversity.
// . maybe just have a diversity class and a pattern class
// and leave the poor weights class alone
//int32_t wsr = 0;
int32_t hashGroup = hi->m_hashGroup;
Section *sx = NULL;
if ( sp ) {
sx = sp[i];
// . this is taken care of in hashTitle()
// . it is slightly different if the title is
// multiple sentences because when hashing the
// body the density rank is per sentence, but in
// hashTitle we count all the words in the title
// towards the density rank even if they are
// in different sentences
if ( sx->m_flags & SEC_IN_TITLE )
//hashGroup = HASHGROUP_TITLE;
continue;
if ( sx->m_flags & SEC_IN_HEADER )
hashGroup = HASHGROUP_HEADING;
if ( sx->m_flags & ( SEC_MENU |
SEC_MENU_SENTENCE |
SEC_MENU_HEADER ) )
hashGroup = HASHGROUP_INMENU;
}
// this is for link text and meta tags mostly
if ( hashIffUnique ) {
// skip if already did it
if ( ut.isInTable ( &h ) ) continue;
if ( ! ut.addKey ( &h ) ) return false;
}
char ws = 15;
if ( wordSpamVec ) ws = wordSpamVec[i];
// HACK:
// if this is inlink text, use the wordspamrank to hold the
// inlinker's site rank!
if ( hashGroup == HASHGROUP_INLINKTEXT )
ws = hi->m_linkerSiteRank;
// default to the document's primary language if it is not
// clear what language this word belongs to.
// if the word is only in german it should be german,
// otherwise it will be the document's primary language.
char langId = langUnknown;
if ( m_wts && langVec ) langId = langVec[i];
// keep it as the original vector. i'm not sure we use
// this for anything but for display, so show the user
// how we made our calculation of the document's primary lang
//if ( langId == langUnknown ) langId = docLangId;
char wd;
if ( hi->m_useCountTable ) wd = wdv[i];
else wd = MAXDIVERSITYRANK;
// if using posdb
key144_t k;
// if ( i == 11429 )
// log("foo");
g_posdb.makeKey ( &k ,
h ,
0LL,//docid
wposvec[i], // dist,
densvec[i],// densityRank , // 0-15
wd, // diversityRank 0-15
ws, // wordSpamRank 0-15
0, // siterank
hashGroup ,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
false , // syn?
false , // delkey?
hi->m_shardByTermId );
// get the one we lost
// char *kstr = KEYSTR ( &k , sizeof(POSDBKEY) );
// if (!strcmp(kstr,"0x0ca3417544e400000000000032b96bf8aa01"))
// log("got lost key");
// key should NEVER collide since we are always incrementing
// the distance cursor, m_dist
dt->addTerm144 ( &k );
// . make the m_wordPosInfoBuf here because we need to set
// WordPosInfo::m_wordPtr/m_wordLen.
// . could also use instead of the "wts" buffer?
if ( m_doingSEO ) {
// alloc in 10k chunks
if ( m_wordPosInfoBuf.getAvail() <
(int32_t)sizeof(WordPosInfo) ) {
int32_t newSize = m_wordPosInfoBuf.length();
newSize += 10000;
if ( ! m_wordPosInfoBuf.reserve ( newSize ) )
return false;
}
// make it
WordPosInfo wi;
wi.m_wordPtr = wptrs[i];
wi.m_wordLen = wlens[i];
wi.m_wordPos = wposvec[i];
wi.m_densityRank = densvec[i];
wi.m_wordSpamRank = ws;
wi.m_diversityRank = wd;//v[i];
wi.m_hashGroup = hashGroup;
wi.m_trafficGain = 0;
int32_t cs = sizeof(WordPosInfo);
if(!m_wordPosInfoBuf.safeMemcpy(&wi,cs)) return false;
}
// add to wts for PageParser.cpp display
if ( wts ) {
if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i,
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
wd,//v[i],
ws,
hashGroup,
//false, // is phrase?
wbuf,
wts,
SOURCE_NONE, // synsrc
langId ,
k))
return false;
}
//
// STRIP POSSESSIVE WORDS for indexing
//
// . for now do simple stripping here
// . if word is "bob's" hash "bob"
//
if ( wlens[i] >= 3 &&
wptrs[i][wlens[i]-2] == '\'' &&
to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
int64_t nah ;
nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 );
if ( plen>0 ) nah = hash64 ( nah , prefixHash );
g_posdb.makeKey ( &k ,
nah,
0LL,//docid
wposvec[i], // dist,
densvec[i],// densityRank , // 0-15
wd,//v[i], // diversityRank ,
ws, // wordSpamRank ,
0, //siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
true , // syn?
false , // delkey?
hi->m_shardByTermId );
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144 ( &k );
// keep going if not debug
if ( ! wts ) continue;
// print the synonym
if ( ! storeTerm(wptrs[i], // synWord,
wlens[i] -2, // gbstrlen(synWord),
nah, // termid
hi,
i, // wordnum
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
wd,//v[i],
ws,
hashGroup,
//false, // is phrase?
wbuf,
wts,
SOURCE_GENERATED,
langId,
k) )
return false;
}
/////////////
//
// synonyms (alt words,morphs,synonyms)
//
/////////////
/*
int64_t *aids = NULL;
int16_t naids = 0;
int64_t syh;
if ( synonyms ) {
aids = synonyms->getAltIds (i);
naids = synonyms->getNumAlts(i);
//ascore = saved / 4;
//if ( ascore <= 0 ) ascore = 1;
//asaved = ascore;
}
for ( int32_t j = 0 ; j < naids ; j++ ) {
// skip if same as original
if ( (uint64_t)aids[j] == wids[i] ) continue;
// . hash it with the prefix if any
// . fixes gbwhere:galleries bug...
if ( plen>0 ) syh = hash64 ( aids[j] , prefixHash );
else syh = aids[j];
g_posdb.makeKey ( &k ,
syh ,
0LL,//docid
wposvec[i], // dist,
densvec[i],// densityRank , // 0-15
wdv[i], // diversityRank ,
ws, // wordSpamRank ,
0, //siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
true , // syn?
false ); // delkey?
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144 ( &k );
// keep going if not debug
if ( ! wts ) continue;
// get the junk
char *synWord = synonyms->getStringFromId(&aids[j]);
// sanity
if ( ! synWord ) { char *xx=NULL;*xx=0; }
// print the synonym
if ( ! storeTerm(synWord,
gbstrlen(synWord),
syh, // termid
hi,
i, // wordnum
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
wdv[i],
ws,
hashGroup,
//false, // is phrase?
wbuf,
wts,
synonyms->m_source[i], // synsrc
langId) )
return false;
}
*/
////////
//
// two-word phrase
//
////////
int64_t npid = pids2[i];
int32_t npw = 2;
uint64_t ph2 = 0;
// repeat for the two word hash if different!
if ( npid ) {
// hash with prefix
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
else ph2 = npid;
g_posdb.makeKey ( &k ,
ph2 ,
0LL,//docid
wposvec[i],//dist,
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK, //phrase
ws, // wordSpamRank ,
0,//siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
true , // syn?
false , // delkey?
hi->m_shardByTermId );
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144 ( &k );
}
// add to wts for PageParser.cpp display
if ( wts && npid ) {
// get phrase as a string
int32_t plen;
char *phr=phrases->getPhrase(i,&plen,npw);
// store it
if ( ! storeTerm ( phr,plen,ph2,hi,i,
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK,//phrase
ws,
hashGroup,
//true,
wbuf,
wts,
SOURCE_BIGRAM, // synsrc
langId,
k) )
return false;
}
////////
//
// three-word phrase
//
////////
/*
npid = pids3[i];
npw = 3;
// repeat for the two word hash if different!
if ( npid ) {
// hash with prefix
uint64_t ph2 ;
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
else ph2 = npid;
g_posdb.makeKey ( &k ,
ph2 ,
0LL,//docid
wposvec[i],//dist,
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK, //phrase
ws, // wordSpamRank ,
0,//siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
true , // syn?
false ); // delkey?
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144 ( &k );
}
// add to wts for PageParser.cpp display
if ( wts && npid ) {
// get phrase as a string
int32_t plen;
char *phr=phrases->getPhrase(i,&plen,npw);
// store it
if ( ! storeTerm ( phr,plen,ph2,hi,i,
wposvec[i], // wordpos
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK,//phrase
ws,
hashGroup,
//true, // is phrase?
wbuf,
wts,
SOURCE_TRIGRAM, // synsrc
langId ) )
return false;
}
*/
// update for hashIncomingLinkText()
//hi->m_startDist = wposvec[i];
// debug point
//if ( ph2 == (uint64_t)-233869093807964777LL ) {
// log("hey slot=%"INT32" date=%"UINT32" n0=%"INT64" score=%"INT32"",
// slot,
// k.n1,k.n0,
// score);
// //char *xx=NULL;*xx=0;
//}
//
// NUMERIC SORTING AND RANGES
//
// only store numbers in fields this way
if ( prefixHash == 0 ) continue;
// this may or may not be numeric.
if ( ! is_digit ( wptrs[i][0] ) ) continue;
// this might have to "back up" before any '.' or '-' symbols
if ( ! hashNumber ( wptrs[0] ,
wptrs[i] ,
wlens[i] ,
hi ) )
return false;
}
// hash a single term so they can do gbfacet:ext or
// gbfacet:siterank or gbfacet:price. a field on a field.
if ( prefixHash && words->m_numWords )
// hash gbfacet:price with and store the price in the key
hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi );
// between calls? i.e. hashTitle() and hashBody()
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
return true;
}
// just like hashNumber*() functions but we use "gbfacet" as the
// primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby,
// gbsortbyint, gbrevsortby, gbrevsortbyint
bool XmlDoc::hashFacet1 ( char *term ,
Words *words ,
HashTableX *tt ) {
// need a prefix
//if ( ! hi->m_prefix ) return true;
// hash the ENTIRE content, all words as one blob
int32_t nw = words->getNumWords();
char *a = words->m_words[0];
char *b = words->m_words[nw-1]+words->m_wordLens[nw-1];
// hash the whole string as one value, the value of the facet
int32_t val32 = hash32 ( a , b - a );
if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;
//
// why do this if we already do it for hashNumber() using gbsortby: ?
//
/*
// if it's a number hash as float and int
if ( nw != 1 ) return true;
char **wptrs = words->m_words;
if ( ! is_digit ( wptrs[0][0] ) ) return true;
// hash with a float val
float f = atof(wptrs[0]);
int32_t vf32 = *(int32_t *)&f;
if ( ! hashFacet2 ( "gbfacetfloat",term, vf32 , tt ) ) return false;
// and an int val
int32_t vi32 = atoi(wptrs[0]);
if ( ! hashFacet2 ( "gbfacetint",term, vi32 , tt ) ) return false;
*/
return true;
}
bool XmlDoc::hashFacet2 ( char *prefix,
char *term ,
int32_t val32 ,
HashTableX *tt ,
// we only use this for gbxpathsitehash terms:
bool shardByTermId ) {
// need a prefix
//if ( ! hi->m_prefix ) return true;
//int32_t plen = gbstrlen ( hi->m_prefix );
//if ( plen <= 0 ) return true;
// we gotta make this case insensitive, and skip spaces
// because if it is 'focal length' we can't search
// 'focal length:10' because that comes across as TWO terms.
//int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen);
// now any field has to support gbfacet:thatfield
// and store the 32-bit termid into where we normally put
// the word position bits, etc.
//static int64_t s_facetPrefixHash = 0LL;
//if ( ! s_facetPrefixHash )
// s_facetPrefixHash = hash64n ( "gbfacet" );
// this is case-sensitive
int64_t prefixHash = hash64n ( prefix );
// term is like something like "object.price" or whatever.
// it is the json field itself, or the meta tag name, etc.
int64_t termId64 = hash64n ( term );
// combine with the "gbfacet" prefix. old prefix hash on right.
// like "price" on right and "gbfacetfloat" on left... see Query.cpp.
int64_t ph2 = hash64 ( termId64, prefixHash );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
g_posdb.setInt ( &k , val32 );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
HashTableX *dt = tt;//hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
bool isFloat = false;
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
// store in buffer for display on pageparser.cpp output
char buf[130];
if ( isFloat )
snprintf(buf,128,"facetField=%s facetVal32=%f",term,
*(float *)&val32);
else
snprintf(buf,128,"facetField=%s facetVal32=%"UINT32"",
term,(uint32_t)val32);
int32_t bufLen = gbstrlen(buf);
// make a special hashinfo for this facet
HashInfo hi;
hi.m_tt = tt;
// the full prefix
char fullPrefix[66];
snprintf(fullPrefix,64,"%s:%s",prefix,term);
hi.m_prefix = fullPrefix;//"gbfacet";
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
ph2, // prefixHash, // s_facetPrefixHash,
&hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k) )
return false;
return true;
}
bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
HashTableX *tt = hi->m_tt;
uint64_t val64 = hash64 ( val , vlen );
// term is like something like "object.price" or whatever.
// it is the json field itself, or the meta tag name, etc.
uint64_t middlePrefix = hash64n ( hi->m_prefix );
// hash "This is a new product." with "object.desc".
// "object.desc" (termId64) is case-sensitive.
uint64_t composite = hash64 ( val64 , middlePrefix );
// hash that with "gbfieldmatch"
char *prefix = "gbfieldmatch";
uint64_t prefixHash = hash64n ( prefix );
uint64_t ph2 = hash64 ( composite , prefixHash );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
false ) ; // shardByTermId? no, by docid.
HashTableX *dt = tt;//hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
// store in buffer for display on pageparser.cpp output
char buf[128];
int32_t bufLen ;
bufLen = sprintf(buf,"gbfieldmatch:%s:%"UINT64"",hi->m_prefix,val64);
// make a special hashinfo for this facet
HashInfo hi2;
hi2.m_tt = tt;
// the full prefix
char fullPrefix[64];
snprintf(fullPrefix,62,"%s:%s",prefix,hi->m_prefix);
hi2.m_prefix = fullPrefix;//"gbfacet";
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
ph2, // prefixHash, // s_facetPrefixHash,
&hi2,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k) )
return false;
return true;
}
// . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the
// posdb key
// . the termid is the hash of the preceeding field
// . in json docs a field is like "object.details.price"
// . in meta tags it is just the meta tag name
// . credit card numbers are 16 digits. we'd need like 58 bits to store those
// so we can't do that here, but we can approximate as a float
// . the binary representation of floating point numbers is ordered in the
// same order as the floating points themselves! so we are lucky and can
// keep our usually KEYCMP sorting algos to keep the floats in order.
bool XmlDoc::hashNumber ( char *beginBuf ,
char *buf ,
int32_t bufLen ,
HashInfo *hi ) {
if ( ! is_digit(buf[0]) ) return true;
char *p = buf;
char *bufEnd = buf + bufLen;
// back-up over any .
if ( p > beginBuf && p[-1] == '.' ) p--;
// negative sign?
if ( p > beginBuf && p[-1] == '-' ) p--;
// . convert it to a float
// . this now allows for commas in numbers like "1,500.62"
float f = atof2 ( p , bufEnd - p );
// debug
//log("build: hashing %s %f",hi->m_prefix,f);
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
return false;
// also hash in reverse order for sorting from low to high
f = -1.0 * f;
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
return false;
//
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
// dont lose 128 seconds of resolution
//
int32_t i = (int32_t) atoll2 ( p , bufEnd - p );
if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) )
return false;
// also hash in reverse order for sorting from low to high
i = -1 * i;
if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) )
return false;
return true;
}
// . THIS IS NOW replaced by ::hashFacet2() being called by hashSections()
// above. it is a more generic, faceted approch.
// . the term is gbxpathsite123456 the prefix is gbfacet the val32
// stored in the posdb key is the inner html hash of the section, and
// the "123456" is the hash of the xpath and site. so the field names
// are very custom, not your typical "ext" or "title"
// . CHROME DETECTION
// . hash a special "gbxpathsitehash12345678" term which has the hash of the
// innerHTML content embedded in it.
// . we do this for doing gbfacetstr:gbxpathsitehash12345678 etc. on every
// section with innerHTML so we can figure out the histogram of each
// section on this page relative to its subdomain. like the distriubtion
// of the innerHTML for this section as it appears on other pages from
// this site. this allows killer CHROME DETECTION!!!!
/*
bool XmlDoc::hashSectionTerm ( char *term , HashInfo *hi , int32_t sentHash32 ) {
int64_t termId = hash64 ( term , gbstrlen(term) );
key144_t k;
g_posdb.makeKey ( &k ,
termId,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
g_posdb.setInt ( &k , sentHash32 );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
// sanity
int t = g_posdb.getInt ( &k );
if ( t != sentHash32 ) { char *xx=NULL;*xx=0; }
HashTableX *dt = hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
// store in buffer
//char buf[128];
//int32_t bufLen = sprintf(buf,"%"UINT32"",sentHash32);
// if no gbmin or gbmax or gbsorty or gbrevsortby we need gbfacet
//int64_t truePrefix64 = hash64n ( "gbfacet" );
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( term,//buf,
gbstrlen(term),//bufLen,
0LL,//truePrefix64,
hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k))
return false;
return true;
}
*/
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
// prefix is something like price. like the meta "name" or
// the json name with dots in it like "product.info.price" or something
int64_t nameHash = 0LL;
int32_t nameLen = 0;
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && nameLen )
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
// need a prefix for hashing numbers... for now
else { char *xx=NULL; *xx=0; }
// combine prefix hash with a special hash to make it unique to avoid
// collisions. this is the "TRUE" prefix.
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
// hash with the "TRUE" prefix
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
g_posdb.setFloat ( &k , f );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
// sanity
float t = g_posdb.getFloat ( &k );
if ( t != f ) { char *xx=NULL;*xx=0; }
HashTableX *dt = hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
// store in buffer
char buf[128];
snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
int32_t bufLen = gbstrlen(buf);
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
truePrefix64,
hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k) )
return false;
return true;
}
bool XmlDoc::hashNumber3 ( int32_t n , HashInfo *hi , char *sortByStr ) {
// prefix is something like price. like the meta "name" or
// the json name with dots in it like "product.info.price" or something
int64_t nameHash = 0LL;
int32_t nameLen = 0;
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && nameLen )
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
// need a prefix for hashing numbers... for now
else { char *xx=NULL; *xx=0; }
// combine prefix hash with a special hash to make it unique to avoid
// collisions. this is the "TRUE" prefix.
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
// hash with the "TRUE" prefix
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
//g_posdb.setFloat ( &k , f );
g_posdb.setInt ( &k , n );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
// sanity
//float t = g_posdb.getFloat ( &k );
int32_t x = g_posdb.getInt ( &k );
if ( x != n ) { char *xx=NULL;*xx=0; }
HashTableX *dt = hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
// store in buffer
char buf[128];
snprintf(buf,126,"%s:%s int32=%"INT32"",sortByStr, hi->m_prefix,n);
int32_t bufLen = gbstrlen(buf);
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
truePrefix64,
hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k ) )
return false;
return true;
}
// . many many websites got hijacked pages in them...
// . revkim.org/mcdrt/mgntf/sata/sata.htm
// . collegefootballweekly.net/hswsj/riime/sata/sata.htm
char *XmlDoc::getIsHijacked() {
bool hj = false;
if ( ! hj ) hj = isHijackerFormat ( ptr_firstUrl );
if ( ! hj ) hj = isHijackerFormat ( ptr_redirUrl );
if ( ! hj ) {
m_isHijacked = false;
m_isHijackedValid = true;
return &m_isHijacked;
}
uint32_t *h1 = getTagPairHash32();
if ( ! h1 || h1 == (void *)-1 ) return (char *)h1;
// TODO: check it for the malicious tag formats here!!
m_isHijacked = false;
m_isHijackedValid = true;
return &m_isHijacked;
}
// is it a custom error page? ppl do not always use status 404!
char *XmlDoc::getIsErrorPage ( ) {
if ( m_isErrorPageValid ) return &m_isErrorPage;
setStatus ( "getting is error page");
// need a buncha crap
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
// get local link info
LinkInfo *info1 = getLinkInfo1();
// error or blocked
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
// get remote link info
LinkInfo **pinfo2 = getLinkInfo2();
// error or blocked
if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
// convenience
LinkInfo *info2 = *pinfo2;
// default
LinkInfo *li = info1;
//we have to be more sophisticated with longer pages because they
//are could actually be talking about an error message.
//if(xml->getContentLen() > 4096) return false;
// assume not
m_isErrorPage = false;
m_isErrorPageValid = true;
int32_t nn = xml->getNumNodes();
int32_t i;
char* s;
int32_t len;
int32_t len2;
char* errMsg = NULL;
int32_t numChecked = 0;
// check the first header and title tag
// limit it to first 32 nodes
if(nn > 32) nn = 32;
for ( i = 0 ; i < nn ; i++ ) {
switch(xml->getNodeId(i)) {
case TAG_TITLE:
case TAG_H1:
case TAG_H2:
case TAG_H3:
case TAG_SPAN:
char* p = xml->getString(i,true,&len);
if(len == 0 || len > 1024) continue;
char* pend = p + len;
errMsg = matchErrorMsg(p, pend );
++numChecked;
break;
}
if(errMsg || numChecked > 1) break;
}
if(!errMsg) return &m_isErrorPage;
len = gbstrlen(errMsg);
// make sure the error message was not present in the link text
loop:
if ( li && li->getNumGoodInlinks() > 5 ) return &m_isErrorPage;
for (Inlink *k=NULL;li && (k=li->getNextInlink(k)); ) {
//int32_t nli = li->getNumLinkTexts();
//if we can index some link text from the page, then do it
//if(nli > 5) return false;
//for ( int32_t i = 0 ; i < nli ; i++ ) {
s = k->getLinkText();
len2 = k->size_linkText - 1; // exclude \0
//if(!s) break;
//allow error msg to contain link text or vice versa
if(len < len2) {
if(strncasestr(errMsg, s,len,len2) != NULL)
return &m_isErrorPage;
}
else {
if(strncasestr(s, errMsg,len2,len) != NULL)
return &m_isErrorPage;
}
}
if ( li ) { li = info2; info2 = NULL; goto loop; }
m_isErrorPage = true;
return &m_isErrorPage;
}
char* XmlDoc::matchErrorMsg(char* p, char* pend ) {
char utf8Buf[1024];
// int32_t utf8Len = 0;
int32_t len = pend - p;
if(len > 1024) len = 1024;
pend = p + len;
char* tmp = utf8Buf;
while(p < pend) {
*tmp = to_lower_a(*p);
tmp++; p++;
}
p = utf8Buf;
pend = p + len;
char* errMsg = NULL;
while(p < pend) {
int32_t r = pend - p;
switch (*p) { //sorted by first letter, then by frequency
case '4':
errMsg = "404 error";
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
errMsg = "403 forbidden";
if(r>=13&&strncmp(p, errMsg, 13) == 0) return errMsg;
break;
case 'd':
errMsg = "detailed error information follows";
if(r>=34&&strncmp(p, errMsg, 34) == 0) return errMsg;
break;
case 'e':
errMsg = "error 404";
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
errMsg = "error was encountered while processing "
"your request";
if(r>=51&&strncmp(p, errMsg,51) == 0) return errMsg;
errMsg = "error occurred while processing request";
if(r>=39&&strncmp(p, errMsg, 39) == 0) return errMsg;
errMsg = "exception error has occurred";
if(r>=28&&strncmp(p, errMsg,28) == 0) return errMsg;
errMsg = "error occurred";
if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
//http://www.gnu.org/fun/jokes/unix.errors.html
//errMsg = "error message";
//if(strncmp(p, errMsg, 13) == 0) return errMsg;
break;
case 'f':
errMsg = "file not found";
if(r>=14&&strncmp(p, errMsg, 14) == 0) return errMsg;
break;
case 'h':
errMsg = "has moved";
if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
break;
case 'n':
errMsg = "no referrer";
if(r>=12&&strncmp(p, errMsg,12) == 0) return errMsg;
break;
case 'o':
errMsg = "odbc error code = ";
if(r>=18&&strncmp(p, errMsg,18) == 0) return errMsg;
errMsg = "object not found";
if(r>=16&&strncmp(p, errMsg,16) == 0) return errMsg;
break;
case 'p':
errMsg = "page not found";
if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
break;
case 's':
errMsg = "system error";
if(r>=12&&strncmp(p, errMsg, 12) == 0) return errMsg;
break;
case 't':
errMsg = "the application encountered an "
"unexpected problem";
if(r>=49&&strncmp(p, errMsg, 49) == 0) return errMsg;
errMsg = "the page you requested has moved";
if(r>=32&&strncmp(p, errMsg, 32) == 0) return errMsg;
errMsg = "this page has moved";
if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
break;
case 'u':
errMsg = "unexpected problem has occurred";
if(r>=31&&strncmp(p, errMsg, 31) == 0) return errMsg;
errMsg = "unexpected error has occurred";
if(r>=29&&strncmp(p, errMsg, 29) == 0) return errMsg;
errMsg = "unexpected problem occurred";
if(r>=27&&strncmp(p, errMsg, 27) == 0) return errMsg;
errMsg ="unexpected error occurred";
if(r>=25&&strncmp(p, errMsg, 25) == 0) return errMsg;
errMsg ="unexpected result has occurred";
if(r>=33&&strncmp(p, errMsg, 33) == 0) return errMsg;
errMsg ="unhandled exception";
if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
break;
case 'y':
errMsg = "you have been blocked";
if(r>=21&&strncmp(p, errMsg, 21) == 0) return errMsg;
break;
}
//skip to the beginning of the next word
while(p < pend && !is_wspace_a(*p)) p++;
while(p < pend && is_wspace_a(*p)) p++;
}
return NULL;
}
#include "Spider.h"
static SafeBuf *s_wbuf = NULL;
// . this is used by gbsort() above
// . sorts TermInfos alphabetically by their TermInfo::m_term member
int cmptp (const void *v1, const void *v2) {
TermDebugInfo *t1 = *(TermDebugInfo **)v1;
TermDebugInfo *t2 = *(TermDebugInfo **)v2;
char *start = s_wbuf->getBufStart();
// prefix first
char *ps1 = start + t1->m_prefixOff;
char *ps2 = start + t2->m_prefixOff;
if ( t1->m_prefixOff < 0 ) ps1 = NULL;
if ( t2->m_prefixOff < 0 ) ps2 = NULL;
int32_t plen1 = 0; if ( ps1 ) plen1 = gbstrlen(ps1);
int32_t plen2 = 0; if ( ps2 ) plen2 = gbstrlen(ps2);
int32_t pmin = plen1;
if ( plen2 < pmin ) pmin = plen2;
int32_t pn = strncmp ( ps1 , ps2 , pmin );
if ( pn ) return pn;
if ( plen1 != plen2 ) return ( plen1 - plen2 );
// return if groups differ
int32_t len1 = t1->m_termLen;
int32_t len2 = t2->m_termLen;
int32_t min = len1;
if ( len2 < min ) min = len2;
char *s1 = start + t1->m_termOff;
char *s2 = start + t2->m_termOff;
int32_t n = strncasecmp ( s1 , s2 , min );
if ( n ) return n;
// . if length same, we are tied
// . otherwise, prefer the int16_ter
return ( len1 - len2 );
}
// . this is used by gbsort() above
// . sorts TermDebugInfos by their TermDebugInfo::m_wordPos member
int cmptp2 (const void *v1, const void *v2) {
TermDebugInfo *t1 = *(TermDebugInfo **)v1;
TermDebugInfo *t2 = *(TermDebugInfo **)v2;
// word position first
int32_t d = t1->m_wordPos - t2->m_wordPos;
if ( d ) return d;
// secondly drop back to hashgroup i guess
//d = t1->m_hashGroup - t2->m_hashGroup;
d = t1->m_synSrc - t2->m_synSrc;
if ( d ) return d;
// word len
d = t1->m_termLen - t2->m_termLen;
if ( d ) return d;
return 0;
}
bool printLangBits ( SafeBuf *sb , TermDebugInfo *tp ) {
char printed = false;
if ( tp->m_synSrc ) {
sb->safePrintf("&nbsp;");
printed = true;
}
int32_t j = 0;
if ( printed ) j = MAX_LANGUAGES;
for ( ; j < MAX_LANGUAGES ; j++ ) {
int64_t mask = 1LL << j;
//if ( j == tp->m_langId )
// sb->safePrintf("[%s]",
// getLangAbbr(tp->m_langId));
if ( ! (tp->m_langBitVec64 & mask) ) continue;
char langId = j+1;
// match in langvec? that means even if the
// word is in multiple languages we put it in
// this language because we interesect its lang bit
// vec with its neighbors in the sliding window
// algo in setLangVector.
if ( langId == tp->m_langId )
sb->safePrintf("<b>");
sb->safePrintf("%s ", getLangAbbr(langId) );
if ( langId == tp->m_langId )
sb->safePrintf("</b>");
printed = true;
}
if ( ! printed ) {
sb->safePrintf("??");
}
return true;
}
bool XmlDoc::printDoc ( SafeBuf *sb ) {
if ( ! sb ) return true;
Url *u = getFirstUrl();
// hash the url into 64 bits
int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());
// int16_tcut
char *fu = ptr_firstUrl;
char *allowed = "???";
if ( m_isAllowedValid && m_isAllowed ) allowed = "yes";
else if ( m_isAllowedValid ) allowed = "no";
int32_t ufn = -1;
if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
time_t spideredTime = getSpideredTime();
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
"content=\"text/html; charset=utf-8\">"
"<table cellpadding=3 border=0>\n"
"<tr>"
"<td width=\"25%%\">docId</td>"
"<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>"
"</tr>\n"
"<tr>"
"<td width=\"25%%\">uh48</td>"
"<td>%"UINT64"</td>"
"</tr>\n"
"<tr>"
"<td width=\"25%%\">uh64</td>"
"<td>%"UINT64"</td>"
"</tr>\n"
"<tr>"
"<td>index error code</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>http status</td>"
"<td>%i</td>"
"</tr>\n"
"<tr>"
"<td>url filter num</td>"
"<td>%"INT32"</td>"
"</tr>\n"
"<tr>"
"<td>other - errno</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>robots.txt allows</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>metalist size</td>"
"<td>%"INT32"</td>"
"</tr>\n"
"<tr>"
"<td>url</td>"
"<td><a href=\"%s\">%s</a></td>"
"</tr>\n"
,
cr->m_coll,
m_docId ,
m_docId ,
getFirstUrlHash48(), // uh48
getFirstUrlHash64(), // uh48
mstrerror(m_indexCode),
m_httpStatus,
ufn,
mstrerror(g_errno),
allowed,
m_metaListSize,
fu,
fu
);
if ( ptr_redirUrl )
sb->safePrintf(
"<tr>"
"<td>redir url</td>"
"<td><a href=\"%s\">%s</a></td>"
"</tr>\n"
,ptr_redirUrl
,ptr_redirUrl
);
else
sb->safePrintf(
"<tr>"
"<td>redir url</td>"
"<td>--</td>"
"</tr>\n"
);
sb->safePrintf("<tr><td>hostHash64</td><td>0x%"XINT64"</td></tr>",
(uint64_t)getHostHash32a());
sb->safePrintf("<tr><td>site</td><td>");
sb->safeMemcpy(ptr_site,size_site-1);
sb->safePrintf("</td></tr>\n");
if ( m_siteHash32Valid )
sb->safePrintf("<tr><td>siteHash32</td><td>0x%"XINT32"</td></tr>\n",
m_siteHash32);
if ( m_domHash32Valid )
sb->safePrintf("<tr><td>domainHash32</td><td>0x%"XINT32"</td></tr>\n",
m_domHash32);
sb->safePrintf ( "<tr>"
"<td>domainHash8</td>"
"<td>0x%"XINT32"</td>"
"</tr>\n"
,
(int32_t)g_titledb.getDomHash8FromDocId(m_docId)
);
sb->safePrintf(
"<tr>"
"<td>coll</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>spidered date</td>"
"<td>%s UTC</td>"
"</tr>\n"
,
cr->m_coll,
asctime(gmtime ( &spideredTime ))
);
/*
char *ms = "-1";
if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
sb->safePrintf (
"<tr>"
"<td>min pub date</td>"
"<td>%s UTC</td>"
"</tr>\n" , ms );
ms = "-1";
if ( m_maxPubDate != -1 ) ms = asctime(gmtime ( &m_maxPubDate ));
sb->safePrintf (
"<tr>"
"<td>max pub date</td>"
"<td>%s UTC</td>"
"</tr>\n" , ms );
*/
// our html template fingerprint
sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
if ( m_tagPairHash32Valid )sb->safePrintf("%"UINT32"",
(uint32_t)m_tagPairHash32);
else sb->safePrintf("invalid");
sb->safePrintf("</td></tr>\n" );
// print list we added to delete stuff
if ( m_indexCode && m_oldDocValid && m_oldDoc ) {
// skip debug printing for now...
//return true;
sb->safePrintf("</table><br>\n");
sb->safePrintf("<h2>Delete Meta List</h2>");
printMetaList ( m_metaList , m_metaList + m_metaListSize ,sb);
}
if ( m_indexCode || g_errno ) {
printMetaList ( m_metaList , m_metaList + m_metaListSize, sb );
}
if ( m_indexCode ) return true;
if ( g_errno ) return true;
// sanity check
//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
/*
sb->safePrintf("<tr><td>next spider date</td>"
"<td>%s UTC</td></tr>\n"
"<tr><td>next spider priority</td>"
"<td>%"INT32"</td></tr>\n" ,
asctime(gmtime( &m_nextSpiderTime )) ,
(int32_t)m_nextSpiderPriority );
*/
// must always start with http i guess!
if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
// show the host that should spider it
//int32_t domLen ; char *dom = getDomFast ( fu , &domLen , true );
//int32_t hostId;
if ( m_sreqValid ) {
// must not block
SpiderRequest *oldsr = &m_sreq;
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
"</td>\n"
"<td><b>%"UINT32"</b></td></tr>\n",shard);
}
time_t ts = m_firstIndexedDate;
sb->safePrintf("<tr><td>first indexed date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime(gmtime(&ts )) );
ts = m_outlinksAddedDate;
sb->safePrintf("<tr><td>outlinks last added date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime(gmtime(&ts )) );
// hop count
sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td></tr>\n",
(int32_t)m_hopCount);
// thumbnails
ThumbnailArray *ta = (ThumbnailArray *) ptr_imageData;
if ( ta ) {
int32_t nt = ta->getNumThumbnails();
sb->safePrintf("<tr><td># thumbnails</td>"
"<td>%"INT32"</td></tr>\n",nt);
for ( int32_t i = 0 ; i < nt ; i++ ) {
ThumbnailInfo *ti = ta->getThumbnailInfo(i);
sb->safePrintf("<tr><td>thumb #%"INT32"</td>"
"<td>%s (%"INT32"x%"INT32",%"INT32"x%"INT32") "
, i
, ti->getUrl()
, ti->m_origDX
, ti->m_origDY
, ti->m_dx
, ti->m_dy
);
ti->printThumbnailInHtml ( sb , 100,100,true,NULL) ;
// end the row for this thumbnail
sb->safePrintf("</td></tr>\n");
}
}
char *ddd;
time_t datedbDate = (time_t)m_pubDate;
if ( datedbDate != -1 ) ddd = asctime ( gmtime(&datedbDate ));
else ddd = "---";
char strLanguage[128];
languageToString(m_langId, strLanguage);
// print tags
//if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
SafeBuf tb;
TagRec *ogr = NULL;
if ( m_tagRecValid ) ogr = &m_tagRec;
if ( ogr ) ogr->printToBufAsHtml ( &tb , "old tag" );
SafeBuf *ntb = NULL;
if ( m_newTagBufValid ) ntb = getNewTagBuf();
if ( ntb ) {
// this is just a sequence of tags like an rdblist
char *pt = ntb->getBufStart();
char *ptend = pt + ntb->length();
for ( ; pt < ptend ; ) {
// skip rdbid
pt++;
// cast it
Tag *tag = (Tag *)pt;
// skip it
pt += tag->getRecSize();
// print tag out
tag->printToBufAsHtml ( &tb, "new tag");
}
}
// prevent (null) from being displayed
tb.pushChar('\0');
//Tag *tag1 = gr->getTag ("sitenuminlinks");
//Tag *tag2 = gr->getTag ("sitepop");
//int32_t sni = 0;
//int32_t spop = 0;
//if ( tag1 ) sni = atol(tag1->m_data);
//if ( tag2 ) spop = atol(tag2->m_data);
int32_t sni = m_siteNumInlinks;
//int32_t spop = m_sitePop;
LinkInfo *info1 = ptr_linkInfo1;
//LinkInfo *info2 = ptr_linkInfo2;
//int32_t sni ;
//int32_t extrapolated = 0;
//if ( info1 ) extrapolated = info1->m_numInlinksExtrapolated;
//if ( info1 ) sni = info1->m_siteNumInlinks;
char *ipString = iptoa(m_ip);
char *estimated = "";
if ( datedbDate & 0x01 ) // tr->datedbDateIsEstimated() )
estimated = "<nobr><b>[estimated from bisection]</b></nobr>";
//char *ls = getIsLinkSpam();
Links *links = getLinks();
// sanity check. should NEVER block!
if ( links == (void *)-1 ) { char *xx=NULL;*xx=0; }
// this is all to get "note"
//char *note = NULL;
// make it a URL
Url uu; uu.set ( ptr_firstUrl , false );
// sanity check
Xml *xml = getXml();
// sanity check
if ( xml == (void *)-1 ) { char *xx=NULL;*xx=0; }
sb->safePrintf (
"<tr><td>datedb date</td><td>%s UTC (%"UINT32")%s"
"</td></tr>\n"
"<tr><td>compressed size</td><td>%"INT32" bytes</td></tr>\n"
"<tr><td>original charset</td><td>%s</td></tr>\n"
//"<tr><td>site num inlinks</td><td><b>%"INT32"%</b></td></tr>\n"
//"<tr><td>total extrapolated linkers</td><td>%"INT32"</td></tr>\n"
"<tr><td><b>title rec version</b></td><td><b>%"INT32"</b>"
"</td></tr>\n"
"<tr><td>adult bit</td><td>%"INT32"</td></tr>\n"
//"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n"
"<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n"
"<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n"
//"<tr><td>index article only?</td><td>%"INT32"</td></tr>\n"
"%s\n"
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
"%s</td></tr>\n"
"<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n"
"<tr><td>content truncated</td><td>%"INT32"</td></tr>\n"
"<tr><td>content type</td><td>%"INT32" (%s)</td></tr>\n"
"<tr><td>language</td><td>%"INT32" (%s)</td></tr>\n"
"<tr><td>country</td><td>%"INT32" (%s)</td></tr>\n"
"<tr><td>time axis used</td><td>%"INT32"</td></tr>\n"
"<tr><td>metadata</td><td>%s</td></tr>\n"
"</td></tr>\n",
ddd ,
(uint32_t)datedbDate ,
estimated ,
m_oldTitleRecSize,
get_charset_str(m_charset),
//sni ,
//ptr_linkInfo1->m_numInlinksExtrapolated,
(int32_t)m_version ,
(int32_t)m_isAdult,
//(int32_t)m_isLinkSpam,
//m_note,
(int32_t)m_isPermalink,
(int32_t)m_isRSS,
//(int32_t)m_eliminateMenus,
// tag rec
tb.getBufStart(),
ipString,
cr->m_coll,
ipString,
size_utf8Content - 1,
(int32_t)m_isContentTruncated,
(int32_t)m_contentType,
g_contentTypeStrings[(int)m_contentType] ,
(int32_t)m_langId,
strLanguage,
(int32_t)m_countryId,
g_countryCode.getName(m_countryId),
m_useTimeAxis,
ptr_metadata);
/*
int32_t boost1 = getBoostFromSiteNumInlinks ( sni );
sb->safePrintf (
"<tr><td><b>title weight</b></td>"
"<td><b>%"UINT32"%%</b></td></tr>\n"
"<tr><td>header weight</td>"
"<td>%"UINT32"%%</td></tr>\n"
"<tr><td>url path weight</td>"
"<td>%"UINT32"%%</td></tr>\n"
"<tr><td>external link text weight</td>"
"<td>%"UINT32"%%</td></tr>\n"
"<tr><td>internal link text weight</td>"
"<td>%"UINT32"%%</td></tr>\n"
"<tr><td>concept weight</td>"
"<td>%"UINT32"%%</td></tr>\n"
"<tr><td>score boost from site num inlinks</td>"
"<td>%"INT32"%%</td>"
"</tr>\n",
(int32_t)m_titleWeight,
(int32_t)m_headerWeight,
(int32_t)m_urlPathWeight,
(int32_t)m_externalLinkTextWeight,
(int32_t)m_internalLinkTextWeight,
(int32_t)m_conceptWeight ,
boost1 );
*/
// print title
//sb->safePrintf( "<tr><td>title</td><td>%s</td></tr>\n" ,
// ti->m_title );
// print the new, unstored, gigabit vector
if ( size_gigabitHashes ) {
// get gigabit vector
int32_t *vec = ptr_gigabitHashes;
// point to scores
int32_t *ss = ptr_gigabitScores;
int32_t count = 0;
int32_t total = 0;
sb->safePrintf ( "<tr><td>stored gigabit vector</td><td>");
while ( *vec ) {
sb->safePrintf ( "%08"XINT32" ", *vec );
sb->safePrintf ( "(%05"INT32") ", *ss );
vec++;
ss++;
count++;
total++;
//if ( total >= GIGABITS_IN_VECTOR ) break;
if ( count < 4 ) continue;
count = 0;
sb->safePrintf ( "<br>\n");
}
sb->safePrintf ( "</tr>\n");
}
// print dmoz stuff
int32_t numCatIds = size_catIds/4;
int32_t numIndCatIds = size_indCatIds/4;
sb->safePrintf( "<tr><td>Number of Category IDs</td>"
"<td>%"INT32"</td></tr>\n", numCatIds );
char *dtp = ptr_dmozTitles;
char *dsp = ptr_dmozSumms;
char *dap = ptr_dmozAnchors;
for (int32_t i = 0; i < numCatIds; i++) {
// print the ID
sb->safePrintf( "<tr><td>ID #%"INT32"</td><td>%"INT32"</td></tr>\n",
i, ptr_catIds[i]);
// print the title
if ( dtp ) {
sb->safePrintf( "<tr><td>Title #%"INT32" </td><td>",i);
sb->safeMemcpy( dtp,gbstrlen(dtp) );
sb->safePrintf( "</td></tr>\n");
dtp += gbstrlen(dtp) + 1;
}
// print the summary
if ( dsp ) {
sb->safePrintf( "<tr><td>Summary #%"INT32"</td><td>", i);
sb->safeMemcpy( dsp , gbstrlen(dsp ) ) ;
sb->safePrintf( "</td></tr>\n");
dsp += gbstrlen ( dsp ) + 1;
}
// print the anchor
if ( dap ) {
sb->safePrintf( "<tr><td>Anchor #%"INT32"</td><td>",i);
sb->safeMemcpy( dap , gbstrlen(dap) );
sb->safePrintf( "</td></tr>\n");
dap += gbstrlen ( dap ) + 1;
}
}
sb->safePrintf( "<tr><td>Number of Indirect Category IDs</td>"
"<td>%"INT32"</td></tr>\n", numIndCatIds);
for (int32_t i = 0; i < numIndCatIds; i++) {
// print the ID
sb->safePrintf( "<tr><td>Indirect ID #%"INT32"</td>"
"<td>%"INT32"</td></tr>\n",
i, ptr_indCatIds[i]);
}
if ( info1 ) {
//sb->safePrintf("<tr><td>page pop</td><td>%"INT32"</td></tr>\n",
// info1->m_pagePop );
//sb->safePrintf("<tr><td>whole site pop</td>"
// "<td>%"INT32"</td></tr>\n",
// spop );
sb->safePrintf("<tr><td>num GOOD links to whole site</td>"
"<td>%"INT32"</td></tr>\n",
sni );
}
// close the table
sb->safePrintf ( "</table></center><br>\n" );
//
// convert document into json representing multiple documents
// if it makes sense. sometimes a single url contains multiple
// subdocuments that each should have their own url, but do not,
// so we fix that here.
//
SafeBuf *dbr = getDiffbotReply();
if ( dbr->length() ) {
sb->safePrintf("<b>START EXACT DIFFBOT REPLY</b><br>\n");
sb->safePrintf("<pre>");
sb->safeMemcpy ( dbr );
sb->safePrintf("</pre>");
sb->safePrintf("<b>END EXACT DIFFBOT REPLY</b><br><br>\n");
}
// print outlinks
links->print( sb );
//
// PRINT ADDRESSES (prints streets first)
//
Addresses *aa = getAddresses ();
if ( ! aa || aa == (Addresses *)-1 ) { char *xx=NULL;*xx=0;}
aa->print(sb,uh64);
//
// PRINT PUB DATE CANDIDATES
//
// print stored pub date candidates which we indexed as clock
// or not clock!
Dates *dp = getDates() ;
// should never block!
if ( dp == (void *)-1 ) { char *xx=NULL;*xx=0; }
// print it out
if ( dp ) dp->printDates ( sb );
//return true;
//
// PRINT SECTIONS
//
Sections *sections = getSections();
if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;}
//SectionVotingTable *nsvt = getNewSectionVotingTable();
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
//SectionVotingTable *osvt = getOldSectionVotingTable();
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
// these are nice
//HashTableX *pt = dp->getPhoneTable();
//HashTableX *et = dp->getEmailTable();
//HashTableX *at = aa->getPlaceTable();
//HashTableX *tt = dp->getTODTable();
//HashTableX *rt = ev->getRegistrationTable();
//HashTableX *priceTable = dp->getPriceTable();
//sections->print ( sb , pt , et , NULL , at , tt , priceTable );
// try the new print function
//sections->print2 ( sb , NULL, NULL , NULL , false );
printRainbowSections ( sb , NULL );
//nsvt->print ( sb , "NEW Sections Voting Table" );
//osvt->print ( sb , "OLD Sections Voting Table" );
//
// PRINT LINKINFO
//
//if ( info1 )
// info1->print ( sb , cr->m_coll );
//if ( info2 ) {
// sb->safePrintf ( "<tr><td><b>IMPORTED LINK INFO:"
// "</b></td></tr>" );
// info2->print ( sb , cr->m_coll );
//}
// cut it int16_t for debugging
logf(LOG_DEBUG,"xmldoc: FIX ME remove return");
//return true;
//
// PRINT LINKINFO
//
char *p = m_pageLinkBuf.getBufStart();
int32_t plen = m_pageLinkBuf.length();
sb->safeMemcpy ( p , plen );
//
// PRINT SITE LINKINFO
//
p = m_siteLinkBuf.getBufStart();
plen = m_siteLinkBuf.length();
sb->safeMemcpy ( p , plen );
//
// BEGIN PRINT GIGABITS
//
// print out for PageParser.cpp
const char *help =
"The <i>Gigabits</i> are words extracted from the document "
"that are deemed to best represent it. The <i>Pop</i> column "
"is the popularity of the word and it ranges from 0 to 1000 "
"and is how many documents out of a sample of 1000 that "
"contained that word. The <i>Score</i> of each Gigabit is "
"based on the popularity and how many times the word appeared "
"in the document. Higher scores are deemed more "
"representative of the document. The hashes of these Gigabits "
"are stored with the cached copy of the document as numeric "
"hashes for purposes of topic clustering. You can see these "
"hashes by clicking on the <i>[info]</i> link next to "
"any search result.<br><br>";
if ( m_numTop > 0 )
sb->safePrintf( "<table width=100%%>"
"<td bgcolor=pink>\n"
"%s"
"<table>"
"<tr><td>#</td><td>"
"<b>%"INT32" Gigabits</b></td><td><b>Score</b>"
"</td>"
"<td><b>Pop</b></td>"
"<td><b>Hash</b></td>"
"</tr>\n",
help,m_numTop);
// . print out the top gigabits we harvested
// . start with the highest scoring node first, the last node since
// nodes are ranked by lowest to highest key
int32_t total = 0;
for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
// get the info
GigabitInfo *gi = m_top[i];
// print row
sb->safePrintf("<tr><td>%"INT32"</td><td>",i);
// print gigabit
sb->safeMemcpy(gi->m_ptr , gi->m_len );
// get 32 bit hash
uint32_t h = gi->m_hash & 0xffffffff;
// never allow 0
if ( h == 0 ) h = 1;
// if unicode, pop's hi bit is set
sb->safePrintf( "</td>"
"<td>%"INT32"</td>"
"<td>%"INT32"</td>"
"<td>%08"XINT32"</td>"
"</tr>\n",
(int32_t)gi->m_pts,
(int32_t)gi->m_pop,
(int32_t)h );
// add up all scores
total += gi->m_pts;
}
// close table
if ( m_numTop > 0 ) {
sb->safePrintf("<tr><td></td><td></td><td>"
"<b>%"INT32"</b></td></tr>\n",total);
sb->safePrintf("</table>\n");
}
//
// END PRINT GIGABITS
//
// note this
sb->safePrintf("<h2>NEW Meta List</h2>");
printMetaList ( m_metaList , m_metaList + m_metaListSize , sb );
// all done if no term table to print out
if ( ! m_wts ) return true;
// print out the rules in Weights.cpp
/*
sb->safePrintf ("<br>"
"<table border=1 cellpadding=0>"
"<tr><td>Rule #3</td>"
"<td>First 40 words in ()'s.</td></tr>\n"
"<tr><td>Rule #4</td>"
"<td>Adjacent to bad punct.</td></tr>\n"
"<tr><td>Rule #5</td>"
"<td>In a link.</td></tr>\n"
"<tr><td>Rule #6</td>"
"<td>First occurence in a section. Actual weight "
"depends on section word count.</td></tr>\n"
"<tr><td>Rule #7</td>"
"<td>In a header tag. h1 is most weight.</td></tr>\n"
"<tr><td>Rule #8</td>"
"<td>In a \"ul\" list.</td></tr>\n"
"<tr><td>Rule #9</td>"
"<td>Repeated occurence in the same fragment or "
"sentence.</td></tr>\n"
"<tr><td>Rule #10</td>"
"<td>In a comma-separated list.</td></tr>\n"
"<tr><td>Rule #11</td>"
"<td>Promoted isolated capitalized words, demote "
"if it is in a capitalized phrase.</td></tr>\n"
"<tr><td>Rule #13</td>"
"<td>First occurence in document.</td></tr>\n"
"<tr><td>Rule #15</td>"
"<td>Word to phrase ratio weight.</td></tr>\n"
"<tr><td>Rule #16</td>"
"<td>At the beginning of a fragment or sentence."
"</td></tr>\n"
"<tr><td>Rule #17</td>"
"<td>If immediately after a quote, iff not "
"promoted by Rule #18.</td></tr>\n"
"<tr><td>Rule #18</td>"
"<td>Promote phrase if capitalized. Demote phrase "
"if mixed case without hypehn.</td></tr>\n"
"<tr><td>Rule #22</td>"
"<td>Demote phrases containing bad punct.</td></tr>\n"
"<tr><td>Rule #23</td>"
"<td>In script, style, select or marquee tag. "
"</td></tr>\n"
"<tr><td>Rule #23</td>"
"<td>Follows a number.</td></tr>\n"
"<tr><td>Rule #25</td>"
"<td>Demote non-hyphenated phrases that would split "
"adjacent hyphenated phrases.</td></tr>\n"
"<tr><td>Rule #26</td>"
"<td>Demote if in a repeated fragment.</td></tr>\n"
"<tr><td>Rule #27</td>"
"<td>Demote if in a menu section.</td></tr>\n"
"<tr><td>Rule #28</td>"
"<td>Pattern spam detector.</td></tr>\n"
"</table>\n"
"<br>"
);
*/
//
// BEGIN PRINT HASHES TERMS
//
// int16_tcut
HashTableX *wt = m_wts;
// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
TermDebugInfo **tp = NULL;
// add them with this counter
int32_t nt = 0;
int32_t nwt = 0;
if ( wt ) {
nwt = wt->m_numSlots;
tp = (TermDebugInfo **)wt->m_keys;
}
// now print the table we stored all we hashed into
for ( int32_t i = 0 ; i < nwt ; i++ ) {
// skip if empty
if ( wt->m_flags[i] == 0 ) continue;
// breathe
//QUICKPOLL(m_niceness);
// get its key, date=32bits termid=64bits
//key96_t *k = (key96_t *)wt->getKey ( i );
// get the TermDebugInfo
TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
// point to it for sorting
tp[nt++] = ti;
}
// set this for cmptp
s_wbuf = &m_wbuf;
// sort them alphabetically by Term
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
// determine how many non 1.0 weight fields we got in the vectors
/*
int32_t count [ MAX_RULES ];
memset ( count , 0 , MAX_RULES * 4 );
for ( int32_t i = 0 ; i < nt ; i++ ) {
TermDebugInfo *ti = tp[i];
for ( int32_t j = 0 ; j < MAX_RULES ; j++ )
if ( ti->m_rv[j] != 1.0 ) count[j]++;
}
// count the counts
char fbuf[9024];
char *fp = fbuf;
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
if ( ! count[j] ) continue;
fp += sprintf(fp ,"<td><b>R#%"INT32"</b></td>",j);
}
*/
// print them out in a table
char hdr[1000];
sprintf(hdr,
"<table border=1 cellpadding=0>"
"<tr>"
// this messes up Test.cpp diff'ing
//"<td><b>#</b></td>"
"<td><b>Prefix</b></td>"
"<td><b>WordNum</b></td>"
"<td><b>Lang</b></td>"
"<td><b>Term</b></td>"
//"%s"
//"<td><b>Weight</b></td>"
//"<td><b>Spam</b></td>"
"<td><b>Desc</b></td>"
"<td><b>TermId/TermHash48</b></td>"
"<td><b>ShardByTermId?</b></td>"
"<td><b>Note</b></td>"
"</tr>\n"
//,fbuf
);
sb->safePrintf("%s",hdr);
char *start = m_wbuf.getBufStart();
int32_t rcount = 0;
for ( int32_t i = 0 ; i < nt ; i++ ) {
// see if one big table causes a browser slowdown
if ( (++rcount % TABLE_ROWS) == 0 )
sb->safePrintf("<!--ignore--></table>%s",hdr);
char *prefix = "&nbsp;";
if ( tp[i]->m_prefixOff >= 0 )
prefix = start + tp[i]->m_prefixOff;
bool isFacet = false;
if ( prefix &&
prefix[0]=='g' &&
strncmp(prefix,"gbfacet",7)== 0 )
isFacet = true;
sb->safePrintf ( "<tr>"
//"<td><b>%"INT32"</b></td>"
"<td>%s</td>"
//i ,
, prefix
);
if ( isFacet )
sb->safePrintf("<td>--</td>");
else
sb->safePrintf( "<td>%"INT32"</td>"
, tp[i]->m_wordNum );
// print lang
//char langId = tp[i]->m_langId;
// print out all langs word is in if it's not clear
// what language it is. we use a sliding window to
// resolve some ambiguity, but not all, so print out
// the possible langs here
sb->safePrintf("<td>");
if ( isFacet )
sb->safePrintf("--");
else
printLangBits ( sb , tp[i] );
sb->safePrintf("</td>");
// print the term
sb->safePrintf("<td><nobr>");
if ( tp[i]->m_synSrc )
sb->pushChar('*');
char *term = start + tp[i]->m_termOff;
int32_t termLen = tp[i]->m_termLen;
sb->safeMemcpy ( term , termLen );
/*
char *dateStr = "&nbsp;";
int32_t ddd = tp[i]->m_date;
uint8_t *tddd = (uint8_t *)&ddd;
char tbbb[32];
if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
sprintf(tbbb,"evIds %"INT32"-%"INT32"",
(int32_t)tddd[1],(int32_t)tddd[0]);
dateStr = tbbb;
}
else if ( ddd )
dateStr = asctime ( gmtime(&ddd ));
*/
//char ss[30];
//if ( tp[i]->m_spam == -1.0 ) sprintf(ss,"&nbsp;");
//else if ( tp[i]->m_spam == 0.0 ) sprintf(ss,"--");
//else sprintf ( ss , "%.03f",1.0-tp[i]->m_spam);
sb->safePrintf ( "</nobr></td>"
);
// print the weight vector before Weight and Spam
/*
float prod = 1.0;
for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
if ( ! count[j] ) continue;
if ( tp[i]->m_isSynonym )
sb->safePrintf("<td>&nbsp;</td>" );
else if ( tp[i]->m_rv[j] == 1.0 )
sb->safePrintf("<td>&nbsp;</td>" );
else sb->safePrintf("<td>%.02f</td>",tp[i]->m_rv[j] );
// product up
prod *= tp[i]->m_rv[j];
}
// sanity check
// maybe look into this at some point, but not a big deal!!
//float err = prod - tp[i]->m_weight;
//if ( err > .05 )
// logf(LOG_DEBUG,"weights: prod was %.02f should be "
// "%.02f",prod,tp[i]->m_weight);
*/
//char *desc = "&nbsp;";
//if ( tp[i]->m_descOff >= 0 )
// desc = start + tp[i]->m_descOff;
/*
// synonyms are always 1/4 weight of original
if ( tp[i]->m_isSynonym )
sb->safePrintf("<td>&nbsp;</td>" );
else
sb->safePrintf("<td>%.03f</td>", tp[i]->m_weight );
*/
sb->safePrintf ( //"<td>%s</td>"
//"<td><b>%"UINT32"</b></td>"
//"<td><nobr>%s</nobr></td>"
"<td><nobr>%s",
getHashGroupString(tp[i]->m_hashGroup)
);
//if ( tp[i]->m_synSrc ) {
// char ss = tp[i]->m_synSrc;
// sb->safePrintf(" - %s",g_synonyms.getSourceString(ss));
//}
sb->safePrintf ( "</nobr></td>" );
sb->safePrintf ( "<td>%016"UINT64"</td>"
,
//ss ,
//(uint32_t)tp[i]->m_score32 ,
//dateStr ,
//desc, // start + tp[i]->m_descOff ,
(uint64_t)(tp[i]->m_termId & TERMID_MASK) );
if ( tp[i]->m_shardByTermId ) sb->safePrintf("<td><b>1</b></td>" );
else sb->safePrintf("<td>0</td>" );
sb->safePrintf("<td>");
// there is no prefix for such terms now
// TODO: store actual key in there i guess?? or just this bit.
int32_t val32 = 0;
if ( strncmp(prefix,"gbfacet",7) == 0 )
val32 = g_posdb.getInt(&tp[i]->m_key);
// . this is like gbxpathsitehash1234567
// . the number following it is the hash
// . the value stored in the posdb key is the hash of the
// inner html content of that xpath/site for this page
if ( strncmp(term,"facetField=gbxpathsitehash",26)==0)
sb->safePrintf("<b>Term</b> is a 32-bit hash of the "
"X-path of "
"a section XOR'ed with the 32-bit "
"hash of this document's subdomain. "
"[%"UINT32"] is the 32-bit hash of the "
"Inner HTML of this section stored "
"in the posdb key instead of "
"the usual stuff. This is also "
"sharded by termId!",
(uint32_t)val32
//(int32_t)tp[i]->m_sentHash32
);
sb->safePrintf("</td>");
sb->safePrintf("</tr>\n");
}
sb->safePrintf("</table><br>\n");
//
// END PRINT HASHES TERMS
//
return true;
}
bool XmlDoc::printMenu ( SafeBuf *sb ) {
// encode it
SafeBuf ue;
ue.urlEncode ( ptr_firstUrl );
// get
sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
"content=\"text/html; charset=utf-8\">" );
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
/*
char *coll = cr->m_coll;
int64_t d = m_docId;
// print links at top
sb->safePrintf(
//"<a href=/print?c=%s&u=%s&page=1>general info</a> | "
//"<a href=/print?c=%s&u=%s&page=2>page inlinks</a> | "
//"<a href=/print?c=%s&u=%s&page=3>site inlinks</a> | "
//"<a href=/print?c=%s&u=%s&page=4>sections</a> | "
//"<a href=/print?c=%s&u=%s&page=5>indexed terms</a> | "
// the breakdown of when it was spidered and when it
// is due to be spidered again. and any errors
// encountered when spidering
//"<a href=/print?c=%s&u=%s&page=6>spider stats</a> | "
//"<a href=/print?c=%s&u=%s&page=7>cached page</a>"
"<a href=/print?c=%s&d=%"INT64"&page=1>general info</a> | "
"<a href=/print?c=%s&d=%"INT64"&page=2&recompute=1>"
"page inlinks</a> | "
"<a href=/print?c=%s&d=%"INT64"&page=3>site inlinks</a> | "
//"<a href=/print?c=%s&d=%"INT64"&page=4>sections</a> | "
"<a href=/print?c=%s&d=%"INT64"&page=5>indexed terms</a>"
// the breakdown of when it was spidered and when it
// is due to be spidered again. and any errors
// encountered when spidering
//"<a href=/print?c=%s&d=%"INT64"&page=6>spider stats</a> |"
//" <a href=/print?c=%s&d=%"INT64"&page=7>cached page</a>"
"<br>"
"<br>"
,coll,d//ue.getBufStart()
,coll,d//ue.getBufStart()
,coll,d//ue.getBufStart()
//,coll,d//ue.getBufStart()
,coll,d//ue.getBufStart()
//,coll,d//ue.getBufStart()
//,coll,d//ue.getBufStart()
);
*/
return true;
}
// if printDocForProCog, an entry function, blocks, we gotta re-call it
static void printDocForProCogWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// note it
THIS->setStatus ( "in print doc for pro cog wrapper" );
// get it
bool status = THIS->printDocForProCog ( THIS->m_savedSb ,
THIS->m_savedHr );
// return if it blocked
if ( ! status ) return;
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
}
// in PageRoot.cpp
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
bool printGigablast );
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
if ( ! sb ) return true;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
m_masterLoop = printDocForProCogWrapper;
m_masterState = this;
m_savedSb = sb;
m_savedHr = hr;
// if we are generating site or page inlinks info for a
// non docid based url, then store that info in the respective
// safe bufs
m_useSiteLinkBuf = true;
m_usePageLinkBuf = true;
int32_t page = hr->getLong("page",1);
// for some reason sections page blocks forever in browser
if ( page != 7 && ! m_printedMenu ) { // && page != 5 )
printFrontPageShell ( sb , "search" , cr , false );
m_printedMenu = true;
//printMenu ( sb );
}
if ( page == 1 )
return printGeneralInfo(sb,hr);
if ( page == 2 )
return printPageInlinks(sb,hr);
if ( page == 3 )
return printSiteInlinks(sb,hr);
if ( page == 4 )
return printRainbowSections(sb,hr);
if ( page == 5 )
return printTermList(sb,hr);
if ( page == 6 )
return printSpiderStats(sb,hr);
if ( page == 7 )
return printCachedPage(sb,hr);
return true;
}
bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
// int16_tcut
char *fu = ptr_firstUrl;
// sanity check
Xml *xml = getXml();
// blocked?
if ( xml == (void *)-1 ) return false;
// error?
if ( ! xml ) return true;
char *ict = getIsContentTruncated();
if ( ! ict ) return true; if ( ict == (char *)-1 ) return false;
char *at = getIsAdult();
if ( ! at ) return true; if ( at == (void *)-1 ) return false;
char *ls = getIsLinkSpam();
if ( ! ls ) return true; if ( ls == (void *)-1 ) return false;
uint8_t *ct = getContentType();
if ( ! ct ) return true; if ( ct == (void *)-1 ) return false;
uint16_t *cs = getCharset ( );
if ( ! cs ) return true; if ( cs == (uint16_t *)-1 ) return false;
char *pl = getIsPermalink();
if ( ! pl ) return true; if ( pl == (char *)-1 ) return false;
char *isRSS = getIsRSS();
if ( ! isRSS ) return true; if ( isRSS == (char *)-1 ) return false;
int32_t *ip = getIp();
if ( ! ip ) return true; if ( ip == (int32_t *)-1 ) return false;
uint8_t *li = getLangId();
if ( ! li ) return true; if ( li == (uint8_t *)-1 ) return false;
uint16_t *cid = getCountryId();
if ( ! cid ) return true; if ( cid == (uint16_t *)-1 ) return false;
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 ) return true; if ( info1 == (void *)-1 ) return false;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
//char *ls = getIsLinkSpam();
//Links *links = getLinks();
// blocked?
//if ( links == (void *)-1 ) { char *xx=NULL;*xx=0;}//return false;
// error?
//if ( ! links ) return true;
// make it a URL
Url uu; uu.set ( fu , false );
char *allowed = "???";
int32_t allowedInt = 1;
if ( m_isAllowedValid && m_isAllowed ) {
allowed = "yes";
allowedInt = 1;
}
else if ( m_isAllowedValid ) {
allowed = "no";
allowedInt = 0;
}
int32_t ufn = -1;
if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
char *es = mstrerror(m_indexCode);
if ( ! m_indexCode ) es = mstrerror(g_errno);
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
//int32_t groupId = g_hostdb.getGroupIdFromDocId(m_docId);
//Host *group = g_hostdb.getGroup(groupId);
int32_t shardNum = getShardNumFromDocId ( m_docId );
Host *hosts = g_hostdb.getShard ( shardNum );
Host *h = &hosts[0];
if ( ! isXml )
sb->safePrintf (
"<table cellpadding=3 border=0>\n"
"<tr>"
"<td width=\"25%%\">docId</td>"
"<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>"
"</tr>\n"
"<tr>"
"<td width=\"25%%\">on host #</td>"
"<td>%"INT32"</td>"
"</tr>\n"
"<tr>"
"<td>index error code</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>robots.txt allows</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>url</td>"
"<td><a href=\"%s\">%s</a></td>"
"</tr>\n"
,
cr->m_coll,
m_docId ,
m_docId ,
h->m_hostId,
es,
allowed,
fu,
fu
);
else
sb->safePrintf (
"<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
"\t<coll><![CDATA[%s]]></coll>\n"
"\t<docId>%"INT64"</docId>\n"
"\t<indexError><![CDATA[%s]]></indexError>\n"
"\t<robotsTxtAllows>%"INT32""
"</robotsTxtAllows>\n"
"\t<url><![CDATA[%s]]></url>\n"
,
cr->m_coll,
m_docId ,
es,
allowedInt,//(int32_t)m_isAllowed,
fu
);
char *redir = ptr_redirUrl;
if ( redir && ! isXml ) {
sb->safePrintf(
"<tr>"
"<td>redir url</td>"
"<td><a href=\"%s\">%s</a></td>"
"</tr>\n"
,redir
,redir );
}
else if ( redir ) {
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
"</redirectUrl>\n" ,redir );
}
if ( m_indexCode || g_errno ) {
if ( ! isXml ) sb->safePrintf("</table><br>\n");
else sb->safePrintf("</response>\n");
return true;
}
// must always start with http i guess!
if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
time_t ts = (time_t)m_firstIndexedDate;
if ( ! isXml )
sb->safePrintf("<tr><td>first indexed date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime(gmtime(&ts)) );
else
sb->safePrintf("\t<firstIndexedDateUTC>%"UINT32""
"</firstIndexedDateUTC>\n",
(uint32_t)m_firstIndexedDate );
ts = m_spideredTime;
if ( ! isXml )
sb->safePrintf("<tr><td>last indexed date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime(gmtime(&ts )) );
else
sb->safePrintf("\t<lastIndexedDateUTC>%"UINT32""
"</lastIndexedDateUTC>\n",
(uint32_t)m_spideredTime );
ts = m_outlinksAddedDate;
if ( ! isXml )
sb->safePrintf("<tr><td>outlinks last added date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime(gmtime(&ts )) );
else
sb->safePrintf("\t<outlinksLastAddedUTC>%"UINT32""
"</outlinksLastAddedUTC>\n",
(uint32_t)m_outlinksAddedDate );
// hop count
if ( ! isXml )
sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td>"
"</tr>\n",
(int32_t)m_hopCount);
else
sb->safePrintf("\t<hopCount>%"INT32"</hopCount>\n",
(int32_t)m_hopCount);
char strLanguage[128];
languageToString(m_langId, strLanguage);
// print tags
//SafeBuf tb;
int32_t sni = m_siteNumInlinks;
char *ipString = iptoa(m_ip);
//int32_t sni = info1->getNumGoodInlinks();
time_t tlu = info1->getLastUpdated();
struct tm *timeStruct3 = gmtime ( &tlu );//info1->m_lastUpdated );
char tmp3[64];
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
if ( ! isXml )
sb->safePrintf (
"<tr><td>original charset</td><td>%s</td></tr>\n"
"<tr><td>adult bit</td><td>%"INT32"</td></tr>\n"
//"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n"
"<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n"
"<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n"
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
"%s</td></tr>\n"
"<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n"
"<tr><td>content truncated</td><td>%"INT32"</td></tr>\n"
"<tr><td>content type</td><td>%s</td></tr>\n"
"<tr><td>language</td><td>%s</td></tr>\n"
"<tr><td>country</td><td>%s</td></tr>\n"
"<tr><td><b>good inlinks to site</b>"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique IP inlinks to site"
// "</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique CBlock inlinks to site"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td><b>site rank</b></td><td>%"INT32"</td></tr>\n"
"<tr><td>good inlinks to page"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique IP inlinks to page"
// "</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique CBlock inlinks to page"
// "</td><td>%"INT32"</td></tr>\n"
// "<tr><td>total inlinks to page"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td><nobr>page inlinks last computed</nobr></td>"
"<td>%s</td></tr>\n"
"</td></tr>\n",
get_charset_str(m_charset),
(int32_t)m_isAdult,
//(int32_t)m_isLinkSpam,
//m_note,
(int32_t)m_isPermalink,
(int32_t)m_isRSS,
ipString,
cr->m_coll,
ipString,
size_utf8Content - 1,
(int32_t)m_isContentTruncated,
g_contentTypeStrings[(int)m_contentType] ,
strLanguage,
g_countryCode.getName(m_countryId) ,
sni,
//m_siteNumInlinksUniqueIp,
//m_siteNumInlinksUniqueCBlock,
::getSiteRank(sni),
//info1->getNumTotalInlinks(),
info1->getNumGoodInlinks(),
// info1->m_numUniqueIps,
// info1->m_numUniqueCBlocks,
// info1->m_totalInlinkingDocIds,
tmp3
);
else {
sb->safePrintf (
"\t<charset><![CDATA[%s]]></charset>\n"
"\t<isAdult>%"INT32"</isAdult>\n"
"\t<isLinkSpam>%"INT32"</isLinkSpam>\n"
"\t<siteRank>%"INT32"</siteRank>\n"
"\t<numGoodSiteInlinks>%"INT32"</numGoodSiteInlinks>\n"
//"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
// "\t<numUniqueIpsLinkingToSite>%"INT32""
// "</numUniqueIpsLinkingToSite>\n"
// "\t<numUniqueCBlocksLinkingToSite>%"INT32""
// "</numUniqueCBlocksLinkingToSite>\n"
// how many inlinks, external and internal, we have
// to this page not filtered in any way!!!
//"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
// how many inlinking ips we got, including our own if
// we link to ourself
// "\t<numUniqueIpsLinkingToPage>%"INT32""
// "</numUniqueIpsLinkingToPage>\n"
// how many inlinking cblocks we got, including our own
// if we link to ourself
// "\t<numUniqueCBlocksLinkingToPage>%"INT32""
// "</numUniqueCBlocksLinkingToPage>\n"
"\t<numGoodPageInlinks>%"INT32"</numGoodPageInlinks>\n"
"\t<pageInlinksLastComputed>%"INT32""
"</pageInlinksLastComputed>\n"
,get_charset_str(m_charset)
,(int32_t)m_isAdult
,(int32_t)m_isLinkSpam
,::getSiteRank(sni)
,sni
// ,m_siteNumInlinksTotal
// ,m_siteNumInlinksUniqueIp
// ,m_siteNumInlinksUniqueCBlock
//,info1->m_totalInlinkingDocIds
//,info1->m_numUniqueIps
//,info1->m_numUniqueCBlocks
,info1->getNumGoodInlinks()
//,tmp3
,(int32_t)info1->m_lastUpdated
);
//if ( m_note )
// sb->safePrintf("\t<isLinkSpamReason><![CDATA[%s]]>"
// "</isLinkSpamReason>\n"
// , m_note );
sb->safePrintf("\t<isPermalink>%"INT32"</isPermalink>\n"
"\t<isRSSFeed>%"INT32"</isRSSFeed>\n"
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
"\t<contentLenInBytes>%"INT32""
"</contentLenInBytes>\n"
"\t<isContentTruncated>%"INT32""
"</isContentTruncated>\n"
"\t<contentType><![CDATA[%s]]></contentType>\n"
"\t<language><![CDATA[%s]]></language>\n"
"\t<country><![CDATA[%s]]></country>\n",
(int32_t)m_isPermalink,
(int32_t)m_isRSS,
ipString,
size_utf8Content - 1,
(int32_t)m_isContentTruncated,
g_contentTypeStrings[(int)m_contentType] ,
strLanguage,
g_countryCode.getName(m_countryId) );
}
//sb->safePrintf("<tr><td>site</td><td>");
//sb->safeMemcpy(ptr_site,size_site-1);
//sb->safePrintf("</td></tr>\n");
TagRec *ogr = NULL;
if ( m_tagRecDataValid && m_version >= 118 ) {
ogr = getTagRec(); // &m_tagRec;
// sanity. should be set from titlerec, so no blocking!
if ( ! ogr || ogr == (void *)-1 ) { char *xx=NULL;*xx=0; }
}
if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
else if ( ogr ) ogr->printToBufAsXml ( sb );
// show the good inlinks we used when indexing this
if ( ! isXml )
info1->print(sb,cr->m_coll);
// close the table
if ( ! isXml )
sb->safePrintf ( "</table></center><br>\n" );
else
sb->safePrintf("</response>\n");
return true;
}
bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {
// use msg25 to hit linkdb and give us a link info class i guess
// but we need paging functionality so we can page through like
// 100 links at a time. clustered by c-class ip.
// do we need to mention how many from each ip c-class then? because
// then we'd have to read the whole termlist, might be several
// separate disk reads.
// we need to re-get both if either is NULL
LinkInfo *sinfo = getSiteLinkInfo();
// block or error?
if ( ! sinfo ) return true; if ( sinfo == (LinkInfo *)-1) return false;
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
if ( isXml )
sb->safePrintf ("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
);
sb->safeMemcpy ( &m_siteLinkBuf );
if ( isXml )
sb->safePrintf ("</response>\n" );
// just print that
//sinfo->print ( sb , cr->m_coll );
return true;
}
bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
// we need to re-get both if either is NULL
LinkInfo *info1 = getLinkInfo1();
// block or error?
if ( ! info1 ) return true; if ( info1 == (LinkInfo *)-1) return false;
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
if ( isXml )
sb->safePrintf ("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
);
int32_t recompute = hr->getLong("recompute",0);
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
// i guess we need this
if ( ! recompute ) // m_setFromTitleRec )
info1->print ( sb , cr->m_coll );
else
sb->safeMemcpy ( &m_pageLinkBuf );
if ( isXml )
sb->safePrintf ("</response>\n" );
return true;
}
static void getInlineSectionVotingBufWrapper ( void *state ) {
XmlDoc *xd = (XmlDoc *)state;
SafeBuf *vb = xd->getInlineSectionVotingBuf();
// return if blocked
if ( vb == (void *)-1 ) return;
// error?
if ( ! vb ) log("xmldoc: error getting inline section votes: %s",
mstrerror(g_errno));
// all done then. call original entry callback
log("xmldoc: returning control to original caller");
xd->m_callback1 ( xd->m_state );
}
// . returns false if blocked, true otherwise
// . returns true with g_errno set on error
// . this actually returns the page content with inserted information
// based on sectiondb data
// . for example, <div id=poo> --> <div id=poo d=5 n=20>
// means that the section is repeated on 20 pages from this site and 5 of
// which have the same innerHtml as us
SafeBuf *XmlDoc::getInlineSectionVotingBuf ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . if we block anywhere below we want to come back here until done
// . this can be a main entry point, so set m_masterLoop
if ( ! m_masterLoop ) {
m_masterLoop = getInlineSectionVotingBufWrapper;
m_masterState = this;
log("xmldoc: getting section voting info from coll=%s",
cr->m_coll);
}
if ( m_inlineSectionVotingBufValid )
return &m_inlineSectionVotingBuf;
Sections *sections = getSectionsWithDupStats();
if ( ! sections || sections == (void *)-1 ) return (SafeBuf *)sections;
Words *words = getWords();
if ( ! words || words == (void *)-1 ) return (SafeBuf *)words;
HttpMime *mime = getMime();
if ( ! mime || mime == (void *)-1 ) return (SafeBuf *)mime;
int32_t siteHash32 = *getSiteHash32();
//int32_t nw = words->getNumWords();
//int64_t *wids = words->getWordIds();
SafeBuf *sb = &m_inlineSectionVotingBuf;
// store mime first then content
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
// we no longer use this through a proxy, so take this out
//sb->safeMemcpy ( m_httpReply , mime->getMimeLen() );
// but hack the Content-Length: field to something alien
// because we markup the html and the lenght will be different...
//sb->nullTerm();
// we no longer use this through a proxy so take this out
//char *cl = strstr(sb->getBufStart(),"\nContent-Length:");
//if ( cl ) cl[1] = 'Z';
//sec_t mflags = SEC_SENTENCE | SEC_MENU;
// just print out each word
// map the word to a section.
// if it s the first time we've printed the section then we
// can inject the stuff
// set a printed bit to indicate when we print out a section so
// we do not re-print it...
// these are 1-1 with words
Section **sptrs = sections->m_sectionPtrs;
int32_t nw = words->getNumWords();
char **wptrs = words->m_words;
int32_t *wlens = words->m_wordLens;
for ( int32_t i = 0 ; i < nw ; i++ ) {
char *a = wptrs[i];
// skip if not a front tag
if ( *a != '<' || a[1] == '/' ) {
sb->safeMemcpy(a,wlens[i]);
continue;
}
Section *sa = sptrs[i];
// straight copy if no stats
if ( ! sa || ! sa->m_stats.m_totalEntries ) {
sb->safeMemcpy ( a , wlens[i] );
continue;
}
// should be tag then
char *e = a;
for ( ; *e && *e != '>' && ! is_wspace_a(*e) ; e++);
// copy that
sb->safeMemcpy ( a , e-a);
// the hash of the turktaghash and sitehash32 combined
// so you can do gbfacetstr:gbxpathsitehash12345
// where the 12345 is this h32 value.
uint32_t h32 = sa->m_turkTagHash32 ^ siteHash32;
// insert our stuff into the tag
//sb->safePrintf("<!--");
//sb->safePrintf("<font color=red>");
SectionStats *sx = &sa->m_stats;
// # docs from our site had the same innerHTML?
sb->safePrintf(" _s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32"",
// total # of docs that had an xpath with
// our same innerHtml
(int32_t)sx->m_totalMatches,
// # of of docids with this facet
(int32_t)sx->m_totalDocIds,
// . total # of times this xpath occurred
// . can be multiple times per doc
(int32_t)sx->m_totalEntries,
// unique values in the xpath innerhtml
(int32_t)sx->m_numUniqueVals,
// xpathsitehash
h32 );
// copy the rest of the tag
sb->safeMemcpy( e, wlens[i]-(e-a) );
//sb->safePrintf("-->");
//sb->safePrintf("</font>");
// print it here
}
sb->nullTerm();
m_inlineSectionVotingBufValid = true;
return &m_inlineSectionVotingBuf;
}
bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
// what wordposition to scroll to and blink?
int32_t hiPos = -1;
if ( hr ) hiPos = hr->getLong("hipos",-1);
//
// PRINT SECTIONS
//
Sections *sections ;
// hr is NULL if being called from page parser which does not have the
// dup stats! and we core if we block here!
if ( hr ) sections = getSectionsWithDupStats();
else sections = getSections();
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
//SectionVotingTable *nsvt = getNewSectionVotingTable();
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
//SectionVotingTable *osvt = getOldSectionVotingTable();
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
Words *words = getWords();
if ( ! words ) return true; if ( words == (Words *)-1 ) return false;
Phrases *phrases = getPhrases();
if ( ! phrases ) return true; if (phrases == (void *)-1 ) return false;
HashTableX *cnt = getCountTable();
if ( ! cnt ) return true; if ( cnt == (void *)-1 ) return false;
int32_t nw = words->getNumWords();
//int32_t wordStart = 0;
//int32_t wordEnd = nw;
int64_t *wids = words->getWordIds();
int32_t isXml = false;
if ( hr ) isXml = (bool)hr->getLong("xml",0);
//if ( ! isXml ) printMenu ( sb );
// now complement, cuz bigger is better in the ranking world
//int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY );
SafeBuf densBuf;
// returns false and sets g_errno on error
if ( ! getDensityRanks((int64_t *)wids,
nw,
HASHGROUP_BODY,//hi->m_hashGroup,
&densBuf,
sections,
m_niceness))
return true;
// a handy ptr
char *densityVec = (char *)densBuf.getBufStart();
/*
if ( ! isXml )
sb->safePrintf("<br><b>density rank of body = %"INT32"</b> "
"(out of %"INT32")"
"<br>"
"<br>"
, densityRank
, (int32_t)MAXDENSITYRANK
);
*/
char *wordSpamVec = getWordSpamVec();
char *fragVec = m_fragBuf.getBufStart();
SafeBuf dwbuf;
if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true;
char *diversityVec = dwbuf.getBufStart();
// hack fack debug
//m_bodyStartPos =2136;
SafeBuf wpos;
if ( ! getWordPosVec ( words ,
sections,
//wordStart,
//wordEnd,
// we save this in the titlerec, when we
// start hashing the body. we have the url
// terms before the body, so this is necessary.
m_bodyStartPos,//0, // hi->m_startDist,
fragVec,
m_niceness,
&wpos) ) return true;
// a handy ptr
int32_t *wposVec = (int32_t *)wpos.getBufStart();
if ( ! isXml ) {
// put url in for steve to parse out
sb->safePrintf("%s\n",
m_firstUrl.m_url);
sb->safePrintf("<font color=black>w</font>"
"/"
"<font color=purple>x</font>"
//"/"
//"<font color=green>y</font>"
"/"
"<font color=red>z</font>"
": "
"w=wordPosition "
"x=densityRank "
//"y=diversityRank "
"z=wordSpamRank "
"<br>"
"<br>"
""
);
}
if ( ! isXml ) {
// try the new print function
sections->print2 ( sb ,
hiPos,
wposVec,
densityVec,
diversityVec,
wordSpamVec,
fragVec,
NULL,
NULL ,
&m_addresses ,
true );
return true;
}
if ( isXml )
sb->safePrintf ("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
);
Section *si = sections->m_rootSection;
sec_t mflags = SEC_SENTENCE | SEC_MENU;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// print it out
sb->safePrintf("\t<section>\n");
// get our offset in the array of sections
int32_t num = si - sections->m_sections;
sb->safePrintf("\t\t<id>%"INT32"</id>\n",num);
Section *parent = si->m_parent;
if ( parent ) {
int32_t pnum = parent - sections->m_sections;
sb->safePrintf("\t\t<parent>%"INT32"</parent>\n",pnum);
}
char *byte1 = words->m_words[si->m_a];
char *byte2 = words->m_words[si->m_b-1] +
words->m_wordLens[si->m_b-1];
int32_t off1 = byte1 - words->m_words[0];
int32_t size = byte2 - byte1;
sb->safePrintf("\t\t<byteOffset>%"INT32"</byteOffset>\n",off1);
sb->safePrintf("\t\t<numBytes>%"INT32"</numBytes>\n",size);
if ( si->m_flags & mflags ) {
sb->safePrintf("\t\t<flags><![CDATA[");
bool printed = false;
if ( si->m_flags & SEC_SENTENCE ) {
sb->safePrintf("sentence");
printed = true;
}
if ( si->m_flags & SEC_MENU ) {
if ( printed ) sb->pushChar(' ');
sb->safePrintf("ismenu");
printed = true;
}
sb->safePrintf("]]></flags>\n");
}
int32_t bcolor = (int32_t)si->m_colorHash& 0x00ffffff;
int32_t fcolor = 0x000000;
//int32_t rcolor = 0x000000;
uint8_t *bp = (uint8_t *)&bcolor;
bool dark = false;
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
dark = true;
// or if two are less than 50
if ( bp[0]<100 && bp[1]<100 ) dark = true;
if ( bp[1]<100 && bp[2]<100 ) dark = true;
if ( bp[0]<100 && bp[2]<100 ) dark = true;
// if bg color is dark, make font color light
if ( dark ) {
fcolor = 0x00ffffff;
//rcolor = 0x00ffffff;
}
sb->safePrintf("\t\t<bgColor>%06"XINT32"</bgColor>\n",bcolor);
sb->safePrintf("\t\t<textColor>%06"XINT32"</textColor>\n",fcolor);
// count stats
uint64_t ch64 = (int32_t)si->m_sentenceContentHash64;
if ( ! ch64 ) {
sb->safePrintf("\t</section>\n");
continue;
}
/* take this out for now it is not quite right any more.
we now use the xpath hash and site hash as the key
and the "value" is the sentence/innerHtml hash
sb->safePrintf("\t\t<numOnSitePagesThatDuplicateContent>%"INT32""
"</numOnSitePagesThatDuplicateContent>\n",
(int32_t)si->m_stats.m_onSiteDocIds);
sb->safePrintf("\t\t<numOffSitePagesThatDuplicateContent>%"INT32""
"</numOffSitePagesThatDuplicateContent>\n",
(int32_t)si->m_stats.m_offSiteDocIds);
sb->safePrintf("\t\t<numSitesThatDuplicateContent>%"INT32""
"</numSitesThatDuplicateContent>\n",
(int32_t)si->m_stats.m_numUniqueSites);
*/
// you can do a sitehash:xxxxx this number to see who the
// dups are!
sb->safePrintf("\t\t<innerContentHash64>%"UINT64""
"</innerContentHash64>\n",
si->m_sentenceContentHash64);
sb->safePrintf("\t</section>\n");
}
// now print out the entire page content so the offsets make sense!
sb->safePrintf("\t<utf8Content><![CDATA[");
if ( ptr_utf8Content )
sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,false);
sb->safePrintf("]]></utf8Content>\n");
// end xml response
sb->safePrintf("</response>\n");
return true;
}
bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {
// set debug buffer
m_storeTermListInfo = true;
// default to sorting by wordpos
m_sortTermListBy = hr->getLong("sortby",1);
// cores in getNewSpiderReply() if we do not have this and provide
// the docid...
m_useSpiderdb = false;
char *metaList = getMetaList ( );
if ( ! metaList ) return true; if (metaList==(char *) -1) return false;
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
int32_t isXml = hr->getLong("xml",0);
if ( isXml ) {
sb->safePrintf ("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
);
sb->safePrintf(
"\t<maxDens>%"INT32"</maxDens>\n"
//"\t<maxDiv>%"INT32"</maxDiv>\n"
"\t<maxSpam>%"INT32"</maxSpam>\n"
, (int32_t)MAXDENSITYRANK
//, (int32_t)MAXDIVERSITYRANK
, (int32_t)MAXWORDSPAMRANK
);
}
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
if ( ! isXml ) {
//printMenu ( sb );
//sb->safePrintf("<i>* indicates word is a synonym or "
// "alternative word form<br><br>");
sb->safePrintf("N column = DensityRank (0-%"INT32")<br>"
//"V column = DiversityRank (0-%"INT32")<br>"
"S column = WordSpamRank (0-%"INT32") "
"[or linker "
"siterank if its offsite link text]<br>"
"Lang column = language used for purposes "
"of detecting the document's primary language "
"using a simple majority vote"
"<br>"
"</i>"
"<br>"
"Document Primary Language: <b>%s</b> (%s)"
"<br>"
"<br>"
, (int32_t)MAXDENSITYRANK
//, (int32_t)MAXDIVERSITYRANK
, (int32_t)MAXWORDSPAMRANK
, getLanguageString (m_langId)
, getLangAbbr(m_langId)
);
// encode it
SafeBuf ue;
ue.urlEncode ( ptr_firstUrl );
sb->safePrintf("Sort by: " );
if ( m_sortTermListBy == 0 )
sb->safePrintf("<b>Term</b>");
else
sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
"sortby=0>"
"Term</a>"
, cr->m_coll
, ue.getBufStart()
);
sb->safePrintf(" | ");
if ( m_sortTermListBy == 1 )
sb->safePrintf("<b>WordPos</b>");
else
sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
"sortby=1>"
"WordPos</a>"
, cr->m_coll
, ue.getBufStart()
);
sb->safePrintf("<br>"
"<br>"
);
}
//
// BEGIN PRINT HASHES TERMS (JUST POSDB)
//
// int16_tcut
HashTableX *wt = m_wts;
// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
TermDebugInfo **tp = NULL;
// add them with this counter
int32_t nt = 0;
int32_t nwt = 0;
if ( wt ) {
nwt = wt->m_numSlots;
tp = (TermDebugInfo **)wt->m_keys;
}
// now print the table we stored all we hashed into
for ( int32_t i = 0 ; i < nwt ; i++ ) {
// skip if empty
if ( wt->m_flags[i] == 0 ) continue;
// breathe
//QUICKPOLL(m_niceness);
// get its key, date=32bits termid=64bits
//key96_t *k = (key96_t *)wt->getKey ( i );
// get the TermDebugInfo
TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
// point to it for sorting
tp[nt++] = ti;
}
// set this for cmptp
s_wbuf = &m_wbuf;
if ( m_sortTermListBy == 0 )
// sort them alphabetically
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
else
// sort by word pos
gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp2 , m_niceness );
// print the weight tables
//printLocationWeightsTable(sb,isXml);
//printDiversityWeightsTable(sb,isXml);
//printDensityWeightsTable(sb,isXml);
//printWordSpamWeightsTable(sb,isXml);
// print them out in a table
char hdr[1000];
sprintf(hdr,
"<table border=1 cellpadding=0>"
"<tr>"
// this messes up Test.cpp diff'ing
//"<td><b>#</b></td>"
"<td><b>Prefix</b></td>"
"<td><b>WordPos</b></td>"
"<td><b>Lang</b></td>"
"<td><b>Term</b></td>"
//"%s"
//"<td><b>Weight</b></td>"
//"<td><b>Spam</b></td>"
"<td><b>Desc</b></td>"
"<td><b>N</b></td>"
//"<td><b>V</b></td>" // diversityRank
"<td><b>S</b></td>"
"<td><b>Score</b></td>"
//"<td><b>Date</b></td>"
//"<td><b>Desc</b></td>"
//"<td><b>TermId</b></td>"
"</tr>\n"
//,fbuf
);
if ( ! isXml )
sb->safePrintf("%s",hdr);
char *start = m_wbuf.getBufStart();
int32_t rcount = 0;
for ( int32_t i = 0 ; i < nt ; i++ ) {
// see if one big table causes a browser slowdown
if ( (++rcount % TABLE_ROWS) == 0 && ! isXml )
sb->safePrintf("<!--ignore--></table>%s",hdr);
char *prefix = NULL;//"&nbsp;";
if ( tp[i]->m_prefixOff >= 0 )
prefix = start + tp[i]->m_prefixOff;
if ( isXml ) sb->safePrintf("\t<term>\n");
if ( isXml && prefix )
sb->safePrintf("\t\t<prefix><![CDATA[%s]]>"
"</prefix>\n",prefix);
if ( ! isXml ) {
sb->safePrintf ( "<tr>");
if ( prefix )
sb->safePrintf("<td>%s:</td>",prefix);
else
sb->safePrintf("<td>&nbsp;</td>");
}
if ( ! isXml )
sb->safePrintf("<td>%"INT32""
"/%"INT32""
"</td>" ,
tp[i]->m_wordPos
,tp[i]->m_wordNum
);
//char *abbr = getLangAbbr(tp[i]->m_langId);
//if ( tp[i]->m_langId == langTranslingual ) abbr ="??";
//if ( tp[i]->m_langId == langUnknown ) abbr ="--";
//if ( tp[i]->m_synSrc ) abbr = "";
// print out all langs word is in if it's not clear
// what language it is. we use a sliding window to
// resolve some ambiguity, but not all, so print out
// the possible langs here
if ( ! isXml ) {
sb->safePrintf("<td>");
printLangBits ( sb , tp[i] );
sb->safePrintf("</td>");
}
//if ( ! isXml && abbr[0] )
// sb->safePrintf("<td>%s</td>", abbr );
//else if ( ! isXml )
// sb->safePrintf("<td>&nbsp;</td>" );
//else if ( abbr[0] )
// sb->safePrintf("\t\t<lang><![CDATA["
// "]]>%s</lang>\n", abbr );
if ( isXml )
sb->safePrintf("\t\t<s><![CDATA[");
if ( ! isXml )
sb->safePrintf ("<td><nobr>" );
//if ( tp[i]->m_synSrc )
// sb->pushChar('*');
sb->safeMemcpy_nospaces ( start + tp[i]->m_termOff ,
tp[i]->m_termLen );
/*
char *dateStr = "&nbsp;";
int32_t ddd = tp[i]->m_date;
uint8_t *tddd = (uint8_t *)&ddd;
char tbbb[32];
if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
sprintf(tbbb,"evIds %"INT32"-%"INT32"",
(int32_t)tddd[1],(int32_t)tddd[0]);
dateStr = tbbb;
}
else if ( ddd )
dateStr = asctime ( gmtime(&ddd ));
char tmp[20];
if ( tp[i]->m_noSplit ) sprintf ( tmp,"<b>1</b>" );
else sprintf ( tmp,"0" );
*/
if ( isXml )
sb->safePrintf("]]></s>\n");
else
sb->safePrintf ( "</nobr></td>" );
if ( isXml )
sb->safePrintf("\t\t<wordPos>%"INT32"</wordPos>\n",
tp[i]->m_wordPos);
char *desc = NULL;
if ( tp[i]->m_descOff >= 0 )
desc = start + tp[i]->m_descOff;
// use hashgroup
int32_t hg = tp[i]->m_hashGroup;
if ( ! desc || ! strcmp(desc,"body") )
desc = getHashGroupString(hg);
if ( isXml && desc )
sb->safePrintf("\t\t<loc>%s</loc>\n", desc);
else if ( ! isXml ) {
if ( ! desc ) desc = "&nbsp;";
sb->safePrintf ( "<td>%s", desc );
char ss = tp[i]->m_synSrc;
if ( ss )
sb->safePrintf(" - %s",
getSourceString(ss));
sb->safePrintf("</td>");
}
int32_t dn = (int32_t)tp[i]->m_densityRank;
if ( isXml )
sb->safePrintf("\t\t<dens>%"INT32"</dens>\n",dn);
if ( ! isXml && dn >= MAXDENSITYRANK )
sb->safePrintf("<td>%"INT32"</td>\n",dn);
else if ( ! isXml )
sb->safePrintf("<td><font color=purple>%"INT32"</font>"
"</td>",dn);
// the diversityrank/wordspamrank
/*
int32_t ds = (int32_t)tp[i]->m_diversityRank;
if ( isXml )
sb->safePrintf("\t\t<div>%"INT32"</div>\n",ds);
if ( ! isXml && ds >= MAXDIVERSITYRANK )
sb->safePrintf("<td>%"INT32"</td>\n",ds);
else if ( ! isXml )
sb->safePrintf("<td><font color=green>%"INT32"</font>"
"</td>",ds);
*/
int32_t ws = (int32_t)tp[i]->m_wordSpamRank;
if ( isXml && hg == HASHGROUP_INLINKTEXT )
sb->safePrintf("\t\t<linkerSiteRank>%"INT32""
"</linkerSiteRank>\n",ws);
else if ( isXml )
sb->safePrintf("\t\t<spam>%"INT32"</spam>\n",ws);
if ( ! isXml && ws >= MAXWORDSPAMRANK )
sb->safePrintf("<td>%"INT32"</td>",ws);
else if ( ! isXml )
sb->safePrintf("<td><font color=red>%"INT32"</font></td>",
ws);
float score = 1.0;
// square this like we do in the query ranking algo
score *= getHashGroupWeight(hg) * getHashGroupWeight(hg);
//score *= getDiversityWeight(tp[i]->m_diversityRank);
score *= getDensityWeight(tp[i]->m_densityRank);
if ( tp[i]->m_synSrc ) score *= SYNONYM_WEIGHT;
if ( hg == HASHGROUP_INLINKTEXT ) score *= getLinkerWeight(ws);
else score *= getWordSpamWeight(ws);
if ( isXml )
sb->safePrintf("\t\t<score>%.02f</score>\n",score);
else
sb->safePrintf("<td>%.02f</td>\n",score);
if ( isXml )
sb->safePrintf("\t</term>\n");
else
sb->safePrintf("</tr>\n");
}
if ( isXml )
sb->safePrintf ("</response>\n" );
else
sb->safePrintf("</table><br>\n");
//
// END PRINT HASHES TERMS
//
return true;
}
bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
sb->safePrintf("<b>Coming Soon</b>");
return true;
}
bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {
char **c = getUtf8Content();
if ( ! c ) return true; if ( c==(void *)-1) return false;
int32_t isXml = hr->getLong("xml",0);
int32_t raw = hr->getLong("raw",0);
if ( ! isXml && ! raw ) printMenu ( sb );
if ( ! isXml ) {
// just copy it otherwise
if ( ptr_utf8Content )
sb->safeMemcpy ( ptr_utf8Content ,size_utf8Content -1);
return true;
}
sb->safePrintf ("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
);
sb->safePrintf("\t<utf8Content><![CDATA[");
if ( ptr_utf8Content )
sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,
false);
sb->safePrintf("]]></utf8Content>\n");
// end xml response
sb->safePrintf("</response>\n");
return true;
}
// . get the possible titles of the root page
// . includes the title tag text
// . includes various inlink text
// . used to match the VERIFIED place name 1 or 2 of addresses on this
// site in order to set Address::m_flags's AF_VENUE_DEFAULT bit which
// indicates the address is the address of the website (a venue website)
char **XmlDoc::getRootTitleBuf ( ) {
// return if valid
if ( m_rootTitleBufValid ) return (char **)&m_rootTitleBuf;
// get it from the tag rec first
setStatus ( "getting root title buf");
// sanity check, root must have been indexed
//if ( ! m_sreq.m_rootIndexed ) { char *xx=NULL;*xx=0; }
// . update it first before reading it!
// . do not update it here, just update it in getTitleRec() because
// this makes doConsistencyCheck() block and core
//bool *status2 = updateSiteTitleBuf();
//if ( ! status2 || status2 == (void *)-1 ) return (char **)status2;
// get it from the tag rec if we can
TagRec *gr = getTagRec ();
if ( ! gr || gr == (void *)-1 ) return (char **)gr;
// clear this if not set from title rec
//if ( ! m_setFromTitleRec ) {
// ptr_siteTitleBuf = NULL;
// size_siteTitleBuf = 0;
//}
// PROBLEM: new title rec is the only thing which has sitetitles tag
// sometimes and we do not store that in the title rec. in this case
// we should maybe store ptr_siteTitleBuf/size_siteTitleBuf in the
// title rec?
Tag *tag = gr->getTag("roottitles");
char *src = NULL;
int32_t srcSize = 0;
if ( ptr_rootTitleBuf || m_setFromTitleRec ) {
src = ptr_rootTitleBuf;
srcSize = size_rootTitleBuf;
}
else if ( tag ) {
src = tag->getTagData();
srcSize = tag->getTagDataSize();
// no need to add to title rec since already in the tag so
// make sure we did not double add
if ( ptr_rootTitleBuf ) { char *xx=NULL;*xx=0; }
}
else {
// . get the root doc
// . allow for a one hour cache of the titleRec
XmlDoc **prd = getRootXmlDoc( 3600 );
if ( ! prd || prd == (void *)-1 ) return (char **)prd;
// int16_tcut
XmlDoc *rd = *prd;
// . if no root doc, then assume no root title
// . this happens if we are injecting because we do not want
// to download the root page for speed purposes
if ( ! rd ) {
m_rootTitleBuf[0] = '\0';
m_rootTitleBufSize = 0;
m_rootTitleBufValid = true;
return (char **)&m_rootTitleBuf;
}
// . ONLY do this if root doc was NOT set from titleRec to
// avoid that core in updateSiteTitleBuf(). this can happen
// if the root doc had no title! (or no content)
//if ( rd->m_setFromTitleRec ) {
// // emptyt
// m_siteTitleBuf[0] = '\0';
// // set the size of it
// m_siteTitleBufSize = 0;
// // validate it
// m_siteTitleBufValid = true;
// // return a ptr to it
// return (char **)&m_siteTitleBuf;
//}
// a \0 separated list
char **rtl = rd->getTitleBuf();
if ( ! rtl || rtl == (void *)-1 ) return (char **)rtl;
// ptr
src = rd->m_titleBuf;
srcSize = rd->m_titleBufSize;
}
int32_t max = (int32_t)ROOT_TITLE_BUF_MAX - 5;
// sanity
if ( srcSize >= max ) {
// truncate
srcSize = max;
// back up so we split on a space
for ( ; srcSize>0 && ! is_wspace_a(src[srcSize]); srcSize--);
// null term
src[srcSize] = '\0';
// include it
srcSize++;
}
// copy that over in case root is destroyed
gbmemcpy ( m_rootTitleBuf , src , srcSize );
m_rootTitleBufSize = srcSize;
// sanity check, must include the null ni the size
if ( m_rootTitleBufSize > 0 &&
m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
log("build: bad root titlebuf size not end in null char for "
"collnum=%i",(int)m_collnum);
ptr_rootTitleBuf = NULL;
size_rootTitleBuf = 0;
m_rootTitleBufValid = true;
return (char **)&m_rootTitleBuf;
char *xx=NULL;*xx=0;
//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
//m_rootTitleBufSize++;
}
// sanity check - breach check
if ( m_rootTitleBufSize > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0;}
// serialize into our titlerec
ptr_rootTitleBuf = m_rootTitleBuf;
size_rootTitleBuf = m_rootTitleBufSize;
m_rootTitleBufValid = true;
return (char **)&m_rootTitleBuf;
}
char **XmlDoc::getFilteredRootTitleBuf ( ) {
if ( m_filteredRootTitleBufValid )
return (char **)&m_filteredRootTitleBuf;
// get unfiltered. m_rootTitleBuf should be set from this call.
char **rtbp = getRootTitleBuf();
if ( ! rtbp || rtbp == (void *)-1 ) return (char **)rtbp;
/*
// assume none
m_filteredRootTitleBuf[0] = '\0';
m_filteredRootTitleBufSize = 0;
m_filteredRootTitleBufValid = true;
return (char **)&m_filteredRootTitleBuf;
*/
// filter all the punct to \0 so that something like
// "walmart.com : live better" is reduced to 3 potential
// names, "walmart", "com" and "live better"
char *src = m_rootTitleBuf;
char *srcEnd = src + m_rootTitleBufSize;
char *dst = m_filteredRootTitleBuf;
// save some room to add a \0, so subtract 5
char *dstEnd = dst + ROOT_TITLE_BUF_MAX - 5;
//char *src = tag->getTagData();
//char *srcEnd = src + tag->getTagDataSize();
int32_t size = 0;
bool lastWasPunct = true;
for ( ; src < srcEnd && dst < dstEnd ; src += size ) {
// set the char size
size = getUtf8CharSize(src);
// space?
if ( is_wspace_a (*src) ||
// allow periods too
*src=='.' ) {
// no back to back punct
if ( lastWasPunct ) continue;
// flag it
lastWasPunct = true;
// add it in
*dst++ = '.';
// that's it
continue;
}
// x'y or x-y
if ( ( *src == '\'' ||
*src == '.' ||
*src == '-' ) &&
! lastWasPunct &&
is_alnum_a(src[1]) ) {
// add it in
*dst++ = *src;
// that's it
continue;
}
// x & y is ok
if ( *src == '&' ) {
// assume not punct (stands for and)
lastWasPunct = false;
// add it in
*dst++ = *src;
// that's it
continue;
}
// store alnums right in
if ( is_alnum_a(*src) ) {
// flag it
lastWasPunct = false;
// copy it over
gbmemcpy ( dst , src , size );
// skip what we copied
dst += size;
continue;
}
// if punct and haven't stored anything, just skip it
if ( lastWasPunct ) dst[-1] = '\0';
// store it
else *dst++ = '\0';
}
// make sure we end on a \0
if ( dst > m_filteredRootTitleBuf && dst[-1] != '\0' )
*dst++ = '\0';
// int16_tcut
char *str = m_filteredRootTitleBuf;
int32_t strSize = dst - m_filteredRootTitleBuf;
// copy that over in case root is destroyed
gbmemcpy ( m_filteredRootTitleBuf , str , strSize );
m_filteredRootTitleBufSize = strSize;
// sanity check, must include the null ni the size
if ( m_filteredRootTitleBufSize > 0 &&
m_filteredRootTitleBuf [ m_filteredRootTitleBufSize - 1 ] ) {
char *xx=NULL;*xx=0;
//m_filteredRootTitleBuf [ m_filteredRootTitleBufSize-1]='\0';
//m_filteredRootTitleBufSize++;
}
// sanity check - breach check
if ( m_filteredRootTitleBufSize > ROOT_TITLE_BUF_MAX ) {
char *xx=NULL;*xx=0;}
m_filteredRootTitleBufValid = true;
// make this static to avoid compiler warning
static char *fp = m_filteredRootTitleBuf;
return (char **)&fp;
//return (char **)&m_filteredRootTitleBuf;
}
//static bool s_dummyBool = 1;
class Binky {
public:
char *m_text;
int32_t m_textLen;
int32_t m_score;
int64_t m_hash;
};
int cmpbk ( const void *v1, const void *v2 ) {
Binky *b1 = (Binky *)v1;
Binky *b2 = (Binky *)v2;
return b1->m_score - b2->m_score;
}
char **XmlDoc::getTitleBuf ( ) {
if ( m_titleBufValid ) return (char **)&m_titleBuf;
// recalc this everytime the root page is indexed
setStatus ( "getting title buf on root");
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
// this should only be called on the root!
// . if the site changed for us, but the title rec of what we
// think is now the root thinks that it is not the root because
// it is using the old site, then it cores here!
// . i.e. if the new root is www.xyz.com/user/ted/ and the old root
// is www.xyz.com then and the old root is stored in ptr_site for
// the title rec for www.xyz.com/user/ted/ then we core here,
// . so take this sanity check out
// . but if the title rec does not think he is the site root yet
// then just wait until he does so we can get his
// ptr_rootTitleBuf below
if ( ! *isRoot ) {
m_titleBuf[0] = '\0';
m_titleBufSize = 0;
m_titleBufValid = true;
return (char **)&m_titleBuf;
}
// sanity check
if ( m_setFromTitleRec ) {
gbmemcpy(m_titleBuf, ptr_rootTitleBuf, size_rootTitleBuf );
m_titleBufSize = size_rootTitleBuf;
m_titleBufValid = true;
return (char **)&m_titleBuf;
}
char *mysite = getSite();
if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
// get link info first
LinkInfo *info1 = getLinkInfo1();
// error or blocked
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char **)info1;
// sanity check
Xml *xml = getXml();
// return -1 if it blocked
if ( xml == (void *)-1 ) return (char **)-1;
// set up for title
int32_t tlen ;
char *title ;
// on error, ignore it to avoid hammering the root!
if ( xml == (void *)NULL ) {
// log it
log("build: error downloading root xml: %s",
mstrerror(g_errno));
// clear it
g_errno = 0;
// make it 0
tlen = 0;
title = NULL;
}
else {
// get the title
title = m_xml.getTextForXmlTag ( 0,
999999 ,
"title" ,
&tlen ,
true ); // skip leading spaces
}
// truncate to 100 chars
//for ( ; tlen>0 && (tlen > 100 || is_alnum_a(title[tlen])) ; tlen-- )
// if ( tlen == 0 ) break;
if ( tlen > 100 ) {
char *tpend = title + 100;
char *prev = getPrevUtf8Char ( tpend , title );
// make that the end so we don't split a utf8 char
tlen = prev - title;
}
// store tag in here
char tmp[1024];
// point to it
char *ptmp = tmp;
// set this
char *pend = tmp + 1024;
// add that in
gbmemcpy ( ptmp, title, tlen); ptmp += tlen;
// null terminate it
*ptmp++ = '\0';
// two votes per internal inlink
int32_t internalCount = 0;
// count inlinkers
int32_t linkNum = 0;
Binky bk[1000];
// init this
//char stbuf[2000];
//HashTableX scoreTable;
//scoreTable.set(8,4,64,stbuf,2000,false,m_niceness,"xmlscores");
// scan each link in the link info
for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
// do not breach
if ( linkNum >= 1000 ) break;
// is this inlinker internal?
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// get length of link text
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// get the text
char *txt = k->getLinkText();
// skip corrupted
if ( ! verifyUtf8 ( txt , tlen ) ) {
log("xmldoc: bad link text 4 from url=%s for %s",
k->getUrl(),m_firstUrl.m_url);
continue;
}
// store these
// zero out hash
bk[linkNum].m_hash = 0;
bk[linkNum].m_text = txt;
bk[linkNum].m_textLen = tlen;
bk[linkNum].m_score = 0;
// internal count
if ( internal && ++internalCount >= 3 ) continue;
// it's good
bk[linkNum].m_score = 1;
linkNum++;
/*
// set into words
Words w;
// return NULL on error with g_errno set
if ( ! w.setx ( txt , tlen , m_niceness ) ) return NULL;
// int16_tcut
int64_t *wids = w.getWordIds();
// init hash
int64_t h = 0LL;
// hash all words together
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
// skip if not hashable
if ( ! wids[i] ) continue;
// mix it up
h <<= 1LL;
// xor it in
h ^= wids[i];
}
// update hash
bk[linkNum].m_hash = h;
// store in table, return NULL with g_errno set on error
if ( ! scoreTable.addTerm ( &h ) ) return NULL;
*/
}
// init this
char dtbuf[1000];
HashTableX dupTable;
dupTable.set(8,0,64,dtbuf,1000,false,m_niceness,"xmldup");
// now set the scores and isdup
for ( int32_t i = 0 ; i < linkNum ; i++ ) {
// skip if ignored
if ( bk[i].m_score == 0 ) continue;
// get hash
int64_t h = bk[i].m_hash;
// assume a dup
bk[i].m_score = 0;
// skip if zero'ed out
if ( ! h ) continue;
// only do each hash once!
if ( dupTable.isInTable(&h) ) continue;
// add to it. return NULL with g_errno set on error
if ( ! dupTable.addKey(&h) ) return NULL;
// is it in there?
bk[i].m_score = 1; // scoreTable.getScore ( &h );
}
// now sort the bk array by m_score
//gbsort ( bk , linkNum , sizeof(Binky), cmpbk , m_niceness );
// sanity check - make sure sorted right
//if ( linkNum >= 2 && bk[0].m_score < bk[1].m_score ) {
// char *xx=NULL; *xx=0; }
// . now add the winners to the buffer
// . skip if score is 0
for ( int32_t i = 0 ; i < linkNum ; i++ ) {
// skip if score is zero
if ( bk[i].m_score == 0 ) continue;
// skip if too big
if ( bk[i].m_textLen + 1 > pend - ptmp ) continue;
// store it
gbmemcpy ( ptmp , bk[i].m_text , bk[i].m_textLen );
// advance
ptmp += bk[i].m_textLen;
// null terminate it
*ptmp++ = '\0';
}
// sanity
int32_t size = ptmp - tmp;
if ( size > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0; }
gbmemcpy ( m_titleBuf , tmp , ptmp - tmp );
m_titleBufSize = size;
m_titleBufValid = true;
// ensure null terminated
if ( size > 0 && m_titleBuf[size-1] ) { char *xx=NULL;*xx=0; }
//ptr_siteTitleBuf = m_siteTitleBuf;
//size_siteTitleBuf = m_siteTitleBufSize;
return (char **)&m_titleBuf;
}
// . now we just get all the tagdb rdb recs to add using this function
// . then we just use the metalist to update tagdb
SafeBuf *XmlDoc::getNewTagBuf ( ) {
if ( m_newTagBufValid ) return &m_newTagBuf;
setStatus ( "getting new tags");
int32_t *ic = getIndexCode();
if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
// get our ip
int32_t *ip = getIp();
// this must not block to avoid re-computing "addme" above
if ( ip == (void *)-1 ) { char *xx=NULL;*xx=0; }
if ( ! ip || ip == (int32_t *)-1) return (SafeBuf *)ip;
// . do not both if there is a problem
// . otherwise if our ip is invalid (0 or 1) we core in
// getNumSiteInlinks() which requires a valid ip
// . if its robots.txt disallowed, then indexCode will be set, but we
// still want to cache our sitenuminlinks in tagdb! delicious.com was
// recomputing the sitelinkinfo each time because we were not storing
// these tags in tagdb!!
if ( ! *ip || *ip == -1 ) { // *ic ) {
m_newTagBuf.reset();
m_newTagBufValid = true;
return &m_newTagBuf;
}
// get the tags already in tagdb
TagRec *gr = getTagRec ( );
if ( ! gr || gr == (void *)-1 ) return (SafeBuf *)gr;
// get our site
char *mysite = getSite();
// this must not block to avoid re-computing "addme" above
if ( mysite == (void *)-1 ) { char *xx=NULL;*xx=0; }
if ( ! mysite || mysite == (char *)-1 ) return (SafeBuf *)mysite;
// age of tag in seconds
int32_t timestamp;
// always just use the primary tagdb so we can cache our sitenuminlinks
char rdbId = RDB_TAGDB;
//if ( m_useSecondaryRdbs ) rdbId = RDB2_TAGDB2;
//else rdbId = RDB_TAGDB;
// sitenuminlinks special for repair
if ( m_useSecondaryRdbs &&
// and not rebuilding titledb
! m_useTitledb ) {
m_newTagBuf.reset();
m_newTagBufValid = true;
int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,&timestamp);
if ( old1 == m_siteNumInlinks &&
old1 != -1 &&
! m_updatingSiteLinkInfoTags )
return &m_newTagBuf;
int32_t now = getTimeGlobal();
if ( g_conf.m_logDebugLinkInfo )
log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"",
mysite,m_siteNumInlinks);
if ( ! m_newTagBuf.addTag2(mysite,"sitenuminlinks",now,
"xmldoc",
*ip,m_siteNumInlinks,rdbId) )
return NULL;
return &m_newTagBuf;
}
// if doing consistency check, this buf is for adding to tagdb
// so just ignore those. we use ptr_tagRecData in getTagRec() function
// but this is really for updating tagdb.
if ( m_doingConsistencyCheck ) {
m_newTagBuf.reset();
m_newTagBufValid = true;
return &m_newTagBuf;
}
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
char *isIndexed = getIsIndexed();
if ( !isIndexed || isIndexed==(char *)-1 ) return (SafeBuf *)isIndexed;
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (SafeBuf *)isRoot;
int32_t *siteNumInlinks = getSiteNumInlinks();
if ( ! siteNumInlinks ) return NULL;
if ( siteNumInlinks == (int32_t *)-1) return (SafeBuf *)-1;
// ok, get the sites of the external outlinks and they must
// also be NEW outlinks, added to the page since the last time
// we spidered it...
Links *links = getLinks ();
if ( ! links || links == (Links *)-1 ) return (SafeBuf *)links;
// our next slated spider priority
char *spiderLinks = getSpiderLinks();
if ( ! spiderLinks || spiderLinks == (char *)-1 )
return (SafeBuf *)spiderLinks;
// . get ips of all outlinks.
// . use m_msgeForIps class just for that
// . it sucks if the outlink's ip is a dns timeout, then we never
// end up being able to store it in tagdb, that is why when
// rebuilding we need to skip adding firstip tags for the outlinks
int32_t **ipv = NULL;
TagRec ***grv = NULL;
bool addLinkTags = true;
if ( ! *spiderLinks ) addLinkTags = false;
if ( ! m_useSpiderdb ) addLinkTags = false;
if ( addLinkTags ) {
ipv = getOutlinkFirstIpVector ();
if ( ! ipv || ipv == (void *)-1 ) return (SafeBuf *)ipv;
// . uses m_msgeForTagRecs for this one
grv = getOutlinkTagRecVector();
if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
}
// get root langid of root page
uint8_t *rl = getRootLangId();
if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;
char *hci = getHasContactInfo();
if ( ! hci || hci == (char *)-1 ) return (SafeBuf *)hci;
// get the address class
Addresses *aa = getAddresses ();
if ( ! aa || aa == (Addresses *)-1 ) return (SafeBuf *)aa;
// get comma separated list of email address on page
char *emails = getEmailBuf ( );
if ( ! emails || emails == (void *)-1 ) return (SafeBuf *)emails;
#ifdef _USETURKS_
//HashTableX *tvt = getTurkVotingTable ();
//if ( ! tvt || tvt == (void *)-1 ) return (SafeBuf *)tvt;
#endif
//
// init stuff
//
// . this gets the root doc and and parses titles out of it
// . sets our m_rootTitleBuf/m_rootTitleBufSize
char **rtbufp = getRootTitleBuf();
if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf *)rtbufp;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// overwrite "getting root title buf" status
setStatus ("computing new tags");
if ( g_conf.m_logDebugLinkInfo )
log("xmldoc: adding tags for mysite=%s",mysite);
// int16_tcut
//TagRec *tr = &m_newTagRec;
// current time
int32_t now = getTimeGlobal();
// actually, use spider download time if we can. that way
// Test.cpp's injection runs will be more consistent!
if ( ! strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
now = getSpideredTime();//m_spideredTime;
}
// store tags into here
SafeBuf *tbuf = &m_newTagBuf;
// allocate space to hold the tags we will add
Tag *tag;
int32_t need = 512;
// add in root title buf in case we add it too
need += m_rootTitleBufSize;
// reserve it all now
if ( ! tbuf->reserve(need) ) return NULL;
//
// add root langid if we need to
//
char *oldrl = gr->getString("rootlang",NULL,&timestamp);
// assume no valid id
int32_t oldrlid = -99;
// convert to id
if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );
// if not in old tag, or changed from what was in tag, or it has
// been 10 days or more, then update tagdb with this tag.
bool addRootLang = false;
if ( ! oldrl ) addRootLang = true;
if ( oldrlid != *rl ) addRootLang = true;
if ( now-timestamp > 10*86400 ) addRootLang = true;
// injects do not download the root doc for speed reasons, so do not
// bother for them unless the doc itself is the root.
if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
// . get the two letter (usually) language code from the id
// . i think the two chinese languages are 5 letters
char *newrl = NULL;
if ( addRootLang )
// i've seen this return NULL because *rl is a corrupt 215
// for some reason
newrl = getLanguageAbbr( *rl );
if ( newrl )
tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);
//
// add hascontactinfo if we need to
//
int32_t oldhci = gr->getLong("hascontactinfo",-1,NULL,&timestamp);
if ( oldhci == -1 || oldhci != *hci || now-timestamp > 10 *86400 ) {
char *val = "0";
if ( m_hasContactInfo ) val = "1";
tbuf->addTag3 (mysite,"hascontactinfo",now,"xmldoc",*ip,val,
rdbId);
}
//
// add "site" tag
//
char *oldsite = gr->getString("site",NULL);
if ( ! oldsite || strcmp(oldsite,mysite) || now-timestamp > 10*86400)
tbuf->addTag3(mysite,"site",now,"xmldoc",*ip,mysite,rdbId);
//
// add firstip if not there at all
//
char *oldfip = gr->getString("firstip",NULL);
// convert it
int32_t ip3 = 0;
if ( oldfip ) ip3 = atoip(oldfip);
// if not there or if bogus, add it!! should override bogus firstips
if ( ! ip3 || ip3 == -1 ) {
char *ipstr = iptoa(m_ip);
//if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0; }
//int32_t iplen = gbstrlen(ipstr);
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
tbuf->addTag3(mysite,"firstip",now,"xmldoc",*ip,ipstr,
rdbId);
}
//if ( strncmp(m_firstUrl.m_url,"http://delicious.com/",21)==0 )
// log("boo");
// sitenuminlinks
int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,&timestamp);
if ( old1 == -1 || old1 != m_siteNumInlinks ||
m_updatingSiteLinkInfoTags ) {
if ( g_conf.m_logDebugLinkInfo )
log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"",
mysite,m_siteNumInlinks);
if ( ! tbuf->addTag2(mysite,"sitenuminlinks",now,"xmldoc",
*ip,m_siteNumInlinks,rdbId) )
return NULL;
}
//int32_t old2, old3, old4;
// if running for diffbot crawlbot then isCustomCrawl is true
// so do not update the siteinlink info already in tagdb since i
// imported it from my main collection. we do not want to overwrite it.
// NO, because for single site crawls we bottlenech on msg25
// when there are millions of urls. we only skip this
// for the global-index and if already in tagdb!
// No, let's just not invalidate the sitenuminlinks* tags
// in XmlDoc::getSiteNumInlinks()
//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;
// sitenuminlinksfresh
// old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,&timestamp);
// if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
// m_updatingSiteLinkInfoTags )
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
// now,"xmldoc",
// *ip,m_siteNumInlinksUniqueIp,rdbId))
// return NULL;
// // sitepop
// old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
// &timestamp);
// if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
// m_updatingSiteLinkInfoTags )
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
// now,"xmldoc",
// *ip,m_siteNumInlinksUniqueCBlock,rdbId))
// return NULL;
// // total site inlinks
// old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
// &timestamp);
// if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
// m_updatingSiteLinkInfoTags )
// if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
// now,"xmldoc",
// *ip,m_siteNumInlinksTotal,rdbId))
// return NULL;
// skipSiteInlinks:
// get root title buf from old tag
char *data = NULL;
int32_t dsize = 0;
Tag *rt = gr->getTag("roottitles");
if ( rt ) {
data = rt->getTagData();
dsize = rt->getTagDataSize();
}
bool addRootTitle = false;
// store the root title buf if we need to. if we had no tag yet...
if ( ! rt )
addRootTitle = true;
// or if differs in size
else if ( dsize != m_rootTitleBufSize )
addRootTitle = true;
// or if differs in content
else if ( memcmp(data,m_rootTitleBuf,m_rootTitleBufSize))
addRootTitle =true;
// or if it is 10 days old or more
if ( now-timestamp > 10*86400 ) addRootTitle = true;
// but not if injected
if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false;
// add it then
if ( addRootTitle &&
! tbuf->addTag(mysite,"roottitles",now,"xmldoc",
*ip,m_rootTitleBuf,m_rootTitleBufSize,
rdbId,true) )
return NULL;
//
// add the VENUEADDRESS tags
//
// init the dedup table so we do not add the same address many times
char dtbuf[1000];
HashTableX dt;
dt.set(8,0,32,dtbuf,1000,false,m_niceness,"xmldt");
// reset counts
int32_t numContactAddressTags = 0;
int32_t numContactEmailTags = 0;
int32_t tagType2 = getTagTypeFromStr ( "contactaddress" );
int32_t tagType3 = getTagTypeFromStr ( "contactemails" );
// before we add the sitevenue to the tagrec let's make sure it is
// not a dedup.. i.e. that we do not already have this address
// in there.
int32_t tagType = getTagTypeFromStr ( "venueaddress" );
// start at the first tag
tag = gr->getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
// count current contact addresses we have
if ( tag->m_type == tagType2 ) numContactAddressTags++;
if ( tag->m_type == tagType3 ) numContactEmailTags++;
// skip if not a venueaddress tag
if ( tag->m_type != tagType ) continue;
// point to the serialized address
char *data = tag->getTagData();
// get that address hash i guess
uint64_t ah = getHashFromAddr ( data );
// add to dedup table - return NULL with g_errno set on error
if ( ! dt.addKey ( &ah ) ) return NULL;
}
int32_t na = aa->getNumAddresses();
// add up to 10 for now
for ( int32_t i = 0 ; i < na ; i++ ) {
// get it
Address *a = (Address *)aa->m_am.getPtr(i);
// check if venue
if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue;
// must have street on the page, not pointing into a tagrec
// from tagdb... otherwise we keep re-adding
if ( a->m_street->m_a < 0 ) continue;
// dedup! dedup against
// addresses in tagdb for venueaddress tag. can we use
// the dc[] array from Address.cpp... we need another
// set of bit flags for address class:
if ( dt.isInTable ( &a->m_hash ) ) continue;
// sanity
if ( a->m_hash == 0 ) { char *xx=NULL;*xx=0; }
// . serialize it
// . TODO: get rid of Address::addToTagRec() functions
char abuf[5000];
a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
// store in safebuf of tags
if ( ! tbuf->addTag3 (mysite,"venueaddress",now,"xmldoc",
*ip,abuf,rdbId) ) return NULL;
// only add once
if ( ! dt.addKey (&a->m_hash) ) return NULL;
}
//
//
// contact info stuff
//
//
// ensure m_numContactAddresses etc. are valid
Address **ca = getContactAddresses();
// blocked?
if ( ! ca || ca == (void *)-1 ) return (SafeBuf *)ca;
// do not do this for root if multiple addresses. this
// fixes http://obits.abqjournal.com/
if ( *isRoot && aa->m_uniqueStreetHashes > 1 ) na = 0;
// do not store more than 2 contact addresses, or 2 contact emails
// to avoid tagdb bloat. and also because we do not need that many.
// . store contact address if we had one
// . this is a buffer of Address ptrs
for ( int32_t i = 0 ; i < m_numContactAddresses ; i++ ) {
// stop on breach
if ( numContactAddressTags >= 2 ) break;
// inc it
numContactAddressTags++;
// breathe
QUICKPOLL(m_niceness);
// get it
Address *a = ca[i];
// . serialize it
// . TODO: get rid of Address::addToTagRec() functions
char abuf[5000];
a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
// store in safebuf of tags
if ( ! tbuf->addTag3 (mysite,"contactaddress",now,"xmldoc",
*ip,abuf,rdbId) ) return NULL;
}
// . add email addresses and submission forms to tag
// . this does not block, so make sure only called once!
// . contact emails. comma separated list
if ( emails && numContactEmailTags <= 1 ) {
numContactEmailTags++;
if ( ! tbuf->addTag3 (mysite,"contactemails",now,"xmldoc",
*ip,emails,rdbId) ) return NULL;
}
//
//
// NOW add tags for our outlinks
//
//
bool oldHighQualityRoot = true;
// if we are new, do not add anything, because we only add a tagdb
// rec entry for "new" outlinks that were added to the page since
// the last time we spidered it
if ( ! *isIndexed ) oldHighQualityRoot = false;
// special tags for google search results pages for scraping
char inGoogle = false;
if ( strstr(mysite,"google.com") ) inGoogle = true;
// no updating if we are not root
if ( ! inGoogle && ! *isRoot ) oldHighQualityRoot = false;
// must be high quality, too
if ( ! inGoogle && *siteNumInlinks < 500 ) oldHighQualityRoot = false;
// . if we are a google url then add tags for each outlink!
// . more google special tags to replace Scraper.cpp
char *fu = m_firstUrl.getUrl();
//char *name = NULL;
bool inGoogleBlogs = false;
bool inGoogleNews = false;
if ( ! strncmp ( fu , "http://www.google.com/blogsearch?", 33 ) )
inGoogleBlogs = true;
if ( ! strncmp ( fu , "http://blogsearch.google.com/blogsearch?", 40 ))
inGoogleBlogs = true;
if ( ! strncmp ( fu , "http://news.google.com/", 23 ))
inGoogleNews = true;
// only do once per site
char buf[1000];
HashTableX ht; ht.set (4,0,-1 , buf , 1000 ,false,m_niceness,"sg-tab");
// get site of outlink
SiteGetter siteGetter;
// . must be from an EXTERNAL DOMAIN and must be new
// . we should already have its tag rec, if any, since we have msge
int32_t n = links->getNumLinks();
// not if not spidering links
if ( ! addLinkTags ) n = 0;
// get the flags
linkflags_t *flags = links->m_linkFlags;
// scan all outlinks we have on this page
for ( int32_t i = 0 ; i < n ; i++ ) {
// get its tag rec
TagRec *gr = (*grv)[i];
// does this hostname have a "firstIp" tag?
char *ips = gr->getString("firstip",NULL);
bool skip = false;
// skip if we are not "old" high quality root
if ( ! oldHighQualityRoot ) skip = true;
// . skip if not external domain
// . we added this above, so just "continue"
if ( flags[i] & LF_SAMEDOM ) continue;//skip = true;
// skip links in the old title rec
if ( flags[i] & LF_OLDLINK ) skip = true;
// skip if determined to be link spam! should help us
// with the text ads we hate so much
if ( links->m_spamNotes[i] ) skip = true;
// if we should skip, and they have firstip already...
if ( skip && ips ) continue;
// get the normalized url
char *url = links->getLinkPtr(i);
// get the site. this will not block or have an error.
siteGetter.getSite(url,gr,timestamp,cr->m_collnum,m_niceness);
// these are now valid and should reference into
// Links::m_buf[]
char *site = siteGetter.m_site;
int32_t siteLen = siteGetter.m_siteLen;
int32_t linkIp = (*ipv)[i];
// get site hash
uint32_t sh = hash32 ( site , siteLen );
// ensure site is unique
if ( ht.getSlot ( &sh ) >= 0 ) continue;
// add it. returns false and sets g_errno on error
if ( ! ht.addKey ( &sh ) ) return NULL;
// . need to add firstip tag for this link's subdomain?
// . this was in Msge1.cpp but now we do it here
if ( ! ips && linkIp && linkIp != -1 ) {
// make it
char *ips = iptoa(linkIp);
if (!tbuf->addTag3(site,"firstip",now,"xmldoc",*ip,ips,
rdbId))
return NULL;
}
if ( skip ) continue;
// if outlink is a .gov or .edu site, do not bother, because
// getIsSpam() always returns false for those
// TODO: verify this
//if ( flags[i] & LF_EDUTLD ) continue;
//if ( flags[i] & LF_GOVTLD ) continue;
// this must be valid
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
//int32_t timestamp = m_spideredTime;
// how much avail for adding tags?
int32_t avail = tbuf->getAvail();
// reserve space
int32_t need = 512;
// make sure enough
if ( need > avail && ! tbuf->reserve ( need ) ) return NULL;
// add tag for this outlink
if ( inGoogle ) {// && ! gr->getTag("ingoogle") ) {
if ( ! tbuf->addTag(site,"ingoogle",now,"xmldoc",
*ip,"1",2,rdbId,true) )
return NULL;
}
if ( inGoogleBlogs && //! gr->getTag("ingoogleblogs") &&
!tbuf->addTag(site,"ingoogleblogs",now,"xmldoc",*ip,"1",2,
rdbId,true))
return NULL;
if ( inGoogleNews && //! gr->getTag("ingooglenews") &&
!tbuf->addTag(site,"ingooglenews",now,"xmldoc",*ip,"1",2,
rdbId,true))
return NULL;
// link is linked to by a high quality site! 500+ inlinks.
if ( gr->getNumTagTypes("authorityinlink") < 5 &&
! tbuf->addTag(site,"authorityinlink",now,"xmldoc",
*ip,"1",2,rdbId,true) )
return NULL;
}
m_newTagBufValid = true;
return &m_newTagBuf;
}
//
//
// BEGIN OLD SPAM.CPP class
//
//
#define WTMPBUFSIZE (MAX_WORDS *21*3)
// . RULE #28, repetitive word/phrase spam detector
// . set's the "spam" member of each word from 0(no spam) to 100(100% spam)
// . "bits" describe each word in phrasing terminology
// . if more than maxPercent of the words are spammed to some degree then we
// consider all of the words to be spammed, and give each word the minimum
// score possible when indexing the document.
// . returns false and sets g_errno on error
char *XmlDoc::getWordSpamVec ( ) {
if ( m_wordSpamBufValid ) {
char *wbuf = m_wordSpamBuf.getBufStart();
if ( ! wbuf ) return (char *)0x01;
return wbuf;
}
setStatus("getting word spam vec");
// assume not the repeat spammer
m_isRepeatSpammer = false;
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (char *)words;
m_wordSpamBuf.purge();
int32_t nw = words->getNumWords();
if ( nw <= 0 ) {
m_wordSpamBufValid = true;
return (char *)0x01;
}
Phrases *phrases = getPhrases ();
if ( ! phrases || phrases == (void *)-1 ) return (char *)phrases;
Bits *bits = getBits();
if ( ! bits ) return (char *)NULL;
m_wordSpamBufValid = true;
//if ( m_isLinkText ) return true;
//if ( m_isCountTable ) return true;
// int16_tcuts
//Words *words = m_words;
//Bits *bits = m_bits;
// if 20 words totally spammed, call it all spam?
m_numRepeatSpam = 20;
// int16_tcut
int32_t sni = m_siteNumInlinks;
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
// set "m_maxPercent"
int32_t maxPercent = 6;
if ( sni > 10 ) maxPercent = 8;
if ( sni > 30 ) maxPercent = 10;
if ( sni > 100 ) maxPercent = 20;
if ( sni > 500 ) maxPercent = 30;
// fix this a bit so we're not always totally spammed
maxPercent = 25;
// assume not totally spammed
m_totallySpammed = false;
// get # of words we have to set spam for
int32_t numWords = words->getNumWords();
// set up the size of the hash table (number of buckets)
int32_t size = numWords * 3;
// . add a tmp buf as a scratch pad -- will be freed right after
// . allocate this second to avoid mem fragmentation more
// . * 2 for double the buckets
char tmpBuf [ WTMPBUFSIZE ];
char *tmp = tmpBuf;
int32_t need = (numWords * 21) * 3 + numWords;
if ( need > WTMPBUFSIZE ) {
tmp = (char *) mmalloc ( need , "Spam" );
if ( ! tmp ) {
log("build: Failed to allocate %"INT32" more "
"bytes for spam detection: %s.",
need,mstrerror(g_errno));
return NULL;
}
}
QUICKPOLL(m_niceness);
// set up ptrs
char *p = tmp;
// first this
unsigned char *spam = (unsigned char *)p; p += numWords ;
// . this allows us to make linked lists of indices of words
// . i.e. next[13] = 23--> word #23 FOLLOWS word #13 in the linked list
int32_t *next = (int32_t *)p; p += size * 4;
// hash of this word's stem (or word itself if useStem if false)
int64_t *bucketHash = (int64_t *)p; p += size * 8;
// that word's position in document
int32_t *bucketWordPos = (int32_t *)p; p += size * 4;
// profile of a word
int32_t *profile = (int32_t *)p; p += size * 4;
// is it a common word?
char *commonWords = (char *)p; p += size * 1;
// sanity check
if ( p - tmp > need ) { char *xx=NULL;*xx=0; }
// clear all our spam percentages for these words
memset ( spam , 0 , numWords );
int32_t np;
// clear the hash table
int32_t i;
for ( i = 0 ; i < size ; i++ ) {
bucketHash [i] = 0;
bucketWordPos[i] = -1;
commonWords [i] = 0;
}
// count position since Words class can now have tags in it
//
//int32_t pos = 0;
//bool usePos = false;
//if ( words->m_tagIds ) usePos = true;
int64_t *wids = words->getWordIds();
// . loop through each word
// . hash their stems and place in linked list
// . if no stemming then don't do stemming
for ( i = 0 ; i < numWords ; i++ ) {
// . skip punctuation
// . this includes tags now , too i guess
//if ( words->isPunct(i) ) continue;
if ( wids[i] == 0 ) continue;
// skip if will not be indexed cuz score is too low
//if ( wscores && wscores[i] <= 0 ) continue;
QUICKPOLL(m_niceness);
// TODO: get phrase stem if stemming is on
// store the phrase stem this word int32_to the buffer
// blen = words->getPhraseStem(i,buf,100);
// if (blen<=0) continue;
// get the hash of the ith word
int64_t h = words->getWordId(i);
// use secondary wordId if available
//if ( words->getStripWordId(i) )
// h = words->getStripWordId(i);
// "j" is the bucket index
int32_t j = (uint64_t)h % size;
// make sure j points to the right bucket
while (bucketHash[j]) {
if ( h == bucketHash[j] ) break;
if (++j == size) j = 0;
}
// if this bucket is occupied by a word then replace it but
// make sure it adds onto the "linked list"
if (bucketHash[j]) {
// if Words class contain tags as words, do this
//if ( usePos ) {
// next [pos] = bucketWordPos[j];
// bucketWordPos[ j] = pos++;
//}
//else {
// add onto linked list for the ith word
next[i] = bucketWordPos[j];
// replace bucket with index to this word
bucketWordPos[j] = i;
//}
}
// otherwise, we have a new occurence of this word
else {
bucketHash [j] = h;
// if Words class contain tags as words, do this
//if ( usePos ) {
// bucketWordPos[ j] = pos++;
// next [pos] = -1;
//}
//else {
// store our position # (i) in bucket
bucketWordPos[j] = i;
// no next occurence of the ith word yet
next[i] = -1;
//}
}
// if stop word or number then mark it
if ( bits->isStopWord(i) ) commonWords[j] = 1;
if ( words->isNum ( i ) ) commonWords[j] = 1;
}
// count distinct candidates that had spam and did not have spam
int32_t spamWords = 0;
int32_t goodWords = 0;
// . now cruise down the hash table looking for filled buckets
// . grab the linked list of indices and make a "profile"
for ( i = 0 ; i < size ; i++ ) {
// skip empty buckets
if (bucketHash[i] == 0) continue;
np=0;
// word #j is in bucket #i
int32_t j = bucketWordPos[i];
// . cruise down the linked list for this word
while ( j!=-1) {
// store position of occurence of this word in profile
profile [ np++ ] = j;
// get the position of next occurence of this word
j = next[ j ];
}
// if 2 or less occurences of this word, don't check for spam
if ( np < 3 ) { goodWords++; continue; }
//
// set m_isRepeatSpammer
//
// look for a word repeated in phrases, in a big list,
// where each phrase is different
//
int32_t max = 0;
int32_t count = 0;
int32_t knp = np;
// must be 3+ letters, not a stop word, not a number
if ( words->m_wordLens[profile[0]] <= 2 || commonWords[i] )
knp = 0;
// scan to see if they are a tight list
for ( int32_t k = 1 ; k < knp ; k++ ) {
// breathe
QUICKPOLL(m_niceness);
// are they close together? if not, bail
if ( profile[k-1] - profile[k] >= 25 ) {
count = 0;
continue;
}
// otherwise inc it
count++;
// must have another word in between or tag
int32_t a = profile[k];
int32_t b = profile[k-1];
bool gotSep = false;
bool inLink = false;
for ( int32_t j = a+1 ; j <b ; j++ ) {
// if in link do not count, chinese spammer
// does not have his crap in links
if ( words->m_words[j][0] == '<' &&
words->m_wordLens[j]>=3 ) {
// get the next char after the <
char nc;
nc=to_lower_a(words->m_words[j][1]);
// now check it for anchor tag
if ( nc == 'a' ) {
inLink = true; break; }
}
if ( words->m_words[j][0] == '<' )
gotSep = true;
if ( is_alnum_a(words->m_words[j][0]) )
gotSep = true;
}
// . the chinese spammer always has a separator,
// usually another tag
// . and fix "BOW BOW BOW..." which has no separators
if ( ! gotSep ) count--;
else if ( inLink ) count--;
// get the max
if ( count > max ) max = count;
}
// a count of 50 such monsters indicates the chinese spammer
if ( max >= 50 )
m_isRepeatSpammer = true;
//
// end m_isRepeatSpammer detection
//
// . determine the probability this word was spammed by looking
// at the distribution of it's positions in the document
// . sets "spam" member of each word in this profile
// . don't check if word occurred 2 or less times
// . TODO: what about TORA! TORA! TORA!
// . returns true if 1+ occurences were considered spam
QUICKPOLL(m_niceness);
bool isSpam = setSpam ( profile , np , numWords , spam );
// don't count stop words or numbers towards this threshold
if ( commonWords[i] ) continue;
// tally them up
if ( isSpam ) spamWords++;
else goodWords++;
}
// what percent of distinct cadidate words were spammed?
int32_t totalWords = spamWords + goodWords;
// if no or ver few words return true
int32_t percent;
if ( totalWords <= 10 ) goto done;
percent = ( spamWords * 100 ) / totalWords;
// if 20% of words we're spammed punish everybody now to 100% spam
// if we had < 100 candidates and < 20% spam, don't bother
//if ( percent < 5 ) goto done;
if ( percent <= maxPercent ) goto done;
// set flag so linkspam.cpp can see if all is spam and will not allow
// this page to vote
m_totallySpammed = true;
// now only set to 99 so each singleton usually gets hashed
for ( i = 0 ; i < numWords ; i++ )
if ( words->getWordId(i) && spam[i] < 99 )
spam[i] = 99;
done:
// update the weights for the words
//for ( i = 0 ; i < numWords ; i++ ) {
// m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100;
//}
// TODO: use the min word spam algo as in Phrases.cpp for this!
//for ( i = 0 ; i < numWords ; i++ ) {
// m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100;
//}
// convert from percent spammed into rank.. from 0 to 10 i guess
for ( i = 0 ; i < numWords ; i++ )
spam[i] = (MAXWORDSPAMRANK * (100 - spam[i])) / 100;
// copy into our buffer
if ( ! m_wordSpamBuf.safeMemcpy ( (char *)spam , numWords ) )
return NULL;
// free our temporary table stuff
if ( tmp != tmpBuf ) mfree ( tmp , need , "Spam" );
return m_wordSpamBuf.getBufStart();
}
// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we map "each" subProfile to a probability of spam (from 0 to 100)
// . if the profile is really big we get really slow (O(n^2)) iterating through
// many subProfiles
// . so after the first 25 words, it's automatically considered spam
// . return true if one word was spammed w/ probability > 20%
bool XmlDoc::setSpam ( int32_t *profile, int32_t plen , int32_t numWords ,
unsigned char *spam ) {
// don't bother detecting spam if 2 or less occurences of the word
if ( plen < 3 ) return false;
int32_t i;
// if we have more than 10 words and this word is 20% or more of
// them then all but the first occurence is spammed
//log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam);
if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) {
for (i=1; i<plen; i++) spam[profile[i]] = 100;
return true ;
}
// . over 50 repeated words is ludicrous
// . set all past 50 to spam and continue detecting
// . no, our doc length based weight takes care of that kind of thing
//if (plen > 50 && m_version < 93 ) {
// // TODO: remember, profile[i] is in reverse order!! we should
// // really do i=0;i<plen-50, but this is obsolete anyway...
// for (i=50; i<plen;i++) m_spam[profile[i]] = 100;
// plen = 50;
//}
// we have to do this otherwise it takes FOREVER to do for plens in
// the thousands, like i saw a plen of 8338!
if ( plen > 50 ) { // && m_version >= 93 ) {
// . set all but the last 50 to a spam of 100%
// . the last 50 actually occur as the first 50 in the doc
for (i=0; i<plen-50;i++) spam[profile[i]] = 100;
// we now have only 50 occurences
plen = 50;
// we want to skip the first plen-50 because they actually
// occur at the END of the document
profile += plen - 50;
}
QUICKPOLL(m_niceness);
// higher quality docs allow more "freebies", but only starting with
// version 93... (see Titledb.h)
// profile[i] is actually in reverse order so we subtract off from wlen
//int32_t off ;
//if ( m_version >= 93 ) {
// off = (m_docQuality - 30) / 3;
// if ( off < 0 ) off = 0;
//}
// just use 40% "quality"
int32_t off = 3;
// . now the nitty-gritty part
// . compute all sub sequences of the profile
// . similar to a compression scheme (wavelets?)
// . TODO: word positions should count by two's since punctuation is
// not included so start step @ 2 instead of 1
// . if "step" is 1 we look at every word position in the profile
// . if "step" is 2 we look at every other word position
// . if "step" is 3 we look at every 3rd word position, etc...
int32_t maxStep = plen / 4;
if ( maxStep > 4 ) maxStep = 4;
// . loop through all possible tuples
int32_t window, wlen, step, prob;
for ( step = 1 ; step <= maxStep ; step++ ) {
for ( window = 0 ; window + 3 < plen ; window+=1) {
for (wlen = 3; window+wlen <= plen ; wlen+=1) {
// continue if step isn't aligned with window
// length
if (wlen % step != 0) continue;
// . get probability that this tuple is spam
// . returns 0 to 100
prob = getProbSpam ( profile + window ,
wlen , step);
// printf("(%i,%i,%i)=%i\n",step,window,
// wlen,prob);
// . if the probability is too low continue
// . was == 100
if ( prob <= 20 ) continue;
// set the spammed words spam to "prob"
// only if it's bigger than their current spam
for (i=window; i<window+wlen;i++) {
// first occurences can have immunity
// due to doc quality being high
if ( i >= plen - off ) break;
if (spam[profile[i]] < prob)
spam[profile[i]] = prob;
}
QUICKPOLL(m_niceness);
}
}
}
// was this word spammed at all?
bool hadSpam = false;
for (i=0;i<plen;i++) if ( spam[profile[i]] > 20 ) hadSpam = true;
// make sure at least one word survives
for (i=0;i<plen;i++) if ( spam[profile[i]] == 0) return hadSpam;
// clear the spam level on this guy
spam[profile[0]] = 0;
// return true if we had spam, false if not
return hadSpam;
}
bool getWordPosVec ( Words *words ,
Sections *sections,
//int32_t wordStart,
//int32_t wordEnd,
int32_t startDist, // m_dist
char *fragVec,
int32_t niceness ,
SafeBuf *wpos ) {
int32_t dist = startDist; // 0;
Section *lastsx = NULL;
int32_t tagDist = 0;
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
nodeid_t *tids = words->m_tagIds;
int64_t *wids = words->m_wordIds;
int32_t *wlens = words->getWordLens();
char **wptrs = words->getWords();
int32_t nw = words->getNumWords();
if ( ! wpos->reserve ( nw * 4 ) ) return false;
int32_t *wposvec = (int32_t *)wpos->getBufStart();
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL(niceness);
// save it
wposvec[i] = dist;
// tags affect the distance/wordposition cursor
if ( tids && tids[i] ) {
// tag distance affects
nodeid_t tid = tids[i] & BACKBITCOMP;
if ( isBreakingTagId ( tid ) ) tagDist += SENT_UNITS;
dist++;
continue;
}
// . and so do sequences of punct
// . must duplicate this code in Query.cpp for setting
// QueryWord::m_posNum
if ( ! wids[i] ) {
// simple space or sequence of just white space
if ( words->isSpaces(i) )
dist++;
// 'cd-rom'
else if ( wptrs[i][0]=='-' && wlens[i]==1 )
dist++;
// 'mr. x'
else if ( wptrs[i][0]=='.' && words->isSpaces2(i,1))
dist++;
// animal (dog)
else
dist += 2;
continue;
}
// ignore if in repeated fragment
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) {
dist++; continue; }
Section *sx = NULL;
if ( sp ) {
sx = sp[i];
// ignore if in style tag, etc. and do not
// increment the distance
if ( sx->m_flags & NOINDEXFLAGS )
continue;
}
// different sentence?
if ( sx &&
( ! lastsx ||
sx->m_sentenceSection != lastsx->m_sentenceSection ) ) {
// separate different sentences with 30 units
dist += SENT_UNITS; // 30;
// limit this!
if ( tagDist > 120 ) tagDist = 120;
// and add in tag distances as well here, otherwise
// we do not want "<br>" to really increase the
// distance if the separated words are in the same
// sentence!
dist += tagDist;
// new last then
lastsx = sx;
// store the vector AGAIN
wposvec[i] = dist;
}
tagDist = 0;
dist++;
}
return true;
}
bool getDensityRanks ( int64_t *wids ,
int32_t nw ,
int32_t hashGroup ,
SafeBuf *densBuf ,
Sections *sections ,
int32_t niceness ) {
//int32_t nw = wordEnd - wordStart;
// make the vector
if ( ! densBuf->reserve ( nw ) ) return false;
// convenience
char *densVec = densBuf->getBufStart();
// clear i guess
memset ( densVec , 0 , nw );
if ( hashGroup != HASHGROUP_BODY &&
hashGroup != HASHGROUP_HEADING )
sections = NULL;
// scan the sentences if we got those
Section *ss = NULL;
if ( sections ) ss = sections->m_firstSent;
// sanity
//if ( sections && wordStart != 0 ) { char *xx=NULL;*xx=0; }
for ( ; ss ; ss = ss->m_nextSent ) {
// breathe
QUICKPOLL(niceness);
// count of the alnum words in sentence
int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
// start with one word!
count--;
// how can it be less than one alnum word
if ( count < 0 ) continue;
// . base density rank on that
// . count is 0 for one alnum word now
int32_t dr = MAXDENSITYRANK - count;
// ensure not negative. make it at least 1. zero means un-set.
if ( dr < 1 ) dr = 1;
// mark all in sentence then
for ( int32_t i = ss->m_senta ; i < ss->m_sentb ; i++ ) {
// breathe
QUICKPOLL(niceness);
// assign
densVec[i] = dr;
}
}
// all done if using sections
if ( sections ) return true;
// count # of alphanumeric words in this string
int32_t na = 0;
for ( int32_t i = 0 ; i < nw ; i++ ) if ( wids[i] ) na++;
// a single alnum should map to 0 "na"
na--;
// wtf?
if ( na < 0 ) return true;
// compute density rank
int32_t dr = MAXDENSITYRANK - na ;
// at least 1 to not be confused with 0 which means un-set
if ( dr < 1 ) dr = 1;
// assign
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL(niceness);
// assign
densVec[i] = dr;
}
return true;
}
// . called by hashString() for hashing purposes, i.e. creating posdb keys
// . string is usually the document body or inlink text of an inlinker or
// perhaps meta keywords. it could be anything. so we need to create this
// vector based on that string, which is represented by words/phrases here.
bool getDiversityVec ( Words *words ,
Phrases *phrases ,
HashTableX *countTable ,
SafeBuf *sbWordVec ,
//SafeBuf *sbPhraseVec ,
int32_t niceness ) {
int64_t *wids = words->getWordIds ();
//nodeid_t *tids = words->getTagIds ();
int32_t nw = words->getNumWords();
int64_t *pids = phrases->getPhraseIds2();
// . make the vector
// . it will be diversity ranks, so one float per word for now
// cuz we convert to rank below though, one byte rank
if ( ! sbWordVec ->reserve ( nw*4 ) ) return false;
//if ( ! sbPhraseVec->reserve ( nw*4 ) ) return false;
// get it
float *ww = (float *)sbWordVec ->getBufStart();
//float *pw = (float *)sbPhraseVec->getBufStart();
int32_t nexti = -10;
int64_t pidLast = 0;
// . now consider ourselves the last word in a phrase
// . adjust the score of the first word in the phrase to be
for ( int32_t i = 0 ; i < nw ; i++ ) {
// yield
QUICKPOLL ( niceness );
// skip if not alnum word
if ( ! wids[i] ) { ww[i] = 0.0; continue; }
// try to inline this
int64_t nextWid = 0;
int64_t lastPid = 0;
// how many words in the bigram?
int32_t nwp = phrases->getNumWordsInPhrase2(i);
if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ;
if ( i == nexti ) lastPid = pidLast;
// get current pid
int64_t pid = pids[i];
// get the word and phrase weights for term #i
float ww2;
//float pw2;
getWordToPhraseRatioWeights ( lastPid , // pids[i-1],
wids[i] ,
pid ,
nextWid , // wids[i+1] ,
&ww2 ,
//&pw2 ,
countTable ,
1);//m_version );
// 0 to 1.0
if ( ww2 < 0 || ww2 > 1.0 ) { char *xx=NULL;*xx=0; }
// save the last phrase id
if ( nwp > 0 ) {
nexti = i + nwp - 1;
pidLast = pid; // pids[i] ;
}
// . apply the weights
// . do not hit all the way down to zero though...
// . Words.cpp::hash() will not index it then...
//if ( ww[i] > 0 ) {
ww[i] = ww2;
//}
/*
//if ( pw[i] > 0 ) {
pw[i] = (int32_t)(pw[i] * pw2);
if ( pw[i] <= 0 ) pw[i] = 1;
//}
// MDW: why was this here?
//if ( isLinkText ) continue;
// do not demote all the way to 0
//if ( ww[i] <= 0 ) ww[i] = 2;
// skip if phrase score is 0
if ( ! pw[i] ) continue;
if ( pid == 0 ) { pw[i] = 0; continue; }
// skip if does not start phrase
if ( nwp <= 0 ) continue;
// sanity check
if ( nwp == 99 ) { char *xx = NULL; *xx = 0; }
// now mod the score
float avg = pw[i];
// weight by punct in between
//for ( int32_t j = i+1 ; j < i+nwp ; j++ ) {
// if ( wids[j] ) continue;
// avg = (avg * (int64_t)pw[j]) / DW;
//}
// do not demote all the way to zero, we still want to index it
// and when normalized on a 100 point scale, like when printed
// out by PageParser.cpp, a score of 1 here gets normalized to
// 0, so make sure it is at least 2.
if ( avg < 2 )
avg = 2;
// set that as our new score
pw[i] = avg;
*/
}
// overwrite the array of floats with an array of chars (ranks)
char *nww = (char *)ww;
//char *npw = (char *)pw;
// convert from float into a rank from 0-15
for ( int32_t i = 0 ; i < nw ; i++ ) {
if ( ! ww[i] ) { nww[i] = 0; continue; }
// 2.50 is max in getWordToPhraseRatioWeights() function
char wrank = (char) ((ww[i] * ((float)MAXDIVERSITYRANK))/.55);
// sanity
if ( wrank > MAXDIVERSITYRANK ) wrank = MAXDIVERSITYRANK;
if ( wrank < 0 ) { char *xx=NULL;*xx=0; }
//char prank = (char) ((pw[i] * 15.0) / 2.50);
// assign now
nww[i] = wrank;
//npw[i] = prank;
}
return true;
}
// match word sequences of NUMWORDS or more words
#define NUMWORDS 5
// . repeated sentence frags
// . 1-1 with words in body of doc
char *XmlDoc::getFragVec ( ) {
if ( m_fragBufValid ) {
char *fb = m_fragBuf.getBufStart();
if ( ! fb ) return (char *)0x01;
return fb;
}
setStatus("getting frag vec");
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (char *)words;
Bits *bits = getBits();
if ( ! bits ) return NULL;
m_fragBuf.purge();
// ez vars
int64_t *wids = words->getWordIds ();
int32_t nw = words->getNumWords();
// if no words, nothing to do
if ( nw == 0 ) {
m_fragBufValid = true;
return (char *)0x01;//true;
}
// truncate for performance reasons. i've seen this be over 4M
// and it was VERY VERY SLOW... over 10 minutes...
// - i saw this tak over 200MB for an alloc for
// WeightsSet3 below, so lower from 200k to 50k. this will probably
// make parsing inconsistencies for really large docs...
if ( nw > MAXFRAGWORDS ) nw = MAXFRAGWORDS;
int64_t ringWids [ NUMWORDS ];
int32_t ringPos [ NUMWORDS ];
int32_t ringi = 0;
int32_t count = 0;
uint64_t h = 0;
// . make the hash table
// . make it big enough so there are gaps, so chains are not too long
int32_t minBuckets = (int32_t)(nw * 1.5);
uint32_t nb = 2 * getHighestLitBitValue ( minBuckets ) ;
int32_t need = nb * (8+4+4);
char *buf = NULL;
char tmpBuf[50000];
if ( need < 50000 ) buf = tmpBuf;
else buf = (char *)mmalloc ( need , "WeightsSet3" );
char *ptr = buf;
uint64_t *hashes = (uint64_t *)ptr; ptr += nb * 8;
int32_t *vals = (int32_t *)ptr; ptr += nb * 4;
float *ww = (float *)ptr; ptr += nb * 4;
if ( ! buf ) return NULL;
for ( int32_t i = 0 ; i < nw ; i++ ) ww[i] = 1.0;
if ( ptr != buf + need ) { char *xx=NULL;*xx=0; }
// make the mask
uint32_t mask = nb - 1;
// clear the hash table
memset ( hashes , 0 , nb * 8 );
// clear ring of hashes
memset ( ringWids , 0 , NUMWORDS * 8 );
// for sanity check
int32_t lastStart = -1;
// . hash EVERY NUMWORDS-word sequence in the document
// . if we get a match look and see what sequences it matches
// . we allow multiple instances of the same hash to be stored in
// the hash table, so keep checking for a matching hash until you
// chain to a 0 hash, indicating the chain ends
// . check each matching hash to see if more than NUMWORDS words match
// . get the max words that matched from all of the candidates
// . demote the word and phrase weights based on the total/max
// number of words matching
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
// yield
QUICKPOLL ( m_niceness );
// add new to the 5 word hash
h ^= wids[i];
// . remove old from 5 word hash before adding new...
// . initial ring wids are 0, so should be benign at startup
h ^= ringWids[ringi];
// add to ring
ringWids[ringi] = wids[i];
// save our position
ringPos[ringi] = i;
// wrap the ring ptr if we need to, that is why we are a ring
if ( ++ringi >= NUMWORDS ) ringi = 0;
// this 5-word sequence starts with word # "start"
int32_t start = ringPos[ringi];
// need at least NUMWORDS words in ring buffer to do analysis
if ( ++count < NUMWORDS ) continue;
// . skip if it starts with a word which can not start phrases
// . that way "a new car" being repeated a lot will not
// decrease the weight of the phrase term "new car"
// . setCountTable() calls set3() with this set to NULL
//if ( bits && ! bits->canStartPhrase(start) ) continue;
// sanity check
if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
// reset max matched
int32_t max = 0;
// look up in the hash table
uint32_t n = h & mask;
// sanity breach check
if ( n >= nb ) { char *xx=NULL;*xx=0; }
loop:
// all done if empty
if ( ! hashes[n] ) {
// sanity check
//if ( n >= nb ) { char *xx = NULL; *xx = 0; }
// add ourselves to the hash table now
hashes[n] = h;
// sanity check
//if ( wids[start] == 0 ) { char *xx = NULL; *xx = 0; }
// this is where the 5-word sequence starts
vals [n] = start;
// save it
lastStart = start;
// debug point
//if ( start == 7948 )
// log("heystart");
// do not demote words if less than NUMWORDS matched
if ( max < NUMWORDS ) continue;
// . how much we should we demote
// . 10 matching words pretty much means 0 weights
float demote = 1.0 - ((max-5)*.10);
if ( demote >= 1.0 ) continue;
if ( demote < 0.0 ) demote = 0.0;
// . RULE #26 ("long" phrases)
// . if we got 3, 4 or 5 in our matching sequence
// . basically divide by the # of *phrase* terms
// . multiply by 1/(N-1)
// . HOWEVER, should we also look at HOW MANY other
// sequences matches this too!???
//float demote = 1.0 / ((float)max-1.0);
// set3() is still called from setCountTable() to
// discount the effects of repeated fragments, and
// the count table only understands score or no score
//if ( max >= 15 ) demote = 0.0;
// demote the next "max" words
int32_t mc = 0;
int32_t j;
for ( j = start ; mc < max ; j++ ) {
// sanity
if ( j >= nw ) { char *xx=NULL;*xx=0; }
if ( j < 0 ) { char *xx=NULL;*xx=0; }
// skip if not an alnum word
if ( ! wids[j] ) continue;
// count it
mc++;
// demote it
ww[j] = (int32_t)(ww[j] * demote);
if ( ww[j] <= 0 ) ww[j] = 2;
}
// save the original i
int32_t mini = i;
// advance i, it will be incremented by 1 immediately
// after hitting the "continue" statement
i = j - 1;
// must be at least the original i, we are monotinic
// otherwise ringPos[] will not be monotonic and core
// dump ultimately cuz j and k will be equal below
// and we increment matched++ forever.
if ( i < mini ) i = mini;
// get next word
continue;
}
// get next in chain if hash does not match
if ( hashes[n] != h ) {
// wrap around the hash table if we hit the end
if ( ++n >= nb ) n = 0;
// check out bucket #n now
goto loop;
}
// how many words match so far
int32_t matched = 0;
// . we have to check starting at the beginning of each word
// sequence since the XOR compositional hash is order
// independent
// . see what word offset this guy has
int32_t j = vals[n] ;
// k becomes the start of the current 5-word sequence
int32_t k = start;
// sanity check
if ( j == k ) { char *xx = NULL; *xx = 0; }
// skip to next in chain to check later
if ( ++n >= nb ) n = 0;
// keep advancing k and j as int32_t as the words match
matchLoop:
// get next wid for k and j
while ( k < nw && ! wids[k] ) k++;
while ( j < nw && ! wids[j] ) j++;
if ( k < nw && wids[k] == wids[j] ) {
matched++;
k++;
j++;
goto matchLoop;
}
// keep track of the max matched for i0
if ( matched > max ) max = matched;
// get another matching string of words, if possible
goto loop;
}
if ( nw <= 0 ) { char *xx=NULL;*xx=0;}
// make space
if ( ! m_fragBuf.reserve ( nw ) ) {
// save it
int32_t saved = g_errno;
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
// reinstate it
g_errno = saved;
return NULL;
}
// validate
m_fragBufValid = true;
// handy ptr
char *ff = m_fragBuf.getBufStart();
// convert from floats into frag score, 0 or 1 really
for ( int32_t i = 0 ; i < nw ; i++ ) {
if ( ww[i] <= 0.0 ) ff[i] = 0;
else ff[i] = 1;
}
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
// wtf?
if ( ! ff ) { char *xx=NULL;*xx=0; }
return ff;
}
float g_wtab[30][30];
// . inline this for speed
// . if a word repeats in different phrases, promote the word
// and demote the phrase
// . if a word repeats in pretty much the same phrase, promote
// the phrase and demote the word
// . if you have the window of text "new mexico good times"
// and word #i is mexico, then:
// pid1 is "new mexico"
// wid1 is "mexico"
// pid2 is "mexico good"
// wid2 is "good"
// . we store sliderParm in titleRec so we can update it along
// with title and header weights on the fly from the spider controls
void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
int64_t wid1 ,
int64_t pid2 ,
int64_t wid2 , // post word
float *retww ,
//float *retpw ,
HashTableX *tt1 ,
int32_t titleRecVersion ) {
static float s_fsp;
// from 0 to 100
char sliderParm = g_conf.m_sliderParm;
// i'm not too keen on putting this as a parm in the CollectionRec
// because it is so cryptic...
//static char sliderParm = 25;
// . to support RULE #15 (word to phrase ratio)
// . these weights are based on the ratio of word to phrase count
// for a particular word
static char s_sp = -1;
if ( s_sp != sliderParm ) {
// . set it to the newly updated value
// . should range from 0 up to 100
s_sp = sliderParm;
// the float version
s_fsp = (float)sliderParm / 100.0;
// sanity test
if ( s_fsp < 0.0 || s_fsp > 1.0 ) { char *xx = NULL; *xx = 0; }
// i is the word count, how many times a particular word
// occurs in the document
for ( int32_t i = 0 ; i < 30 ; i++ ) {
// . k is the phrase count, how many times a particular phrase
// occurs in the document
// . k can be GREATER than i because we index only phrase terms
// sometimes when indexing neighborhoods, and not the
// single words that compose them
for ( int32_t k = 0 ; k < 30 ; k++ ) {
// do not allow phrase count to be greater than
// word count, even though it can happen since we
// add imported neighborhood pwids to the count table
int32_t j = k;
if ( k > i ) j = i;
// get ratio
//float ratio = (float)phrcount / (float)wrdcount;
float ratio = (float)j/(float)i;
// it should be impossible that this can be over 1.0
// but might happen due to hash collisions
if ( ratio > 1.0 ) ratio = 1.0;
// restrict the range we can weight a word or phrase
// based on the word count
//float r = 1.0;
//if ( i >= 20 ) r = 2.1;
//else if ( i >= 10 ) r = 1.8;
//else if ( i >= 4 ) r = 1.5;
//else r = 1.3;
//g_ptab[i][k] = 1.00;
g_wtab[i][k] = 1.00;
if ( i <= 1 ) continue;
// . we used to have a sliding bar between 0.0 and 1.0.
// word is weighted (1.0 - x) and phrase is weighted
// by (x). however, x could go all the way to 1.0
// even when i = 2, so we need to restrict x.
// . x is actually "ratio"
// . when we have 8 or less word occurences, do not
// remove more than 80% of its score, a 1/5 penalty
// is good enough for now. but for words that occur
// a lot in the link text or pwids, go to town...
if ( i <= 2 && ratio >= .50 ) ratio = .50;
else if ( i <= 4 && ratio >= .60 ) ratio = .60;
else if ( i <= 8 && ratio >= .80 ) ratio = .80;
else if ( i <= 12 && ratio >= .95 ) ratio = .95;
// round up, so many "new mexico" phrases but only
// make it up to 95%...
if ( ratio >= .95 ) ratio = 1.00;
// if word's phrase is repeated 3 times or more then
// is a pretty good indication that we should weight
// the phrase more and the word itself less
//if ( k >= 3 && ratio < .90 ) ratio = .90;
// compute the weights
//float pw = 2.0 * ratio;
//float ww = 2.0 * (1.0 - ratio);
float ww = (1.0 - ratio);
// . punish words a little more
// . if we got 50% ratio, words should not get as much
// weight as the phrase
//ww *= .45;
// do not weight to 0, no less than .15
if ( ww < 0.0001 ) ww = 0.0001;
//if ( pw < 0.0001 ) pw = 0.0001;
// do not overpromote either
//if ( ww > 2.50 ) ww = 2.50;
//if ( pw > 2.50 ) pw = 2.50;
// . do a sliding weight of the weight
// . a "ww" of 1.0 means to do no weight
// . can't do this for ww cuz we use "mod" below
//float newWW = s_fsp*ww + (1.0-s_fsp)*1.00;
//float newPW = s_fsp*pw + (1.0-s_fsp)*1.00;
// limit how much we promote a word because it
// may occur 30 times total, but have a phrase count
// of only 1. however, the other 29 times it occurs it
// is in the same phrase, just not this particular
// phrase.
//if ( ww > 2.0 ) ww = 2.0;
g_wtab[i][k] = ww;
//g_ptab[i][k] = newPW;
//logf(LOG_DEBUG,"build: wc=%"INT32" pc=%"INT32" ww=%.2f "
//"pw=%.2f",i,k,g_wtab[i][k],g_ptab[i][k]);
}
}
}
int32_t phrcount1 = 0;
int32_t phrcount2 = 0;
int32_t wrdcount1 = 0;
int32_t wrdcount2 = 0;
if ( tt1->m_numSlotsUsed > 0 ) {
if (pid1) phrcount1 = tt1->getScore(&pid1);
if (pid2) phrcount2 = tt1->getScore(&pid2);
if (wid1) wrdcount1 = tt1->getScore(&wid1);
if (wid2) wrdcount2 = tt1->getScore(&wid2);
}
// if we are always ending the same phrase, like "Mexico"
// in "New Mexico"... get the most popular phrase this word is
// in...
int32_t phrcountMax = phrcount1;
int32_t wrdcountMin = wrdcount1;
// these must actually exist to be part of the selection
if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2;
if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2;
// . but if we are 'beds' and in a popular phrase like 'dog beds'
// there maybe a lot of other phrases mentioned that have 'beds'
// in them like 'pillow beds', 'pet beds', but we need to assume
// that is phrcountMax is high enough, do not give much weight to
// the word... otherwise you can subvert this algorithm by just
// adding other random phrases with the word 'bed' in them.
// . BUT, if a page has 'X beds' with a lot of different X's then you
// still want to index 'beds' with a high score!!! we are trying to
// balance those 2 things.
// . do this up here before you truncate phrcountMax below!!
float mod = 1.0;
if ( phrcountMax <= 6 ) mod = 0.50;
else if ( phrcountMax <= 8 ) mod = 0.20;
else if ( phrcountMax <= 10 ) mod = 0.05;
else if ( phrcountMax <= 15 ) mod = 0.03;
else mod = 0.01;
// scale wrdcount1/phrcountMax down for the g_wtab table
if ( wrdcount1 > 29 ) {
float ratio = (float)phrcountMax / (float)wrdcount1;
phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
wrdcount1 = 29;
}
if ( phrcountMax > 29 ) {
float ratio = (float)wrdcount1 / (float)phrcountMax;
wrdcount1 = (int32_t)((29.0 * ratio) + 0.5);
phrcountMax = 29;
}
// . sanity check
// . neighborhood.cpp does not always have wid/pid pairs
// that match up right for some reason... so we can't do this
//if ( phrcount1 > wrdcount1 ) { char *xx = NULL; *xx = 0; }
//if ( phrcount2 > wrdcount2 ) { char *xx = NULL; *xx = 0; }
// apply the weights from the table we computed above
*retww = mod * g_wtab[wrdcount1][phrcountMax];
// slide it
*retww = s_fsp*(*retww) + (1.0-s_fsp)*1.00;
// ensure we do not punish too hard
if ( *retww <= 0.0 ) *retww = 0.01;
if ( *retww > 1.0 ) { char *xx=NULL;*xx=0; }
/*
if ( phrcountMax >= 0 ) {
int64_t sh = getPrefixHash ( (char *)NULL , 0 , NULL , 0 );
int64_t tid = g_indexdb.getTermId ( sh , wid1 );
logf(LOG_DEBUG,"build: phrcountMax=%"INT32" wrdCount1=%"INT32" "
"*ww=%.4f for word with tid=%"UINT64"",
phrcountMax,wrdcount1,(float)*ww,tid);
//if ( phrcountMax < 10 && tid == 16944700235015LL )
// log("hey");
}
*/
// sanity check
//if ( *ww == 0.0 ) { char *xx = NULL; *xx = 0; }
/*
// scale wrdcountMin/phrcount down for the g_ptab table
if ( wrdcountMin > 29 ) {
float ratio = (float)phrcount2 / (float)wrdcountMin;
phrcount2 = (int32_t)((29.0 * ratio) + 0.5);
wrdcountMin = 29;
}
if ( phrcount2 > 29 ) {
float ratio = (float)wrdcountMin / (float)phrcount2;
wrdcountMin = (int32_t)((29.0 * ratio) + 0.5);
phrcount2 = 29;
}
*/
// . if the word is Mexico in 'New Mexico good times' then
// phrase term #i which is, say, "Mexico good" needs to
// get the min word count when doings its word to phrase
// ratio.
// . it has two choices, it can use the word count of
// "Mexico" or it can use the word count of "good".
// . say, each is pretty high in the document so the phrase
// ends up getting penalized heavily, which is good because
// it is a nonsense phrase.
// . if we had "united socialist soviet republic" repeated
// a lot, the phrase "socialist soviet" would score high
// and the individual words would score low. that is good.
// . try to seek the highest weight possible for this phrase
// by choosing the lowest word count possible
// . NO LONGER AFFECT phrase weights because just because the
// words occur a lot in the document and this may be the only
// occurence of this phrase, does not mean we should punish
// the phrase. -- MDW
//*retpw = 1.0;
return;
// do it the old way...
//*pw = g_ptab[wrdcountMin][phrcount2];
// sanity check
//if ( *pw == 0.0 ) { char *xx = NULL; *xx = 0; }
}
// for registerSleepCallback
static void clockSyncWaitWrapper ( int fd , void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_masterLoop ( THIS->m_masterState );
}
// . a special call
// . returns -1 if blocked, 1 otherwise, 0 on error
char XmlDoc::waitForTimeSync ( ) {
// unregister?
if ( isClockInSync() && m_alreadyRegistered ) {
// note it
log("build: clock now synced for %s",m_firstUrl.m_url);
g_loop.unregisterSleepCallback(m_masterState,
clockSyncWaitWrapper);
}
// return 1 if synced!
if ( isClockInSync() ) return 1;
// already registered? wait another 1000ms
if ( m_alreadyRegistered ) return -1;
// flag it
m_alreadyRegistered = true;
// note it
log("build: waiting for clock to sync for %s",m_firstUrl.m_url);
// this should mean it is re-called later
if ( g_loop.registerSleepCallback ( 1000 , // 1000 ms
m_masterState ,
clockSyncWaitWrapper ,
m_niceness ))
// wait for it, return -1 since we blocked
return -1;
// if was not able to register, ignore delay
log("doc: failed to register clock wait callback");
return 0;
}
////////////////////////////
//
// SCRAPING TOOLS
//
////////////////////////////
void doInjectLoopWrapper ( void *state ) {
XmlDoc *XD = (XmlDoc *)state;
// if it blocked, wait
if ( ! XD->doInjectLoop ( ) ) return;
// . if we did not inject any links, i guess we are done!
// . this happens if the ahrefs.com doc had the same outlinks
// as the ahrefs.com doc for another search result, they are all
// deduped and it does not block.
XD->m_finalCallback ( XD->m_finalState );
}
// . return false if blocks, true otherwise
// . return true and set error on error, with no blocks outstanding
// . TODO: make this word for ahrefs.com list of links in xml feed
bool XmlDoc::injectLinks (HashTableX *linkDedupTablePtr ,
HashTableX *domDedupTablePtr,
void *finalState ,
void (* finalCallback)(void *)) {
// INJECT 10 at a time. xmldoc is 1MB.
int32_t i; for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
XmlDoc *nd;
// continue if already set it. this was overwriting it
// and causing a mem leak before
if ( m_xmlDocs[i] ) continue;
try { nd = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
break;
}
mnew ( nd , sizeof(XmlDoc),"xmldocarr");
m_xmlDocs[i] = nd;
}
// all null?
if ( i < (int32_t)MAX_XML_DOCS ) {
log("scrape: one xmldoc alloc failed");
return true;
}
m_masterLoop = doInjectLoopWrapper;
m_masterState = this;
m_finalState = finalState;
m_finalCallback = finalCallback;
// note it
//log("xmldoc: injecting outlinks of %s",m_firstUrl.getUrl());
m_linkDedupTablePtr = linkDedupTablePtr;
m_domDedupTablePtr = domDedupTablePtr;
// loop over all links
m_i = 0;
m_blocked = 0;
memset ( m_used , 0 , (int32_t)MAX_XML_DOCS );
return doInjectLoop();
}
void doneInjectingWrapper ( void *state ) {
XmlDoc *xd = (XmlDoc *)state;
XmlDoc *XD = (XmlDoc *)xd->m_hack;
XD->doneInjecting ( xd );
}
// . return false if blocks, true otherwise
// . return true and set error on error, with no blocks outstanding
bool XmlDoc::doInjectLoop ( ) {
setStatus("inject outlinks");
//Links *links = getLinks();
//if ( ! links ) return (m_blocked == 0);
//if ( links == (void *)-1 ) return false;
Sections *sections = getSections();
if ( ! sections ) return (m_blocked == 0);
if ( sections == (void *)-1 ) return false;
Links *links = getLinks();
if ( ! links ) return (m_blocked == 0);
if ( links == (void *)-1 ) return false;
Words *words = getWords();
if ( ! words ) return (m_blocked == 0);
if ( words == (void *)-1 ) return false;
Bits *bp = getBits();
if ( ! bp ) return (m_blocked == 0);
if ( bp == (void *)-1 ) return false;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
int32_t n = links->getNumLinks();
Url tmpUrl;
Section *prev = NULL;
// scan the links now
for ( ; m_i < n ; ) {
// get xml doc then
int32_t j; for ( j = 0 ; j < MAX_XML_DOCS ; j++ )
if ( ! m_used[j] ) break;
// none? return false if blocked.
if ( j >= MAX_XML_DOCS ) return false;
// get the m_ith link
char *link = links->getLink ( m_i );
int32_t linkLen = links->getLinkLen ( m_i );
// temp term
if ( link[linkLen] ) { char *xx=NULL;*xx=0; }
// skip to next link to index
m_i++;
// skip injecting if its an internal bing/google outlink
if ( strncmp(link,"http://www.bing.com/",20) == 0 )
continue;
// skip youtube query links. they contain our exact
// query!! so almost always come up #1
if ( strstr(link,".youtube.com/") && strstr(link,"&q="))
continue;
if ( strstr(link,".msn.com/") )
continue;
if ( strstr(link,".microsoft.com/") )
continue;
if ( strstr(link,".discoverbing.com/") )
continue;
if ( strstr(link,".googleusercontent.com/") )
continue;
//if(!strncmp(link,"http://webcache.googleusercontent.com/",38)
if(!strncmp(link,"http://www.google.com/url?q=http",32)){
// grab the real url from that
char *embed = strstr(link,"url?q=http");
if ( ! embed ) continue;
link = embed+6;
char *end = embed;
for ( ; *end && *end != '&' ; end++) {
// google appends query to url.. strange
//if ( end[0] == '%' &&
// end[1] == '2' &&
// to_lower_a(end[2]) == 'b' )
// break;
}
SafeBuf mbuf;
mbuf.reserve ( end - link + 100 );
int32_t dlen;
char *bs = mbuf.getBufStart();
dlen=urlDecode(bs,link , end - link );
bs[dlen] = '\0';
tmpUrl.set ( bs );
link = tmpUrl.getUrl();
linkLen = tmpUrl.getUrlLen();
}
// skip maps.google.com etc.
if ( strstr(link,".google.com/") )
continue;
// ok, point to title and summary for this result!
// go up to prev node for first non-clickable text which
// should be summary
//Section **sp = sections->m_sectionPtrs;
// get the section
int32_t ln = links->getNodeNum(m_i-1);
// get node ptr
XmlNode *node = m_xml.getNodePtr(ln);
char *ptr = node->m_node;
// find section that contains it i guess
Section *sx = sections->m_rootSection;
Section *last = NULL;
char **wptrs = words->getWords();
//nodeid_t *tids = words->getTagIds();
for ( ; sx ; sx = sx->m_next ) {
// get section ptr
char *sw = wptrs[sx->m_b-1];
if ( sw < ptr ) continue;
// over?
sw = wptrs[sx->m_a];
if ( sw > ptr ) break;
last = sx;
}
// assign
sx = last;
// telescope section up one i guess
//sx = sx->m_parent;
// int16_tcut
wbit_t *bits = bp->m_bits;
// if still same first alnum, go another
//for ( ; sx ; sx = sx->m_parent ) {
// // skip if same word starts this section
// //if ( sx->m_firstWordPos == fa ) continue;
// // must have alnum
// if ( sx->m_firstWordPos <= 1 ) continue;
// // must be in link! should be the result TITLE
// if ( bits[sx->m_firstWordPos] & D_IN_LINK ) break;
// // word must not be "cached" or whatever...
//}
// if in bold tag, should telescope up some more
//if ( sx && sx->m_tagId == TAG_B ) sx = sx->m_parent;
//if ( sx && sx->m_tagId == TAG_STRONG ) sx = sx->m_parent;
// save
//int32_t fa = sx->m_firstWordPos;
// that's the title so telescope up as int32_t as that is the
// first alnum!!!
for ( ; sx ; sx = sx->m_parent ) {
//Section *ps = sx->m_parent;
// do we have a next brother? stop then! that means
// we are in a list!
//if ( sx->m_nextBrother ) break;
//if ( ps->m_firstWordPos != fa ) break;
// stop when we hit a result delimeter!!
if ( sx->m_tagId == TAG_LI ) {
// bing...
if ( strncmp(wptrs[sx->m_a],
"<li class=\"sa_wr\">",
17) == 0 ) {
break;
}
// google...
if ( strncmp(wptrs[sx->m_a],
"<li class=\"g\">",
13) == 0 ) {
break;
}
}
}
// if no indicator, bail
if ( ! sx ) continue;
// skip link if contained in prev section
if ( prev == sx )
continue;
// save it
prev = sx;
// record search result details
Section *title = NULL;
Section *cite = NULL;
Section *summary = NULL;
// . that is probably the full result then...
// . title is first sentence
for ( ; sx ; sx = sx->m_next ) {
// only sentences
if ( ! ( sx->m_flags & SEC_SENTENCE ) ) continue;
// grab it
if ( ! title ) {
title = sx;
continue;
}
// skip section if in link
if ( bits[sx->m_firstWordPos] & D_IN_LINK ) continue;
// we are sentence section so fix it so we are one
// above!
Section *rs = sx; // ->m_parent;
// telescope up to a div or whatever...
//for ( ; rs ; rs = rs->m_parent ) {
// if ( rs->m_tagId == TAG_DIV ) break;
// if ( rs->m_tagId == TAG_P ) break;
//}
// and out of bold
if ( rs && rs->m_tagId == TAG_B ) rs = rs->m_parent;
if ( rs && rs->m_tagId == TAG_STRONG) rs=rs->m_parent;
// bail if no good!
if ( ! rs ) continue;
// then site if google
if ( ! cite ) {
cite = rs;
continue;
}
// then summary
summary = rs;
break;
}
m_serpBuf.safePrintf("\t\t<result>\n");
// print <title> tag
if ( title ) printSerpFiltered(title,"title");
// print <sum> tag
if ( summary ) printSerpFiltered(summary,"sum");
m_serpBuf.safePrintf("\t\t\t<url>");
m_serpBuf.safeMemcpy ( link , linkLen );
m_serpBuf.safePrintf("</url>\n");
m_serpBuf.safePrintf("\t\t</result>\n");
// if not injecting, skip
//continue;
if ( ! m_reallyInjectLinks ) continue;
// dedup
int32_t linkHash32 = hash32 ( link , linkLen );
if ( m_linkDedupTablePtr &&
m_linkDedupTablePtr->isInTable (&linkHash32) ) continue;
// add it otherwise
if ( m_linkDedupTablePtr )
m_linkDedupTablePtr->addKey ( &linkHash32 );
// we use this when injecting ahrefs links
if ( m_domDedupTablePtr ) {
int32_t domLen;
char *dom = getDomFast ( link , &domLen );
int32_t dh32 = hash32 ( dom , domLen );
if ( m_domDedupTablePtr->isInTable (&dh32) ) continue;
m_domDedupTablePtr->addKey ( &dh32 );
}
// get it
XmlDoc *xd = m_xmlDocs[j];
if ( ! xd ) { char *xx=NULL;*xx=0; }
// add www to it
Url lu;
lu.set ( link , linkLen , true );
char *wwwLink = lu.getUrl();
// this can go on the stack since set4() copies it
SpiderRequest sreq;
sreq.reset();
// index this link!
strcpy(sreq.m_url,wwwLink);
// parentdocid of 0
int32_t firstIp = hash32n(wwwLink);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1;
sreq.m_isPageInject = 1;
sreq.m_hopCount = 0;//hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
setStatus("injecting an outlink");
// . use the enormous power of our new XmlDoc class
// . this returns false with g_errno set on error
if ( ! xd->set4 ( &sreq ,
NULL ,
cr->m_coll ,
NULL , // pbuf
// give it a niceness of 1, we have to be
// careful since we are a niceness of 0!!!!
m_niceness, // 1 ,
// inject this content
NULL, // content ,
false, // deleteFromIndex ,
0, // forcedIp ,
0, // contentType ,
0, // lastSpidered ,
false )) { // hasMime
// . g_errno should be set if that returned false
// . return true if does not need to block
log("xmldoc: outlink inject: %s",mstrerror(g_errno));
break;
}
xd->m_hack = this;
// make this our callback in case something blocks
xd->setCallback ( xd , doneInjectingWrapper );
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
xd->m_recycleContent = false;//true;
// avoid looking up ip of each outlink to add "firstip" tag to
// tagdb because that can be slow!!!!!!!
xd->m_spiderLinks = false;
xd->m_spiderLinks2 = false;
xd->m_spiderLinksValid = true;
// . newOnly is true --> do not inject if document is already
// indexed!
// . maybe just set indexCode
xd->m_newOnly = true;//false;//newOnly;
// need to refresh it!!
//xd->m_newOnly = false;//newOnly;
// turn off robots.txt lookups
xd->m_isAllowed = true;
xd->m_isAllowedValid = true;
xd->m_crawlDelay = -1; // unknown
xd->m_crawlDelayValid = true;
// log it now
log("inject: indexing outlink %s (hash=%"UINT32")",wwwLink,
(uint32_t)linkHash32);
// costs one API unit, which is one cent. but if we do
// top 50 on google, top 50 on procog, it can be like
// $1 every time we do this.
//xd->injectAhrefsLinks();
bool status = true;
// this will tell it to index ahrefs first before indexing
// the doc. but do NOT do this if we are from ahrefs.com
// ourselves to avoid recursive explosion!!
xd->m_downloadLevel = m_downloadLevel + 1;
xd->m_useAhrefs = m_useAhrefs;
// inherit dedup tables as well!
xd->m_linkDedupTablePtr = m_linkDedupTablePtr;
// . now tell it to index
// . this returns false if blocked
status = xd->indexDoc ( );
// log it. i guess only for errors when it does not block?
// because xmldoc.cpp::indexDoc calls logIt()
if ( status ) xd->logIt();
// otherwise, it blocks
else {
m_blocked++;
log("xmldoc: blockedout=%"INT32" slotj=%"INT32" "
"(this=0x%"PTRFMT",xd=0x%"PTRFMT")",
m_blocked,j,(PTRTYPE)this,(PTRTYPE)xd);
m_used[j] = true;
}
}
// return true if all done
return (m_blocked == 0);
}
void XmlDoc::doneInjecting ( XmlDoc *xd ) {
// find it in our list
int32_t i;
for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
if ( ! m_used[i] ) continue;
if ( m_xmlDocs[i] != xd ) continue;
break;
}
// core if not found in our list, it must be there
if ( i >= MAX_XML_DOCS ) { char *xx=NULL;*xx=0; }
// free it up now!
m_used[i] = 0;
// free it up
//mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
//delete ( m_xmlDocs[i] );
//m_xmlDocs[i] = NULL;
m_xmlDocs[i]->reset();
// uncount it as being outstanding
m_blocked--;
// log debug
log("xmldoc: blockedin=%"INT32" (this=0x%"PTRFMT")",
m_blocked,(PTRTYPE)this);
// return if still blocked
if ( ! doInjectLoop() ) return;
// log debug
log("xmldoc: final callback");
// ok, all have been indexed
m_finalCallback ( m_finalState );
}
bool XmlDoc::injectAhrefsLinks ( ) {
setStatus("get inlinks from ahrefs.com");
// skip for now
//return true;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// make the ahrefs urls
try { m_ahrefsDoc = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
}
mnew ( m_ahrefsDoc , sizeof(XmlDoc),"xmldocah");
// make the url
SafeBuf ubuf;
// turn count down to 10 for now
ubuf.safePrintf("http://api.ahrefs.com/get_backlinks.php?count=350&mode=exact&output=xml&AhrefsKey=0452f27fd5a7fec5e9702e23ba4af223&target=");
//ubuf.safePrintf("http://www.gigablast.com/?q=poo&u=");
ubuf.urlEncode (m_firstUrl.getUrl() );
Url url;
url.set ( ubuf.getBufStart() );
char *up = url.getUrl();
// set by url i guess
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url,up);
// parentdocid of 0
int32_t firstIp = hash32n(up);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1;
sreq.m_isPageInject = 1;
sreq.m_hopCount = 0;//hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// int16_tcut
XmlDoc *ah = m_ahrefsDoc;
// . use the enormous power of our new XmlDoc class
// . this returns false with g_errno set on error
if ( ! ah->set4 ( &sreq ,
NULL ,
cr->m_coll ,
NULL , // pbuf
// give it a niceness of 1, we have to be
// careful since we are a niceness of 0!!!!
m_niceness, // 1 ,
// inject this content
NULL, // content ,
false, // deleteFromIndex ,
0, // forcedIp ,
0, // contentType ,
0, // lastSpidered ,
false )) { // hasMime
log("xmldoc: ahref doc error %s",mstrerror(g_errno));
// g_errno should be set if that returned false
return true;
}
// do not re-call the set
//m_needsSet = false;
// make this our callback in case something blocks
//ah->setCallback ( state , callback );
// do not re-lookup the robots.txt
ah->m_isAllowed = true;
ah->m_isAllowedValid = true;
ah->m_crawlDelay = -1; // unknown
ah->m_crawlDelayValid = true;
ah->m_downloadLevel = m_downloadLevel + 1;
// reset domain table for deduping ahref's links by domain
// before injecting them... only inject one per domain
if ( ! m_domDedupTablePtr ) {
m_domDedupTable.set(4,0,512,NULL,0,false,m_niceness,"dmtab2");
m_domDedupTablePtr = &m_domDedupTable;
}
// log it now
//log("inject: indexing injected doc %s",url);
// if we are a url like api.ahrefs.com/get_backlinks... then
// our links can use our table for deduping based on domain, AND
// they can use our link dedup table in case one outlink is also
// a search result on google's page...
if ( ! ah->injectLinks ( m_linkDedupTablePtr,
m_domDedupTablePtr,
m_masterState ,
m_masterLoop ) )
return false;
return true;
}
bool XmlDoc::printSerpFiltered ( Section *sx , char *tagName ) {
//int64_t *wids = m_words.getWordIds();
char **wptrs = m_words.getWords();
int32_t *wlens = m_words.getWordLens();
int32_t fa = sx->m_firstWordPos;
nodeid_t *tids = m_words.getTagIds();
if ( fa > 0 && tids[fa-1] == TAG_B ) fa--;
if ( fa > 0 && tids[fa-1] == TAG_STRONG ) fa--;
int32_t la = sx->m_b;
int32_t nw = m_words.getNumWords();
if ( la+1 < nw && tids[la+1] == (TAG_B|BACKBIT) ) la++;
if ( la+1 < nw && tids[la+1] == (TAG_STRONG|BACKBIT) ) la++;
// advance la even more if regular words or br tags or b or strong tags
for ( ; la < nw ; la++ ) {
if ( ! tids[la] ) continue;
if ( (tids[la]&BACKBITCOMP) == TAG_BR ) continue;
if ( (tids[la]&BACKBITCOMP) == TAG_STRONG ) continue;
if ( tids[la] == TAG_BR ) continue;
break;
}
m_serpBuf.safePrintf("\t\t\t<%s>",tagName);
// cdata!
m_serpBuf.safePrintf("<![CDATA[");
// subtract 1 from sx->m_b to avoid ending tag
for ( int32_t i = fa ; i < la ; i++ ) {
// skip if br
if ( tids[i] == TAG_BR ) continue;
m_serpBuf.cdataEncode ( wptrs[i] , wlens[i] );
}
// cdata!
m_serpBuf.safePrintf("]]>");
m_serpBuf.safePrintf("</%s>\n",tagName);
return true;
}
//////////
//
// BEGIN NEW SEO MATCHING QUERIES TOOL CODE
//
//////////
static void loadTitleRecFromDiskOrSpiderWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
if ( ! THIS->loadTitleRecFromDiskOrSpider() ) return;
THIS->m_callback1 ( THIS->m_state );
}
// . if we can't load titlerec from titledb, spider it, index it and
// use that new titlerec
// . returns false if blocks
// . returns true and sets g_errno on error
bool XmlDoc::loadTitleRecFromDiskOrSpider() {
if ( ! m_masterLoop ) {
m_masterState = this;
m_masterLoop = loadTitleRecFromDiskOrSpiderWrapper;
}
// fix a core when getTermListBuf() calls getMetaList()
// which calls getNewSpiderReply() which calls
// getDownloadEndTime() and tries to download the page
// even though we have a valid titlerec!
if ( ! m_downloadEndTimeValid ) {
m_downloadEndTimeValid = true;
m_downloadEndTime = 0;
}
// . try to recycle the content first
// . try to load it from title rec first
// . we have to do this otherwise our ptr_linkInfo link texts
// will be somewhat random and cause us to get different scores
// for the queries we match!!
// . so do this not just for speed, but to be consistent.
if ( ! loadFromOldTitleRec() ) return false;
// did that fail? i.e. not found!?!?! ignore and just indexx it
if ( m_oldTitleRecValid && m_oldTitleRec )
return true;
// ok, we gotta index it
if ( ! m_loggedMsg3 ) {
m_loggedMsg3 = true;
log("xmldoc: url %s not in titledb, spidering and indexing",
m_firstUrl.m_url);
}
// clear that
g_errno = 0;
// turn off recycling i guess since we don't have it
m_recycleContent = false;
// first index it, but only if not already indexed
// did it block?
// eror indexing doc? indexCode should be set then
if ( ! indexDoc() ) return false;
// no blocking
return true;
}
/*
void getSEOQueryInfoWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// note it
THIS->setStatus ( "seoqueryinfowrapper" );
// make sure has not been freed from under us!
if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
// note it
THIS->setStatus ( "in seo query info wrapper" );
// return if it blocked
if ( THIS->getSEOQueryInfo( ) == (void *)-1 ) return;
// print any error
if ( g_errno )
log("seopipe: getSeoQueryInfo error: %s",mstrerror(g_errno));
// all done
else
log("seopipe: getSeoQueryInfo is done");
// show timing info
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - THIS->m_beginSEOTime;
log("seopipe: time: getSeoQueryInfo took %"INT64"ms",took);
// otherwise, all done, call the caller callback
if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
else THIS->m_callback2 ( THIS->m_state );
return;
}
void getSEOQueryInfoWrapper2 ( int fd , void *state ) {
// just pump! otherwise we might re-launch a msg3a request while
// one is outstanding causing a core in Multicast::reset()
XmlDoc *THIS = (XmlDoc *)state;
// debug log
THIS->setStatus ("getseoqueryinfowrapper2");
// if we are waiting just on the pump i guess we are all done!
if ( ! THIS->m_socketWriteBufValid ) {
log("seopipe: pumping socket");
THIS->pumpSocketWriteBuf();
return;
}
// not pumping?
log("seopipe: pumping socket ready wrapper");
// otherwise, let it call the callback
getSEOQueryInfoWrapper ( state );
}
// . return safebuf of xml containing matching and related queries and
// related urls/titles
// . this transmits the xml as it generates it to "m_seoSocket" if non-null
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . stores the xml in the m_socketWriteBuf SafeBuf
// . will keep blocking (returning -1) until the xml is delivered to socket
// if it is non-NULL
SafeBuf *XmlDoc::getSEOQueryInfo ( ) {
setStatus ( "seo query info" );
// only set to valid once it has been all written out!!
if ( m_socketWriteBufValid ) {
// all done?
if ( ! m_seoSocket ) return &m_socketWriteBuf;
// pump
pumpSocketWriteBuf();
// if socket not done being pumped... we block. it's
// ready wrappers should re-call our wrapper.
if ( m_socketWriteBufSent >= m_socketWriteBuf.length() )
return &m_socketWriteBuf;
// wait for write to finish
return (SafeBuf *)-1;
}
// the g_errno could be a title rec not found reply coming back
// so do not process that here! it needs to be processed
// by the function whose request resulted in an error reply.
// for instances, the getTitle() call below needs to set g_errno
// when we call it now, responding to its msg22 reply.
//if ( g_errno ) return NULL;
// a good place to init stuff we need here
if ( ! m_masterState ) {
m_printedQueries = false;
m_printedRelatedDocIds = false;
m_printedRelatedQueries = false;
m_printedRecommendedLinks = false;
m_printedScoredInsertableTerms = false;
//m_docIndexed = false;
// time it
m_beginSEOTime = gettimeofdayInMilliseconds();
// for our m_masterLoop function, it uses this as the state
m_masterState = this;
// this is a main entry point function so anything that blocks
// should re-call this function
m_masterLoop = getSEOQueryInfoWrapper;
// assume indexed
m_docIndexed = true;
// fix a core when getTermListBuf() calls getMetaList()
// which calls getNewSpiderReply() which calls
// getDownloadEndTime() and tries to download the page
// even though we have a valid titlerec!
if ( ! m_downloadEndTimeValid ) {
m_downloadEndTimeValid = true;
m_downloadEndTime = 0;
}
}
// . try to load it from title rec first
// . we have to do this otherwise our ptr_linkInfo link texts
// will be somewhat random and cause us to get different scores
// for the queries we match!!
// . so do this not just for speed, but to be consistent.
if ( m_recycleContent && ! loadFromOldTitleRec()) return (SafeBuf *)-1;
// did that fail? i.e. not found!?!?! ignore and just indexx it
if ( m_oldTitleRecValid && ! m_oldTitleRec && m_recycleContent ) {
// just skip this asshole then
log("xmldoc: url %s load3 failed",m_firstUrl.m_url);
// clear that
g_errno = 0;
// need to index it
m_docIndexed = false;
}
// first index it, but only if not already indexed
if ( ! m_docIndexed ) {
// turn off recycling i guess since we don't have it
m_recycleContent = false;
// did it block?
// eror indexing doc? indexCode should be set then
if ( ! indexDoc() ) return (SafeBuf *)-1;
// do not re-call
m_docIndexed = true;
}
// was indexing successful?
int32_t *indexCode = getIndexCode();
if ( ! indexCode || indexCode == (void *)-1 )
return (SafeBuf *)indexCode;
// if not successfully indexed send back error msg
if ( *indexCode && m_seoSocket ) {
m_socketWriteBuf.safePrintf(
"\t<errorMsg><![CDATA[%s]]>"
"</errorMsg>\n"
"</response>"
, mstrerror(*indexCode) );
// send on socket
pumpSocketWriteBuf();
// if socket not done being pumped... we block
if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
return (SafeBuf *)-1;
// otherwise, we are done sending
return &m_socketWriteBuf;
}
// seo.cpp needs this in printDupSentences
Sections *sections = getSectionsWithDupStats();
if ( ! sections || sections == (void *)-1) return (SafeBuf *)sections;
// seo.cpp needs this now when it calls getSiteRank()
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (void *)-1 ) return (SafeBuf *)sni;
// . find all logged queries that this document matches
// . this will launch msg99 requests to each host in the network
// . then it scores them
// . don't worry about sending back in real-time for this since it
// should be fast
SafeBuf *qpbuf = getMatchingQueriesScored();
if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;
// . how many queries do we have that match this url?
// . they should be sorted by our url's score
int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
// int16_tcut
SafeBuf *sb = &m_socketWriteBuf;
// cast the msg99 reply ptrs, i.e. query ptrs
Msg99Reply **queryPtrs = (Msg99Reply **)qpbuf->getBufStart();
// store each one as xml then into m_headerBuf99
if ( ! m_printedQueries && m_seoSocket ) {
m_printedQueries = true;
// do not flood the socket! so limit to 1000 queries
// they should be sorted by queryImportance!
// cheatcodes.com has like 50,000 matching queries.
int32_t max = numQueryPtrs;
if ( max > 1000 ) max = 1000;
for ( int32_t i = 0 ; i < max ; i++ ) {
// int16_tcut
Msg99Reply *qp = queryPtrs[i];
// sometimes queries like 'gallery-view' are
// hard-phrased and do not show up for us, so skip.
// they should be at the very end so we should be
// trimming the tail for them, so don't worry about
// <queryNum> having holes in it.
if ( qp->m_myDocId == 0LL && qp->m_myScore == 0.0 )
continue;
// int16_tcut
QueryLogEntry *qe = &qp->m_queryLogEntry;
sb->safePrintf("\t<seoQuery>\n"
"\t\t<queryNum>%"INT32"</queryNum>\n"
"\t\t<query><![CDATA[%s]]></query>\n"
"\t\t<queryTrafficPerDay>%"INT32""
"</queryTrafficPerDay>\n"
// our url's score
"\t\t<myDocId>%"INT64"</myDocId>\n"
"\t\t<myScore>%f</myScore>\n"
//"\t\t<mySiteHash32>%"UINT32""
//"</mySiteHash32>\n"
"\t\t<queryImportance>%f"
"</queryImportance>\n"
"\t</seoQuery>\n"
, i
, qp->m_queryStr
// x 10 to estimate google?
, qe->m_gigablastTraffic *
GB_TRAFFIC_MODIFIER
, qp->m_myDocId
, qp->m_myScore
//, qp->m_mySiteHash32
, qp->m_queryImportance
//,qp->m_queryInfo.m_numUniqueWordForms
//,qp->m_queryInfo.m_numRepeatWordForms
//qp->m_queryInfo.m_smallestNormTermFreq
);
}
}
// pump it some. i.e. send m_socketWriteBuf contents back to
// m_seoSocket if it is non-NULL
pumpSocketWriteBuf();
// . now instead try getting the top "imax" queries scored on the
// whole index
// . transmit them back on m_seoSocket AS WE GET THEM by calling
// pumpSocketWriteBuf() function and storing into m_socketWriteBuf
//qpbuf = getMatchingQueriesScoredForFullQuery ( );
//if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
// how many related docids do we have?
int32_t nr = rdbuf->length() / sizeof(RelatedDocId);
//
// print out the related urls
//
if ( ! m_printedRelatedDocIds && nr && m_seoSocket ) {
m_printedRelatedDocIds = true;
int32_t max = 200; // m_maxRelatedUrls;
if ( max == -1 ) max = nr;
if ( nr < max ) max = nr;
sb->safePrintf("\t<relatedUrls>\n");
for ( int32_t i = 0 ; i < max ; i++ ) {
RelatedDocId *rd = &rds[i];
// fix for titlerec not found errors
char *title = rd->ptr_rd_title;
char *url = rd->ptr_rd_url;
if ( ! title ) title = "";
if ( ! url ) url = "";
// print it out
sb->safePrintf("\t\t<relatedUrl>\n"
"\t\t\t<urlNum>%"INT32"</urlNum>\n"
"\t\t\t<url><![CDATA[%s]]></url>\n"
"\t\t\t<docId>%"INT64"</docId>\n"
"\t\t\t<siteHash32>%"UINT32"</siteHash32>\n"
"\t\t\t<title><![CDATA["
, i
, url
, rd->m_docId
, rd->m_siteHash32
);
// encode CDATA stuff in title
sb->cdataEncode(title);
sb->safePrintf("]]></title>\n"
"\t\t\t<queriesInCommon>%"INT32""
"</queriesInCommon>\n"
"\t\t\t<similarityScore>%f"
"</similarityScore>\n"
, rd->m_numCommonQueries
, rd->m_dotProduct // similarityScore
);
// print the actualy querynums in common
int32_t firstOff = rd->m_firstCommonQueryNumOff;
int32_t offset = firstOff;
sb->safePrintf("\t\t\t<queriesInCommon>\n");
for ( ; offset >= 0 ; ) {
// get that node
char *buf = m_commonQueryNumBuf.getBufStart();
// and offset
buf += offset;
// then cast
QueryNumLinkedNode *qn;
qn = (QueryNumLinkedNode *)buf;
// print that
sb->safePrintf("\t\t\t\t<queryNum>%"INT32""
"</queryNum>\n"
, qn->m_queryNum );
// advance. will be -1 when done
offset = qn->m_nextOff;
}
sb->safePrintf("\t\t\t</queriesInCommon>\n");
sb->safePrintf("\t\t</relatedUrl>\n");
}
sb->safePrintf("\t</relatedUrls>\n");
}
//
// recommended inlinks!
//
// pump it some. i.e. send m_socketWriteBuf contents back to
// m_seoSocket if it is non-NULL
pumpSocketWriteBuf();
SafeBuf *kbuf = getRecommendedLinksBuf();
if ( ! kbuf || kbuf == (void *)-1 ) return kbuf;
// print out the recommended links in xml
if ( ! m_printedRecommendedLinks && m_seoSocket ) {
sb->safePrintf("\t<recommendedLinks>\n");
char *p = kbuf->getBufStart();
char *pend = kbuf->getBuf();
for ( ; p < pend ; ) {
// cast it
RecommendedLink *ri = (RecommendedLink *)p;
// skip it
p += ri->getSize();
// print it out
sb->safePrintf("\t\t<link>\n"
"\t\t\t<url><![CDATA[%s]]></url>\n"
"\t\t\t<title><![CDATA[%s]]></title>\n"
"\t\t\t<score>%f</score>\n"
"\t\t\t<siteRank>%"INT32"</siteRanke>\n"
,ri->getUrl(kbuf)
,ri->getTitle(kbuf)
,ri->m_totalRecommendedScore
,(int32_t)ri->m_siteRank
);
}
sb->safePrintf("\t</recommendedLinks>\n");
m_printedRecommendedLinks = true;
}
//
// related queries
//
// write out
pumpSocketWriteBuf();
SafeBuf *relBuf = getRelatedQueryBuf();
if ( ! relBuf || relBuf == (void *)-1 ) return relBuf;
QueryRel **rels = (QueryRel **)relBuf->getBufStart();
int32_t numRels = relBuf->length() / sizeof(QueryRel *);
//
// print out the related queries
//
if ( ! m_printedRelatedQueries && numRels && m_seoSocket ) {
sb->safePrintf("\t<relatedQueries>\n");
int32_t max = 200; // m_maxRelatedQueries;
if ( max == -1 ) max = numRels;
if ( numRels < max ) max = numRels;
for ( int32_t i = 0 ; i < max ; i++ ) {
QueryRel *rel = rels[i];
// must be a first!
if ( ! rel->m_isFirst ) { char *xx=NULL;*xx=0; }
// int16_tcut
//QueryInfo *qi = &rel->m_queryInfo;
// print it out
sb->safePrintf("\t\t<relatedQuery>\n"
"\t\t\t<query><![CDATA[%s]]></query>\n"
"\t\t\t<relatedDocIdsInCommon>%"INT32""
"</relatedDocIdsInCommon>\n"
"\t\t\t<relatedQueryImportance>%f"
"</relatedQueryImportance>\n"
//"\t</relatedUrl>\n"
, rel->m_queryStr
, rel->m_docIdVotes
//, qi->m_numUniqueWordForms
//, qi->m_numRepeatWordForms
//, qi->m_smallestNormTermFreq
, rel->m_totalRelatedQueryImportance
//, qi->m_myScoreRelated
);
// print details!
sb->safePrintf("\t\t\t<matchingDocIds>\n");
// linked list of Msg99Replies for the related queries.
// all in linked list are for the same query but
// restricted to a different docid!
for ( ; rel ; rel = rel->m_next ) {
// get his related docid
RelatedDocId *rd = rel->m_relatedDocId;
// print that
sb->safePrintf("\t\t\t\t<match>\n"
"\t\t\t\t\t<relatedDocId>%"INT64""
"</relatedDocId>\n"
"\t\t\t\t\t<siteHash32>%"UINT32""
"</siteHash32>\n"
//"\t\t\t\t\t"
//"<queryImportance>%f"
//"</queryImportance>\n"
"\t\t\t\t\t<docIdSimilarity>%f"
"</docIdSimilarity>\n"
"\t\t\t\t\t<docIdScore>%f"
"</docIdScore>\n"
"\t\t\t\t</match>\n"
, rd->m_docId
, rd->m_siteHash32
//, rd->m_similarityScore
, rd->m_dotProduct
, rel->m_myScore
);
}
sb->safePrintf("\t\t\t</matchingDocIds>\n");
sb->safePrintf("\t\t</relatedQuery>\n");
}
sb->safePrintf("\t</relatedQueries>\n");
m_printedRelatedQueries = true;
}
// write out
pumpSocketWriteBuf();
// this is the Keyword Insertion Tool data (KIT data)
SafeBuf *sits = getScoredInsertableTerms();
if ( ! sits || sits == (void *)-1 ) return sits;
// try to store into cachedb in case user clicks a different
// insertable term and we have to update the wordposinfo::m_rankChange
// stuff in the html src display
//if ( ! storeIntoCachedb() )
// // return -1 if it blocked and wait for store to complete
// return (SafeBuf *)-1;
// print out query changes
if ( ! m_printedScoredInsertableTerms && m_seoSocket ) {
// dump out each insertable term and it's corresponding
// QueryChanges
if ( ! printScoredInsertableTerms ( sb ) )
return NULL;
m_printedScoredInsertableTerms = true;
// end of xml response?
sb->safePrintf("</response>\n");
}
// even if not fully pumped, set it to valid here
m_socketWriteBufValid = true;
if ( ! m_seoSocket ) return &m_socketWriteBuf;
// write out
pumpSocketWriteBuf();
// if socket not done being pumped... we block
if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
return (SafeBuf *)-1;
// ok, we are done
return &m_socketWriteBuf;
}
*/
// have the smallest twids on top!
int twidcmp ( const void *a, const void *b ) {
TermInfo *ua = (TermInfo *)a;
TermInfo *ub = (TermInfo *)b;
//uint32_t ua = *(uint32_t *)a;
//uint32_t ub = *(uint32_t *)b;
// HACKY: sort by lower 32 bits of the 64 bit termids so
// seo.cpp can use them with its QueryLogEntries which use 32 bit
// termids to save mem.
uint32_t ta = (uint32_t)ua->m_termId64;
uint32_t tb = (uint32_t)ub->m_termId64;
// lower first
if ( ta > tb ) return 1; // swap
if ( ta < tb ) return -1;
return 0;
}
// . 1. make a vector of the words in the title, headers, page-inlink-text,
// and site-inlink-text
//
// . 2. pass that word vector to every machine in network to see what queries
// in the query logs we match. use Msg99.cpp. it should initialize
// on startup and load in it's share of the query logs. query log file
// should be sorted then sorted by filtered query then split. should also
// remove queries from the most aggressive IPs (bots). we would need
// a program, filterquerylog.cpp to do all that on gk37, our query log
// storage server. it needs to store # of times query was done, too.
// all queries should have back to back spaces removed and made lowercase.
// remove queries that have double quotes or colon operators in them.
// index each query term in the query log into HashTableX, which will
// point to the query in the buffer. then we just store the termlist
// in a SafeBuf that we save on disk. 40GB of queries split 256 ways
// is still like 175MB per server! (if one server is dead, skip it)
//
// . 3. merge all queries received from all hosts and sort by traffic.
//
// . 4. perform the queries on procog and cache the scores of the top 10
// results for each query. should be cached on machine that houses the
// query. try a 60-day cache max age.
//
// . 5. now redo the queries but with a "url:thisurl |" to get this page's
// score for each query. if the min score of the query on procog is
// well beyond our grasp, we could just skip it.
//
// . 6. then determine the # of inlinks we need to add to get more traffic
// for each query. assume siterank of 0 per inlink. if that would be
// impossible then increment the siterank until it gets us in the top 10.
//
// just use getTopTermsVector
HashTableX *XmlDoc::getTermIdBufDedupTable32 ( ) {
SafeBuf *tiBuf = getTermInfoBuf();
if ( ! tiBuf || tiBuf == (void *)-1 ) return (HashTableX *)tiBuf;
return &m_tidTable32;
}
// . used by handleRequest8e() which uses msg20::getSummary() with
// m_getTermListBuf to call this in the local host msg20 handler.
// . this buf is used to determine what queries this document matches
SafeBuf *XmlDoc::getTermId32Buf() {
if ( m_termId32BufValid )
return &m_termId32Buf;
SafeBuf *tiBuf = getTermInfoBuf ();
if ( ! tiBuf || tiBuf == (void *) -1 ) return tiBuf;
int32_t need = 4 * (tiBuf->length() / sizeof(TermInfo));
if ( ! m_termId32Buf.reserve(need) ) return NULL;
// scan those
char *p = tiBuf->getBufStart();
char *pend = tiBuf->getBuf();
uint32_t last = 0;
for ( ; p < pend ; ) {
TermInfo *ti = (TermInfo *)p;
p += sizeof(TermInfo);
uint32_t tid32 = (uint32_t)(ti->m_termId64);
m_termId32Buf.pushLong(tid32);
// sanity
if ( last && tid32 <= last ) { char *xx=NULL;*xx=0; }
last = tid32;
}
m_termId32BufValid = true;
return &m_termId32Buf;
}
// . used by getTermId32Buf() for getting this document's matching queries
// . serialize the words in the title and inlink text into a vector
// . SafeBuf is filled with class TermInfos! defined in seo.h. currently
// just a int64_t m_termId64 though!
// . get synonyms of each word too!
// . we sort them by the 32-bit termid so handleRequest8e() can do its fast
// compare algo to find matching queries which are also sorted by the lower
// 32 bits of terms in the query.
SafeBuf *XmlDoc::getTermInfoBuf ( ) {
setStatus ( "getterminfobuf" );
if ( m_termInfoBufValid ) return &m_termInfoBuf;
bool includeSynonyms = true;
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (SafeBuf *)langId;
if (!m_tidTable32.set(4,0,16384,NULL,0,false,m_niceness,"twidtabl"))
return NULL;
//
// add document body words now to m_twbuf
//
if ( ! addUniqueWordsToBuf ( &m_termInfoBuf ,
&m_tidTable32 , // dedup table
NULL, // filter table
NULL, // mincounttable
false ,
ww ,
includeSynonyms) )
return NULL;
//
// store count of each term we hash after this into "TMP"
//
HashTableX TMP;
if(!TMP.set(4,4,4096,NULL,0,false,m_niceness,"tmttt") )
return NULL;
//
// hash meta desc into TMP table
//
int32_t mdlen;
char *md = getMetaDescription( &mdlen );
if ( md ) {
Words ww3;
ww3.setx ( md , mdlen , m_niceness );
if (!addUniqueWordsToBuf(NULL,
NULL , // dedup table
NULL, // filter table
&TMP, // mincounttable
true, // store counts?
&ww3,
includeSynonyms))
return NULL;
}
//
// hash meta keywords into TMP table
//
int32_t mklen;
char *mk = getMetaKeywords( &mklen );
if ( mk ) {
Words ww4;
ww4.setx ( mk , mklen , m_niceness );
if (!addUniqueWordsToBuf(NULL,
NULL, // dedup table
NULL, // filter table
&TMP, // mincounttable
true, // store counts?
&ww4,
includeSynonyms))
return NULL;
}
//
// hash each link text into TMP table
//
// loop over every link text to this page
for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL(m_niceness);
// get the link text
if ( k->size_linkText <= 1 ) continue;
// set Url
Url u;
u.set ( k->getUrl() , k->size_urlBuf );
// do not allow anomalous link text to match query
//if ( k->m_isAnomaly ) continue;
char *p = k-> getLinkText();
int32_t plen = k->size_linkText - 1;
if ( ! verifyUtf8 ( p , plen ) ) {
log("title: set4 bad link text from url=%s",
k->getUrl());
continue;
}
// debug
//log("seo: counttable for link text '%s'",k->getLinkText());
// now the words.
Words ww2;
if ( ! ww2.set ( k->getLinkText() ,
k->size_linkText-1, // len
TITLEREC_CURRENT_VERSION ,
true , // computeIds
m_niceness ))// niceness
// g_errno set on error, return NULL
return NULL;
// int16_tcuts on link text
if ( ! addUniqueWordsToBuf( NULL,
NULL, // dedup table
NULL, // filter table
&TMP, // mincounttable
true, // store counts?
&ww2,
includeSynonyms))
return NULL;
}
//
// now only add link texts to main table and buffer if it occurs
// already in the body, or occurs TWICE in "TMP"
//
// loop over every link text to this page
for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL(m_niceness);
// get the link text
if ( k->size_linkText <= 1 ) continue;
// set Url
Url u;
u.set ( k->getUrl() , k->size_urlBuf );
// do not allow anomalous link text to match query
//if ( k->m_isAnomaly ) continue;
char *p = k-> getLinkText();
int32_t plen = k->size_linkText - 1;
if ( ! verifyUtf8 ( p , plen ) ) {
log("title: set4 bad link text from url=%s",
k->getUrl());
continue;
}
// now the words.
Words ww2;
if ( ! ww2.set ( k->getLinkText() ,
k->size_linkText-1, // len
TITLEREC_CURRENT_VERSION ,
true , // computeIds
m_niceness ))// niceness
// g_errno set on error, return NULL
return NULL;
if ( !addUniqueWordsToBuf( &m_termInfoBuf,
&m_tidTable32, // dedup table
NULL, // filter table
&TMP, // mincounttable, >=2 counts
false, // store counts?
&ww2,
includeSynonyms))
return NULL;
}
// how many 32-bit twids do we got?
//m_numTwids = m_twbuf.length() / 4;
//m_twids = (int32_t *)m_twbuf.getBufStart();
QUICKPOLL(m_niceness);
// . sort that buf now
// . HACK: only sorts by last 32 bits of termid!!!!
qsort ( m_termInfoBuf.getBufStart(),
m_termInfoBuf.length() / sizeof(TermInfo),
sizeof(TermInfo), // 32-bit twids = 4 bytes
twidcmp );
QUICKPOLL(m_niceness);
// if no twids then return a -2 ptr, not NULL, that means error
// not -1 that means blocked!
//if ( m_numTwids == 0 ) m_twids = (int32_t *)-2;
// do not repeat this logic
//m_twidsValid = true;
m_termInfoBufValid = true;
// return the vector
return &m_termInfoBuf;
}
// . just like getTermInfoBuf but also includes terms from related queries
// that our document does not have!
// . we do it this way because for seo.cpp::handleRequest95() it finds
// matching queries locally based on getNewTermInfoBuf()'s m_newTermInfoBuf.
SafeBuf *XmlDoc::getNewTermInfoBuf ( ) {
setStatus ( "getnewterminfobuf" );
if ( m_newTermInfoBufValid ) return &m_newTermInfoBuf;
SafeBuf *oldBuf = getTermInfoBuf ();
if ( ! oldBuf || oldBuf == (void *) -1 ) return oldBuf;
SafeBuf *itBuf = getInsertableTerms();
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
// this should be valid automatically
HashTableX *oldDedupTable = getTermIdBufDedupTable32 ( );
// get old guy
if ( ! m_newTermInfoBuf.safeMemcpy ( oldBuf ) )
return NULL;
// a dedup table on stack
HashTableX newDedup32;
if (! newDedup32.set(4,0,16384,NULL,0,false,m_niceness,"newdtabl"))
return NULL;
// now scan the insertable terms buf
char *p = itBuf->getBufStart();
char*pend = itBuf->getBuf();
// scan each "term" which might be one or more words
for ( ; p < pend ; ) {
QUICKPOLL(m_niceness);
// cast it
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
char *term = it->getTerm();
Words ww;
ww.set9 ( term , m_niceness );
// we add entries to the dedup table, "newDedup32",
// but only filter and not add to "oldDedupTable"
if ( ! addUniqueWordsToBuf ( &m_newTermInfoBuf,
&newDedup32 , // dedup table
oldDedupTable, // filter table
NULL, // mincounttable
false,
&ww ,
true ) )
return NULL;
}
QUICKPOLL(m_niceness);
// . sort that buf now.
// . HACK: only sorts by last 32 bits of termid!!!!
qsort ( m_newTermInfoBuf.getBufStart(),
m_newTermInfoBuf.length() / sizeof(TermInfo),
sizeof(TermInfo), // 32-bit twids = 4 bytes
twidcmp );
QUICKPOLL(m_niceness);
/*
// set the term freq of each one
p = m_newTermInfoBuf.getBufStart();
pend = m_newTermInfoBuf.getBuf();
for ( ; p < pend ; ) {
QUICKPOLL(m_niceness);
TermInfo *ti = (TermInfo *)p;
p += sizeof(TermInfo);
// look it up
int64_t tf = g_posdb.getTermFreq (cr->m_coll,ti->m_termId64);
// store it
ti->m_termFreq64 = tf;
}
*/
// do not repeat this logic
m_newTermInfoBufValid = true;
// return the vector
return &m_newTermInfoBuf;
}
bool XmlDoc::addUniqueWordsToBuf ( SafeBuf *termInfoBuf ,
HashTableX *dedupTable ,
HashTableX *filterTable ,
HashTableX *minCountTable ,
bool storeCounts,
Words *ww ,
bool getSynonyms ) {
int32_t nw = ww->getNumWords ();
uint64_t *wids = (uint64_t *)ww->getWordIds ();
//nodeid_t *tids = ww->getTagIds ();
uint8_t *langId = getLangId();
// this should have been set by parent caller
if ( ! langId || langId == (uint8_t *)-1 ) {char *xx=NULL;*xx=0; }
// store the langId here
uint8_t useLangId = *langId;
// default that to english i guess if unknown
if ( useLangId == langUnknown ) {
static XmlDoc *s_lastPrint = NULL;
if ( s_lastPrint != this ) {
log("seopipe: langid of page is unknown for twid "
"synonyms. assuming english.");
s_lastPrint = this;
}
useLangId = langEnglish;
}
Synonyms syn;
//bool inTitle = false;
// scan for title
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// out of a link
//if(tids && tids[i] == TAG_TITLE ) inTitle = true;
//if(tids && tids[i] == (TAG_TITLE | BACKBIT)) inTitle = false;
// count it, limit to 30
//if ( inTitle ) tw++;
// skip if not alnumword
if ( ! wids[i] ) continue;
// make it 32 bit
uint32_t wid32 = (uint32_t)wids[i];
// filter table
if ( filterTable && filterTable->isInTable(&wid32) ) continue;
/*
// debug
if ( minCountTable && storeCounts ) {
int32_t wlen = ww->m_wordLens[i];
char *wptr = ww->m_words[i];
char c= wptr[wlen];
wptr[wlen] = '\0';
log("seo: storecount wid=%"UINT32" word=%s",
(uint32_t)((uint64_t)wids[i]),wptr);
wptr[wlen] = c;
}
*/
// to avoid link text anomalies, the word must have been
// repeated in another link text or a meta tag. should
// fix ibm.com from getting 'lincoln' or 'unc' as high-scoring
// matching queries. should fix artdaily.com from getting
// that foreign language phrase in danish. (bedste pa nettet)
// (best of the web)
if ( minCountTable &&
! storeCounts &&
minCountTable->getScore32(&wid32) <= 1 )
continue;
// get slot
if ( dedupTable && dedupTable->isInTable(&wid32) ) continue;
// count it!
if ( storeCounts && ! minCountTable->addTerm32(&wid32) )
return false;
// show it
//if ( wid32 == 1174583722 && storeCounts ) {
// log("seo: storing occurence. current count=%"INT32"",
// (int32_t)minCountTable->getScore32(&wid32) );
//}
// add it to vector
TermInfo ti;
ti.m_termId64 = wids[i];
//ti.m_termFreq64 = -1;
if ( termInfoBuf && !
termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
return false;
// add it then
if ( dedupTable && ! dedupTable->addKey ( &wid32 ) )
return false;
// do synonyms now?
if ( ! getSynonyms ) continue;
// get its synonyms into tmpBuf
char tmpBuf[TMPSYNBUFSIZE];
int32_t naids = syn.getSynonyms(ww,i,useLangId,tmpBuf,m_niceness);
for ( int32_t j = 0 ; j < naids ; j++ ) {
// get it
uint32_t aid32 = (uint32_t)syn.m_aids[j];
// get slot
if ( dedupTable && dedupTable->isInTable(&aid32) )
continue;
// add it to vector
TermInfo ti;
ti.m_termId64 = syn.m_aids[j]; // 64 bit version
//ti.m_termFreq64 = -1;
if ( termInfoBuf &&
! termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
return false;
// add it then
if ( dedupTable && ! dedupTable->addKey(&aid32) )
return false;
// count it!
if ( storeCounts && ! minCountTable->addTerm32(&aid32))
return false;
}
}
return true;
}
/*
static void gotMsg99ReplyWrapper ( void *state , UdpSlot *slot ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->gotMsg99Reply ( slot );
}
void XmlDoc::gotMsg99Reply ( UdpSlot *slot ) {
// get replying hostid
int32_t hostId = slot->m_hostId;
// log
setStatus ( "gotmsg99reply" );
// sanity
if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
// save it
int32_t i = m_numMsg99Replies;
m_msg99ReplyPtrs [i] = slot->m_readBuf;
m_msg99ReplySizes[i] = slot->m_readBufSize;
m_msg99ReplyAlloc[i] = slot->m_readBufMaxSize;
m_msg99HostIds [i] = hostId;
// steal it so it doesn't free it
slot->m_readBuf = NULL;
// note it
//log("seopipe: got msg99 reply from host #%"INT32" i=%"INT32" alloc=%"INT32"",
// hostId,i,slot->m_readBufMaxSize);
// inc the counter
m_numMsg99Replies++;
// sanity!
if ( m_numMsg99Replies > m_numMsg99Requests ) { char *xx=NULL;*xx=0; }
if ( m_numMsg99Replies > g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
// don't free the sendbuf, it is shared between all hosts UNLESS
// we are the last reply received!!!
if ( m_numMsg99Replies < g_hostdb.m_numHosts )
slot->m_sendBufAlloc = NULL;
// return control to transmit function. it will call m_callback1
// if the function is done. but if a different parent function than
// transmit called us then we call that. it just depends on the
// intial entry function that called getMatchingQueries()
m_masterLoop ( m_masterState );
}
*/
/*
float getQueryImportance2 ( QueryInfo *qi , float myScore ) {
// now divide by the top score (or 50th score) for the query
// so we can see how high we score relatively speaking...
// although, if all search results for this query have the
// same score this method kinda sux...
float imp = myScore / qe->m_minTop50Score;
return imp;
// mod because one word query terms get higher scores than
// multi-word queries because they are divided by distance in
// the search algo.
// this hurts 'gigablast' query.
if ( qi->m_numUniqueWordForms <= 1 ) score /= 10.0;
// multiply by it?
score *= qi->m_numUniqueWordForms;
// until we have the code to fix things like 'coast to coast'
// where the term is repeated, we have to punish...
if ( qi->m_numRepeatWordForms >= 1 ) score /= 30.0;
// kill 'search+engine+search+engine'
if ( qi->m_numRepeatWordForms >= 2 ) score /= 30.0;
// if every word in query is repeated... push it down
// try to fix 'bot+bot' and 'search+search' 'http+http'
if ( qi->m_numUniqueWordForms == qi->m_numRepeatWordForms )
score /= 2000.0;
// fix 'web search search'
if ( qi->m_numRepeatWordForms > 0 &&
qi->m_numUniqueWordForms == qi->m_numRepeatWordForms + 1 )
score /= 200.0;
// try to kill those queries that are just a single stop word
// or forms of stop words.
// this hurts 'gigablast' query, so make it > .9. no, then crap like
// 'web' and 'http' come up too high...
if ( qi->m_numUniqueWordForms == 1 ) {
score *= (1.1 - qi->m_smallestNormTermFreq);
score *= (1.1 - qi->m_smallestNormTermFreq);
}
// http is very common! so make the 'http' or 'http+http' queries
// very low importance
if ( qi->m_numControlWordForms == qi->m_numUniqueWordForms )
score /= 1000000.0;
// TODO: if query is a single term and it's exact syn min
// hash is that for 'and' then kill it. fix 'anding'
// boost it for more accuracy since we gotta make it into anint
//score *= 1000;
return score;
}
// set Msg99Reply::m_queryImportance for all msg99replies
void setQueryImportance ( Msg99Reply **qptrs , int32_t numQueryPtrs ) {
}
void setQueryImportanceRelated ( QueryRel **qptrs , int32_t numQueryPtrs ) {
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
QueryRel *qrel = qptrs[i];
float score = qrel->m_queryInfo.m_myScoreRelated;
QueryInfo *qi = &qrel->m_queryInfo;
float imp = getQueryImportance2 ( qi , score );
qi->m_queryImportance = imp;
}
}
*/
/*
int qp99cmp ( const void *a, const void *b ) {
Msg99Reply *qa = *(Msg99Reply **)a;
Msg99Reply *qb = *(Msg99Reply **)b;
// make sure manually added queries are on top
if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1;
if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
//QueryInfo *qia = &qa->m_queryInfo;
//QueryInfo *qib = &qb->m_queryInfo;
// get scores
float scorea = qa->m_queryImportance;
float scoreb = qb->m_queryImportance;
if ( scorea < scoreb ) return 1;
if ( scorea > scoreb ) return -1;
// fallback to traffic otherwise i guess
int32_t traffica = qa->m_queryLogEntry.m_gigablastTraffic;
int32_t trafficb = qb->m_queryLogEntry.m_gigablastTraffic;
if ( qa->m_queryLogEntry.m_googleTraffic != -1 )
traffica = qa->m_queryLogEntry.m_googleTraffic;
if ( qb->m_queryLogEntry.m_googleTraffic != -1 )
trafficb = qb->m_queryLogEntry.m_googleTraffic;
if ( traffica < trafficb ) return 1;
if ( traffica > trafficb ) return -1;
// fallback alphabetical otherwise?
char *qsa = qa->m_queryStr;
char *qsb = qb->m_queryStr;
if ( ! qsa ) return 0;
if ( ! qsb ) return 0;
return strcmp( qsa , qsb );
//return 0;
}
*/
#include "Cachedb.h"
// . only check cachedb once per url
// . return false if blocked, true otherwise
// . returns true and sets g_errno on error
bool XmlDoc::checkCachedb ( ) {
if ( ! m_readFromCachedb ) return true;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// already set?
//if ( m_seoInfoSetFromCache )
// return true;
// return -1 if this blocked
if ( ! m_checkedCachedb ) {
// we now use the contenthash as part of the key because the
// data we cache is dependent on the content. i guess we don't
// need to use the user id then...
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
// first check cachedb. enum type cr_MatchingQueries
int32_t uh32 ;
uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
key_t sk = g_cachedb.makeStartKey ( uh32 , ch32 );
key_t ek = g_cachedb.makeEndKey ( uh32 , ch32 );
// debug
log("seo: checking cachedb uh32=%"UINT32" ch32=%"UINT32"",
(uint32_t)uh32,
(uint32_t)ch32);
// do not repeat
m_checkedCachedb = true;
// . get it from the appropriate host
// . get cachedb rec for all types of safebufs for this
// url/content
// . then we will set safebufs based on what recs we find
// in the returned list
if ( ! m_msg0.getList ( -1, // hostid
0 , // ip
0 , // port
0 , // maxcacheage
false, // addtocache?
RDB_CACHEDB,
cr->m_collnum ,
&m_cacheList,
(char *)&sk ,
(char *)&ek ,
30000000, // minrecsizes 30MB
m_masterState,
m_masterLoop,
m_niceness ) )
// return FALSE if this blocks
return false;
}
if ( m_processedCachedbReply ) return true;
// only scan list once
m_processedCachedbReply = true;
// if empty, that was easy
if ( m_cacheList.isEmpty() ) return true;
// we might have one rec set from cache and another not, and we
// still want to cache the one that is not in storeIntoCachedb()!
//m_seoInfoSetFromCache = true;
// otherwise, parse out the cache recs
for ( ; ! m_cacheList.isExhausted() ; m_cacheList.skipCurrentRec() ) {
// breathe
QUICKPOLL(m_niceness);
// get it
char *rec = m_cacheList.getCurrentRec();
// . get type of cached rec
// . enum types cr_MatchingQueries etc. as in Cachedb.h
char recType = g_cachedb.getTypeFromKey(rec);
int32_t dataSize = m_cacheList.getCurrentDataSize();
// sanity. must at least have the cached date
if ( dataSize < 4 ) { char *xx=NULL;*xx=0; }
char *data = m_cacheList.getCurrentData ();
// in data, first int32_t is the cached time in utc
//int32_t cachedDate = *(int32_t *)data;
// skip the TIMESTAMP!
//int32_t timestamp = *(int32_t *)data;
data += 4;
dataSize -= 4;
// and version
data += 4;
dataSize -= 4;
// . 1
// . is it a cached rec for matching queries?
// . getSeoQueryInfo() needs this
if (recType == cr_MatchingQueries && !m_matchingQueryBufValid){
// debug
log("seo: found matching queries");
// total size of the msg99replies (totalMsg99ReplySize)
int32_t size1 = *(int32_t *)data;
data += 4;
// just point into the list itself. we will
// free m_cacheList on reset then.
m_matchingQueryBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
data += size1;
// now the m_queryLinkStringBuf
size1 = *(int32_t *)data;
data += 4;
m_matchingQueryStringBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding
data += size1;
m_matchingQueryBufValid = true;
continue;
}
// . 2
// . is it a cached rec for related docis with titles?
// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
// . m_relatedTitleBuf is buf of titles and urls referenced
// by those classes
if ( recType == cr_RelatedDocIds &&
! m_relatedDocIdsWithTitlesValid ) {
// debug
log("seo: found related docids");
// first is the safebuf of RelatedDocId classes
int32_t size1 = *(int32_t *)data;
data += 4;
// point into it
//char *p = data;
//char *pend = data + size1;
// just point into the list itself. we will
// free m_cacheList on reset then.
m_relatedDocIdBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
// skip that
data += size1;
size1 = *(int32_t *)data;
data += 4;
// save this
//char *rtbuf = data;
// now the string buffer
m_relatedTitleBuf.setBuf ( data ,
size1 ,
size1 ,
false ,
0 );
// skip that
data += size1;
size1 = *(int32_t *)data;
data += 4;
// now the string buffer
m_commonQueryNumBuf.setBuf ( data ,
size1 ,
size1 ,
false ,
0 );
// now the RelatedDocId::ptr_url/ptr_rd_title members
// were hacked to be offsets into this for storage
// into the cache!
/*
for ( ; p < pend ; p += sizeof(RelatedDocId) ) {
QUICKPOLL(m_niceness);
// cast it
RelatedDocId *rd = (RelatedDocId *)p;
// get offsets
int32_t off1 = (int32_t)rd->ptr_rd_title;
int32_t off2 = (int32_t)rd->ptr_rd_url;
int32_t off3 = (int32_t)rd->ptr_rd_site;
// normalize/store back
rd->ptr_rd_title = rtbuf + off1;
rd->ptr_rd_url = rtbuf + off2;
rd->ptr_rd_site = rtbuf + off3;
}
*/
m_relatedDocIdsWithTitlesValid = true;
m_relatedTitleBufValid = true;
m_relatedDocIdBufValid = true;
continue;
}
// . 3
// . is it a cached rec for related docis with titles?
// . getSeoQueryInfo() calls getRelatedQueryBuf()
if ( recType == cr_RelatedQueries && ! m_queryLinkBufValid ) {
// we changed the format of relatedquerystringbuf
// to be a bunch of QueryLogEntries now. so ignore
// if old format.
//if ( timestamp <= 1367704324 ) continue;
// debug
log("seo: found related queries");
int32_t size1;
// first is the safebuf m_queryLinkBuf of QueryLinks
size1 = *(int32_t *)data;
data += 4;
m_relatedQueryBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
data += size1;
// now the m_queryLinkStringBuf
size1 = *(int32_t *)data;
data += 4;
m_relatedQueryStringBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding
data += size1;
/*
// now the ptrs, sorted
size1 = *(int32_t *)data;
data += 4;
m_relPtrs.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
// test sorting
char *p = m_relPtrs.getBufStart();
char *pend = m_relPtrs.getBuf();
char *base = m_queryLinkBuf.getBufStart();
QueryLink *lastqr = NULL;
for ( ; p < pend ; p += 4 ) {
QUICKPOLL(m_niceness);
int32_t qkOff = *(int32_t *)p;
QueryLink *qr = (QueryRel *)(base+qkOff);
// no, longer, it is more complicated because
// if m_uniqueRound scoring addition
//if ( lastqr &&
// lastqr->m_totalRelatedQueryImportance <
// qr ->m_totalRelatedQueryImportance ) {
// char *xx=NULL;*xx=0;}
lastqr = qr;
}
*/
// validate
//m_relPtrsValid = true;
//m_queryLinkStringBufValid = true;
m_relatedQueryBufValid = true;
continue;
}
// if it is debug and we are not, skip it!!
//if(recType == cr_ScoredInsertableTermsDebug && ! m_seoDebug )
// continue;
// or if we are debug and it is not, skip it!
//if (recType == cr_ScoredInsertableTerms && m_seoDebug )
// continue;
/*
if ( (recType == cr_MissingTermBuf ) &&
! m_missingTermBufValid ) {
// debug
log("seo: found missingtermbuf");
int32_t size1;
size1 = *(int32_t *)data;
data += 4;
m_missingTermBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
m_missingTermBufValid = true;
}
*/
// 3b
if ( (recType == cr_WordPosInfoBuf ) &&
! m_wordPosInfoBufValid ) {
// debug
log("seo: found wordposinfo");
int32_t size1;
size1 = *(int32_t *)data;
data += 4;
m_wordPosInfoBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
// WordPosInfo::m_term relative to ptr_utf8Content
char *p = m_wordPosInfoBuf.getBufStart();
char *pend = m_wordPosInfoBuf.getBuf();
for ( ; p < pend ; p += sizeof(WordPosInfo) ) {
QUICKPOLL(m_niceness);
WordPosInfo *wp = (WordPosInfo *)p;
int64_t off = (int64_t)wp->m_wordPtr;
char *ptr = ptr_utf8Content + off;
if ( off == -1 ) ptr = NULL;
wp->m_wordPtr = ptr;
}
m_wordPosInfoBufValid = true;
}
// . 4
// . and the insertable terms buffer with its querychanges
// linked lists!
if ( recType == cr_ScoredInsertableTerms &&
! m_scoredInsertableTermsBufValid ) {
// debug
log("seo: found scored insertable terms");
int32_t size1;
// first is the safebuf m_queryLinkBuf of QueryLinks
size1 = *(int32_t *)data;
data += 4;
// just point into the list itself. we will
// free m_cacheList on reset then.
m_insertableTermsBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
// skip that
data += size1;
size1 = *(int32_t *)data;
data += 4;
// now the buffer of query changes
// these are normally just referenced by
// InsertableTerm and in the linked list directly
// into the Msg95Reply::ptr_queryChanges, but for
// caching we have to use a new safebuf
m_queryChangeBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
// skip that
data += size1;
size1 = *(int32_t *)data;
data += 4;
m_queryLogBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
/*
// skip that
data += size1;
size1 = *(int32_t *)data;
data += 4;
m_itStrBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
*/
/*
// debug scoring. QueryChange::m_debugScoreInfoOffset
data += size1;
size1 = *(int32_t *)data;
data += 4;
m_debugScoreInfoBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
// debug scoring. QueryChange::m_origScoreInfoOffset
data += size1;
size1 = *(int32_t *)data;
data += 4;
m_origScoreInfoBuf.setBuf ( data ,
size1 , // size
size1 , // allocated
false , // owndata?
0 ); // encoding none
*/
// insertable terms deserialization logic
char *p = m_insertableTermsBuf.getBufStart();
char *pend = m_insertableTermsBuf.getBuf();
for ( ; p < pend ; ) {
QUICKPOLL(m_niceness);
// cast it
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
// normalize m_firstQueryChange
int64_t off =(int64_t)(it->m_firstQueryChange);
// fix this
char *buf = m_queryChangeBuf.getBufStart();
// int16_tcut
QueryChange *fqc = (QueryChange *)(buf+off);
// -1 means NULL
if ( off == -1 ) fqc = NULL;
// put back
it->m_firstQueryChange = fqc;
// terms
//off = (int32_t)it->m_termStr;
// to this
//buf = m_itStrBuf.getBufStart();
// cast it
//it->m_termStr = (char *)(buf+off);
}
// . now we set QueryChange::m_next and
// InsertableTerm::m_firstQueryChange to be offsets
// into the new m_queryChangeBuf before we stored
// into the cache....
p = m_queryChangeBuf.getBufStart();
pend = m_queryChangeBuf.getBuf();
for ( ; p < pend ; p += sizeof(QueryChange) ) {
QUICKPOLL(m_niceness);
// cast it
QueryChange *qc = (QueryChange *)p;
// normalize m_next
int64_t off = (int64_t)qc->m_next;
// offset into this
char *buf = m_queryChangeBuf.getBufStart();
// put back
qc->m_next = (QueryChange *)(buf + off);
// -1 means NULL
if ( off == -1 ) qc->m_next = NULL;
}
// now all ptrs should be set correctly
m_scoredInsertableTermsBufValid = true;
m_insertableTermsBufValid = true;
continue;
}
// . 2
// . is it a cached rec for related docis with titles?
// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
// . m_relatedTitleBuf is buf of titles and urls referenced
// by those classes
if ( recType == cr_RecommendedLinks &&
! m_recommendedLinksBufValid ) {
// debug
log("seo: found recommended links buf");
// first is the safebuf of RelatedDocId classes
int32_t size1 = *(int32_t *)data;
data += 4;
// now the string buffer
m_recommendedLinksBuf.setBuf ( data ,
size1 ,
size1 ,
false ,
0 );
m_recommendedLinksBufValid = true;
continue;
}
}
return true;
}
#define CACHEDB_CURRENT_VERSION 1
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . flush the msg4 until it completes i guess
bool XmlDoc::storeMatchingQueriesIntoCachedb ( ) {
if ( ! m_writeToCachedb ) return true;
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
// all these things should already be validated so they should
// not block or have errors
//SafeBuf *qpbuf = getMatchingQueriesScored();
//SafeBuf *qpbuf = &m_queryPtrs;
if ( ! m_matchingQueryBufValid ) { char *xx=NULL;*xx=0; }
int32_t now = getTimeGlobal();
// calc how much space we need
//int32_t totalMsg99ReplySize = 0;
//int32_t numQueryPtrs = 0;
//Msg99Reply **qptrs = NULL;
// 1. msg99replies for matchingQueries
int32_t need = 0;
need += sizeof(key_t) + 4 + 4+4; // key + dataSize+cacheDate(now)+ver
need += 4 + m_matchingQueryBuf.length();
need += 4 + m_matchingQueryStringBuf.length();
// sanity
if ( need > 20000000 ) {
log("cachedb: mq listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
//
// 1. first add the matching queries, msg99 replies
//
k = g_cachedb.makeKey ( uh32, ch32 , cr_MatchingQueries );
// note it
log("seo: cachedb storing matchingqueries "
"uh32=%"UINT32" ch32=%"UINT32""
,(uint32_t)uh32,(uint32_t)ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_matchingQueryBuf.length();
dataSize += 4 + m_matchingQueryStringBuf.length();
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
listBuf.pushLong ( m_matchingQueryBuf.length() );
listBuf.safeMemcpy ( &m_matchingQueryBuf );
listBuf.pushLong ( m_matchingQueryStringBuf.length() );
listBuf.safeMemcpy ( &m_matchingQueryStringBuf );
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding matching query list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
bool XmlDoc::storeRelatedDocIdsIntoCachedb ( ) {
if ( ! m_writeToCachedb ) return true;
if ( ! m_queryPtrsWholeValid ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
if ( ! m_relatedDocIdsWithTitlesValid ) { char *xx=NULL;*xx=0;}
if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0;}
// 2. related docids
int32_t need = 0;
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
need += 4 + m_relatedDocIdBuf.length();
need += 4 + m_relatedTitleBuf.length();
need += 4 + m_commonQueryNumBuf.length();
// sanity
if ( need > 20000000 ) {
log("cachedb: rd listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
char *p1;
char *p2;
int32_t now = getTimeGlobal();
// 2. then add related docids
k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedDocIds );
// note it
log("seo: cachedb storing relateddocids "
"uh32=%"UINT32" ch32=%"UINT32""
,(uint32_t)uh32,(uint32_t)ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_relatedDocIdBuf.length();
dataSize += 4 + m_relatedTitleBuf.length();
dataSize += 4 + m_commonQueryNumBuf.length();
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
listBuf.pushLong ( m_relatedDocIdBuf.length() );
p1 = listBuf.getBuf();
listBuf.safeMemcpy ( &m_relatedDocIdBuf );
p2 = listBuf.getBuf();
listBuf.pushLong ( m_relatedTitleBuf.length() );
listBuf.safeMemcpy ( &m_relatedTitleBuf );
//char *tbuf = m_relatedTitleBuf.getBufStart();
listBuf.pushLong ( m_commonQueryNumBuf.length() );
listBuf.safeMemcpy ( &m_commonQueryNumBuf );
// make ptrs into offsets into m_relatedTitleBuf
/*
for ( ; p1 < p2 ; p1 += sizeof(RelatedDocId )) {
QUICKPOLL(m_niceness);
RelatedDocId *rd = (RelatedDocId *)p1;
int32_t off;
off = rd->ptr_rd_url - tbuf;
rd->ptr_rd_url = (char *)off;
off = rd->ptr_rd_title - tbuf;
rd->ptr_rd_title = (char *)off;
off = rd->ptr_rd_site - tbuf;
rd->ptr_rd_site = (char *)off;
}
*/
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding related docids list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool XmlDoc::storeRecommendedLinksBuf ( ) {
if ( ! m_writeToCachedb ) return true;
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
if ( ! m_recommendedLinksBufValid ) { char *xx=NULL;*xx=0;}
int32_t need = 0;
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
need += 4 + m_recommendedLinksBuf.length();
// sanity
if ( need > 20000000 ) {
log("cachedb: reclnx listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
int32_t now = getTimeGlobal();
// 2. then add related docids
k = g_cachedb.makeKey ( uh32 , ch32, cr_RecommendedLinks );
// note it
log("seo: cachedb storing recommendedlinksbuf "
"uh32=%"UINT32" ch32=%"UINT32""
,(uint32_t)uh32,(uint32_t)ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_recommendedLinksBuf.length();
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
listBuf.pushLong ( m_recommendedLinksBuf.length() );
listBuf.safeMemcpy ( &m_recommendedLinksBuf );
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding recommendedlinksbuf list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool XmlDoc::storeRelatedQueriesIntoCachedb ( ) {
if ( ! m_writeToCachedb ) return true;
if ( ! m_relatedQueryBufValid ) { char *xx=NULL;*xx=0; }
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
//SafeBuf *relBuf = NULL;
//if ( m_relPtrsValid ) relBuf = &m_relPtrs;
int32_t now = getTimeGlobal();
// calc how much space we need
int32_t need = 0;
// 3. related queries. buf of QueryLinks
need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
need += 4 + m_relatedQueryBuf.length();
need += 4 + m_relatedQueryStringBuf.length();
//need += 4 + m_relPtrs.length();
// sanity
if ( need > 20000000 ) {
log("cachedb: rq listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
//
// 3. then related queries (STORED by m_queryImportanceRelated)
//
//int32_t sizeRels = (m_relPtrs.length() / 4) * sizeof(QueryLink);
k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedQueries );
// note it
log("seo: cachedb storing relatedqueries "
"uh32=%"UINT32" ch32=%"UINT32""
,(uint32_t)uh32,(uint32_t)ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_relatedQueryBuf.length(); // sizeRels;
dataSize += 4 + m_relatedQueryStringBuf.length();
//dataSize += 4 + m_relPtrs.length();
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
listBuf.pushLong ( m_relatedQueryBuf.length() );
//char *p3 = listBuf.getBuf();
listBuf.safeMemcpy ( &m_relatedQueryBuf );
//char *p4 = listBuf.getBuf();
listBuf.pushLong ( m_relatedQueryStringBuf.length() );
listBuf.safeMemcpy ( &m_relatedQueryStringBuf );
//listBuf.pushLong ( m_relPtrs.length() );
//char *p5 = listBuf.getBuf();
//listBuf.safeMemcpy ( &m_relPtrs );
// sanity tests
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding related queries list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
bool XmlDoc::storeWordPosInfoBufIntoCachedb ( ) {
if ( ! m_writeToCachedb ) return true;
if ( ! m_wordPosInfoBufValid ) { char *xx=NULL;*xx=0; }
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
int32_t now = getTimeGlobal();
// calc how much space we need
int32_t need = 0;
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
need += 4 + m_wordPosInfoBuf.length();
// sanity
if ( need > 20000000 ) {
log("cachedb: wpi listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
// 4. then the insertable terms and their query changes and log buf
// mangle key a little if in debug mode because that is the only
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
uint8_t cr8 = cr_WordPosInfoBuf;
k = g_cachedb.makeKey ( uh32 , ch32, cr8 );
// note it
log("seo: cachedb storing wordposinfobuf "
"uh32=%"UINT32" ch32=%"UINT32""
,(uint32_t)uh32,(uint32_t)ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_wordPosInfoBuf.length();
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
listBuf.pushLong ( m_wordPosInfoBuf.length() );
char *p8 = listBuf.getBuf();
listBuf.safeMemcpy ( &m_wordPosInfoBuf );
char *p9 = listBuf.getBuf();
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// WordPosInfo::m_term relative to html ptr_utf8Content!
for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
QUICKPOLL(m_niceness);
WordPosInfo *wp = (WordPosInfo *)p8;
int64_t off = wp->m_wordPtr - ptr_utf8Content;
// if its a tag or fielded term it won't be in the
// html like ext:html or filetype:html
if ( wp->m_wordPtr< ptr_utf8Content )
off = -1;
if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
off = -1;
wp->m_wordPtr = (char *)off;
}
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding wordposinfobuf list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
/*
bool XmlDoc::storeMissingTermBufIntoCachedb ( ) {
if ( ! m_writeToCachedb ) return true;
if ( ! m_missingTermBufValid ) { char *xx=NULL;*xx=0; }
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
int32_t now = getTimeGlobal();
// calc how much space we need
int32_t need = 0;
need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
need += 4 + m_missingTermBuf.length();
// sanity
if ( need > 20000000 ) {
log("cachedb: wpi listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
// 4. then the insertable terms and their query changes and log buf
// mangle key a little if in debug mode because that is the only
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
uint8_t cr = cr_MissingTermBuf;
k = g_cachedb.makeKey ( uh32 , ch32, cr );
// note it
log("seo: cachedb storing missingtermbuf "
"uh32=%"UINT32" ch32=%"UINT32"",uh32,ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_missingTermBuf.length();
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
listBuf.pushLong ( m_missingTermBuf.length() );
listBuf.safeMemcpy ( &m_missingTermBuf );
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding missingtermbuf list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
*/
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . flush the msg4 until it completes i guess
bool XmlDoc::storeScoredInsertableTermsIntoCachedb ( ) {
if ( ! m_writeToCachedb ) return true;
if ( ! m_scoredInsertableTermsBufValid ) return true;
int32_t *ch32p = getContentHash32();
if ( ! ch32p ) return true;
if ( ch32p == (void *)-1 ) return false;
int32_t ch32 = *ch32p;
// include spider date now in case indexed copy changes
// site rank, tags, etc.
if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
int32_t now = getTimeGlobal();
// calc how much space we need
int32_t need = 0;
need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
need += 4 + m_insertableTermsBuf.length();
// InsertableTerm::m_firstQryChange:
need += 4 + m_queryChangeBuf.length();
//4 QueryChange::m_replyQueryOffset :
need += 4 + m_queryLogBuf.length();
//InsertableTerm::m_termStr reference
//need += 4 + m_itStrBuf.length();
//need += 4 + m_wordPosInfoBuf.length();
// TOO BIG to score into cachedb!
//need += 4 + m_debugScoreInfoBuf.length(); // debug only
//need += 4 + m_origScoreInfoBuf.length(); // debug only
// sanity
if ( need > 20000000 ) {
log("cachedb: listsize %"INT32" too big for cachedb",need);
return true;
}
SafeBuf listBuf;
// add 1 byte padding to ensure copying a 0 byte buf to listBuf
// does not trigger a reserve
if ( ! listBuf.reserve ( need + 4 ) ) return true;
// ensure no reallocating - that would screw logic below up
char *orig = listBuf.getBufStart();
int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());
key_t k;
int32_t dataSize = 0;
char *p1;
char *p2;
// 4. then the insertable terms and their query changes and log buf
// mangle key a little if in debug mode because that is the only
// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
uint8_t cr8 = cr_ScoredInsertableTerms;
//if ( m_seoDebug ) cr = cr_ScoredInsertableTermsDebug;
k = g_cachedb.makeKey ( uh32 , ch32, cr8 );
// note it
log("seo: cachedb storing scoredinsertableterms "
"uh32=%"UINT32" ch32=%"UINT32""
,(uint32_t)uh32,(uint32_t)ch32);
listBuf.safeMemcpy ( &k , sizeof(key_t) );
dataSize = 0;
dataSize += 4; // timestamp
dataSize += 4; // version
dataSize += 4 + m_insertableTermsBuf.length();
dataSize += 4 + m_queryChangeBuf.length();
dataSize += 4 + m_queryLogBuf.length();
//dataSize += 4 + m_itStrBuf.length();
//dataSize += 4 + m_wordPosInfoBuf.length();
//dataSize += 4 + m_debugScoreInfoBuf.length(); // debug only
//dataSize += 4 + m_origScoreInfoBuf .length(); // debug only
listBuf.pushLong ( dataSize );
listBuf.pushLong ( now ); // cached date
listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION );
// m_insertableTermsBuf
listBuf.pushLong ( m_insertableTermsBuf.length() );
p1 = listBuf.getBuf();
listBuf.safeMemcpy ( &m_insertableTermsBuf );
char *p1End = listBuf.getBuf();
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// m_queryChangeBuf
listBuf.pushLong ( m_queryChangeBuf.length() );
p2 = listBuf.getBuf();
listBuf.safeMemcpy ( &m_queryChangeBuf );
char *p2End = listBuf.getBuf();
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// m_queryLogBuf
listBuf.pushLong ( m_queryLogBuf.length() );
listBuf.safeMemcpy ( &m_queryLogBuf );
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// m_itStrBuf referenced by InsertableTerm::m_termStr
//listBuf.pushLong ( m_itStrBuf.length() );
//listBuf.safeMemcpy ( &m_itStrBuf );
//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// m_itStrBuf referenced by InsertableTerm::m_termStr
//listBuf.pushLong ( m_wordPosInfoBuf.length() );
//char *p8 = listBuf.getBuf();
//listBuf.safeMemcpy ( &m_wordPosInfoBuf );
//char *p9 = listBuf.getBuf();
//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// debug buffers, QueryChange::m_*Offset parms ref them if
// m_seoDebug is true. TOO BIG TO STORE INTO CACHEDB!
//listBuf.pushLong ( m_debugScoreInfoBuf.length() );
//listBuf.safeMemcpy ( &m_debugScoreInfoBuf );
//listBuf.pushLong ( m_origScoreInfoBuf.length() );
//listBuf.safeMemcpy ( &m_origScoreInfoBuf );
// make the InsertableTerm::m_firstQueryChange parms into
// offsets
for ( ; p1 < p1End ; ) { // p1 += sizeof(InsertableTerm) ) {
QUICKPOLL(m_niceness);
InsertableTerm *it = (InsertableTerm *)p1;
p1 += it->getSize();
QueryChange *qc = it->m_firstQueryChange;
int64_t qoff =(char *)qc - m_queryChangeBuf.getBufStart();
if ( qc == NULL ) qoff = -1;
it->m_firstQueryChange = (QueryChange *)qoff;
// and m_termStr
//int32_t off = it->m_termStr - m_itStrBuf.getBufStart();
//it->m_termStr = (char *)off;
}
// make QueryChange::m_next ptrs into offsets as well
for ( ; p2 < p2End ; p2 += sizeof(QueryChange) ) {
QUICKPOLL(m_niceness);
QueryChange *qc = (QueryChange *)p2;
QueryChange *next = qc->m_next;
int64_t noff =(char *)next-m_queryChangeBuf.getBufStart();
if ( next == NULL ) noff = -1;
qc->m_next = (QueryChange *)noff;
}
// WordPosInfo::m_term relative to html ptr_utf8Content!
/*
for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
QUICKPOLL(m_niceness);
WordPosInfo *wp = (WordPosInfo *)p8;
int32_t off = wp->m_wordPtr - ptr_utf8Content;
// if its a tag or fielded term it won't be in the
// html like ext:html or filetype:html
if ( wp->m_wordPtr< ptr_utf8Content )
off = -1;
if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
off = -1;
wp->m_wordPtr = (char *)off;
}
*/
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// ensure list did not realloc, that would screw up everything!
if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 );
// . list is ready now
// . this only returns when each record has been added
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
m_storeList.printList();
QUICKPOLL(m_niceness);
log("xmldoc: adding insertable terms list of %"INT32" bytes to cachedb",
m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
return false;
return true;
}
#define MAX_TOP_MATCHING_QUERIES 300
/*
// returns -1 if blocked, NULL with g_errno set on error
SafeBuf *XmlDoc::getMatchingQueriesScored ( ) {
setStatus ( "getmatchingqueriesscored" );
// try to set m_queryPtrs from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
// just re-use the same m_queryPtrs SafeBuf we used above but we
// set the Msg99Reply::m_myScore here and sort them by that
if ( m_queryPtrsSortedValid )
return &m_queryPtrs;
// get the queries from msg99 replies first
SafeBuf *mq = getMatchingQueries(false,-1);
if ( mq == NULL || mq == (void *)-1 ) return mq;
// time it
if ( ! m_beginTimeMatchUrl )
m_beginTimeMatchUrl = gettimeofdayInMilliseconds();
// i'm assuming this is quer ptrs!?!?!
int32_t numQueryPtrs = mq->length() / sizeof(Msg99Reply *);
// get the qptrs
Msg99Reply **qptrs = (Msg99Reply **)mq->getBufStart();
// score them in parallel over all hosts in network
if ( ! scoreDocIdRestrictedQueries ( qptrs,NULL,numQueryPtrs) )
return (SafeBuf *)-1;
// error?
if ( g_errno ) return NULL;
// total pages indexed!
int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
// take 25% of that. i think 'the', the most common term, is in about
// 25% of those pages
numPagesIndexed /= 4;
//
// SET QUERY IMPORTANCE
//
// . set the m_queryImportance float and sort by that
// . how important is the matching query for the main url?
// . just divide the main url's score by the
// QueryLogEntry::m_mintop50Score for the query to normalize it
// . however, when we compute RelatedDocId::m_dotProduct we normalize
// using the score of the #1 result because we executed the full
// query, so keep that in mind. we can't mix the two.
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
Msg99Reply *qp = qptrs[i];
// int16_tcut
QueryLogEntry *qe = &qp->m_queryLogEntry;
// get # results
int64_t numResults = qe->m_numTotalResultsInSlice;
// fix it to be global
numResults *= (int64_t)g_hostdb.getNumGroups();
// big indexes did the "slice logic" restricting docid
// range to MAX_DOCID * .10 when setting this!
if ( numPagesIndexed > 10000000 ) numResults *= 10;
// point to query
char *qstr = qp->m_queryStr;
// if not processed assume like 1M?
if ( numResults < 0 ) {
log("seo: guessing query importance for '%s' from "
"hostid #%"INT32"",
qstr,(int32_t)qp->m_replyingHostId);
qp->m_queryImportance = 0.0;
continue;
}
// zero means make it 1 to avoid div by zero below
if ( numResults == 0 ) numResults = 1;
// and also weight by traffic! the more traffic the
// more important perhaps...
// NO! with this we get 'www' 'view' etc for
// jezebelgallery.com coming up in the top 50 matching
// queries by importance. crap, but it hurts cheatcodes.com
// then.
// fix
//if ( strcmp(qstr,"search engine") == 0 )
// log("poo");
// adjust since numPagesIndexed is actually a quarter of
// the # of pages indexed since 'the' is only in about
// 1/4 of the pages and it is the most common term
if ( numResults > numPagesIndexed )
numResults = numPagesIndexed;
// try doubling this to get rid of www problem for
// jezebelgallery.com. it put www and view down some more.
float popRatio = (float)numResults / (float)numPagesIndexed;
// stuff like 'www' and 'view' will be near 1.0
float weight = 1.0 - popRatio;//(popRatio * popRatio);
// go crazy
weight *= weight;
weight *= weight;
weight *= weight;
weight *= weight;
// do not let this be 1.0 because 'web page searching' is
// getting 1.0 for it and getting a weight of 0.0 and making
// it the same as the ignored matching queries for
// gigablast.com, so we end up using the ignored common
// word matching queries for getting competitor pages and it
// is bad! we need to fix that to not use such queries if
// their importance is 0!
if ( weight < .01 ) weight = .01;
// because you are in the top 50
//numResults = (int32_t)powf ( (float)numResults , .4 );
//if ( numResults == 0 )
// imp /= 1;
// otherwise, normalize by division
//else
// imp /= numResults;
// boost it!
//imp *= 10000;
//QueryInfo *qi = &qp->m_queryInfo;
//float imp = getQueryImportance2 ( qi , score );
// just try sorting by your serp score, hopefully we remove
// shit like 'www' becaise isCommonQueryWordInEnglish()
// takes care of it below.
// consider *= weight on this
// the idea is to ignore the top serp score because
// you do not want terms that you may be able to be #1
// for but are not really relevant for your doc. so for this
// let's focus on just getting the queries that best represent
// your doc...
double imp = qp->m_myScore * weight;
qp->m_queryImportance = (float)imp;
// just use this!!!
//qp->m_queryImportance = qp->m_myScore /
// (float)(numResults*numResults);
// set importance to 0 for queries with minus sign in them
// that indicates negative terms...
for ( char *p = qstr; *p ; p++ ) {
if ( *p != ' ' ) continue;
if ( p[1] != '-' ) continue;
// 'a - b' is ok
if ( p[2] == ' ' ) continue;
qp->m_queryImportance = 0.00;
log("seo: ignoring query '%s' with minus sign", qstr);
break;
}
// avoid common queries with just common words in them:
// http web www com org us we 1 2 3 by on i https one page
Words ww;
ww.set3 ( qstr );
int32_t i; for ( i = 0 ; i < ww.m_numWords ; i++ ) {
int64_t wid = ww.m_wordIds[i];
if ( wid == 0 ) continue;
if ( ! isCommonQueryWordInEnglish ( wid ) ) break;
}
if ( i >= ww.m_numWords ) {
qp->m_queryImportance = 0.00;
log("seo: ignoring common query '%s'", qstr);
}
// skip debug for now
if ( ! m_seoDebug ) continue;
// note it
log("seo: "
"imp=%f "
"numresults=%"INT64" "
"numpagesindexed=%"INT64" "
"popweight=%f "
"myscore=%f "
"topscore=%f "
"qstr=%s",
qp->m_queryImportance,
numResults,
numPagesIndexed,
weight,
qp->m_myScore,
qe->m_topSERPScore,
qstr);
}
// let's sort them first
qsort ( qptrs ,
numQueryPtrs ,
sizeof(Msg99Reply *),
qp99cmp );
// log for debug
int32_t maxk = numQueryPtrs;
// limit to logging 300 to avoid log spam
if ( maxk > MAX_TOP_MATCHING_QUERIES )
maxk = MAX_TOP_MATCHING_QUERIES; // 300;
// limit to top 300 dammit, otherwise we can't store all
// into cachedb!!!
int32_t newLen = maxk * sizeof(Msg99Reply *);
m_queryPtrs.setLength ( newLen );
for ( int32_t k = 0 ; k < maxk ; k++ ) {
Msg99Reply *kp = qptrs[k];
log("seopipe: newquery=\"%s\" myscore=%f imp=%f",
kp->m_queryStr,
kp->m_myScore,
kp->m_queryImportance);
}
// time it
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_beginTimeMatchUrl;
log("seopipe: time: matchingscoredqueries took %"INT64" ms",took);
m_queryPtrsSortedValid = true;
if ( ! storeMatchingQueriesIntoCachedb() )
// return -1 if it blocked and wait for store to complete
return (SafeBuf *)-1;
return mq;
}
*/
static void gotMsg3aReplyForFullQueryWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->setStatus ( "gotmsg3areplyforfullquerywrapper" );
THIS->gotMsg3aReplyForFullQuery();
// . go back to the main entry function
// . make sure g_errno is clear from a msg3a g_errno before calling
// this lest it abandon the loop
THIS->m_masterLoop ( THIS->m_masterState );
}
/*
void XmlDoc::gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
Msg99Reply *qp ) {
// try again for next guy
m_triedCache = false;
char *p = cachedRec;
// # docids
int32_t numDocIds = *(int32_t *)p;
p += 4;
// total # results
int32_t numTotalResults = *(int32_t *)p;
p += 4;
// docids
int64_t *docIds = (int64_t *)p;
p += 8 * numDocIds;
// scores
float *scores = (float *)p;
p += sizeof(float) * numDocIds;
// site hashes
int32_t *siteHashes = (int32_t *)p;
p += 4 * numDocIds;
// store score info into this class
TopDocIds *td = qp->m_topDocIds;
// store reply info, like # docids, in the query ptr
int32_t max = numDocIds;
if ( max > (int32_t)NUM_TOP_RESULTS ) max = (int32_t)NUM_TOP_RESULTS;
td->m_numDocIds = max;
// count replies
m_numMsg3aReplies++;
// log to log as well
char tmp[50000];
p = tmp;
p += sprintf(p,
"seopipe: got full results CACHED "
"qrynum=%"INT32"of%"INT32" docids=%"INT32" "
"query=\"%s\" ",
m_numMsg3aReplies,//m_msg3a->m_hackQNum,
m_maxFullQueries ,
td->m_numDocIds,
qp->m_queryStr );
// log each docid
for ( int32_t i = 0 ; i < max ; i++ ) {
//float score = m_msg3a->getScores()[i];
int64_t d = docIds[i];
//int32_t sh32 = m_msg3a->getSiteHash32(i);
p += sprintf(p,"d%"INT32"=%"INT64" ",i,d);
}
log(tmp);
// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
SafeBuf *sb = &m_socketWriteBuf;
sb->safePrintf(
"\t<seoQueryScoreInfo>\n"
"\t\t<queryNum>%"INT32"</queryNum>\n"
"\t\t<numTotalEstimatedSearchResults>%"INT32""
"</numTotalEstimatedSearchResults>\n"
"\t\t<numDocIds>%"INT32"</numDocIds>\n"
, m_msg3a->m_hackQNum
, numTotalResults
, numDocIds
);
// print the top 50 scores
for ( int32_t i = 0 ; i < max ; i++ ) {
float score = scores[i];
int64_t d = docIds[i];
int32_t sh32 = siteHashes[i];
sb->safePrintf("\t\t<searchResult>\n");
sb->safePrintf("\t\t\t<rank>%"INT32"</rank>\n",i+1);
sb->safePrintf("\t\t\t<score>%f</score>\n",score);
sb->safePrintf("\t\t\t<docId>%"INT64"</docId>\n",d);
sb->safePrintf("\t\t\t<siteHash32>%"UINT32"</siteHash32>\n",sh32);
sb->safePrintf("\t\t</searchResult>\n");
// store results for this Msg99Reply
td->m_topDocIds[i] = d;
td->m_topScores[i] = score;
td->m_topSiteHashes[i] = sh32;
}
// reset rest so it prints pretty on gdb debug print cmd
for ( int32_t i = max ; i < (int32_t)NUM_TOP_RESULTS ; i++ ) {
td->m_topDocIds[i] = 0LL;
td->m_topScores[i] = 0.0;
td->m_topSiteHashes[i] = 0;
}
sb->safePrintf("\t</seoQueryScoreInfo>\n");
// pump m_socketWriteBuf to m_seoSocket
pumpSocketWriteBuf ( );
}
*/
// . this is the msg3a reply for related docids only
// . the full replies we get for determining ranks from scores for the
// HTML simulator, are handled in seo.cpp using State95::m_msg3a.
void XmlDoc::gotMsg3aReplyForFullQuery ( ) {
int32_t err = g_errno;
// save it so we know related docid generation had an error...
if ( g_errno && ! m_msg3aErrno )
m_msg3aErrno = g_errno;
setStatus ( "gotmsg3areplyforfullquery" );
if ( g_errno ) {
log("seopipe: got msg3a reply error: %s",mstrerror(g_errno));
g_errno = 0;
}
// try again for next guy
//m_triedCache = false;
// how many docids in the search results were returned to us?
int32_t numDocIds = m_msg3a->getNumDocIds();
// total # search results estimated
//int32_t numTotalResults = m_msg3a->getNumTotalEstimatedHits();
// get the query as we received it in the msg99 reply
//Msg99Reply *qp = (Msg99Reply *)m_msg3a->m_hackQPtr;
int32_t queryNum = (int32_t)m_msg3a->m_hackQNum;
// . point to the empty class we reserved in the buf
// . store score info into this class
//TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBuf();//Start();
// ensure enough room
//if ( m_topDocIdsBuf.getAvail() < sizeof(TopDocIds) )
// m_topDocIdsBuf.reserve(sizeof(TopDocIds) )
// get next available spot to store this
TopDocIds *td = (TopDocIds *)m_topDocIdsBuf.getBuf();
int32_t tdnum = m_topDocIdsBuf.length() / sizeof(TopDocIds);
m_topDocIdsBuf.incrementLength(sizeof(TopDocIds));
if ( m_topDocIdsBuf.length() > m_topDocIdsBuf.m_capacity ) {
char *xx=NULL;*xx=0; }
QueryLink *qks = (QueryLink *)m_matchingQueryBuf.getBufStart();
QueryLink *qk = &qks[queryNum];
// the relateddocidnum hack
if ( tdnum > 32000 ) { char *xx=NULL;*xx=0; }
qk->m_relatedDocIdNum = tdnum;
// store reply info, like # docids, in the query ptr
int32_t max = numDocIds;
if ( max > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS )
max = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
td->m_numDocIds = max;
// QueryLink # in the m_matchingQueryBuf buffer we represent
td->m_queryNum = queryNum;
// keep it clean
//qp->m_docIdVotes = 0;
// get the query base hash and use that to
// dedup. the query base hash ignores common
// words and converts words to their synonym
// with the smallest hash
//int64_t qbh = getQueryBaseHash(qstr);
//m_msg3a->m_hackQNum = m_queryNum;
//m_msg3a->m_hackQPtr = (char *)qp;
// count replies
m_numMsg3aReplies++;
// log to log as well
//char tmp[50000];
SafeBuf tmp;
//char *p = tmp;
tmp.safePrintf(
"seopipe: got list of %"INT32" related docids for "
"qrynum=%"INT32" "
//"of%"INT32""
"numDocids=%"INT32" "
"query=\"",
numDocIds,
m_numMsg3aReplies,//m_msg3a->m_hackQNum,
//m_maxFullQueries ,
td->m_numDocIds);
char *qqq = qk->getQueryString(&m_matchingQueryStringBuf);
tmp.safeStrcpy(qqq);
tmp.safePrintf("\" (err=%s)",
mstrerror(err));
// log each docid
//for ( int32_t i = 0 ; i < max ; i++ ) {
// //float score = m_msg3a->getScores()[i];
// int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
// //int32_t sh32 = m_msg3a->getSiteHash32(i);
// p += sprintf(p,"d%"INT32"=%"INT64" ",i,d);
//}
char *msg = tmp.getBufStart();
log("%s",msg);
/*
// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
SafeBuf *sb = &m_socketWriteBuf;
sb->safePrintf(
"\t<seoQueryScoreInfo>\n"
"\t\t<queryNum>%"INT32"</queryNum>\n"
"\t\t<numTotalEstimatedSearchResults>%"INT32""
"</numTotalEstimatedSearchResults>\n"
"\t\t<numDocIds>%"INT32"</numDocIds>\n"
, m_msg3a->m_hackQNum
, numTotalResults
, numDocIds
);
*/
// print the top 50 scores
for ( int32_t i = 0 ; i < max ; i++ ) {
float score = m_msg3a->m_scores[i];//getScores()[i];
int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
int32_t sh26 = m_msg3a->getSiteHash26(i);
/*
sb->safePrintf("\t\t<searchResult>\n");
sb->safePrintf("\t\t\t<rank>%"INT32"</rank>\n",i+1);
sb->safePrintf("\t\t\t<score>%f</score>\n",score);
sb->safePrintf("\t\t\t<docId>%"INT64"</docId>\n",d);
sb->safePrintf("\t\t\t<siteHash32>%"UINT32"</siteHash32>\n",sh32);
sb->safePrintf("\t\t</searchResult>\n");
*/
// store results for this Msg99Reply
td->m_topDocIds[i] = d;
td->m_topScores[i] = score;
td->m_topSiteHashes26[i] = sh26;
}
// reset rest so it prints pretty on gdb debug print cmd
for ( int32_t i = max ; i < (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; i++ ) {
td->m_topDocIds[i] = 0LL;
td->m_topScores[i] = 0.0;
td->m_topSiteHashes26[i] = 0;
}
/*
sb->safePrintf("\t</seoQueryScoreInfo>\n");
*/
// give front-end the progress bar info
if ( m_seoSocket && m_progressBar ) {
// tmp buf
char tmp[16];
float percent = (float)m_numMsg3aReplies ;
//percent /= (float)m_maxFullQueries;
percent *= 100.0;
// these are 80% of the pipeline if getting competitor
// backlinks
if ( m_progressBar == 2 ) percent *= .80;
int32_t percentLong = (int32_t)percent;
if ( percentLong >= 100 ) percentLong = 99;
int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong);
if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen);
// try a send on non-blocking socket
int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n);
// forget error
errno = 0;
}
}
bool XmlDoc::clientClosedConnection ( ) {
if ( ! m_seoSocket ) return false;
if ( m_clientClosed ) return true;
if ( g_now - m_lastCheckTime < 50 ) return m_clientClosed;
m_lastCheckTime = g_now;
char buffer[100];
if ( recv(m_seoSocket->m_sd,buffer,99,MSG_PEEK|MSG_DONTWAIT) == 0 ) {
m_clientClosed = true;
log("xmldoc: CLIENT CLOSED CONNECTION!!");
}
return m_clientClosed;
}
// . returns -1 if blocked, NULL with g_errno set on error
// . we do this to get related docids
SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
setStatus ( "getmatchingqueriesscoredforfullquery" );
// just re-use the same m_queryPtrs SafeBuf we used above but we
// set the Msg99Reply::m_myScore here and sort them by that
if ( m_queryPtrsWholeValid )
return &m_matchingQueryBuf;
// get the queries sorted by the url: | scores for our main url
SafeBuf *mq = getMatchingQueryBuf();
if ( mq == NULL || mq == (void *)-1 ) return mq;
// setup timer
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( ! m_beginTimeFullQueries )
m_beginTimeFullQueries = gettimeofdayInMilliseconds();
// this buffer holds a ptr to each query in each msg99 reply we
// received from all hosts in the network
QueryLink *qks = (QueryLink *)mq->getBufStart();
int32_t nks = mq->length()/sizeof(QueryLink);
int32_t maxFullQueries = 50;
int32_t tneed = maxFullQueries * sizeof(TopDocIds);
if ( m_topDocIdsBuf.length() == 0 && ! m_topDocIdsBuf.reserve(tneed) )
return NULL;
// . now launch msg3as at them
// . this is 60k so new it here
if ( ! m_msg3a ) {
// reset the query # we are processing
m_queryNum = 0;
m_numMsg3aRequests = 0;
m_numMsg3aReplies = 0;
if ( ! m_fullQueryDedup.set(8,0,256,NULL,0,
false,m_niceness,"fqdd"))
return NULL;
try { m_msg3a = new ( Msg3a ); }
catch ( ... ) {
g_errno = ENOMEM;
return NULL;
}
mnew ( m_msg3a, sizeof(Msg3a),"xdmsg3a");
// need this too now i guess since it is 65k
try { m_query3a = new ( Query ); }
catch ( ... ) {
g_errno = ENOMEM;
return NULL;
}
mnew ( m_query3a, sizeof(Query),"xdqry3a");
}
loop:
// breath in case we hit all cache
QUICKPOLL(m_niceness);
// have we launched all the requests we need to
bool exhausted = false;
if ( m_queryNum >= nks ) exhausted = true;
if ( m_numMsg3aRequests >= maxFullQueries ) exhausted = true;
// if client closed browser connection by hitting the stop sign
// then stop here!
if ( clientClosedConnection() ) m_hadMatchError = ESOCKETCLOSED;
if ( m_hadMatchError ) exhausted = true;
// if nothing to launch
if ( exhausted &&
// and all replies received
m_numMsg3aReplies >= m_numMsg3aRequests ) {
// nuke the msg3a to save mem
mdelete ( m_msg3a, sizeof(Msg3a) , "msg3a" );
delete ( m_msg3a );
m_msg3a = NULL;
mdelete ( m_query3a , sizeof(Query), "qry3a" );
delete ( m_query3a );
m_query3a = NULL;
// time it
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_beginTimeFullQueries;
log("seopipe: time: fullqueries took %"INT64" ms",took);
// force closed?
if ( m_hadMatchError ) return NULL;
// we are done!
m_queryPtrsWholeValid = true;
return &m_matchingQueryBuf;//queryPtrs;
}
// if nothing to launch wait for all replies
if ( exhausted )
return (SafeBuf *)-1;
// get the current query to process
//Msg99Reply *qp = queryPtrs[m_queryNum];
QueryLink *qk = &qks[m_queryNum];
int32_t savedQueryNum = m_queryNum;
QueryLogEntry *qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
// int16_tcut
//int64_t h64 = qk->m_querySynBaseHash64;
int64_t h64 = getSynBaseHash64 ( qe->getQueryString(),qe->m_langId);
// . if we already did a similar query, then skip it
// . Msg99Reply::m_topDocIds will be NULL so getRelatedDocIds() will
// know we skipped this query and to ignore it
if ( m_fullQueryDedup.isInTable(&h64) ) {
m_queryNum++;
goto loop;
}
// or if importance is 0, which means to ignore!
if ( qk->m_queryImportance <= 0.0 ) {
m_queryNum++;
goto loop;
}
// int16_tcut
char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
// sanity
if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
// this is required for synonyms!
// TODO: use whatever language the query is!!!
uint8_t langId = langEnglish;
// int16_tcut
int32_t qlen = gbstrlen(qstr);
//int32_t collLen = gbstrlen(cr->m_coll);
// set the request
m_mr2.reset();
m_mr2.ptr_query = qstr;
m_mr2.size_query = qlen+1;
//m_mr2.ptr_coll = cr->m_coll;
//m_mr2.size_coll = collLen+1;
m_mr2.m_collnum = cr->m_collnum;
m_mr2.m_queryExpansion = 1;
m_mr2.m_language = langId;
m_mr2.m_niceness = m_niceness;
// . get top 50 results now
// . then related docids will have to be in there
m_mr2.m_docsToGet = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
m_mr2.m_useSeoResultsCache = true;
// we do not need this, we just want the related docids/scores
m_mr2.m_getDocIdScoringInfo = false;
// use cache for 7 days since it is just for getting related docids
// right now. make sure that that cache saves to disk.
// MDW: why is this not working?
//m_mr2.m_maxAge = 86400 * 7;
//m_mr2.m_addToCache = true;
//m_mr2.m_debug = 1;
// prepend to the query?
int32_t ulen = m_firstUrl.m_ulen;
// go to next guy if this query is too big already
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
m_queryNum++;
goto loop;
}
// support for the new TopDocIds class which holds detailed search
// results for selected matching queries QueryLinks
//int32_t maxt = numQueryPtrs;
//if ( maxt > m_maxQueries ) maxt = m_maxQueries;
//if ( ! maxt ) { char *xx=NULL;*xx=0; }
// we also need the top docids
//if ( ! m_topDocIdsBuf.m_capacity ) {
// int32_t need = sizeof(TopDocIds) * (int32_t)MAX_MATCHING_QUERIES;
// if ( ! m_topDocIdsBuf.reserve ( need ,"tdbuf" ) ) return NULL;
// //m_nextAvailTopDocIdsOffset = 0;// = m_topDocIdsBuf;
//}
// make matching query, "qk", point to the topdocids that we
// will fill in when we execute this query in full below
// sanity!
//int32_t off3 = m_nextAvailTopDocIdsOffset ;
//if ( off3/(int32_t)sizeof(TopDocIds)>=maxt){char *xx=NULL;*xx=0;}
// seo.cpp's handleRequest99() should have set it to -1
//if ( qp->m_topDocIdsBufOffset != -1 ) { char *xx=NULL;*xx=0; }
// assign this TopDocIds class to this query ptr now
//qp->m_topDocIdsBufOffset = m_nextAvailTopDocIdsOffset;
// get that ptr to reset its count to 0
//TopDocIds *ttt = qp->getTopDocIds(&m_topDocIdsBuf);
//ttt->m_numDocIds = 0;
// inc it
//m_nextAvailTopDocIdsOffset += sizeof(TopDocIds);
// update length since we store topdocids buf based on its m_length
//m_topDocIdsBuf.setLength ( m_nextAvailTopDocIdsOffset );
// advance for next guy
m_queryNum++;
// add it to dedup table
if ( ! m_fullQueryDedup.addKey(&h64) ) {
m_hadMatchError = g_errno;
goto loop;
}
// mark it out
m_numMsg3aRequests++;
// . set the query class for msg3a
// . queryExpansion = true
m_query3a->set2 ( qstr , langId , true );
// a debug thing
m_query3a->m_containingParent = (void *)this;
// secret variable latchon
m_msg3a->m_hack = this;
m_msg3a->m_hackQNum = savedQueryNum;
m_msg3a->m_hackQPtr = NULL;//(char *)qp;
// note it
setStatus("launching msg3a");
// . get the docIds
// . this sets m_msg3a.m_clusterLevels[] for us
// . it sends a msg39 request to each alive host in the network
bool status = m_msg3a->getDocIds ( &m_mr2,
m_query3a,
this,//m_msg3a , // this ,
gotMsg3aReplyForFullQueryWrapper);
// return false if msg3a blocked
if ( ! status ) return (SafeBuf *)-1;
// error?
if ( g_errno ) {
m_hadMatchError = g_errno;
m_numMsg3aReplies++;
goto loop;
}
// i guess did not block... can this happen? cached?
//log("xmldoc: msg3a did not block");
// not supported yet. we need to process reply.
//char *xx=NULL;*xx=0;
// yeah, msg17 in there can cache in seoresults cache now
gotMsg3aReplyForFullQuery();
// try looping
goto loop;
}
static int rdCmp ( const void *a, const void *b ) {
RelatedDocId *da = (RelatedDocId *)a;
RelatedDocId *db = (RelatedDocId *)b;
// get scores
float scorea = da->m_relatedWeight;//dotProduct;//similarityScore;
float scoreb = db->m_relatedWeight;//dotProduct;//similarityScore;
if ( scorea < scoreb ) return 1;
if ( scorea > scoreb ) return -1;
return 0;
}
static int lkCmp ( const void *a, const void *b ) {
QueryNumLinkedNode *ka = *(QueryNumLinkedNode **)a;
QueryNumLinkedNode *kb = *(QueryNumLinkedNode **)b;
// get scores
int32_t ra = ka->m_relatedDocIdRank;
int32_t rb = kb->m_relatedDocIdRank;
if ( ra >= 0 && rb >= 0 ) {
if ( ra < rb ) return -1;
if ( ra > rb ) return 1; // swap
}
if ( ra >= 0 ) return -1;
if ( rb >= 0 ) return 1; // swap
// if neither ranked, go by serp score i guess
float sa = ka->m_relatedDocIdSerpScore;
float sb = kb->m_relatedDocIdSerpScore;
if ( sa > sb ) return -1;
if ( sa < sb ) return 1; // swap
return 0;
}
// buf is an array of RelatedDocId members
SafeBuf *XmlDoc::getRelatedDocIds ( ) {
setStatus ( "getrelateddocids" );
if ( m_relatedDocIdBufValid )
return &m_relatedDocIdBuf;
// get the full replies with the top 50 docids and scores listed
// for each query. should be sorted by m_myScore.
SafeBuf *mq = getMatchingQueriesScoredForFullQuery ( );
if ( ! mq || mq == (void *)-1 ) return mq;
// . how many queries do we have that match this url?
// . they should be sorted by our url's score
//QueryLink *qks = (QueryLink *)mq->getBufStart();
//int32_t nks = mq->length()/sizeof(QueryLink);
int32_t *sh32 = getSiteHash32();
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SafeBuf *)sh32;
int32_t dh32 = getDomHash32();
//if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
int32_t ourSiteHash26 = *sh32 & 0x03ffffff;
int32_t ourDomHash26 = dh32 & 0x03ffffff;
// for deduping queries with the same "base hash" we do not want
// them to count twice for RelatedDocId::m_numCommonQueries
//HashTableX dedup;
//if ( ! dedup.set(8,0,1024,NULL,0,false,0,"dddtab"))
// return NULL;
// scan the top docids
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
for ( int32_t i = 0 ; i < ntds ; i++ ) {
TopDocIds *td = &tds[i];
int32_t queryNum = td->m_queryNum;
//QueryLink *qk = &qks[queryNum];
// sanity
int32_t nd = td->m_numDocIds;
if( nd < 0) { char *xx=NULL;*xx=0; }
if( nd > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS){
char *xx=NULL;*xx=0;}
// get main url score for query
//float ourScore = qp->m_myScore;
// and the score of the top result
//float normScore = td->m_topScores[0];
// norm main url score
//ourScore /= normScore;
// scan the top 50 (or more) docids for this query
for ( int32_t j = 0 ; j < nd ; j++ ) {
// . do not allow related docid (aka competitor page)
// to be from our site! will make sure we exclude
// our url itself, too. otherwise competitor
// backlinks mentions when a link links to us, and
// we don't care about that, we already have the
// link. we just want to see recommneded backlinks
// we do not yet have, so we can get them.
// . skip it if from our same sitehash26
if ( td->m_topSiteHashes26[j] == ourSiteHash26 )
continue;
// fix cheatcodes.com being a competitor page when
// our main url is www.cheatcodes.com
if ( td->m_topSiteHashes26[j] == ourDomHash26 )
continue;
// skip twitter facebook, etc
int64_t docId = td->m_topDocIds[j];
if ( docId == 114607849462LL || // https://www.twitter
docId == 273941610476LL || // twitter.com
docId == 1628437294LL || // facebook.com
docId == 146394931444LL ) // cnn.com/video/
continue;
// add RelatedDocId into m_relatedDocIdBuf and/or
// augment its linked list of query/score pairs
addRelatedDocIdInfo ( td->m_topDocIds[j],
queryNum ,
td->m_topScores[j], // score
j , // rank
td->m_topSiteHashes26[j] );
}
}
QUICKPOLL(m_niceness);
// this is now in getRelatedDocIdsScored()!!!!!!!
/*
char *rdbuf = m_relatedDocIdBuf.getBufStart();
int32_t numDocIds = m_relatedDocIdBuf.length()/sizeof(RelatedDocId);
// now sort by RelatedDocId::m_relatedWeight
qsort ( rdbuf , numDocIds, sizeof(RelatedDocId),qp99docIdCmp );
QUICKPOLL(m_niceness);
// limit to top MAX_RELATED_DOCIDS related docids
// will take longer to get titles/urls and related queries the
// higher this number is, but we will have more competitor backlinks
// and terms etc.
int32_t maxLen = sizeof(RelatedDocId) * MAX_RELATED_DOCIDS;
int32_t currentLen = m_relatedDocIdBuf.length();
if ( currentLen > maxLen ) currentLen = maxLen;
m_relatedDocIdBuf.setLength(currentLen);
numDocIds = currentLen / sizeof(RelatedDocId);
*/
int32_t numDocIds = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
/*
// log out for debug
char *rdbuf = m_relatedDocIdBuf.getBufStart();
RelatedDocId *rds = (RelatedDocId *)rdbuf;
for ( int32_t i = 0 ; g_conf.m_logDebugSEO && i < numDocIds ; i++ ) {
log("seopipe: related docId #%"INT32" docid=%"INT64" "
"score=?? common=%"INT32"",
i,
rds[i].m_docId,
//rds[i].m_relatedWeight,//dotProduct, // similarityScore,
rds[i].m_numCommonQueries);
}
*/
log("seo: got %"INT32" related docids in buf",numDocIds);
m_relatedDocIdBufValid = true;
return &m_relatedDocIdBuf;
}
// used as part of the msg4f request
SafeBuf *XmlDoc::getTopMatchingQueryBuf ( ) {
if ( m_topMatchingQueryBufValid )
return &m_topMatchingQueryBuf;
// scan matching queries that we evaluated fully using msg3a
SafeBuf *qkbuf = getMatchingQueriesScoredForFullQuery ( );
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
//int32_t nks = qkbuf->length()/sizeof(QueryLink);
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
for ( int32_t i = 0 ; i < ntds ; i++ ) {
TopDocIds *td = &tds[i];
int32_t queryNum = td->m_queryNum;
QueryLink *qk = &qks[queryNum];
// ok, get it
char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
int32_t qlen = gbstrlen(qstr);
// store query #
if ( ! m_topMatchingQueryBuf.pushLong(queryNum) )
return NULL;
// then query
if ( ! m_topMatchingQueryBuf.safeMemcpy(qstr,qlen+1))
return NULL;
}
m_topMatchingQueryBufValid = true;
return &m_topMatchingQueryBuf;
}
static void gotMsg4fReplyWrapper ( void *state , UdpSlot *slot ) {
XmlDoc *THIS = (XmlDoc *)state;
// a bit of a hack
THIS->m_savedSlot = slot;
// ultimately, getRelatedDocIdsScored() will be called from this
THIS->m_masterLoop ( THIS->m_masterState );
}
// . lets just put everything in this one function
// . launch a msg4f request for each relateddocid
// . get the msg4f reply back and add the positive scoring queries to the
// related docids linked list of QueryNumLinkedNodes in the
// m_commonQueryNumBuf, avoid dups.
// . then score each related docid by calling setRelatedDocIdScores()
SafeBuf *XmlDoc::getRelatedDocIdsScored ( ) {
setStatus ( "getrelateddocidsscored");
if ( m_relatedDocIdsScoredBufValid ) {
// and return the buf of RelatedDocIds
return &m_relatedDocIdBuf;
}
// what docids share our TOP-scoring matching queries?
SafeBuf *rdbuf = getRelatedDocIds();
if ( ! rdbuf || rdbuf == (void *)-1) return (SafeBuf *) rdbuf;
SafeBuf *tmq = getTopMatchingQueryBuf();
if ( ! tmq || tmq == (void *)-1) return (SafeBuf *) tmq;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// the top 50 or so matching queries will each be scored for
// every related docid we have in m_relatedDocIdBuf. these are
// the same queries we got the full results for above!!!
// we have to score them for each related docid here because we only
// get the top 300 or so results above for each one. so if the
// related docid matched the query but was not in the top 300 results,
// it would have appeared to NOT match the query. bad. that was
// causing google to come up high in related docids because it
// ranked high for so many generic queries. and the other good
// related docids did not rank in the top 300 for those same
// generic queries. so at least this logic will show that the
// related docids do indeed match those generic queries, too.
// and they will get higher scores (RelatedDocId::m_relatedWeight)
// we must be an incoming reply if we already sent out all the requests
if ( m_numMsg4fRequests > 0 ) {
// increment our reply counter
m_numMsg4fReplies++;
// . m_savedSlot is a hack
// . now parse the reply and add QueryNumLinkedNode
// into m_commonQueryNumBuf.
char *p = m_savedSlot->m_readBuf;
char *pend = m_savedSlot->m_readBufSize + p;
// now scan the reply
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// the queryNum is relative to the m_queryPtrs array
// which has all the matching queries of this document,
// not just the "top" 50 matching queries by score.
int32_t queryNum = *(int32_t *)p;
// sanity
if ( queryNum<0 ) {char *xx=NULL;*xx=0; }
p += 4;
// then docid of related docid that had this score
int64_t docId = *(int64_t *)p;
p += 8;
// then score
float score = *(float *)p;
p += 4;
// this will add the query/score pair into the
// related docid buf. it will not add dups if already
// ranked!
addRelatedDocIdInfo ( docId ,
queryNum ,
score ,
-1 , // rank unknown
-1 ); // sitehash26 unknown
}
// return if awaiting more replies
if ( m_numMsg4fReplies < m_numMsg4fRequests )
return (SafeBuf *)-1;
// point to buffer of related docids
char *rdbuf = m_relatedDocIdBuf.getBufStart();
RelatedDocId *rds = (RelatedDocId *)rdbuf;
int32_t nr = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
for ( int32_t i = 0 ; i < nr ; i++ ) {
// int16_tcut
RelatedDocId *rd = &rds[i];
// now score it since we have all the serpscores for
// all top matching queries.
setRelatedDocIdWeightAndRank(rd);
}
// breathe
QUICKPOLL(m_niceness);
// now sort by RelatedDocId::m_relatedWeight
qsort ( rdbuf , nr , sizeof(RelatedDocId),rdCmp );
// breathe
QUICKPOLL(m_niceness);
// limit to top MAX_RELATED_DOCIDS related docids
// will take longer to get titles/urls and related queries the
// higher this number is, but we will have more competitor
// backlinks and terms etc. less space in cachedb too!
int32_t maxLen = MAX_RELATED_DOCIDS * sizeof(RelatedDocId);
int32_t newLen = m_relatedDocIdBuf.length();
if ( newLen > maxLen ) newLen = maxLen;
m_relatedDocIdBuf.setLength(newLen);
//
// make a new buffer for m_commonQueryNumBuf just for the
// related docids we picked, and sort them by rel docid rank.
// so it will be smaller and sorted.
//
SafeBuf tmpBuf;
if ( ! tmpBuf.reserve ( m_commonQueryNumBuf.length() ) )
return NULL;
// scan each related docid in the top 300 or so
for ( int32_t i = 0 ; i < nr ; i++ ) {
// int16_tcut
RelatedDocId *rd = &rds[i];
// store ptrs to query nums so we can sort them
QueryNumLinkedNode *links[1024];
int32_t nn = 0;
int32_t fo = rd->m_firstCommonQueryNumOff;
char *base = m_commonQueryNumBuf.getBufStart();
// scan down the linked list and store ptrs to links[]
for ( ; fo >= 0 ; ) {
// cast it
QueryNumLinkedNode *qn;
qn = (QueryNumLinkedNode *)(base + fo);
// point to next
fo = qn->m_nextOff;
// store this guy for sorting
links[nn] = qn;
nn++;
if ( nn >= 1024 ) break;
}
// now sort them by m_relatedDocIdRank
qsort( links, nn,sizeof(QueryNumLinkedNode *),lkCmp);
// point to our new linked list in tmpBuf, we will
// store them here.
rd->m_firstCommonQueryNumOff = tmpBuf.length();
QueryNumLinkedNode *prev = NULL;
// now store into tmpbuf
for ( int32_t k = 0 ; k < nn ; k++ ) {
QueryNumLinkedNode *qn = links[k];
int32_t size = sizeof(QueryNumLinkedNode);
if ( !tmpBuf.reserve(size) ) return NULL;
QueryNumLinkedNode *nn ;
nn = (QueryNumLinkedNode *)tmpBuf.getBuf();
int32_t clen = tmpBuf.length();
tmpBuf.safeMemcpy(qn,size);
// we are the previous guy's next node
if ( prev ) prev->m_nextOff = clen;
// assume nobody follows us
nn->m_nextOff = -1;
// we are now next guy's prev
prev = nn;
}
}
// now steal tmpbuf, and free our old stuff
m_commonQueryNumBuf.stealBuf ( &tmpBuf );
// i guess we are done now!
m_relatedDocIdsScoredBufValid = true;
return &m_relatedDocIdBuf;
}
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
// . there's a massive # of related docids at this point
// . possibly 50 x 300 = 15,000
// . so launch one msg4f for each host in our network
// . just specify all the related docids in the msg4f request and have
// the handleRequest4f() function in seo.cpp get the title rec.
// . make sure all docids are local to that host
// . dispatch the msg4f request to the machine that has that docid
// local so it can just hit disk
// . handleRequest4f() can follow the same logic as in
// getRelatedQueryLinks() which make a new xmldoc. then it can
// call newxd->getTermListBuf() instead of us passing it in.
// . so each host has a bin, a host bin
//#ifdef __APPLE__
SafeBuf hostBin[MAX_HOSTS];
//#else
//SafeBuf hostBin[g_hostdb.m_numHosts];
//#endif
// scan the related docids and send the requests if we have not already
for ( int32_t i = 0 ; ! m_sentMsg4fRequests && i < numRelated ; i++ ) {
RelatedDocId *rd = &rds[i];
//uint32_t gid=g_hostdb.getGroupIdFromDocId (rd->m_docId);
// pick host in that group
//Host *group = g_hostdb.getGroup ( gid );
int32_t shardNum = getShardNumFromDocId ( rd->m_docId );
Host *group = g_hostdb.getShard ( shardNum );
int32_t nh = g_hostdb.m_numHostsPerShard;
int32_t hostNum = rd->m_docId % nh;
Host *h = &group[hostNum];
int32_t hostId = h->m_hostId;
// skip if dead
int32_t count = 0;
if ( g_hostdb.isDead(hostId) && h->m_wasEverAlive ) {
// increment hostnum if that one is dead
if ( ++hostNum >= nh ) hostNum = 0;
// set these again
h = &group[hostNum];
hostId = h->m_hostId;
// if all dead, just pick this one i guess
if ( ++count >= nh ) break;
}
// int16_tcut
SafeBuf *hbin = &hostBin[hostId];
// if bin is empty initialize
if ( hbin->length() == 0 ) {
// provide only collection to handleRequest4f()
if ( ! hbin->safeMemcpy(cr->m_coll,
gbstrlen(cr->m_coll)+1) )
return NULL;
// . store the queries we want it to evaluate
// . these are null-terminated query strings preceeded
// by their corresponding query number in our
// m_queryPtrs[] array which pts to a Msg99Reply
if ( ! hbin->pushLong(tmq->length()))
return NULL;
if ( ! hbin->safeMemcpy(tmq))
return NULL;
}
// store this new docid, which is local to this host
if ( ! hbin->pushLongLong(rd->m_docId) ) return NULL;
}
// shotgun out the msg4f requests now
for ( int32_t i = 0 ;
! m_sentMsg4fRequests && i < g_hostdb.getNumHosts() ; i++ ) {
// int16_tcut
SafeBuf *hbin = &hostBin[i];
// get that host
Host *host = g_hostdb.getHost(i);
// make a copy for sending out
SafeBuf copy;
if ( ! copy.safeMemcpy ( hbin ) ) continue;
// get the bin copy
char *req = copy.getBufStart();
int32_t reqSize = copy.length();
// detach it so udpserver can free it when done transmitting
copy.detachBuf ();
// free this guy now i guess
hbin->purge();
// count as launched
m_numMsg4fRequests++;
// launch it
if ( ! g_udpServer.sendRequest ( req ,
reqSize,
0x4f , // msgtype
host->m_ip , // ip
host->m_port , // port
host->m_hostId,
NULL, // retslot
this,
gotMsg4fReplyWrapper,
10000 , // timeout
-1 , // backoff
-1 , // maxwait
NULL, // replybuf
0, // replybufmaxsize
m_niceness // niceness
)) {
// let admin know about error
log("seopipe: sendRequest 4f had error: %s",
mstrerror(g_errno));
// count it as replied then
m_numMsg4fReplies++;
continue;
}
}
// do not re-send the requests
m_sentMsg4fRequests = true;
// wait for all replies to come in
if ( m_numMsg4fRequests > m_numMsg4fReplies ) return (SafeBuf *)-1;
// how can they all be done? all errors!
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return NULL;
}
// remote host will alloc an xmldoc, about 1MB each...
#define MAX_OUT_MSG20S 30
// . like getRelatedDocIds() but with titles, etc.
// . return a list of competiting docids/titles/etc.
SafeBuf *XmlDoc::getRelatedDocIdsWithTitles ( ) {
setStatus ( "getrelateddocidswithtitles" );
// try to set from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
if ( m_relatedDocIdsWithTitlesValid )
return &m_relatedDocIdBuf;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
SafeBuf *rdbuf = getRelatedDocIdsScored();
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
// now look up each docid in titledb and store the url title
// into m_relatedTitleBuf safebuf and set the RelatedDocId::
// rd_title_off and rd_url_off into that when done. store offsets for
// now and make into full out ptrs when done in case the
// m_relatedTitleBuf reallocs.
if ( ! m_msg20Buf.length() ) {
int32_t need = sizeof(Msg20) * MAX_OUT_MSG20S ;
if ( ! m_msg20Buf.reserve ( need,"m20buf" ) ) return NULL;
// mark it all in use
m_msg20Buf.setLength(need);
// init them
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
// reset cursor to start with first related docid
m_rdCursor = 0;
m_relatedDocIdError = 0;
m_numMsg20Replies = 0;
}
// point to buffer of related docids
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();;
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
// scan the msg20s we allocated to see if any got a reply
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
// int16_tcut
Msg20 *msg20 = &mp[i];
// skip if never launched
if ( ! msg20->m_launched ) continue;
// skip if it is in progress, awaiting its reply
if ( msg20->m_inProgress ) continue;
// get the reply from it (might be NULL iff g_errno is set)
Msg20Reply *reply = msg20->getReply(); // m_r
// get the corresponding related docid
int32_t hisCursor = msg20->m_hack2;
// int16_tcut
RelatedDocId *rd = &rds[hisCursor];
// ok, it has a reply. could be NULL if g_errno was set.
if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , reply ) )
m_relatedDocIdError = g_errno;
// reset it for later us... or not...
msg20->reset();
// count reply as back now
m_numMsg20Replies++;
}
// launch more if we can. one launch per msg20.
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
// no more related docids left to launch?
if ( m_rdCursor >= numRelated ) break;
// int16_tcut
Msg20 *msg20 = &mp[i];
// skip if already launched/inuse
if ( msg20->m_inProgress ) continue;
// get current related docid
RelatedDocId *rd = &rds[m_rdCursor];
// make the request
Msg20Request req;
//req.ptr_coll = cr->m_coll;
//req.size_coll = gbstrlen(cr->m_coll)+1;
req.m_collnum = cr->m_collnum;
req.m_docId = rd->m_docId;
req.m_expected = true;
req.m_niceness = m_niceness;
req.m_state = m_masterState;
req.m_callback2 = m_masterLoop;
// do not get summary stuff. too slow.
req.m_numSummaryLines = 0;
// if it has an outlink to our site/domain set
// Msg20Reply::m_hasLinkToOurDomOrHost
req.m_ourHostHash32 = getHostHash32a();
req.m_ourDomHash32 = getDomHash32();
// store cursor in msg20 itself so we know what rd it's using
msg20->m_hack2 = m_rdCursor;
// advance cursor!!!
m_rdCursor++;
// launch it
if ( ! msg20->getSummary ( &req ) ) continue;
// it did not block... wtf? g_errno might be set. ENOMEM?
if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , NULL ) )
m_relatedDocIdError = g_errno;
// reset it
msg20->reset();
// count reply as back now
m_numMsg20Replies++;
// it is not launched
i--;
}
// wait for one reply per related docid
if ( m_numMsg20Replies < numRelated )
return (SafeBuf *)-1;
// call msg20 destructor
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
Msg20 *msg20 = &mp[i];
msg20->destructor();
}
// purge the mem they used
m_msg20Buf.purge();
// now we are done
m_relatedDocIdsWithTitlesValid = true;
m_relatedTitleBufValid = true;
// store it in cachedb
if ( ! storeRelatedDocIdsIntoCachedb( ))
return (SafeBuf *)-1;
return &m_relatedDocIdBuf;
}
bool XmlDoc::setRelatedDocIdInfoFromMsg20Reply ( RelatedDocId *rd ,
Msg20Reply *reply ) {
// get error. g_errno can be ENOTFOUND if titlerec not found
int32_t error = g_errno;
// . or could be EDOCBANNED/EDOCFILTERED etc.
// . if reply is NULL then g_errno MUST be set
if ( ! error ) error = reply->m_errno;
// int16_tcuts
char *urlStr = NULL;
char *titleStr = NULL;
char *siteStr = NULL;
if ( reply ) {
urlStr = reply->ptr_ubuf;
titleStr = reply->ptr_tbuf;
siteStr = reply->ptr_site;
}
// did that fail? i.e. docid not found!?!?!
if ( error ) {
// . just skip this asshole then
// . might be EDOCBANNED or EDOCFILTERED!
// . some are filtered because they are domain-only urls
// which should not be in the index because we force
// a "www." prepend on all urls now.
log("seo: msg20 reply for docid=%"INT64" url=%s had "
"error: %s", rd->m_docId,urlStr,mstrerror(error));
// clear that
g_errno = 0;
ignoreRelatedDocId:
// mark them offsets as not-founds
rd->rd_title_off = -1;
rd->rd_url_off = -1;
rd->rd_site_off = -1;
return true;
}
// bar facebook.com and twitter.com roots... too popular for all!
// was coming up for jezebelgallery.com
if ( strcmp(urlStr,"http://www.twitter.com/") == 0 )
goto ignoreRelatedDocId;
if ( strcmp(urlStr,"https://www.twitter.com/") == 0 )
goto ignoreRelatedDocId;
if ( strcmp(urlStr,"http://www.facebook.com/") == 0 )
goto ignoreRelatedDocId;
// "/home.php?" or "home.*"
if ( strncmp(urlStr,"http://www.facebook.com/home.",29) == 0 )
goto ignoreRelatedDocId;
if ( strcmp(urlStr,"https://www.facebook.com/") == 0 )
goto ignoreRelatedDocId;
if ( strcmp(urlStr,"http://www.cnn.com/video/") == 0 )
goto ignoreRelatedDocId;
// fix robothits.com competitor pages
if ( strcmp(urlStr,"http://www.google.com/") == 0 )
goto ignoreRelatedDocId;
if ( strcmp(urlStr,"http://www.msn.com/") == 0 )
goto ignoreRelatedDocId;
// null means no title i guess
if ( ! titleStr ) titleStr = "";
// or if he links to us
if ( reply->m_hasLinkToOurDomOrHost ) {
log("seo: related docid=%"INT64" url=%s links to our domain",
reply->m_docId,
urlStr);
goto ignoreRelatedDocId;
}
// store title
int32_t titleOffset = m_relatedTitleBuf.length();
if ( ! m_relatedTitleBuf.safeStrcpy ( titleStr ) ) return false;
m_relatedTitleBuf.pushChar('\0');
// then url
int32_t urlOffset = m_relatedTitleBuf.length();
if ( ! m_relatedTitleBuf.safeStrcpy ( urlStr ) ) return false;
m_relatedTitleBuf.pushChar('\0');
// then site
int32_t siteOffset = m_relatedTitleBuf.length();
if ( ! m_relatedTitleBuf.safeStrcpy ( siteStr ) ) return false;
m_relatedTitleBuf.pushChar('\0');
// then linkinfo
//int32_t linkInfo1Offset = m_relatedTitleBuf.length();
//if(!m_relatedTitleBuf.safeMemcpy(info1,info1->getSize()))return NULL;
// store as offset for easy serialization for storage into cachedb
//rd->m_linkInfo1Offset = linkInfo1Offset;
rd->m_relatedFirstIp = reply->m_firstIp;
rd->m_relatedCurrentIp = reply->m_ip;
rd->m_rd_siteRank = reply->m_siteRank;
rd->m_rd_langId = reply->m_language;
rd->m_rd_siteHash32 = 0;
if ( reply->ptr_site )
rd->m_rd_siteHash32 = hash32n ( reply->ptr_site );
// record the offsets of title/url/site in the m_relatedTitleBuf
rd->rd_title_off = titleOffset;
rd->rd_url_off = urlOffset;
rd->rd_site_off = siteOffset;
SafeBuf *rdbuf = getRelatedDocIds();
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
// log out for debug
log(LOG_DEBUG,
"seopipe: related docid (%"INT32"of%"INT32") docid=%"INT64" score=%f "
"title=\"%s\" url=\"%s\"",
m_numMsg20Replies,
numRelated-1,
rd->m_docId,
rd->m_relatedWeight,
titleStr,
urlStr);
return true;
}
/*
HashTableX *XmlDoc::getMatchingQueryHashTable ( ) {
setStatus ( "getmatchingqueryhashtable" );
if ( m_queryHashTableValid )
return &m_queryHashTable;
SafeBuf *qpbuf = getMatchingQueries(false);
if ( ! qpbuf || qpbuf == (void *)-1) return (HashTableX *)qpbuf;
// how many queries do we have that match this url?
Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
// init it
if ( ! m_queryHashTable.set(8,
0,
numQueryPtrs*4,
NULL,
0,
false,
m_niceness,
"qdht") )
return NULL;
for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
// cast it
Msg99Reply *qp = qptrs[i];
// int16_tcut
int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
// hash it up
if ( ! m_queryHashTable.addKey ( &eh64 ) )
return NULL;
}
// all done
m_queryHashTableValid = true;
return &m_queryHashTable;
}
*/
/*
HashTableX *XmlDoc::getMatchingQueryOffsetTable ( ) {
setStatus ( "getmatchingqueryoffsettable" );
if ( m_queryOffsetTableValid )
return &m_queryOffsetTable;
SafeBuf *qkbuf = getMatchingQueryBuf();
if ( ! qkbuf || qkbuf == (void *)-1) return (HashTableX *)qkbuf;
// how many queries do we have that match this url?
//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
int32_t nks = qkbuf->length()/sizeof(QueryLink);
// init it
if ( ! m_queryOffsetTable.set(8,
0,
nks*4,
NULL,
0,
false,
m_niceness,
"qdot") )
return NULL;
for ( int32_t i = 0 ; i < nks ; i++ ) {
// cast it
QueryLink *qk = &qks[i];
// int16_tcut
//int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
int64_t eh64 = qp->m_replyingHostId;
eh64 <<= 32;
eh64 |= qp->m_qbufOffset;
// hash it up
if ( ! m_queryOffsetTable.addKey ( &eh64 ) )
return NULL;
}
// all done
m_queryOffsetTableValid = true;
return &m_queryOffsetTable;
}
//static char *s_base = NULL;
// related QUERY compate
int qp99relatedCmp ( const void *a, const void *b ) {
// these are offsets
//int32_t offa = *(int32_t *)a;
//int32_t offb = *(int32_t *)b;
QueryLink *qa = *(QueryLink **)a;
QueryLink *qb = *(QueryLink **)b;
// make sure manually added queries are on top
//if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1;
//if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
//QueryInfo *qia = &qa->m_queryInfo;
//QueryInfo *qib = &qb->m_queryInfo;
// get scores
float scorea = qa->m_rq_totalScore;
float scoreb = qb->m_rq_totalScore;
if ( scorea < scoreb ) return 1;
if ( scorea > scoreb ) return -1;
//return 0;
// let docidsincommon break ties
return qb->m_docIdVotes - qa->m_docIdVotes;
}
*/
/*
static int qlCmp ( const void *a, const void *b ) {
QueryLink *qa = (QueryLink *)a;
QueryLink *qb = (QueryLink *)b;
// let docid break ties
int64_t da = qa->getRelatedDocId(s_rdBuf)->m_docId;
int64_t db = qb->getRelatedDocId(s_rdBuf)->m_docId;
//int64_t da = qa->m_relatedDocId->m_docId;
//int64_t db = qb->m_relatedDocId->m_docId;
// always niceness 1 i guess
QUICKPOLL(1);
if ( da > db )
return 1; // 1 means to swap!
if ( da < db )
return -1;
return 0;
}
*/
#include <math.h> // sqrtf()
// now we can do square roots in gdb by calling this
float gbsqrt ( float x ) {
return sqrtf(x);
}
/*
// sort the related query links intersected buf by docid
QueryLink *ptrs;
ptrs = (QueryLink *)m_relatedQueryLinksIntersected.getBufStart();
int32_t nk = m_relatedQueryLinksIntersected.length() / sizeof(QueryLink);
qsort ( ptrs ,
nk,
sizeof(QueryLink),
qlCmp );
// show time
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - start;
log("seopipe: time: relatedqueryintersection took %"INT64" ms",took);
*/
/*
void XmlDoc::gotMsg98Reply ( UdpSlot *slot ) {
// get replying hostid
int32_t hostId = slot->m_hostId;
// log
setStatus ( "gotmsg98reply" );
// sanity
if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
// point to it
char *p = slot->m_readBuf;
char *pend = p + slot->m_readBufSize;
// int16_tcuts
QueryLink *qks = (QueryLink *)m_tmpBuf5.getBufStart();
// sanity, i guess if oom
int32_t maxLinkOff = m_tmpBuf5.length() ;
maxLinkOff /= sizeof(QueryLink);
// make some space
int32_t need = slot->m_readBufSize;
if ( ! m_tmpStringBuf5.reserve(need,"rqdbuf") ) {
m_msg98ReplyError = g_errno;
// do not bother scanning the reply
p = pend;
}
// init table
if ( m_qstringTable.m_numSlots == 0 ) {
// 1M slots!
if ( ! m_qstringTable.set(4,4,1000000,NULL,0,false,
m_niceness,"qstrtbl") ) {
m_msg98ReplyError = g_errno;
// do not bother scanning the reply
p = pend;
}
}
//int32_t numQueryLinks = m_relatedQueryLinksIntersected.length() ;
//numQueryLinks /= sizeof(QueryLink);
// put strings into m_tmpStringBuf5
// parse these strings
// maybe index so we can assign to QueryLinks::m_queryStringOffset
// maybe include querylink # so we can assign quickly!
QueryLink *qk;
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// offset of query link
int32_t queryLinkOff = *(int32_t *)p;
p += 4;
// crazy? maybe we went oom on m_relatedQueryLinksIntersected
if ( queryLinkOff >= maxLinkOff ) {
log("seopipe: msg98 reply link off breach %"INT32">=%"INT32"",
queryLinkOff,maxLinkOff);
m_msg98ReplyError = ENOMEM;
break;
}
// get that
QueryLogEntry *qe = (QueryLogEntry *)p;
// skip it
p += qe->getSize();
// point to it
qk = &qks[queryLinkOff];
// do not duplicate query strings!
int32_t qh32 = hash32n ( qe->getQueryString() );
int32_t slot = m_qstringTable.getSlot ( &qh32 );
if ( slot >= 0 ) {
int32_t qeOff;
qeOff =*(int32_t *)m_qstringTable.getValueFromSlot(slot);
qk->m_queryStringOffset = qeOff;
qk->m_queryHostId = -1;
continue;
}
// get offset of string in string bug
int32_t stringOff = m_tmpStringBuf5.length();
// store good serp score
if ( ! m_tmpStringBuf5.safeMemcpy(qe,qe->getSize() ) ) {
m_msg98ReplyError = g_errno;
break;
}
// add to table
if ( ! m_qstringTable.addKey(&qh32,&stringOff) ) {
m_msg98ReplyError = g_errno;
break;
}
// show it
//log("seopipe: DEBUG. mapped remote off %"INT32" (hostid%"INT32") to "
// "local off %"INT32" (%s)"
// ,qk->m_queryStringOffset,qk->m_queryHostId,stringOff,qstr);
// . save string offset
// . THIS OVERWRITES the g_qbuf offset that was in there!!!
qk->m_queryStringOffset = stringOff;
// to indicate that this QueryLink::m_queryStringOffset is now
// an offset into m_relatedQueryStringBuf and no longer an
// offset into g_qbuf of the specific hostid, we set hostid
// to -1
qk->m_queryHostId = -1;
}
// steal it so it doesn't free it
//slot->m_readBuf = NULL;
// inc the counter
m_numMsg98Replies++;
// return control to transmit function. it will call m_callback1
// if the function is done. but if a different parent function than
// transmit called us then we call that. it just depends on the
// intial entry function that called getMatchingQueries()
m_masterLoop ( m_masterState );
}
static void gotMsg3fReplyWrapper ( void *state , void *state2 ) {
XmlDoc *THIS = (XmlDoc *)state;
//Multicast *m = (Multicast *)state2;
Bin *bin = (Bin *)state2;
THIS->gotMsg3fReply ( bin ); // m
}
*/
static int mtCmp ( const void *a, const void *b ) {
MissingTerm *wa = *(MissingTerm **)a;
MissingTerm *wb = *(MissingTerm **)b;
if ( wb->m_importance > wa->m_importance ) return 1; // swap
if ( wb->m_importance < wa->m_importance ) return -1;
if ( wb->m_votes > wa->m_votes ) return 1; // swap
if ( wb->m_votes < wa->m_votes ) return -1;
if ( (int64_t)b < (int64_t)a ) return 1; // swap
if ( (int64_t)b > (int64_t)a ) return -1;
return 0;
}
// . called by getMissingTermBuf() and getMatchingTermBuf()
// . returns false and sets g_errno on error
bool XmlDoc::addTermsFromQuery ( char *qstr,
uint8_t queryLangId,
int32_t gigablastTraffic,
int32_t googleTraffic2,
//QueryLogEntry *qe ,
int32_t hackqoff,
SafeBuf *tmpBuf ,
HashTableX *scoreTable ,
HashTableX *topTermsTable ,
float imp, // importance
bool isRelatedQuery ) {
// sanity
if ( hackqoff < 0 ) { char *xx=NULL;*xx=0; }
// print query but bold-face the terms our doc has not
Query qq;
//SafeBuf *rqsb = &m_relatedQueryStringBuf;
// doQueryExpansion = false
//char *qstr = qe->getQueryString ( );
qq.set2 ( qstr , queryLangId , false );
int32_t lastStart = -1;
for ( int32_t k = 0 ; k < qq.m_numWords ; k++ ) {
QUICKPOLL(m_niceness);
QueryWord *qw = &qq.m_qwords[k];
int32_t tid32 = qw->m_wordId & 0xffffffff;
// is it not contained by our doc
if ( ! tid32 ) continue;
// skip if we contain it already
if ( isRelatedQuery && topTermsTable->isInTable ( &tid32 ) )
continue;
// skip if common word like "on" "at" etc.
if ( isCommonQueryWordInEnglish(tid32) ) continue;
// get start of wikipedia phrase it is in
int32_t start = qw->m_wikiPhraseStart;
int32_t nwk = qw->m_numWordsInWikiPhrase;
// if not in wiki phrase at all, just use single word
if ( qw->m_wikiPhraseId == 0 ) {
start = k;
nwk = 1;
}
// do not re-do any words in here
if ( start == lastStart ) continue;
lastStart = start;
// hash each wordid in the term into the th64 hash
int64_t th64 = 0LL;
//int32_t alnumWordCount = 0;
for ( int32_t j = start ; j < start + nwk ; j++ ) {
// int16_tcut
QueryWord *qw = &qq.m_qwords[j];
// skip punct
if ( qw->m_wordId == 0 ) continue;
// hash otherwise
th64 ^= qw->m_wordId;
// count it
//alnumWordCount++;
}
// get traffic of related query
int32_t traffic = gigablastTraffic;
// make gb traffic into google monthly traffic
traffic *= GB_TRAFFIC_MODIFIER;
// ues google numbers if we have them, more accurate
int32_t googleTraffic = googleTraffic2;
if ( googleTraffic >= 0 ) traffic = googleTraffic;
// now score that term
int32_t slot = scoreTable->getSlot ( &th64 );
if ( slot >= 0 ) {
int32_t off;
off=*(int32_t *)scoreTable->getValueFromSlot(slot);
char *base = tmpBuf->getBufStart();
MissingTerm *pt=(MissingTerm *)(base + off);
pt->m_importance += imp;
pt->m_votes++;
pt->m_traffic += traffic;
// store first 10 related query strings
// we got this term from
for ( int32_t x = 1 ; x < 10 ; x++ ) {
if ( pt->m_hackQueryOffsets[x] != -1 )
continue;
// grab it. querylogentry ptr!!
pt->m_hackQueryOffsets[x] = hackqoff;
break;
}
continue;
}
// set a class to store in safebuf
MissingTerm mt;
mt.m_importance = imp;
//mt.m_numAlnumWords = alnumWordCount;
mt.m_synOf = NULL;
mt.m_votes = 1;
mt.m_traffic = traffic;
mt.m_hackQueryOffsets[0] = hackqoff;
// if not a missing term, we are a MATCHING term
mt.m_isMissingTerm = isRelatedQuery;
// invalidate the remaining 9 query offsets
for ( int32_t x = 1 ; x < 10 ; x++ )
mt.m_hackQueryOffsets[x] = -1;
int32_t offset = tmpBuf->length();
int32_t toCopy = sizeof(MissingTerm);
if ( ! tmpBuf->safeMemcpy(&mt,toCopy))
return false;
// for calculating length of stored term string
int32_t startLen = tmpBuf->length();
// . if first time in scoretable, add stuff
// . store the string, each word separately
for ( int32_t j = start ; j < start + nwk ; j++ ) {
// int16_tcut
QueryWord *qw = &qq.m_qwords[j];
// point to word as string
char *str = qw->m_word;
int32_t len = qw->m_wordLen;
// make all punct a space
if ( qw->m_wordId == 0 ) {
str = " ";
len = 1;
}
// store term string after MissingTerm class
if ( ! tmpBuf->safeMemcpy(str,len) )
return false;
}
tmpBuf->pushChar('\0');
// record MissingTerm::m_termSize
int32_t delta = tmpBuf->length() - startLen;
char *base = tmpBuf->getBufStart();
MissingTerm *pmt = (MissingTerm *)(base + offset);
pmt->m_termSize = delta;
// now score table entry
if ( ! scoreTable->addKey ( &th64 , &offset ) )
return false;
}
return true;
}
// this is used to sort the MissingTerm instances in a safeBuf,
// missingTermBuf. it is also used to sort the Matching terms from
// getMatchingTermBuf() as well now!
bool XmlDoc::sortTermsIntoBuf ( HashTableX *scoreTable ,
SafeBuf *tmpBuf ,
SafeBuf *missingTermBuf ) {
// make ptrs for sorting
int32_t numTerms = scoreTable->getNumUsedSlots();
int32_t need = numTerms * 4;
SafeBuf ptrBuf;
if ( ! ptrBuf.reserve ( need ,"srtbuf") ) return false;
char *p = tmpBuf->getBufStart();
char *pend = tmpBuf->getBuf();
for ( ; p < pend ; ) {
MissingTerm *mt = (MissingTerm *)p;
p += mt->getSize();
ptrBuf.pushPtr ( mt );
}
gbqsort ( ptrBuf.getBufStart(),
numTerms,
sizeof(MissingTerm *),
mtCmp,
m_niceness);
// now write the missingTerm instances into m_missingTermBuf
int32_t need2 = tmpBuf->length();
if ( ! missingTermBuf->reserve ( need2 ,"mtbuf") ) return false;
// now write back into the real buf
MissingTerm **pp = (MissingTerm **)ptrBuf.getBufStart();
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
MissingTerm *mt = pp[i];
missingTermBuf->safeMemcpy ( mt , mt->getSize() );
}
return true;
}
// . now this uses the related queries
// . use logic from getInsertableTerms()!!!
SafeBuf *XmlDoc::getMissingTermBuf ( ) {
// try to set from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
if ( m_missingTermBufValid )
return &m_missingTermBuf;
SafeBuf *qkbuf = getRelatedQueryBuf ();
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
HashTableX *topTermsTable = getTermIdBufDedupTable32();
if ( ! topTermsTable || topTermsTable == (void *)-1 )
return (SafeBuf *)topTermsTable;
SafeBuf tmpBuf;
if ( ! tmpBuf.reserve ( 100000 ,"t3buf" ) ) return NULL;
// maps 64-bit term hash (can be multiple words in a term) to
// an offset into tmpBuf.
HashTableX scoreTable;
if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
return NULL;
//
// taken from seo.cpp's printRelatedQueries() function
//
//int32_t *qrOffs = (int32_t *)relBuf->getBufStart();
//int32_t numRels = relBuf->length() / sizeof(int32_t);
//char *base = m_queryRelBuf.getBufStart();
//SafeBuf *rqsb = &m_relatedQueryStringBuf;
int32_t nks = qkbuf->length() / sizeof(QueryLink);
QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
int32_t i;
for ( i = 0 ; i < nks ; i++ ) {
QUICKPOLL(m_niceness);
// stop at 300?
//if ( i >= 300 ) break;
QueryLink *qk = &qks[i];
int32_t qkOff = (char *)qk - qkbuf->getBufStart();
//int32_t relOff = qrOffs[i];
//QueryRel *rel = (QueryRel *)(base+relOff);
// skip if not head of a linked list
if ( ! qk->m_isFirst ) continue;
QueryLogEntry *qe ;
qe = qk->getQueryLogEntry(&m_relatedQueryStringBuf);
// relative to rqsb! m_relatedQueryStringBuf
float imp = qk->m_totalQueryImportance;
// modify by unique round? not yet...
//imp -= rel->m_uniqueRound * 1000;
// now use this function
if ( ! addTermsFromQuery ( qe->getQueryString() ,
qe->m_langId,
qe->m_gigablastTraffic,
qe->m_googleTraffic,
qkOff, // hackqoff
&tmpBuf ,
&scoreTable ,
topTermsTable ,
imp ,
true ) ) // is related query?
return NULL;
}
// sort MissingTerms from tmpBuf into m_missingTermBuf by
// MissingTerm::m_importance
if ( ! sortTermsIntoBuf ( &scoreTable,
&tmpBuf,
&m_missingTermBuf ) )
return NULL;
m_missingTermBufValid = true;
//m_numMissingTerms = i;
// store it
//if ( ! storeMissingTermBufIntoCachedb() )
// return (SafeBuf *)-1;
return &m_missingTermBuf;
}
// . now get the best terms from our matching queries
// . basically the exact same algo as getMissingTermBuf
SafeBuf *XmlDoc::getMatchingTermBuf ( ) {
// try to set from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
if ( m_matchingTermBufValid )
return &m_matchingTermBuf;
SafeBuf *mq = getMatchingQueryBuf();
if ( mq == NULL || mq == (void *)-1 ) return mq;
HashTableX *topTermsTable = getTermIdBufDedupTable32();
if ( ! topTermsTable || topTermsTable == (void *)-1 )
return (SafeBuf *)topTermsTable;
// tmpBuf will hold the MissingTerms we add.
SafeBuf tmpBuf;
if ( ! tmpBuf.reserve ( 100000 ,"t4buf") ) return NULL;
// maps 64-bit term hash (can be multiple words in a term) to
// an offset into tmpBuf. tmpBuf holds the missing terms, so we
// use scoreTable to accumulate MissingTerm::m_importance for
// the same term in different queries.
HashTableX scoreTable;
if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
return NULL;
// scan the queries this doc matches and add MissingTerms for them
// into tmpBuf
int32_t nks = mq->length() / sizeof(QueryLink);
QueryLink *qks = (QueryLink *)mq->getBufStart();
int32_t i; for ( i = 0 ; i < nks ; i++ ) {
QUICKPOLL(m_niceness);
QueryLink *qk = &qks[i];
// stop at 300?
if ( i >= 300 ) break;
// "matching terms" have different hackqoff than missing terms
int32_t qkOff = (char *)qk - mq->getBufStart();
// relative to rqsb! m_relatedQueryStringBuf
float imp = qk->m_queryImportance;
// querylogentry does not have string info here! it is
// just the basic class
QueryLogEntry *qe ;
qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
// . now use this function
if ( ! addTermsFromQuery ( qe->getQueryString(),
qe->m_langId,
qe->m_gigablastTraffic,
qe->m_googleTraffic,
qkOff, // hackqoff
&tmpBuf ,
&scoreTable ,
topTermsTable ,
imp ,
false ) ) // is related query?
return NULL;
}
// sort MatchingTerms from tmpBuf into m_matchingTermBuf by
// MatchingTerm::m_importance
if ( ! sortTermsIntoBuf ( &scoreTable,
&tmpBuf,
&m_matchingTermBuf ) )
return NULL;
m_matchingTermBufValid = true;
//m_numMatchingTerms = i;
// store it
//if ( ! storeMatchingTermBufIntoCachedb() )
// return (SafeBuf *)-1;
return &m_matchingTermBuf;
}
/*
// . max # of outstanding msg3f requests we can send to one host
// . now just make it 1 since it is msg3f NOT msg39
#define MAXOUT 1
//#define BINSIZE 100000
class Bin {
public:
// the current position for adding queries into m_buf
int32_t m_cursor;
int32_t m_maxCursor;
int32_t m_allocSize;
// some hack storage
Host *m_hackHost;
bool m_hackIsMsg99ReplyPtr;
// for sending the m_buf to its host
Multicast m_mcast;
// allocates size of BINSIZE bytes
char m_buf[0];
};
// . returns false and sets g_errno on error
// . returns true on successful launch of request, it will block always
bool XmlDoc::sendBin ( int32_t i ) {
Bin *bin = m_currentBinPtrs[i];
// get host
Host *h = g_hostdb.getHost(i);
// copy it
//int32_t reqSize = p - tmpBuf;
//char *req = mdup ( tmpBuf , reqSize , "3freq" );
//if ( ! req ) return true;
// increment outstanding requests he has
h->m_numOutstandingRequests++;
// this could be a ptr to a msg99reply or a querylink
Multicast *mcast = &bin->m_mcast;
//bin->m_hackxd = this;
//bin->m_hackPtrCursor = firstPtrCursor;
bin->m_hackHost = h;
// get his group id
uint32_t groupId = h->m_groupId;
char *req = bin->m_buf;
int32_t reqSize = bin->m_cursor;
// disown it so mcast can free it when its udpslot is destroyed
m_currentBinPtrs[i] = NULL;
// note that
setStatus("launching msg3f");
// log it too
//log("seopipe: launching msg3f request of %"INT32" gbdocid queries to "
// "score to host %"INT32"", queryCount,h->m_hostId);
// get the docIds for this query using msg3f.cpp's handleRequest3f()
bool status = mcast->send ( req ,
reqSize,
0x3f ,
false, // mcast frees request? no!!!
groupId, // group to send to
false, // send to whole group?
0 , // query hash for host in group select
this , // state1
bin,//mcast, // state2
gotMsg3fReplyWrapper,
86401, // timeout in seconds. LONG TIME!
m_niceness,
false, // realtime?
h->m_hostId // firsthostid to try
);
// mark it out
m_numMsg3fRequests++;
// if this is true then it was a success and we BLOCKED
if ( status ) {
// must BE IN USE!
if ( ! mcast->m_inUse ) { char *xx=NULL;*xx=0; }
// success
return true;
}
// it came back?
m_numMsg3fReplies++;
// undo this
h->m_numOutstandingRequests--;
// errno should be set
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// set error
m_binError = g_errno;
// note it
log("seopipe: mcast had error: %s", mstrerror(g_errno));
// free that bin i guess
mfree ( bin , bin->m_allocSize, "delbin" );
// return false on error
return false;
}
// . this is called from two places:
// 1. getMatchingQueriesScored() (Msg99Reply ptrs)
// 2. getRelatedQueryBuf() (QueryLink ptrs)
// . this can take Msg99Reply ptrs or it can take QueryLink ptrs
// . it will glean the docid from either of these two ptrs types as well
// as glean the pointer to the query string.
// . THEN it can create a 'gbdocid:xxxx | <queryString>' query which
// it will send to a host in the network.
// . it will try to keep each host in the network answering 5 such queries
// at any one time. bins are no longer used.
// . we need to implement heavy termlist caching remotely and locally to
// ensure optimal speed
// . returns false if blocked, true otherwise
// . returns true with g_errno set on error
bool XmlDoc::scoreDocIdRestrictedQueries ( Msg99Reply **replyPtrs ,
QueryLink *linkPtrs ,
int32_t numPtrs ) {
//log("debug: entered scoredocidrestrictedqueries");
if ( numPtrs == 0 ) return true;
// . sanity check
// . you can only score your Msg99Replies or your QueryLinks
// . score your Msg99Replies for queries that match the main url
// . score your QueryLinks for queries that match a related docid
if ( ! replyPtrs && ! linkPtrs ) { char *xx=NULL;*xx=0; }
if ( replyPtrs && m_setForReplyPtrs ) return true;
if ( linkPtrs && m_setForLinkPtrs ) return true;
// we now send the termlistbuf to each host receiving a msg3f
// request so when it performs the msg39 on a query we provide it
// will set QueryTerm::m_posdbListPtr to point to the termlists we
// provided only, just for this docid
SafeBuf *termListBuf = NULL;
if ( ! linkPtrs ) {
termListBuf = getTermListBuf();
if ( ! termListBuf ) return true;
if ( termListBuf==(void *)-1 ) return false;
}
// force to ten for debug
//numPtrs = 20;
sendLoop:
//
// cleanup if got all replies we can
//
if ( m_numMsg3fReplies == m_numMsg3fRequests &&
((m_qcursor >= numPtrs) || m_binError) ) {
//log("debug: cleanup");
// there might be remnant bins if we stopped trying to
// call sendBin because we hit m_binError
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
// see if that bin is still around
Bin *bin = m_currentBinPtrs[i];
if ( ! bin ) continue;
// this will core if the multicast is in use
bin->m_mcast.destructor();
// now nuke it then
mfree ( bin , bin->m_allocSize, "delbin" );
// now make it null
m_currentBinPtrs[i] = NULL;
}
// nuke this too!
if ( m_newxd2 ) {
mdelete ( m_newxd2 , sizeof(XmlDoc) , "newxd2" );
delete ( m_newxd2 );
m_newxd2 = NULL;
}
// free table's mem if used
m_tmpDupTable.reset();
// do not repeat this logic!
if ( replyPtrs ) {
m_setForReplyPtrs = true;
m_binErrorForReplyPtrs = m_binError;
}
if ( linkPtrs ) {
m_setForLinkPtrs = true;
m_binErrorForLinkPtrs = m_binError;
}
// inherit error? pass it on to caller
//if ( m_binError ) g_errno = m_binError;
// reset for another call to this function since we call
// if from two different places above
m_numMsg3fRequests = 0;
m_numMsg3fReplies = 0;
m_qcursor = 0;
m_binError = 0;
// all done!
g_errno = 0;
return true;
}
// int16_tcut
char *base = m_tmpStringBuf5.getBufStart();
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
// store the queries in our buffer into the various bins and send
// a bin off when it gets full
queryLoop:
// breathe
QUICKPOLL(m_niceness);
// nothing left to do except wait for replies?
if ( m_qcursor >= numPtrs ) return false;
// assume ptr is good
bool good = true;
// set these
int64_t docId;
// the query as a string
char *qstr = NULL;
// for passing to mcast::m_hackQPtrs
void *vptr;
// get the ith QueryLink?
if ( linkPtrs ) {
QueryLink *qk = &linkPtrs[m_qcursor];
// skip if was not successfully processed above
// because it's hostid was dead perhaps?
if ( qk->m_queryHostId != -1 ) good = false;
// get from related docid in this case
SafeBuf *rdbuf = &m_relatedDocIdBuf;
if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
docId = rd->m_docId;
// get it
QueryLogEntry *qe ;
qe = (QueryLogEntry *)(qk->m_queryStringOffset + base);
// and this. skip over goodserpscore, gigablastTraffic and
// googleTraffic
qstr = qe->getQueryString();
// save it
vptr = qk;
}
// make a new one for the first time
if ( linkPtrs && ! m_newxd2 ) {
try { m_newxd2 = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
m_binError = g_errno;
goto sendLoop;
}
mnew ( m_newxd2, sizeof(XmlDoc),"newxd2");
}
// set the xmldoc to this new docid, if it is new...
if ( linkPtrs && m_newxd2->m_docId != docId ) {
// a good stopping point?
if ( clientClosedConnection() ) {
m_binError = ESOCKETCLOSED;
goto sendLoop;
}
// set it from related doc's docid
if ( ! m_newxd2->set3 ( docId ,cr->m_coll, m_niceness ) ) {
m_binError = g_errno;
goto sendLoop;
}
// alloc space for tablen
if ( m_tmpDupTable.getNumSlots() <= 0 &&
! m_tmpDupTable.set ( 8,0,1024,NULL,0,false,m_niceness,
"tdtbl") ) {
m_binError = g_errno;
goto sendLoop;
}
// must not be in there already!
if ( m_tmpDupTable.isInTable ( &docId ) ) {
char *xx=NULL;*xx=0; }
// add it
if ( ! m_tmpDupTable.addKey ( &docId ) ) {
m_binError = g_errno;
goto sendLoop;
}
// ensure content is recycled from title rec
m_newxd2->m_recycleContent = true;
// newxd2 needs to use our master functions. so
// anytime one of its internal functions blocks, then
// our m_masterLoop will be called
// and we'll end up right here again!
m_newxd2->m_masterLoop = m_masterLoop;
m_newxd2->m_masterState = m_masterState;
// only get posdb keys really for this stuff
m_newxd2->m_useTitledb = false;
m_newxd2->m_useTagdb = false;
m_newxd2->m_useClusterdb = false;
m_newxd2->m_useSpiderdb = false;
m_newxd2->m_useLinkdb = false;
// debug
log("seopipe: setting newxd2 docid=%"INT64"",docId);
}
// pump this
if ( linkPtrs && ! m_newxd2->m_loaded ) {
// . CRAP, blocking here sucks because when this function
// is re-entered it can also be from a Msg3f reply
// not because this document is back from msg22a...
//log("debug: loading newxd2");
// try to set from title rec first. return false if blocks.
if ( ! m_newxd2->loadFromOldTitleRec() ) {
m_newxd2Blocked = true;
//log("debug: newxd2 blocked");
return false;
}
}
// i guess no longer out
if ( linkPtrs && m_newxd2->m_loaded )
m_newxd2Blocked = false;
//if ( linkPtrs )
// log("debug: newxd2 loaded=%"INT32"",(int32_t)m_newxd2->m_loaded);
// sanity check
if ( linkPtrs && ! m_newxd2->m_oldTitleRecValid ) {
char *xx=NULL;*xx=0;
}
// . did that fail? i.e. docid not found!?!?!
// . do not increment m_qcursor if m_binError is set
if ( linkPtrs && ! m_newxd2->m_oldTitleRec && ! m_binError ) {
// just skip this asshole then
if ( m_lastPrintedDocId != docId ) {
log("seopipe: related docid %"INT64" titlerec "
"load failed99",
docId);
}
m_lastPrintedDocId = docId;
// clear that
g_errno = 0;
// skip it
m_qcursor++;
// try the next one
goto queryLoop;
}
if ( linkPtrs ) {
// . CRAP, blocking here sucks because when this function
// is re-entered it can also be from a Msg3f reply
// not because it has the termlistbuf ready
// . use termlist buf of related docid
// . we need to ENSURE that the QueryLinks are clustered
// by related docid so this logic is efficient here
termListBuf = m_newxd2->getTermListBuf();
// return false if it blocked
if ( termListBuf == (void *)-1 ) {
//log("debug: newxd2 blocked in termlistbuf");
m_newxd2Blocked = true;
return false;
}
// this sucks. error!
if ( ! termListBuf ) {
m_binError = g_errno;
goto sendLoop;
}
}
// i guess no longer out
if ( linkPtrs ) {
//log("debug: newxd2 UNblocked in termlistbuf");
m_newxd2Blocked = false;
}
// wait for replies to come in so we can stop even if m_qcursor
// did not complete its scan!
// shit, but what if we are a msg22 coming in for m_newxd2? that
// is why i moved this check down here so we can set m_newxd2Blocked
// to false and allow the msg3f replies to come back in and free
// all the bins. this is kinda fucked up because everything is
// asynchronous.
if ( m_binError ) return false;
// otherwise the Msg99Reply
if ( ! linkPtrs ) {
Msg99Reply *qp = replyPtrs[m_qcursor];
// tis us!
docId = m_docId;
// sanity
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// and query string
qstr = qp->m_queryStr;
// save it
vptr = qp;
}
int32_t qlen = gbstrlen(qstr);
// mark as bad if this query is too big already
if ( m_firstUrl.m_ulen + qlen + 10 > MAX_QUERY_LEN )
good = false;
// if ptr was bad, do not evaluate at all
if ( ! good ) {
m_qcursor++;
goto queryLoop;
}
// sanity
if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
// . get hash of query to determine bin
// . this keeps our term freqs consistent since every query goes
// back TO THE SAME HOST!!! thus our scores remain consistent.
// each host has a slightly different TermFreq/Weight for the
// exact same query because the termfreq is based on the termlist
// length for that termid. and each host has a different set of
// docids in its index for the most part.
uint32_t h32 = hash32n ( qstr );
int32_t numHosts = g_hostdb.getNumHosts();
// do not send to host #0 if we got a lot of hosts
if ( g_hostdb.getNumHosts() >= 8 ) numHosts--;
int32_t hostNum = h32 % numHosts;
// skip host #0 which is us i guess!
if ( g_hostdb.getNumHosts() >= 8 ) hostNum++;
// sanity for that
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
// get the current bin for that host
Bin *bin = m_currentBinPtrs [ hostNum ];
// alloc on demand
if ( ! bin ) {
// how big is the termlistbuf?
int32_t tsize = termListBuf->length();
int32_t collLen = gbstrlen(cr->m_coll);
// how much space do we need for a good bin?
int32_t alloc = sizeof(Bin) + 8 +1+ collLen + 1 + tsize + 100000;
// make that
char *mem = (char *)mmalloc ( alloc ,"binreq" );
if ( ! mem ) {
m_binError = g_errno;
goto sendLoop;
}
// cast it
bin = (Bin *)mem;
// store it
m_currentBinPtrs [ hostNum ] = bin;
// this includes a Multicast in the Bin
bin->m_mcast.constructor();
// for freeing
bin->m_allocSize = alloc;
// the end of it
char *memEnd = mem + alloc;
// reset offset into Bin::m_buf
bin->m_cursor = 0;
// is it to a msg99reply? so the reply handler knows how to
// handle mcast::m_hackQPtr and what action to take. it is
// slightly different.
if ( linkPtrs ) bin->m_hackIsMsg99ReplyPtr = 0;
else bin->m_hackIsMsg99ReplyPtr = 1;
// . before we add any queries, store langid of QUERY
// . crap just use doc langid for now
char *bp = bin->m_buf;
// first is docid. if doing QueryLinks this is the docid
// of the related docid, otherwise, it is that of our main doc
*(int64_t *)bp = docId; bp += 8;
// then langid
*bp = m_langId; bp++;
// then the coll
gbmemcpy ( bp , cr->m_coll , collLen );
bp += collLen;
*bp++ = '\0';
// sanity!
if ( bp >= memEnd ) { char *xx=NULL;*xx=0; }
// the size of the termlist buf
*(int32_t *)bp = tsize; bp += 4;
// then the termlistbuf that has all the termlists forour docid
gbmemcpy ( bp , termListBuf->getBufStart(), tsize ); bp += tsize;
// update bin's cursor
bin->m_cursor = bp - bin->m_buf;
// for breach detection. send off Bin when breach happens.
bin->m_maxCursor = alloc - sizeof(Bin);
}
// can we store the current query into this bin?
bool storeInBin = true;
// is there enough room for this query in the bin?
int32_t need = qlen + 40;
if ( bin->m_cursor + need >= bin->m_maxCursor )
storeInBin = false;
// does docid of bin match?
int64_t binDocId = *(int64_t *)(bin->m_buf);
if ( docId != binDocId )
storeInBin = false;
// if we can't store this query into the bin, send it off now
if ( ! storeInBin ) {
// use its multicast to send this bin off if too full
if ( ! sendBin ( hostNum ) ) {
m_binError = g_errno;
goto sendLoop;
}
// . now the current bin should have been emptied
// . go back to top to realloc Bin::m_buf to hold this query
goto queryLoop;
}
char *p = bin->m_buf + bin->m_cursor;
// first store the offset from the buf so we can return it
// in the reply which is a list of scores basically and we know
// what score goes with what m_qcursor
*(int32_t *)p = m_qcursor;
p += 4;
// now store queries in the request buf for the msg3f
p += sprintf(p,"gbdocid:%"UINT64" | %s",docId,qstr);
*p++ = '\0';
// update cursor
bin->m_cursor = p - bin->m_buf;
// skip to next query/docid to evaluate
m_qcursor++;
// if we have more queries left, add them to bins now
if ( m_qcursor < numPtrs ) goto queryLoop;
// now send every bin, we have no queries left.
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// skip if empty
if ( ! m_currentBinPtrs[i] ) continue;
// this will transfer the request buffer over to mcast
// so it will be freed when mcast returns
sendBin ( i );
}
goto sendLoop;
}
// we got back the score for each query link in
// the bin that we sent out for the docid specified in the bin header request
void XmlDoc::gotMsg3fReply ( Bin *bin ) { // Multicast *mcast ) {
setStatus ( "gotmsg3freply" );
// do some housekeeping
Host *h = bin->m_hackHost;
h->m_numOutstandingRequests--;
m_numMsg3fReplies++;
// sanity
Multicast *mcast = &bin->m_mcast;
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0; }
// get the reply
bool freeIt = false;
int32_t replySize = 0;
int32_t replyMaxSize;
char *rbuf = mcast->getBestReply ( &replySize ,
&replyMaxSize ,
&freeIt ,
true ); // steal it?
// log it too
//log("seopipe: got msg3f reply from host %"INT32" size=%"INT32" bytes",
// h->m_hostId,replySize);
// cast it
//Msg3fReply *mr = (Msg3fReply *)rbuf;
// in case of mem-leak this helps
//if ( rbuf ) relabel(rbuf,replyMaxSize,"xx-rb");
// . we must be able to free it... we must own it
// . this is true if we should free it, but we should not have
// to free it since it is owned by the slot?
if ( freeIt ) {
log(LOG_LOGIC,"query: msg3f: Steal failed.");
char *xx = NULL; *xx=0;
}
// if it failed for some reason i guess just bail
if ( ! rbuf ) {
// clean up the bin and the multicast and the request buffer
mfree ( bin , bin->m_allocSize, "delbin" );
g_errno = EBADREPLYSIZE;
log(LOG_LOGIC,"seopipe: bad msg3f empty reply");
return;
}
// reply is just sequence of docid/score pairs
char *rp = rbuf;
char *rpEnd = rbuf + replySize;
//int32_t firstCursor = bin->m_hackPtrCursor;
// scan the msg99 replies and insert the scores we got for each
// query from the msg3f reply in "rbuf"
for ( ; rp < rpEnd ; ) {
// breathe
QUICKPOLL(m_niceness);
// . first is index, what query # in the request are we
// processing now, might not be in order because we launch
// a bunch of msg39s in parallel in handleRequest3f()'s call
// to processQueries()
// . but the corresponding msg99reply is reply # "qcursor"
int32_t qcursor = *(int32_t *)rp;
rp += 4;
int64_t docId = *(int64_t *)rp;
rp += 8;
float score = *(float *)rp;
rp += 4;
// . if this is true that means qcursor is referencing a
// msg99reply and we should set the score of that msg99
// reply to what the handlerequest3f provided
// . so store the docid and score for our url for this query
if ( bin->m_hackIsMsg99ReplyPtr ) {
SafeBuf *mqbuf = getMatchingQueries(false,-1);
Msg99Reply **qptrs=(Msg99Reply **)mqbuf->getBufStart();
Msg99Reply *qr = qptrs[qcursor];
qr->m_myScore = score;
qr->m_myDocId = docId;
int32_t numQueryPtrs=mqbuf->length()/sizeof(Msg99Reply *);
// if too many skip some
if ( numQueryPtrs > 1000 && (qcursor%1000)!=0)continue;
// if too many skip some
if ( numQueryPtrs > 400 && (qcursor%100) !=0)continue;
char *qstr = qr->m_queryStr;
log("seopipe: got query #%"INT32"of%"INT32" score=%f qstr=%s"
,qcursor+1
,numQueryPtrs
,score
,qstr
);
continue;
}
// might be storing in a QueryLink (doing related docids)
//SafeBuf *ibuf = getRelatedQueryLinksWithStrings();
QueryLink *qks =(QueryLink *)m_tmpBuf5.getBufStart();
//int32_t numQueryLinks = ibuf->length() / sizeof(QueryLink);
QueryLink *qk = &qks[qcursor];
// sanity. make sure qk->m_queryStringOffset is related to our
// local m_tmpStringBuf5 and not relative to the
// g_qbuf of the hostid that sent back the msg99 reply.
if ( qk->m_queryHostId != -1 ) { char *xx=NULL;*xx=0; }
// how many related query links do we got? for logging.
int32_t nks = m_tmpBuf5.length()/sizeof(QueryLink);
// int16_tcuts
char *base = m_tmpStringBuf5.getBufStart();
// skip over gigablastTraffic and googleTraffic
QueryLogEntry *qe;
qe = (QueryLogEntry *)(base + qk->m_queryStringOffset);
SafeBuf *rdbuf = &m_relatedDocIdBuf;
if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
// note it
if ( (qcursor % 1000) == 0 ) // || qcursor < 100 )
log("seopipe: got msg3f reply for related query "
"#%"INT32"of%"INT32" "
"query \"gbdocid:%"INT64" | %s\" gigablasttraffic=%"INT32" "
"googletraffic=%"INT32" serpscore=%f goodscore=%f"
,qcursor+1
,nks
,rd->m_docId
,qe->getQueryStr()
,qe->m_gigablastTraffic
,qe->m_googleTraffic
,score
,qe->m_topSERPScore // of a docid slice on 1 host
);
//
// no longer used queryrel!
//
// if we are scoring QueryLinks then we add a QueryRel
//QueryRel qr;
// clear that mem to zero
//memset ( &qr , 0 , sizeof(QueryRel));
// then add the info we know
//qr.m_relatedDocId = qk->m_relatedDocId;
//char *base2 = m_relatedDocIdBuf.getBufStart();
//int32_t rdOff = (char *)qk->m_relatedDocId - base2;
//qr.m_relatedDocIdOff = rdOff;
//qr.m_offsetIntoRelQStrBuf = qk->m_queryStringOffset;
//qr.m_myScore = score;
//qr.m_nextOff = -1;
//qr.m_tailOff = -1;
qk->m_serpScore = score;
// save that. WHAT IF THIS ERRORS?!?!?!
//if ( ! m_queryRelBuf.safeMemcpy(&qr,sizeof(QueryRel)) ) {
// m_binError = g_errno;
// log("xmldoc: panic. failed to store query rel");
// break;
//}
// debug test
//m_binError = EBADENGINEER;
//log("xmldoc: panic2. failed to store query rel");
//break;
}
// ok, we got the docid and score, now free it
mfree ( rbuf , replyMaxSize , "fmsg3f" );
// clean up the bin and the multicast and the request buffer
mfree ( bin , bin->m_allocSize, "delbin" );
//if ( m_newxd2Blocked )
// log("debug: got reply, but returning because newxd2 "
// "had blocked");
// prevent double entry bug from entering scoreDocIdRestrictedQueries()
// from a newxd2 function blocking and coming in through msg22
// callback or whatever, vs. coming in from here
if ( m_newxd2Blocked ) return;
//log("debug: got reply and calling masterloop");
// go back to the transmit function
m_masterLoop ( m_masterState );
// if not done, just return... otherwise we double enter
// scoreDocIdRestrictedQueries() along with it's call to
// getTermListBuf()... and all hell breaks loose
return;
}
*/
/*
// send contents of m_socketWriteBuf to m_seoSocket
void XmlDoc::pumpSocketWriteBuf ( ) {
if ( ! m_seoSocket ) return;
setStatus ( "pumpsocketwritebuf" );
SafeBuf *sb = &m_socketWriteBuf;
// insert http header into m_socketWriteBuf if not there
char *wbuf = sb->getBufStart();
bool insertIt = false;
if ( ! wbuf ) insertIt = true;
if ( wbuf && strncmp(wbuf,"HTTP/1.0 ",9 ) ) insertIt = true;
// add http header first
if ( insertIt ) {
// reset # bytes sent
m_socketWriteBufSent = 0;
m_registeredSocketCallback = false;
// xml-itize each query reply without scoring info
sb->insert("HTTP/1.0 200 OK\r\n"
"Content-Type: text/xml ; "
"charset=utf-8\r\n"
"\r\n"
"<response>\n",0);
}
// come back here to do another send
sendLoop:
// try sending out our xml buffer on the socket
// the very first things we do is send the queries over without
// the ranking info which we compute by calling msg39 on each query,
// so at least we can display something quite quickly.
if ( m_socketWriteBufSent < sb->length() ) {
int32_t sd = m_seoSocket->m_sd;
// just in case
if ( m_registeredSocketCallback ) {
g_loop.unregisterWriteCallback(sd,this,
getSEOQueryInfoWrapper2);
m_registeredSocketCallback = false;
}
// send that off
int32_t sendLen = sb->length();
char *sendStr = sb->getBufStart();
char *sendEnd = sendStr + sendLen;
// if we sent SOME last time, skip over that
sendStr += m_socketWriteBufSent;
// how much left?
int32_t remaining = sendEnd - sendStr;
// wtf?
if ( remaining <= 0 ) { char *xx=NULL;*xx=0; }
// try a send on non-blocking socket
int32_t n = ::send ( sd , sendStr , remaining , 0 );
// did we send something?
if ( n > 0 ) {
m_socketWriteBufSent += n;
goto sendLoop;
}
// maybe it sent 0 because it was waiting for something
// so set our callback for when the socket is ready for
// writing again. try sending more later.
g_loop.registerWriteCallback ( sd ,
this ,
getSEOQueryInfoWrapper2,
0 ); // niceness = 0
// flag it so we don't leak these
m_registeredSocketCallback = true;
}
}
*/
bool XmlDoc::getIsInjecting ( ) {
bool isInjecting = false;
//if ( g_inPageInject ) isInjecting = true;
if ( m_sreqValid && m_sreq.m_isInjecting ) isInjecting = true;
if ( m_isInjecting && m_isInjectingValid ) isInjecting = true;
return isInjecting;
}
int posdbKeyCmp ( const void *a, const void *b ) {
char *ka = (char *)a;
char *kb = (char *)b;
//int64_t tid64a = g_posdb.getTermId(ka);
//int64_t tid64b = g_posdb.getTermId(kb);
// a bit of a hack so handleRequest8e already has these
// guys sorted by their lower 32-bits of termids so it can
// match this doc to queries without having to sort first.
//uint32_t tid32a = (uint32_t)tid64a;
//uint32_t tid32b = (uint32_t)tid64b;
//if ( tid32a < tid32b ) return -1;
//if ( tid32a > tid32b ) return 1; // swap
//if ( tid64a < tid64b ) return -1;
//if ( tid64a > tid64b ) return 1; // swap
char val = KEYCMP(ka,kb,sizeof(POSDBKEY));
if ( val > 0 ) return 1;
if ( val < 0 ) return -1;
return 0;
}
// . used by XmlDoc::getTermListBuf() below
// . sorted by posdb key straight up
SafeBuf *XmlDoc::getTermIdSortedPosdbListBuf ( ) {
if ( m_sortedPosdbListBufValid )
return &m_sortedPosdbListBuf;
// get the lists. forDelete = false.
char *metaList = getMetaList ( false );
if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;
// sanity
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// make a tmp buf to hold posdb keys
//SafeBuf tmp;
if ( ! m_sortedPosdbListBuf.reserve(m_metaListSize,"spbuf"))
return NULL;
// point into it
char *dst = m_sortedPosdbListBuf.getBufStart();
// debug test
//verifyMetaList ( m_metaList ,
// m_metaList + m_metaListSize ,
// false );
// scan the meta list for posdb keys
char *p = metaList;
char *pend = p + m_metaListSize;
// stole this loop from getMetaList()
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// save it with the flag
char byte = *p;
// get rdbId
char rdbId = byte & 0x7f;
// skip that
p++;
// key size
int32_t ks = getKeySizeFromRdbId(rdbId);
// get key
char *key = p;
// skip that
p += ks;
// get datasize
int32_t ds = getDataSizeFromRdbId(rdbId);
// assume we do not store the datasize
//bool neg = false;
// . if key is negative, no data is present
// . the doledb key is negative for us here
if ( (key[0] & 0x01) == 0x00 ) ds = 0;
// if datasize variable, read it in
if ( ds == -1 ) {
// get data size
ds = *(int32_t *)p;
// skip data size int32_t
p += 4;
}
// point to data
//char *data = p;
// skip data if not zero
p += ds;
// if not posdb skip rec
if ( rdbId != RDB_POSDB ) continue;
// skip negative keys
if ( (key[0] & 0x01) == 0x00 ) continue;
// add to new buf now
gbmemcpy ( dst , key , sizeof(POSDBKEY) );
// advance
dst += sizeof(POSDBKEY);
}
char *start = m_sortedPosdbListBuf.getBufStart();
// update tmp
m_sortedPosdbListBuf.incrementLength ( dst - start );
// sanity
if ( m_sortedPosdbListBuf.length() > m_metaListSize ) {
char *xx=NULL;*xx=0; }
// point
char *pbuf = m_sortedPosdbListBuf.getBufStart();
int32_t numKeys = m_sortedPosdbListBuf.length()/sizeof(POSDBKEY);
// sort keys by termid
gbqsort ( pbuf ,
numKeys,
sizeof(POSDBKEY),
posdbKeyCmp,
m_niceness );
m_sortedPosdbListBufValid = true;
return &m_sortedPosdbListBuf;
}
#define TLBUFSIZE 5000
// . used by the seo pipeline
// . this is a list of posdb termlists, one termlist per termid.
// . we store each termlist in this termlistbuf into g_termListCache
// . we use g_termListCache for evaluating gbdocid:xxx| restricted queries
// very quickly without having to hit disk because all the posdb termlists
// for that docid should be in g_termListCache
SafeBuf *XmlDoc::getTermListBuf ( ) {
if ( m_termListBufValid )
return &m_termListBuf;
// . ensure content is recycled from title rec
// . no, because if we had to download the doc fresh for the first
// time, this caused us headaches around line 30657 and we ended
// up setting m_docIndexed to false there and calling logIt() twice!
//m_recycleContent = true;
//m_recycleLinkInfo = true;
// try to set from title rec first. return false if it blocked.
//if ( ! loadFromOldTitleRec() ) return (SafeBuf *)-1;
// did that fail? i.e. docid not found!?!?!
//if ( m_oldTitleRecValid && ! m_oldTitleRec ) {
// g_errno = ENOTFOUND;
// return NULL;
//}
// only get posdb keys in getMetaList()
/*
m_useTitledb = false;
m_useTagdb = false;
m_useClusterdb = false;
m_useSpiderdb = false;
m_useLinkdb = false;
*/
// . these are FULL 18-byte keys, no compression
// . sorted by posdbkeys straight up, so by termid
SafeBuf *posdbBuf = getTermIdSortedPosdbListBuf ();
if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;
int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);
// . reserve mem for new termlistbuf
// . include 4 bytes for listsize
// . this buffer will be a list of lists
int32_t need = numKeys * (sizeof(POSDBKEY) + 4);
if ( ! m_termListBuf.reserve ( need ,"tlstbuf" ) )
return NULL;
int64_t lastTermId = -1LL;
/*
char tmpBuf[TLBUFSIZE];
// build termlists from the posdb records
RdbList termList;
// stolen from RdbList::set
termList.m_list = tmpBuf;
termList.m_listSize = 0;
termList.m_listEnd = tmpBuf;
termList.m_alloc = tmpBuf;
termList.m_allocSize = TLBUFSIZE;
termList.m_ownData = false;
termList.m_ks = sizeof(POSDBKEY);
termList.m_fixedDataSize = 0;
termList.m_ownData = false;
termList.m_useHalfKeys = true;
termList.resetListPtr();
bool breakOut = false;
*/
// start a size bookmark
int32_t *bookmark = NULL;
// scan all the sorted posdb keys and build posdb termlists and
// store the termlists into "m_termListBuf"
char *p = posdbBuf->getBufStart();
char *pend = p + posdbBuf->length();
for ( ; p < pend ; ) {
// get the key
char *key = p;
// must be full 18 byte keys!
if ( p[0] & 0x06 ) { char *xx=NULL;*xx=0; }
// skip it
p += sizeof(POSDBKEY);
// get key termid
int64_t termId = g_posdb.getTermId ( key );
// sanity
int64_t docId = g_posdb.getDocId ( key );
if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
// sanity. is it sorted by termid?
if ( termId < lastTermId && lastTermId == -1 ) {
char *xx=NULL;*xx=0; }
// log it for debug
//if ( docId == 192304365235LL )
// log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"",
// docId,
// termId,
// g_posdb.getWordPos(key));
// . store size of keys following that have same termid
// . assume just one for now!
if ( termId != lastTermId ) {
bookmark = (int32_t *)m_termListBuf.getBuf();
m_termListBuf.pushLong(sizeof(POSDBKEY));
}
// store the key
m_termListBuf.safeMemcpy ( key , sizeof(POSDBKEY) );
// if not first in the list, update size
if ( termId == lastTermId ) *bookmark += sizeof(POSDBKEY);
// . cache currently made list then
// . set startkey/endkey
//char startKey[sizeof(POSDBKEY)];
//char endKey [sizeof(POSDBKEY)];
//g_posdb.makeStartKey(startKey,lastTermId,m_docId);
//g_posdb.makeEndKey (endKey,lastTermId,m_docId);
// update it for next list
lastTermId = termId;
// . add to ongoing list? will use compression bit.
// . return true with g_errno set on error
// . use g_termListCache in Msg0.cpp
//if(!addToTermListCache(cr->m_coll,startKey,endKey,&termList))
// return true;
// first store the lits size
//m_termListBuf.pushLong(termList.m_listSize);
// then the list data itself
//m_termListBuf.safeMemcpy(termList.m_list,termList.m_listSize)
// now reset
//termList.m_listSize = 0;
//termList.m_list = tmpBuf;
//termList.m_listEnd = tmpBuf;//ermList.m_list;
//termList.resetListPtr();
// if we are a loopback, bail
//if ( breakOut ) break;
// are we the last record?
//if ( p >= pend ) breakOut = true;
// add fresh to the new termlist
//goto addIt;
}
// sanity
if ( m_termListBuf.length() &&
g_posdb.getDocId(m_termListBuf.getBufStart()+4) != m_docId ) {
char *xx=NULL;*xx=0; }
m_termListBufValid = true;
return &m_termListBuf;
// print timing
//int64_t now = gettimeofdayInMilliseconds();
//int64_t took = now - m_cacheStartTime;
//log("seopipe: took %"INT64" ms to parse docid %"INT64"",took,m_docId);
// . flag it as being completely cached now
// . returns false and sets g_errno on error
//return addDocIdToTermListCache ( m_docId , cr->m_coll );
}
//int32_t XmlDoc::getNumInsertableTerms ( ) {
// // make sure they called getInsertableTerms() first!
// if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0;}
// return m_insertableTermsBuf.length() / sizeof(InsertableTerm);
//}
// . return a list of InsertableTerms
// . these are just terms we will try to insert into the document in every
// possible place to see how they affect ranking of this document for
// all the applicable queries
// . then when we call getScoredInsertableTerms() it will fill in the
// m_queryChangeBuf array
SafeBuf *XmlDoc::getInsertableTerms ( ) {
if ( m_insertableTermsBufValid )
return &m_insertableTermsBuf;
// make sure related query string buf is valid
//SafeBuf *rrr = getRelatedQueryLinksWithStrings();
//if ( ! rrr || rrr == (void *)-1 ) return rrr;
// just use this now
SafeBuf *mtBuf = getMissingTermBuf();
if ( ! mtBuf || mtBuf == (void *)-1 ) return mtBuf;
// get buffer of ptrs to the msg99 replies for this url
//SafeBuf *mqbuf = getMatchingQueries ( false );
//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
// just use the MissingTerm class for these as well!!
SafeBuf *maBuf = getMatchingTermBuf();
if ( ! maBuf || maBuf == (void *)-1 ) return maBuf;
//
// alloc space for the insertable terms in its safebuf
//
int32_t need = 0;
char *p;
char *pend;
p = mtBuf->getBufStart();
pend = mtBuf->getBuf();
for ( ; p < pend ; ) {
MissingTerm *mt = (MissingTerm *)p;
p += mt->getSize();
need += sizeof(InsertableTerm);
need += mt->getTermSize();
}
// these are the matching terms, but use the same MissingTerm class
p = maBuf->getBufStart();
pend = maBuf->getBuf();
for ( ; p < pend ; ) {
MissingTerm *mt = (MissingTerm *)p;
p += mt->getSize();
need += sizeof(InsertableTerm);
need += mt->getTermSize();
}
if ( ! m_insertableTermsBuf.reserve ( need ,"itblbuf" ) ) return NULL;
//
// now interleave the matching terms with the related terms
//
char *p1 = mtBuf->getBufStart();
char *p1End = mtBuf->getBuf();
char *p2 = maBuf->getBufStart();
char *p2End = maBuf->getBuf();
// int16_tcut
SafeBuf *ib = &m_insertableTermsBuf;
int32_t count; for ( count = 0 ; ; count++ ) {
// . just get top 50 insertable terms
// . use #define MAX_INSERTABLE_TERMS 50?
if ( count >= 50 ) break;
bool add1 = false;
bool add2 = false;
if ( ( count % 2 ) == 0 && p1 < p1End ) add1 = true;
if ( ( count % 2 ) == 1 && p2 < p2End ) add2 = true;
if ( ! add1 && ! add2 ) break;
MissingTerm *mt;
if ( add1 ) {
mt = (MissingTerm *)p1;
p1 += mt->getSize();
}
if ( add2 ) {
mt = (MissingTerm *)p2;
p2 += mt->getSize();
}
// make an insertable term
InsertableTerm it;
if ( add1 ) it.m_isRelatedTerm = true;
else it.m_isRelatedTerm = false;
// sum of traffic of the queries that contained this term
it.m_trafficSum = mt->m_traffic;
// hash it up
char *term = mt->getTerm();
int32_t termSize = mt->getTermSize();
it.m_termHash64 = hash64 ( term , termSize - 1 );
it.m_termSize = termSize;
// reset this for later use
it.m_bestTrafficGain = -1;
it.m_bestInsertPos = -1;
// store that insertable term
ib->safeMemcpy(&it,sizeof(InsertableTerm));
// then the term string itself follows for easy serialization
// into cachedb...
ib->safeMemcpy(term,termSize);
}
if ( ib->length() > need ) { char *xx=NULL;*xx=0; }
//m_numInsertableTerms = count;
m_insertableTermsBufValid = true;
return &m_insertableTermsBuf;
}
static void gotMsg95ReplyWrapper ( void *state , UdpSlot *slot ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->gotMsg95Reply( slot );
}
void XmlDoc::gotMsg95Reply ( UdpSlot *slot ) {
// count it
m_numMsg95Replies++;
// return if still waiting
if ( m_numMsg95Replies < m_numMsg95Requests ) return;
// . store each msg95reply
// . TODO: do we need m_msg95ReplyAlloc[] like m_msg99 has?
m_msg95ReplyPtrs [slot->m_hostId] = slot->m_readBuf;
m_msg95ReplySizes[slot->m_hostId] = slot->m_readBufSize;
// do not let it free it, we will free it
slot->m_readBuf = NULL;
// all done! should call getScoredInsertableTerms() indirectly
m_masterLoop ( m_masterState );
}
#include "seo.h" // for Msg95Request class
/*
// return a buffer of WordFreqInfo instances for every word in the
// insertable terms buffer. we use this so the msg95 handler can get the
// term freqs of any term in any matching query consistently, because
// we are host #0 calling this presumably. msg95 handler will use these
// to set the termfreqs in the Msg39Request when calling msg39.
// TODO: run through related queries as well! why didn't insertable terms
// work!?!?! it should...
SafeBuf *XmlDoc::getInsertableWordFreqInfoBuf ( ) {
// must always be host 0 or it's twin! we have to ensure
// consistency always when calling getTermFreq()...
if ( g_hostdb.m_groupId != 0 ) { char *xx=NULL;*xx=0; }
if ( m_iwfiBufValid )
return &m_iwfiBuf;
// get the same top word ids we pass to the msg95 request,
// because handleRequest95() uses those to get the queries
// that we match, and it evaluates each of those queries on each
// insertion we do.
// So that is the ptr_twid32Buf, which MUST include all
// insertable terms as well, like those insertable terms that are
// new to us!!
// scan list of insertable terms
SafeBuf *itBuf = getInsertableTerms();
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
// . true means to get synonyms
// . itBuf non-null will append new insertable terms we don't have
int32_t *twids = getTopTermsVectorWithNewTerms ( true , itBuf );
if ( ! twids || twids==(void *)-1 ) return (SafeBuf *)twids;
// int16_tcut
//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
//int32_t ni = itBuf->length() / sizeof(InsertableTerm);
// get buffer of ptrs to the msg99 replies for this url
//SafeBuf *mqbuf = getMatchingQueries ( false );
//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
//Msg99Reply **mrp = (Msg99Reply **)mqbuf->getBufStart();
//int32_t nmrp = mqbuf->length() / 4;
// use table to dedup so we do not store dups
HashTableX dups;
if ( ! dups.set ( 8,0,8192,NULL,0,false,m_niceness,"iwfidup") )
return NULL;
// . first store the langid in the buf!!!
// . then the wordfreqinfos follow!
if ( ! m_iwfiBuf.safeMemcpy ( &docLangId , 1 ) )
return NULL;
char *p = itBuf->getBufStart();
char*pend = itBuf->getBuf();
// scan each "term" which might be one or more words
for ( ; p < pend ; ) {
//for ( int32_t i = 0 ; i < nmrp ; i++ ) {
QUICKPOLL(m_niceness);
// cast it
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
// add it in
if ( ! addTermFreqsForTerm ( it->getTerm() , &dups ) )
return NULL;
}
// do the same for all words and bigram terms in doc as well
m_iwfiBufValid = true;
return &m_iwfiBuf;
}
bool XmlDoc::addTermFreqsForTerm ( char *term , HashTableX *dups ) {
// we need this for synonyms
//uint8_t langId = langEnglish;
uint8_t *langIdPtr = getLangId();
// this should have been set by parent caller
if ( ! langIdPtr || langIdPtr == (uint8_t *)-1 ) {char *xx=NULL;*xx=0;}
// get the language this doc is in
uint8_t docLangId = *langIdPtr;
// if uknown, use english!
if ( docLangId == langUnknown ) docLangId = langEnglish;
//Msg99Reply *mr = mrp[i];
//Words ww;
//ww.set3 ( it->m_termStr );
//ww.set3(it->getTerm() );//mr->m_queryStr );//it->m_termStr );
Query qq;
// false = query expansion? i.e. use synonyms?
//qq.set2 ( it->getTerm(),docLangId,true);
qq.set2 ( term,docLangId,true);
//if ( strstr ( mr->m_queryStr, "bio wagner"))
// log("hey");
log("adding %s",term);
//int64_t *wids = ww.getWordIds();
// scan each word for term freq
for ( int32_t j = 0 ; j < qq.m_numTerms ; j++ ) {
// int16_tcut
QueryTerm *qt = &qq.m_qterms[j];
// get the full 64-bit hash of the word
int64_t wid = qt->m_rawTermId;
// skip if punct
if ( ! wid ) continue;
// dup?
if ( dups->isInTable ( &wid ) ) continue;
// add it
int64_t tf = g_posdb.getTermFreq ( cr->m_coll, wid );
if ( ! dups->addKey ( &wid ) ) return NULL;
WordFreqInfo wfi;
wfi.m_wordId64 = wid;
wfi.m_wordFreq64 = tf;
// note it
SafeBuf bb;
bb.safePrintf("seo: tf for term=\"");
bb.safeMemcpy ( qt->m_term, qt->m_termLen);
bb.safePrintf("\" = %"INT64"",tf);
log("seo: %s",bb.getBufStart());
// store it
if(!m_iwfiBuf.safeMemcpy(&wfi,sizeof(WordFreqInfo)))
return NULL;
}
return true;
}
*/
// 2. now transmit all the insertable terms to each host in the network. each
// host will evaluate each term in the list for every query that that
// host has in its memory for every new word position. kick this process
// off with the getNewRanks() function which returns a list of
// query terms where each query term has a wordposition/trafficgain
// array. [try to also insert entire phrases not just words]
// Each host will return an InsertedTerm class for each term. But then
// WE have to merge the InsertedTerm classes together for a particular
// term. That can be a bit tricky since we do not list a wordposition
// if it's traffic gain was the same as its previous wordposition.
// PASS in the entire doc's termlist with each request in case not in cache
// so it can evaluate each query's scores very quickly!
//
// . send a msg95 request to each host consisting of a list of terms to
// insert, and the entire termlists of this document.
// . then merge the replies into a final list of InsertedTerms.
// . returned is buffer of InsertableTerms
SafeBuf *XmlDoc::getScoredInsertableTerms ( ) {
setStatus ( "getscoredinsertableterms" );
if ( m_scoredInsertableTermsBufValid )
return &m_insertableTermsBuf;
uint8_t *langIdPtr = getLangId();
if ( ! langIdPtr || langIdPtr == (void *)-1 )
return (SafeBuf *)langIdPtr;
SafeBuf *itBuf = getInsertableTerms();
if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;
// these are the posdb keys of our document, makes it fast
// and easy for msg39 to return a serp score restricted to our docid
SafeBuf *termListBuf = getTermListBuf();
if ( ! termListBuf || termListBuf==(void *)-1 )
return termListBuf;
// this has all our documents terms and their synonyms in it,
// as well as the new terms we plan to insert that our doc does not
// have, from the getMissingTerms() buffer. in addition it
// has the term freq of each one!
SafeBuf *ntiBuf = getNewTermInfoBuf();
if ( ! ntiBuf || ntiBuf == (void *)-1 ) return (SafeBuf *)ntiBuf;
// get list of TermFreqInfo instances for all words in the
// lits of insertable terms
//SafeBuf *wfib = getInsertableWordFreqInfoBuf ( );
//if ( ! wfib || wfib == (void *)-1 ) return wfib;
SafeBuf *wpib = getWordPosInfoBuf();
if ( ! wpib || wpib == (void *)-1 ) return wpib;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if still waiting for replies to come in, return -1
if ( m_numMsg95Requests > 0 && m_numMsg95Replies < m_numMsg95Requests )
return (SafeBuf *)-1;
top:
// otherwise, we are done!
if ( m_numMsg95Requests > 0 && m_numMsg95Replies >=m_numMsg95Requests){
// . calculate the best insertable position for each
// Insertable Term.
// . we get a QueryChange array back from each host for
// the same term, but for queries local on that host,
// so add them all up here and set
// InsertableTerm::m_bestTrafficGain/m_bestTermPosition
// . queries that did not have us in the top 50 will not
// be in the reply
processMsg95Replies();
// show how long it took
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_beginMsg95s;
log("seopipe: time: getscoredinsertableterms took %"INT64" ms",
took);
// return the list of InsertableTerms, scored
m_scoredInsertableTermsBufValid = true;
// cache it! if it blocks that is ok, since it is valid n
// disable for debug... MDW!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
if ( ! storeScoredInsertableTermsIntoCachedb() )
return (SafeBuf *)-1;
return &m_insertableTermsBuf;
}
// now send every term in this list to every host in the
// network so it can evaluate with each of the queries it contains
// in memory from the query log for every position in the doc.
// then it will return InsertableTerm::m_wordPositions/m_trafficGain
// arrays for each InsertableTerm.
// time how long this whole thing takes
m_beginMsg95s = gettimeofdayInMilliseconds();
// reset this crap i guess
m_numMsg95Requests = 0;
m_numMsg95Replies = 0;
// from seo.h
Msg95Request mr;
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
mr.m_docId = m_docId;
mr.m_docLangId = *langIdPtr;
mr.m_seoDebug = m_seoDebug;
mr.ptr_posdbTermList = termListBuf->getBufStart();
// a buffer of TermInfos. used to set the termFreq of each term
// and used to determine what queries match the doc and should be
// evaluated for every insertion.
mr.ptr_termInfoBuf = ntiBuf->getBufStart();
mr.ptr_coll = cr->m_coll;
//mr.ptr_wordFreqInfoBuf = wfib->getBufStart();
mr.ptr_wordPosInfoBuf = wpib->getBufStart();
// why do we need this? doesn't termInfoBuf have all that? no,
// because we limit insertableterms to like the top 300 highest
// scoring, so they are separate. the termInfoBuf is sorted by
// termid (lower 32-bits) and has a termfreq and is used to
// get the matching queries in seo.cpp:handlerequest95()
mr.ptr_insertableTerms = m_insertableTermsBuf.getBufStart();
mr.size_posdbTermList = termListBuf->length();
mr.size_termInfoBuf = ntiBuf->length();//m_numTwids * 4;
mr.size_coll = gbstrlen(cr->m_coll)+1;
//mr.size_wordFreqInfoBuf = wfib->length();
mr.size_wordPosInfoBuf = wpib->length();
mr.size_insertableTerms = m_insertableTermsBuf.length();
int32_t requestSize;
char *req = serializeMsg ( sizeof(Msg95Request),
&mr.size_posdbTermList ,// firstSizeParm
&mr.size_insertableTerms,//lastSizeP
&mr.ptr_posdbTermList ,// firststrptr
&mr ,// thisPtr
&requestSize ,
NULL ,
0 ,
true );
if ( ! req ) return NULL;
int32_t numHosts = g_hostdb.m_numHosts;
// do not re-send if we already did this!
if ( m_numMsg95Requests > 0 ) numHosts = 0;
// send one msg95 request to each host. skip if dead.
for ( int32_t i = 0; i < numHosts ; i++ ) {
// get ptr to the host
Host *host = g_hostdb.getHost(i);
// get hostid of host #i
int32_t hostId = host->m_hostId;
// count it
m_numMsg95Requests++;
// skip if dead. i guess no queries from that guy. we can't
// send to a twin because the twin does not have the same
// queries in its in-memory query log. once we get more
// machines we should probably make the twin have the same
// copy so we can be redundant.
if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive ) {
log("seo: warning. host %"INT32" is dead so we could "
"not do the keyword tool right",hostId);
m_numMsg95Replies++;
continue;
}
// . send our posdb termlist to each host so it can
// call msg39 restricted to our docid very quickly
// . also send a ALL of the insertable terms to each
// host so they can evaluate the insertion for all of the
// relevant queries.
// . each host should be smart enough to realize that some
// queries need not be performed for an insertion because
// it is impossible to break the minimum score to be in the
// top 50 for that query. but we'll only have a minimum
// score for each query once we run a batch to eval
// each query at least partially to get a rough idea of
// the score needed to be in the top 50.
// . reply should be an array of QueryChanges for each
// insertable term for every query that matches this doc
// in the g_qlog buffer.
// . in most cases these arrays will be empty because we are
// not in the top 50 for that query
if ( ! g_udpServer.sendRequest ( req ,
requestSize ,
0x95 , // msgtype
host->m_ip , // ip
host->m_port , // port
hostId,
NULL, // retslot
this,
gotMsg95ReplyWrapper,
10000 , // timeout
-1 , // backoff
-1 , // maxwait
NULL, // replybuf
0, // replybufmaxsize
m_niceness // niceness
)) {
// let admin know about error
log("seopipe: sendRequest 95 had error: %s",
mstrerror(g_errno));
// count it as replied then
m_numMsg95Replies++;
continue;
}
}
// wait for all msg95 replies to come in
if ( m_numMsg95Requests > m_numMsg95Replies )
return (SafeBuf *)-1;
// somehow we finished without blocking
goto top;
// dummy return
return NULL;
}
// now sort the huge ptr buffer to QueryChanges first by:
// 1: QueryChange::m_termHash64
// 2: QueryChange::m_queryHash32
// 3: QueryChange::m_insertPos
int queryChangeCmp ( const void *a, const void *b ) {
QueryChange *qa = *(QueryChange **)a;
QueryChange *qb = *(QueryChange **)b;
// smallest term hash should be at the head of the list
if ( qa->m_termHash64 < qb->m_termHash64 ) return -1;
if ( qa->m_termHash64 > qb->m_termHash64 ) return 1;
if ( qa->m_queryHash32 < qb->m_queryHash32 ) return -1;
if ( qa->m_queryHash32 > qb->m_queryHash32 ) return 1;
if ( qa->m_insertPos < qb->m_insertPos ) return -1;
if ( qa->m_insertPos > qb->m_insertPos ) return 1;
return 0;
}
// . make each InsertableTerm point to a linked list of QueryChanges for it.
// . each QueryChange is a word position and a rank change
// . the linked list will be sorted by QueryChange::m_insertPos
// . there can be multiple QueryChanges for a single m_insertPos, but
// they will be fore different queries.
bool XmlDoc::processMsg95Replies() {
int32_t need = 0;
// each reply is a list of QueryChanges
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
// get reply
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
// skip if empty, error?
if ( ! mr ) continue;
// deserialize the msg95replies first
deserializeMsg ( sizeof(Msg95Reply) ,
(int32_t *)&mr->size_queryChangeBuf,//1stszparm
(int32_t *)&mr->size_queryLogBuf,//lastszparm
(char **)&mr->ptr_queryChangeBuf,//1ststrptr
mr->m_buf );
// scan the QueryChanges
//QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
need += ncs * 4;
}
// alloc now
SafeBuf hugePtrBuf;
if ( ! hugePtrBuf.reserve ( need ,"hpbuf" ) ) return false;
// how big are all query log bufs?
int32_t sumTotal = 0;
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
// get reply
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
// skip if empty, error?
if ( ! mr ) continue;
// how big
sumTotal += mr->size_queryLogBuf;
}
m_queryLogBuf.reset();
if ( ! m_queryLogBuf.reserve ( sumTotal ,"qlogbuf") ) return false;
char *orig = m_queryLogBuf.getBufStart();
int32_t ongoingOffset = 0;
int32_t ongoingDebugOffset = 0;
int32_t ongoingOrigOffset = 0;
// . fill up higePtrBuf for sorting below
// . also fill up m_queryLogBuf now for store*IntoCachedb()
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
// get reply
Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
// skip if empty, error?
if ( ! mr ) continue;
// ref it
//char *ref = m_queryLogBuf.getBuf();
//int32_t ref = m_queryLogBuf.length();
// add to our big buffer
m_queryLogBuf.safeMemcpy ( mr->ptr_queryLogBuf ,
mr->size_queryLogBuf );
// debug scores. should be length 0 if not debugging.
m_debugScoreInfoBuf.safeMemcpy ( mr->ptr_debugScoreInfoBuf ,
mr->size_debugScoreInfoBuf );
// original scores buf
m_origScoreInfoBuf.safeMemcpy ( mr->ptr_origScoreInfoBuf ,
mr->size_origScoreInfoBuf );
// scan the QueryChanges
QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
for ( int32_t j = 0 ; j < ncs ; j++ ) {
QueryChange *qc = &qcs[j] ;
// this is relative to ptr_queryLogBuf
qc->m_replyQueryOffset += ongoingOffset;
// if we have debug score info
if ( m_seoDebug >= 2 ) {
if ( qc->m_debugScoreInfoOffset < 0 ) {
char *xx=NULL;*xx=0; }
if ( qc->m_origScoreInfoOffset < 0 ) {
char *xx=NULL;*xx=0; }
qc->m_debugScoreInfoOffset +=
ongoingDebugOffset;
qc->m_origScoreInfoOffset +=
ongoingOrigOffset;
}
// that's relative to the msg95reply's ptr_queruStrBuf
//QueryLogEntry *qe;
//qe = (QueryLogEntry *)(mr->ptr_queryLogBuf + qoff);
//qe = (QueryLogEntry *)(ref + qoff);
// HACK that in. RELATIVE to m_queryLogBuf!!!
//qc->m_queryOffset3 = ref;//(int32_t)qe;
// add ptr to our global buffer
hugePtrBuf.pushPtr ( qc );
}
// sum it up
ongoingOffset += mr->size_queryLogBuf;
ongoingDebugOffset += mr->size_debugScoreInfoBuf;
ongoingOrigOffset += mr->size_origScoreInfoBuf;
}
// sanity. make sure doesn't grow since we reference it
if ( m_queryLogBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
// now sort the huge ptr buffer to QueryChanges first by:
// 1: QueryChange::m_termHash64
// 2: QueryChange::m_queryHash32
// 3: QueryChange::m_insertPos
char *hhh = hugePtrBuf.getBufStart();
int32_t size = hugePtrBuf.length();
// this should breath with niceness!!
gbqsort ( hhh ,
size/4 ,
sizeof(QueryChange *),
queryChangeCmp ,
m_niceness ) ;
// now store those sorted query changes into m_queryChangeBuf
// so we can cache them in store*IntoCached() easily
int32_t nqc = (need / 4) ;
if ( ! m_queryChangeBuf.reserve ( nqc * sizeof(QueryChange),"qcbuf") )
return false;
// for sanity check
char *orig2 = m_queryChangeBuf.getBufStart();
// copy over sorted into m_queryChangeBuf so we can cache it in cachedb
char *p = hhh;
char *pend = hhh + size;
for ( ; p < pend ; p += sizeof(QueryChange *) ) {
// cast it
QueryChange *qc = *(QueryChange **)p;
// save ptr to it
char *ref = m_queryChangeBuf.getBuf();
// save it
m_queryChangeBuf.safeMemcpy ( qc , sizeof(QueryChange) );
// now ref that instead
*(QueryChange **)p = (QueryChange *)ref;
}
// sanity test
if ( m_queryChangeBuf.getBufStart() != orig2 ) { char *xx=NULL;*xx=0;}
// now we can free the replies since we stored the replies into
// m_queryLogBuf and m_queryChangeBuf for store*IntoCachedb()
for ( int32_t i = 0;i < g_hostdb.m_numHosts;i++) {
if ( ! m_msg95ReplyPtrs[i] ) continue;
mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
m_msg95ReplyPtrs[i] = NULL;
}
// . now set QueryChange::m_next to make our linked list
// . if it is for a different query or termhash then end the linked
// list by setting m_next to NULL
QueryChange *lastqc = NULL;
for ( p = hhh ; p < pend ; p += 4 ) {
// cast it
QueryChange *qc = *(QueryChange **)p;
// assume we are the last one in the linked list
qc->m_next = NULL;
// make linked list
if ( lastqc &&
// terms must match to be in same linked list
lastqc->m_termHash64 == qc->m_termHash64 )
// link them
lastqc->m_next = qc;
// set this for next qc
lastqc = qc;
}
// now set InsertableTerm::m_firstQueryChange to point to the head
// of the linked list for that term based on it's m_termHash64.
// but the insertable terms are sorted by m_trafficSum.
// map a termHash64 to its corresponding first QueryChange.
HashTableX tit;
if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0; }
int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
if ( ! tit.set ( 8,4, ni*4,NULL,0,false,m_niceness,"tittbl") )
return false;
int64_t lastHash64 = 0LL;
// . store ptr to first querychange for each termhash64 into hash table
// . should be the head of the linked list for a termid
for ( p = hhh ; p < pend ; p += 4 ) {
// cast it
QueryChange *qc = *(QueryChange **)p;
// skip if not a new term hash
if ( qc->m_termHash64 == lastHash64 ) continue;
// update it
lastHash64 = qc->m_termHash64;
// . map it in the hash table then
// . it should be pre-allocated!
if (!tit.addKey(&qc->m_termHash64,&qc)){char *xx=NULL;*xx=0;}
}
// now scan the insertable terms and set their
// InsertableTerm::m_firstQueryChange ptr. points to the head
// of the QueryChange linked list for this insertable term
SafeBuf *itBuf = getInsertableTerms();
p = itBuf->getBufStart();
pend = itBuf->getBuf();
for ( ; p < pend ; ) {
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
// assume none
it->m_firstQueryChange = NULL;
char *val = (char *)tit.getValue(&it->m_termHash64);
// i guess there is none
if ( ! val ) continue;
// cast it
QueryChange *qc = *(QueryChange **)val;
// and assign
it->m_firstQueryChange = qc;
}
SafeBuf *wpib = getWordPosInfoBuf();
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
// now set InsertableTerm::m_bestTrafficGain/m_bestInsertPos/
// m_bestQueryChange by scanning the linked list and scoring each
// QueryChange::m_insertPos to see which is the highest traffic gain.
// and in the case of ties prefer the lowest word position.
p = itBuf->getBufStart();
pend = itBuf->getBuf();
for ( ; p < pend ; ) {
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
// . use this function now so seo.cpp can call it too!
// . sets WordPosInfo::m_trafficGain members
setWordPosInfosTrafficGain ( it );
// now find the insert position with the most traffic gain!
int32_t bestTrafficGain = -1;
int32_t bestInsertPos = -1;
for ( int32_t j = 0 ; j < nwpis ; j++ ) {
// skip if not the best scoring position
if ( wpis[j].m_trafficGain <= bestTrafficGain &&
// and if not first time!
bestInsertPos != -1 )
continue;
// we got a new winner
bestTrafficGain = wpis[j].m_trafficGain;
bestInsertPos = wpis[j].m_wordPos;//insertPos;
}
// set it
it->m_bestTrafficGain = bestTrafficGain;
it->m_bestInsertPos = bestInsertPos;
}
return true;
}
void XmlDoc::setWordPosInfosTrafficGain ( InsertableTerm *it ) {
// get the wordposinfobuf!
SafeBuf *wpib = getWordPosInfoBuf();
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
// . use the wordposinfo array to accumulate traffic gains
// for each word position, WordPosInfo::m_insertPos.
// . TODO: ignore tags like gblangid:
// . so reset the traffic gains first
for ( int32_t j = 0 ; j < nwpis ; j++ )
wpis[j].m_trafficGain = 0;
if ( ! it ) return;
// head of the linked list of QueryChanges for this InsertableTerm
QueryChange *qc = it->m_firstQueryChange;
// skip if no list. leave traffic gains set to 0 for all
if ( ! qc ) return;
// accumulate traffic gains
int32_t k = 0;
int32_t lastQueryHash32 = 0;
//bool firstQueryChangeForQuery;
QueryChange *lastqc = NULL;
// . scan the linked list of query changes
// . this is sorted by query first then m_insertPos
for ( ; qc ; qc = qc->m_next ) {
// assume NOT the first QueryChange for this query
//firstQueryChangeForQuery = false;
// . reset stuff for each different query
// . QueryChanges are sorted by m_queryHash32 secondly
// and by m_insertPos thirdly now...
if ( qc->m_queryHash32 != lastQueryHash32 ) {
// reset our WordPosInfo cursor
k = 0;
// for detecting the next set of QueryChanges
// for a different query
lastQueryHash32 = qc->m_queryHash32;
//firstQueryChangeForQuery = true;
lastqc = NULL;
}
// sanity
if ( lastqc && lastqc->m_insertPos > qc->m_insertPos ) {
char *xx=NULL;*xx=0; }
// compute th traffic in advance from the rank changes
int32_t trafficGain = getTrafficGain( qc );
// checkpoint
/*
if ( trafficGain > 0 )
log("got some traffic gain qh=%"UINT32" "
"pos=%"INT32" term=%s gain=%"INT32"",
qc->m_queryHash32,
qc->m_insertPos,
it->m_termStr,
trafficGain);
*/
// get next query change
QueryChange *nqc = qc->m_next;
// make it NULL if for a different query
if ( nqc && nqc->m_queryHash32 != qc->m_queryHash32 )
nqc = NULL;
// . we use a compression where we only store a
// QueryChange if different than the last QueryChange
// . so advance the WordPosInfos cursor "k" until
// we catch up to the qc->m_insertPos.
for ( ; k < nwpis ; k++ ) {
// stop if we are caught up
if ( wpis[k].m_wordPos >= qc->m_insertPos )
break;
}
// now this position and up to next qc "nqc" gets the traffic
for ( ; k < nwpis ; k++ ) {
// stop if we are caught up
if ( nqc && wpis[k].m_wordPos >= nqc->m_insertPos )
break;
wpis[k].m_trafficGain += trafficGain;
}
}
/*
// print out positives - debug
for ( int32_t k = 0 ; k < nwpis ; k++ ) {
// stop if we are caught up
if ( ! wpis[k].m_trafficGain ) continue;
if ( wpis[k].m_trafficGain <= 0 ) continue;
// note it
log("seo: gain pos=%"INT32" gain=%"INT32"",
wpis[k].m_wordPos,
wpis[k].m_trafficGain);
}
*/
}
double getTrafficPercent ( int32_t rank ) {
// from aol's query logs from that same searchenginewatch.com url
static double s_posClicks[1000] = {
.4230, // #1
.1192,
.0844,
.0603,
.0486,
.0399,
.0337,
.0298,
.0283,
.0270 // #10 (was .297 but for our purposes, make it <)
};
//static float s_pageClicks[5];
// set total of clicks each page gets
static bool s_init = false;
if ( ! s_init ) {
s_init = true;
//float sum = 0.0;
//for ( int32_t i = 0 ; i < 10 ; i++ )
// sum += s_posClicks[i];
// this is about .11 or so
//float pageFactor = 1.0 - sum;
// HACK! make it pass the sanity check below!
//pageFactor *= .50;
// sanity. do not allow top result on 2nd page
// to rank higher!!
//if ( pageFactor * s_posClicks[0] > s_posClicks[9] ) {
// char *xx=NULL;*xx=0; }
// will be like .11 for second page, .01 for 3rd, etc.
//float pageMult = 1.0;
// fill in the rest
for ( int32_t i = 10 ; i < 1000 ; i++ ) {
// just make it linear since there is too much
// chaos as to our diffs with google. so this is
// a good estimation way...
s_posClicks[i] = .0270 - .0007 * i;
if ( s_posClicks[i] < 0 )
s_posClicks[i] = 0.0;
}
// sanity to make sure all in order
for ( int32_t i = 1 ; i < 1000 ; i++ ) {
if ( s_posClicks[i-1] < s_posClicks[i] ) {
char *xx=NULL;*xx=0; }
if ( s_posClicks[i] < 0 ) {
char *xx=NULL;*xx=0; }
}
}
if ( rank >= 1000 ) rank = 999;
if ( rank < 0 ) { char *xx=NULL;*xx=0; }
return s_posClicks[rank];
}
// . based on difference between m_oldRank and m_newRank
// . m_*Rank starts at 0 and goes to 9 for first page of results
int32_t XmlDoc::getTrafficGain ( QueryChange *qc ) {
// no rank change? this can both be -1 if it is a missing
// term i guess... and we're not inserting it.
if ( qc->m_oldRank == qc->m_newRank ) return 0;
// get old clicks
int32_t oldRank = qc->m_oldRank;
double oldp;
// if not ranked before because this was inserting a brand new
// missing term, this will be -1
if ( oldRank == -1 ) oldp = 0.0;
else oldp = getTrafficPercent ( oldRank );
//if ( oldRank < 50 ) oldp = s_posClicks[oldRank];
// get new clicks
int32_t newRank = qc->m_newRank;
float newp = getTrafficPercent ( newRank );
//if ( newRank < 50 ) newp = s_posClicks[newRank];
// HACK
// we stored the entire querylogreply buf in here
char *ref = m_queryLogBuf.getBufStart();
// so we can use the replyqueryoffset then...
QueryLogEntry *qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
int32_t traffic = qe->m_gigablastTraffic;
traffic *= GB_TRAFFIC_MODIFIER;
int32_t trafficChange = (int32_t)((newp - oldp) * traffic);
// sanity.
if ( qc->m_oldRank > qc->m_newRank && trafficChange < 0 ) {
char *xx=NULL;*xx=0; }
// ignore this sanity check if not ranked before. i.e. inserting
// a new missing term...
if ( qc->m_oldRank != -1 &&
qc->m_oldRank < qc->m_newRank && trafficChange > 0 ) {
char *xx=NULL;*xx=0; }
// return the change. it might be negative!
return trafficChange;
}
// 4. then we just dump out all the InsertedTerms into xml so they can be
// displayed on the front end.
// dump the list of InsertedTerms into "sbuf" as xml
bool XmlDoc::printScoredInsertableTerms ( SafeBuf *sbuf ) {
// print the header
sbuf->safePrintf("\t<insertableTerms>\n");
// scan each term
SafeBuf *itBuf = getInsertableTerms();
// has to be there
if ( ! itBuf || itBuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
SafeBuf *wpib = getWordPosInfoBuf();
if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
int32_t nwpis = wpib->length() / sizeof(WordPosInfo);
// cast it
//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
// how many terms do we have?
//int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
// dedup queries used in query changes
HashTableX qdups;
if ( ! qdups.set(4,0,32,NULL,0,false,m_niceness,"qddd") ) return false;
//
// . print query map
// . print all query ids we use and their strings
//
bool firstTime = true;
char *p = itBuf->getBufStart();
char *pend = itBuf->getBuf();
for ( ; p < pend ; ) {
QUICKPOLL(m_niceness);
// cast it
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
// scan its query changes
QueryChange *qc = it->m_firstQueryChange;
for ( ; qc ; qc = qc->m_next ) {
// skip if already printed
if ( qdups.isInTable(&qc->m_queryHash32) ) continue;
if ( firstTime ) {
sbuf->safePrintf("\t\t<queryMap>\n");
sbuf->safePrintf("\t\t\t<desc>"
"<![CDATA["
"32bitSignedQueryHash,"
"queryString"
"]]></desc>\n"
);
}
firstTime = false;
// HACK
char *ref = m_queryLogBuf.getBufStart();
QueryLogEntry *qe;
qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
// new query, print it. map the hash to the string
// so we can just show the hash when printing
// out all the QueryChanges below to save space
sbuf->safePrintf("\t\t\t<queryPoint>"
"<![CDATA[%"INT32",%s]]>"
"</queryPoint>\n"
, qc->m_queryHash32
// hack...
, qe->getQueryStr()
);
// do not re-print
if ( ! qdups.addKey(&qc->m_queryHash32) )return false;
}
}
if ( ! firstTime )
sbuf->safePrintf("\t\t</queryMap>\n");
// . now the word position map
// . we only provided querychange if it has a different score than
// the previously stored querychange. this is a kind of compression
// . so you need to know all the possible word positions we tried
// for each insertion we did
sbuf->safePrintf("\t\t<wordInsertionMap>\n");
sbuf->safePrintf("\t\t\t<desc>"
"<![CDATA["
"Describes all positions we attempt to insert each "
"insertable term into. The terms at that position "
"and up are pushed forward by the insertion. "
"&lt;sent&gt; is the sentence number."
"]]></desc>\n"
);
for ( int32_t i = 0 ; i < nwpis ; i++ ) {
WordPosInfo *wpi = &wpis[i];
sbuf->safePrintf("\t\t\t<word>\n"
"\t\t\t\t<pos>%"INT32"</pos>\n"
"\t\t\t\t<sent>%"INT32"</sent>\n"
"\t\t\t\t<hashGroup>%s</hashGroup>\n"
"\t\t\t\t<densityRank>%"INT32"</densityRank>\n"
"\t\t\t\t<spamRank>%"INT32"</spamRank>\n"
"\t\t\t</word>\n"
,wpi->m_wordPos
,wpi->m_sentNum
,getHashGroupString(wpi->m_hashGroup)
,(int32_t)wpi->m_densityRank
,(int32_t)wpi->m_wordSpamRank
);
}
sbuf->safePrintf("\t\t</wordInsertionMap>\n");
// scan all the insertable terms
p = itBuf->getBufStart();
pend = itBuf->getBuf();
for ( ; p < pend ; ) {
QUICKPOLL(m_niceness);
// cast it
InsertableTerm *it = (InsertableTerm *)p;
p += it->getSize();
// print the term
sbuf->safePrintf("\t\t<term>\n");
// the string
sbuf->safePrintf("\t\t\t<string><![CDATA[%s]]></string>\n",
it->getTerm());
// sum of traffic of all queries containing this term
sbuf->safePrintf("\t\t\t<importance>%"INT32"</importance>\n",
it->m_trafficSum);
// is it contained in the doc/linktext or is it "related"
sbuf->safePrintf("\t\t\t<isRelatedTerm>%"INT32"</isRelatedTerm>\n",
(int32_t)it->m_isRelatedTerm);
// get the first query change if any
QueryChange *qc = it->m_firstQueryChange;
// limit to fix firefox crash
//int32_t queryChangeLimit = 30;
// skip if no list
if ( ! qc ) goto skip;
// print the insert position that gives us the most traffic
sbuf->safePrintf("\t\t\t<bestInsertPosition>%"INT32""
"</bestInsertPosition>\n",
it->m_bestInsertPos);
sbuf->safePrintf("\t\t\t<bestTrafficGain>%"INT32""
"</bestTrafficGain>\n",
it->m_bestTrafficGain);
// print query changes
if ( it->m_firstQueryChange )
sbuf->safePrintf("\t\t\t<queryChanges><![CDATA["
);
// print out query changes for this term
for ( qc = it->m_firstQueryChange ; qc ; qc = qc->m_next ) {
// fix firefox crash for now
//if ( --queryChangeLimit <= 0 ) break;
// now store in binary
sbuf->pushLong(qc->m_insertPos);
sbuf->pushLong(qc->m_queryHash32);
sbuf->pushChar(qc->m_oldRank);
sbuf->pushChar(qc->m_newRank);
/*
// . TODO: make sure to remove QueryChanges that have
// the same old and new rank
// . print it
sbuf->safePrintf("\t\t\t<queryChange>\n");
sbuf->safePrintf("\t\t\t\t<insertPos>%"INT32""
"</insertPos>\n", qc->m_insertPos);
sbuf->safePrintf("\t\t\t\t<oldRank>%"INT32""
"</oldRank>\n",(int32_t)qc->m_oldRank);
sbuf->safePrintf("\t\t\t\t<newRank>%"INT32""
"</newRank>\n",(int32_t)qc->m_newRank);
sbuf->safePrintf("\t\t\t\t<queryId>%"INT32""
"</queryId>\n",
qc->m_queryHash32 );
sbuf->safePrintf("\t\t\t</queryChange>\n");
*/
}
if ( it->m_firstQueryChange )
sbuf->safePrintf("]]></queryChanges>\n");
skip:
// print the term end
sbuf->safePrintf("\t\t</term>\n");
}
sbuf->safePrintf("\t</insertableTerms>\n");
return true;
}
/*
static int wordPosInfoCmp ( const void *a, const void *b ) {
WordPosInfo *wa = (WordPosInfo *)a;
WordPosInfo *wb = (WordPosInfo *)b;
// smallest word position should be at the head of the list
if ( wa->m_wordPos < wb->m_wordPos ) return -1;
if ( wa->m_wordPos > wb->m_wordPos ) return 1;
return 0;
}
*/
static int wpPosdbKeyCmp ( const void *a, const void *b ) {
int32_t wpa = g_posdb.getWordPos((char *)a);
int32_t wpb = g_posdb.getWordPos((char *)b);
return wpa - wpb;
}
SafeBuf *XmlDoc::getWordPosSortedPosdbListBuf ( ) {
if ( m_wpSortedPosdbListBufValid )
return &m_wpSortedPosdbListBuf;
// get the lists. forDelete = false.
char *metaList = getMetaList ( false );
if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;
// sanity
if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
// make a tmp buf to hold posdb keys
//SafeBuf tmp;
if ( ! m_wpSortedPosdbListBuf.reserve ( m_metaListSize,"wpsbuf" ) )
return NULL;
// point into it
char *dst = m_wpSortedPosdbListBuf.getBufStart();
// scan the meta list for posdb keys
char *p = metaList;
char *pend = p + m_metaListSize;
// stole this loop from getMetaList()
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// save it with the flag
char byte = *p;
// get rdbId
char rdbId = byte & 0x7f;
// skip that
p++;
// key size
int32_t ks = getKeySizeFromRdbId(rdbId);
// get key
char *key = p;
// skip that
p += ks;
// get datasize
int32_t ds = getDataSizeFromRdbId(rdbId);
// assume we do not store the datasize
//bool neg = false;
// . if key is negative, no data is present
// . the doledb key is negative for us here
if ( (key[0] & 0x01) == 0x00 ) ds = 0;
// if datasize variable, read it in
if ( ds == -1 ) {
// get data size
ds = *(int32_t *)p;
// skip data size int32_t
p += 4;
}
// point to data
//char *data = p;
// skip data if not zero
p += ds;
// if not posdb skip rec
if ( rdbId != RDB_POSDB ) continue;
// skip negative keys
if ( (key[0] & 0x01) == 0x00 ) continue;
// add to new buf now
gbmemcpy ( dst , key , sizeof(POSDBKEY) );
// advance
dst += sizeof(POSDBKEY);
}
char *start = m_wpSortedPosdbListBuf.getBufStart();
// update tmp
m_wpSortedPosdbListBuf.incrementLength ( dst - start );
// sanity
if ( m_wpSortedPosdbListBuf.length() > m_metaListSize ) {
char *xx=NULL;*xx=0; }
// point
char *pbuf = m_wpSortedPosdbListBuf.getBufStart();
int32_t numKeys = m_wpSortedPosdbListBuf.length()/sizeof(POSDBKEY);
// sort keys by word position
gbqsort ( pbuf ,
numKeys,
sizeof(POSDBKEY),
wpPosdbKeyCmp ,
m_niceness );
m_wpSortedPosdbListBufValid = true;
return &m_wpSortedPosdbListBuf;
}
// now pass this into Msg95Request so we only try to insert right before
// or after m_wordPos values in this WordPosInfo vector.
SafeBuf *XmlDoc::getWordPosInfoBuf ( ) {
// if it is valid and we have not yet added to cachedb...
if ( m_wordPosInfoBufValid && ! m_triedToAddWordPosInfoToCachedb ) {
// only do this once
m_triedToAddWordPosInfoToCachedb = true;
// store the m_wordPosInfoBuf into cachedb
if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
return (SafeBuf *)-1;
}
if ( m_wordPosInfoBufValid )
return &m_wordPosInfoBuf;
// it should be valid now from our logic in hashWords3() if
// m_doingSEO is set to true
char *xx=NULL; *xx=0;
// these are FULL 18-byte keys, no compression, sorted by word pos
SafeBuf *posdbBuf = getWordPosSortedPosdbListBuf ();
if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;
// scan posdb keys
int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);
// . reserve mem for new buf
int32_t need = numKeys * sizeof(WordPosInfo);
if ( ! m_wordPosInfoBuf.reserve ( need ,"wpibuf" ) )
return NULL;
int32_t sentNum = 0;
int32_t lastWordPos = -1;
//int32_t lastwp = -1;
int32_t lastSentNum = -1;
// scan all the sorted posdb keys and build posdb termlists and
// store the termlists into "m_termListBuf"
char *p = posdbBuf->getBufStart();
char *pend = p + posdbBuf->length();
for ( ; p < pend ; ) {
// breathe
QUICKPOLL(m_niceness);
// get the key
char *key = p;
// sanity
if ( g_posdb.getKeySize(p) != 18 ) { char *xx=NULL;*xx=0; }
// skip del keys
if ( (p[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
// skip it
p += sizeof(POSDBKEY);
// get key termid
//int64_t termId = g_posdb.getTermId ( key );
// sanity
//int64_t docId = g_posdb.getDocId ( key );
//if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
// log it for debug
//if ( docId == 192304365235LL )
// log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"",
// docId,
// termId,
// g_posdb.getWordPos(key));
WordPosInfo wpi;
int32_t wp = g_posdb.getWordPos(key);
// set "m_sentNum"
if ( wp >= lastWordPos + 50 ) sentNum++;
wpi.m_wordPos = wp;
wpi.m_sentNum = sentNum;
wpi.m_hashGroup = g_posdb.getHashGroup (key);
wpi.m_densityRank = g_posdb.getDensityRank (key);
wpi.m_wordSpamRank = g_posdb.getWordSpamRank (key);
wpi.m_trafficGain = 0;
// log it
/*
log("seopipe: term=%"INT64" pos=%"INT32" sent=%"INT32" hg=%s dr=%"INT32"",
g_posdb.getTermId(key),
(int32_t)wp,
sentNum,
getHashGroupString(wpi.m_hashGroup),
(int32_t)wpi.m_densityRank);
*/
// bigrams share the same word position as the single term.
// so ignore them. we only want unique insertion positions.
if ( wp == lastWordPos ) continue;
// . i thought sorted by word position??
// . word position 0 is used by generic terms, like tags
if ( wp < lastWordPos ) { char *xx=NULL;*xx=0; }
// additional positoin at the end of a sentence?
//if ( lastwp != wp && lastSentNum == sentNum )
// // store it
// m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
// to right as well! so it can be in same sentence, if this
// word as at the end of the sentence.
//wpi.m_wordPos = wp;// + 2;
// add it
m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
int32_t nextSent = -1;
if ( p < pend ) {
// assume same as current sentence
nextSent = sentNum;
// get word position of next term
int32_t nextwp = g_posdb.getWordPos(p);
// same as us? then it is a bigram, so try the
// word after that!
if ( nextwp == wp && p+18<pend )
nextwp = g_posdb.getWordPos(p+18);
// if the following word position is in a new sentence
// he will be separated by 50 units! that is our base
// for sentence skip.
if ( nextwp >= wp + SENT_UNITS )
nextSent = sentNum+1;
}
// HACK. if next word starts a new sentence, add a WordPosInfo
// here so we can insert term at end of THIS sentence.
// otherwise we are inserted BEFORE the term whose position
// we use.
if ( nextSent != sentNum ) {
wpi.m_wordPos += 2;
m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
}
// set these
lastWordPos = wp;
//lastwp = wp;// + 2;
lastSentNum = sentNum;
}
/*
// point to raw buf
char *raw = m_wordPosInfoBuf.getBufStart();
int32_t size = m_wordPosInfoBuf.length();
// this shit is sorted by termid then pos, so sort just by pos
// this should breath with niceness!!
gbqsort ( raw ,
size / sizeof(WordPosInfo),
sizeof(WordPosInfo) ,
wordPosInfoCmp ,
m_niceness ) ;
*/
m_wordPosInfoBufValid = true;
return &m_wordPosInfoBuf;
}
// . i made this easy to serialize by using offsets and not ptrs
// . so we can add to cachedb easily
// . and so its immune to reallocs() on m_linkSourceBuf SafeBuf
class LinkSource {
public:
int32_t m_linkSiteRank;
// the actual url of the link, references into m_buf
int32_t m_linkUrlOffset;
// the title of the link, references into m_buf
int32_t m_linkTitleOffset;
// . we store the offsets of the RelatedDocIds in m_relatedDocIdBuf
// . these are the related docids that are linked to by this link src
int32_t m_offsetOfRelatedDocIdOffsets;
int32_t m_numRelatedDocIds;
char m_buf[0];
char *getLinkUrl ( SafeBuf *linkSourceBuf ) {
char *buf = linkSourceBuf->getBufStart();
buf += m_linkUrlOffset;
return buf;
};
char *getLinkTitle ( SafeBuf *linkSourceBuf ) {
char *buf = linkSourceBuf->getBufStart();
buf += m_linkTitleOffset;
return buf;
};
// crap, do we store RelatedDocIds into cachedb? we should
// make it use offsets and not ptrs too...
int32_t *getRelatedDocIdOffsets ( SafeBuf *linkSourceBuf ) {
// how can this be?
//if ( m_numRelatedDocIds == 0 ) return NULL;
char *buf = linkSourceBuf->getBufStart();
buf += m_offsetOfRelatedDocIdOffsets;
return (int32_t *)buf;
};
};
/*
static void gotLinkInfoReplyWrapper ( void *state ) {
//XmlDoc *newxd = (XmlDoc *)state;
Msg25 *msg25 = (Msg25 *)state;
XmlDoc *xd = msg25->m_xd;
// count it as returned
xd->m_numLinkRequestsIn++;
// this will nuke the msg25 as well after copying its linkinfo
xd->processLinkInfoMsg20Reply ( msg25 );
// try to send out more requests or intersect them if done
xd->m_masterLoop ( xd->m_masterState );
}
// . before we were just looking at the LinkInfo the msg25 makes from
// all the Msg20Replies it gets, but let's keep the msg20 replies
// intact because they have the titles we need!
// . return false on error, true otherwise
bool XmlDoc::processLinkInfoMsg20Reply ( Msg25 *msg25 ) {
// int16_tcut
//LinkInfo *info = msg25->getLinkInfo ();
// store into our buffer
//bool status ;
// i guess info can be NULL on error
//if ( info )
// status = m_linkInfoReplyBuf.safeMemcpy (info, info->getSize());
// give front-end the progress bar info
if ( m_seoSocket && m_progressBar ) {
// tmp buf
char tmp[16];
float percent = (float)m_rdCursor;
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
percent /= (float)numRelated;
// 80% of the pipeline was doing the full queries
percent *= .20;
percent += .80;
percent *= 100.0;
int32_t percentLong = (int32_t)percent;
if ( percentLong >= 100 ) percentLong = 99;
int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong);
if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen);
// try a send on non-blocking socket
int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n);
// forget error
errno = 0;
}
// store this
int32_t nr = msg25->m_numReplyPtrs;
// reserve space
if ( ! m_msg20ReplyPtrBuf.reserve ( 8 + nr * 4 * 2 ) ) {
m_hadLinkInfoError = g_errno;
nr = 0;
}
// first store related docid ptr into m_relatedDocIdBuf safebuf
RelatedDocId *rd = (RelatedDocId *)msg25->m_hackrd;
m_msg20ReplyPtrBuf.pushLong((int32_t)rd);
// then store the # of msg20 replies
m_msg20ReplyPtrBuf.pushLong(nr);
// . scan each msg20reply it got, each msg20reply is an inlink
// for this docid
// . seems like they are only freed in Msg25::reset()
for ( int32_t i = 0 ; i < nr ; i++ ) {
// get one
Msg20Reply *r = msg25->m_replyPtrs[i];
int32_t size = msg25->m_replySizes[i];
// steal it, we will free them ourselves below
m_msg20ReplyPtrBuf.pushLong((int32_t)r);
// we need this since we need to free it when done
m_msg20ReplyPtrBuf.pushLong(size);
}
// . do not allow Msg25 to free it, we will free it below
// . on OOM error above we set nr to 0 on error, so allow msg25
// to free the replies in that case
if ( nr ) msg25->m_numReplyPtrs = 0;
// nuke it
mdelete ( msg25 , sizeof(Msg25), "m25li" );
delete ( msg25 );
return true;
}
*/
static int riCmp ( const void *a, const void *b ) {
RecommendedLink *wa = *(RecommendedLink **)a;
RecommendedLink *wb = *(RecommendedLink **)b;
int32_t diff = wb->m_votes - wa->m_votes;
if ( diff ) return diff;
if ( wb->m_totalRecommendedScore > wa->m_totalRecommendedScore )
return 1;
if ( wb->m_totalRecommendedScore < wa->m_totalRecommendedScore )
return -1;
// docid to break all ties
if ( wb->m_rl_docId > wa->m_rl_docId )
return 1;
if ( wb->m_rl_docId < wa->m_rl_docId )
return -1;
return 0;
}
static void gotLinkdbListWrapper ( void *state ) {
Msg0 *msg0 = (Msg0 *)state;
XmlDoc *xd = msg0->m_hackxd;
// free it's memory here lest we have a leak
//msg0->reset();
xd->m_numLinkRequestsIn++;
xd->m_masterLoop ( xd->m_masterState );
}
#define MAX_RECOMMENDED_LINKS 300
// . returns safebuf of RecommendedLinks
// . use RecommendedLink::getSize() to skip over element in array/safebuf
// . these are the recommended link sources
// . these are the links that your relateddocids (i.e. competing pages) have
// in common the most
// . TODO: store the returned safebuf in cachedb as well!
SafeBuf *XmlDoc::getRecommendedLinksBuf ( ) {
// try to set from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
if ( m_recommendedLinksBufValid )
return &m_recommendedLinksBuf;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// what docids share our matching queries?
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
// cast then
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
// how many related docids do we have?
int32_t numRelatedDocIds = rdbuf->length() / sizeof(RelatedDocId);
if ( m_numLinkRequestsOut == 0 ) {
// reset these on first call
m_rdCursor = 0;
m_numLinkRequestsIn = 0;
m_hadLinkInfoError = 0;
m_numMsg20sIn = 0;
m_numMsg20sOut = 0;
m_numValidMsg20s = 0;
m_titleCursor = 0;
m_msg20Phase = 0;
m_recommendedLinkError = 0;
}
if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0; }
// if we are looking up the title/url of each docid in
// the m_recommendedLinksBuf now, go back there
if ( m_msg20Phase )
return lookupTitles();
for ( ; m_rdCursor < numRelatedDocIds ; m_rdCursor++ ) {
// wait if too many out. only allow 2 out. otherwise each
// one can send out like 500 msg20s
if ( m_numLinkRequestsOut - m_numLinkRequestsIn > 60 )
// wait for 1 to come back
return (SafeBuf *)-1;
// skip the rest on error
if ( m_hadLinkInfoError ) continue;
// cast it
RelatedDocId *rd = &rds[m_rdCursor];
// bogus? a not found, EDOCBANNED/EDOCFILTERED or it
// linked to our domain
if ( rd->rd_url_off < 0 )
continue;
// bogus?
if ( ! rd->getUrl( &m_relatedTitleBuf ) ) {
log("seo: skipping null url");
continue;
}
if ( ! rd->getSite( &m_relatedTitleBuf ) ) {
log("seo: skipping null site");
continue;
}
// allocate msg0 array into m_tmpMsg0Buf safebuf
if ( ! m_tmpMsg0Buf.length() ) {
// fill tmpmsg0 buf
int32_t need = sizeof(Msg0) * numRelatedDocIds;
if ( ! m_tmpMsg0Buf.reserve ( need , "tmp20s" ) )
return NULL;
// do not re-call!
m_tmpMsg0Buf.setLength(need);
char *p = m_tmpMsg0Buf.getBufStart();
char *pend = p + need;
for ( ; p < pend ; p += sizeof(Msg0) ) {
Msg0 *msg0 = (Msg0 *)p;
msg0->constructor();
}
}
// debug it
if ( m_seoDebug >= 2 )
log("seo: getting inlinks to related docid=%"INT64" "
"weight=%f "
"url=%s",
rd->m_docId,
rd->m_relatedWeight,
rd->getUrl(&m_relatedTitleBuf));
// just get his linkdb list!
Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
Msg0 *msg0 = &array[m_rdCursor];
key224_t startKey;
key224_t endKey;
char *rdurl = rd->getUrl(&m_relatedTitleBuf);
// by default, just hash of hostname, unless overridden
// with "site" tag in tagdb, or has a path like /~mwells
int32_t siteHash32 = rd->m_rd_siteHash32;
int64_t linkHash64 = hash64n(rdurl);
startKey = g_linkdb.makeStartKey_uk (siteHash32,linkHash64 );
endKey = g_linkdb.makeEndKey_uk (siteHash32,linkHash64 );
// hack that thing
msg0->m_hackxd = this;
// consider it outstanding
m_numLinkRequestsOut++;
// int16_tcut, piggyback on the msg0
RdbList *list = &msg0->m_handyList;
//RdbList list2;
if ( ! msg0->getList ( -1 , // hostId, -1 if none
0 , // hostId ip
0 , // hostId port
0 , // max cache age -secs
false , // addToCache?
RDB_LINKDB ,
cr->m_collnum ,
list , // linkdb list to fill
(char*)&startKey,
(char*)&endKey ,
1000000 , // 1MB minrecsizes
msg0 ,
gotLinkdbListWrapper ,
m_niceness ,
true , // error correct?
true , // includeTree
true , // do merge
-1,//hostId
0 , // startFileNum
-1 , // numFiles
60*60*24*365 )){//timeout of one year
// blocked? keep chugging
continue;
}
// . maybe it was cached or something, or we had an error!
// . this will nuke the msg25
// . returns false and sets g_errno on error
//processLinkInfoMsg20Reply ( msg25 );
m_numLinkRequestsIn++;
// save g_errno
int32_t saved = g_errno;
// free it's memory here lest we have a leak
//msg0->reset();
// error? it will not have blocked then
if ( ! saved ) continue;
// save error, and stop launching any more requests
m_hadLinkInfoError = saved;
log("xmldoc: linksrc error3 = %s",mstrerror(saved));
}
// return -1 if waiting for more requests to come in
if ( m_numLinkRequestsOut > m_numLinkRequestsIn )
return (SafeBuf *)-1;
// vote table to allow inlink voting
HashTableX riTable;
// do not return on error setting this table because we'll leave
// the msg20 replies unfreed!
if ( ! riTable.set ( 8,4,1024,NULL,0,false,m_niceness,"ritbl") )
m_hadLinkInfoError = g_errno;
RecommendedLink *ri;
HashTableX dedupVotesTable;
if ( ! dedupVotesTable.set(8,0,1024,NULL,0,false,m_niceness,"dvtt") )
return NULL;
// need this for computing rdOff
char *rdStart = m_relatedDocIdBuf.getBufStart();
// store recommended links bufs here temporarily
SafeBuf tmpBuf;
if ( ! tmpBuf.reserve ( 10000000 ,"tt5buf" ) ) return NULL;
// all done. scan linkdb lists and intersect. there is one list
// per related docid.
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
// get related docid that had the following msg20replies
RelatedDocId *rd = &rds[i];
// his offset in his buf
int32_t rdOff = (char *)rd - rdStart;
// get linkdb list loaded from msg0 call above
Msg0 *msg0 = &((Msg0 *)m_tmpMsg0Buf.getBufStart())[i];
RdbList *list = &msg0->m_handyList;
list->resetListPtr();
// scan the docids in list
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
// get the current key if list has more left
key224_t key;
list->getCurrentKey( &key );
//int32_t itop = g_linkdb.getLinkerIp24_uk ( &key );
int32_t ip32 = g_linkdb.getLinkerIp_uk ( &key );
//bool isLinkSpam = g_linkdb.isLinkSpam_uk ( &key );
int64_t docId = g_linkdb.getLinkerDocId_uk ( &key );
//int32_t discovered = g_linkdb.getDiscoveryDate_uk(&key);
// skip if no longer there on page, we keep these
// only to graph lost links over time
int32_t lostDate = g_linkdb.getLostDate_uk ( &key );
if ( lostDate )
continue;
// if the inlink is from the same c-block IP as the
// related docid it links to, then do not consider.
// the ip used in linkdb is the current ip not the
// first ip actually.
if ( ipdom(ip32)==ipdom(rd->m_relatedCurrentIp))
continue;
if ( ipdom(ip32)==ipdom(rd->m_relatedFirstIp))
continue;
// if the linking document links to the same related
// docid multiple times/ we need to dedup so m_votes
// is not incremented multiple times!
// actually make it use c-block not docid to fix
// links/pages getting two m_votes for linking to
// two competitors, where each competitor linked to
// is on the same c-block... kinda strange.
int64_t dkey = docId ^ ipdom(rd->m_relatedFirstIp);
if ( dedupVotesTable.isInTable(&dkey) )
continue;
if ( ! dedupVotesTable.addKey(&dkey) ) return NULL;
// now we associate a new class with each unique linker
int32_t *poff = (int32_t *)riTable.getValue ( &docId );
// if there, it will be an offset into the links buf
if ( poff ) {
char *ptr = tmpBuf.getBufStart();
ptr += *poff;
RecommendedLink *rip = (RecommendedLink *)ptr;
rip->m_totalRecommendedScore +=
rd->m_relatedWeight;
rip->m_votes++;
// add to array of rd offs
int32_t k; for ( k = 0 ; k < 10 ; k++ ) {
if ( rip->m_relatedDocIdOff[k]==-1)
break;
}
if ( k < 10 )
rip->m_relatedDocIdOff[k] = rdOff;
continue;
}
// reserve space
int32_t need = sizeof(RecommendedLink);
// reserve
if ( ! tmpBuf.reserve ( need , "tt5buf" ) ) {
m_hadLinkInfoError = g_errno;
continue;
}
// save this
int32_t firstOff = tmpBuf.length();
// ref it
char *buf = tmpBuf.getBuf();
ri = (RecommendedLink *)buf;
// advance over that
int32_t over = sizeof(RecommendedLink);
// increase buf length
tmpBuf.incrementLength(over);
// this is how similar the relatedDocId is to the
// main url. these dotproducts are all relative
// with the other relatedDocIds for this url.
// the dotproduct was basically a dotproduct
// of the score vector of "rd" with that of
// the main url for the same queries. and that
// was normalized by the score of the top result
// for each query that have in common. see the
// the algo above for the "m_dotProduct" computation.
ri->m_totalRecommendedScore = rd->m_relatedWeight;
ri->m_votes = 1;
ri->m_rl_docId = docId;
// we do not know these things until we call msg20
// on the docid:
ri->m_rl_siteRank = -1;//reply->m_siteRank;
ri->m_rl_firstIp = 0;//reply->m_firstIp;
// each recommended link links to one or more
// related docids. so record them!
ri->m_relatedDocIdOff[0] = rdOff;
ri->m_relatedDocIdOff[1] = -1;
ri->m_relatedDocIdOff[2] = -1;
ri->m_relatedDocIdOff[3] = -1;
ri->m_relatedDocIdOff[4] = -1;
ri->m_relatedDocIdOff[5] = -1;
ri->m_relatedDocIdOff[6] = -1;
ri->m_relatedDocIdOff[7] = -1;
ri->m_relatedDocIdOff[8] = -1;
ri->m_relatedDocIdOff[9] = -1;
ri->m_urlSize = 0;
ri->m_titleSize = 0;
// store it in table then, pointing into the new buf
if ( ! riTable.addKey ( &docId, &firstOff ) )
m_hadLinkInfoError = g_errno;
}
// free that list now to save mem
list->freeList();
}
// free the msg0s now, including Msg0::m_handyList, what we used
// to hold the linkdb list
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
Msg0 *msg0 = &array[i];
// free the mem and the handylist now that we've processed them
msg0->reset();
}
// no longer need the msg0s and linkdb lists (Msg0::m_handyLists)
m_tmpMsg0Buf.purge();
//
// now sort RecommendedLinks in tmpBuf by their scores
//
// get the top 300 recommended links so we can save mem and
// store this beastie in cachedb
SafeBuf ptrBuf;
int32_t maxNumPtrs = tmpBuf.length() / sizeof(RecommendedLink);
if ( ! ptrBuf.reserve(maxNumPtrs *sizeof(RecommendedLink *),"ptrbuf"))
return NULL;
char *p = tmpBuf.getBufStart();
char *pend = tmpBuf.getBuf();
int32_t numPtrs = 0;
for ( ; p < pend ; ) {
RecommendedLink *ri = (RecommendedLink *)p;
ptrBuf.pushPtr ( ri );
p += sizeof(RecommendedLink);
// we have no title or url at this point...
if ( ri->getSize() != sizeof(RecommendedLink) ) {
char *xx=NULL;*xx=0; }
numPtrs++;
}
// now sort!
RecommendedLink **ptrs = (RecommendedLink **)ptrBuf.getBufStart();
gbqsort ( ptrs ,
numPtrs ,
sizeof(RecommendedLink *),
riCmp,
m_niceness );
// copy over the top recommended links into permanent buffer in order
// of score
int32_t need2 = tmpBuf.length();
// increase for storing titles/urls into here
need2 = numPtrs * sizeof(RecommendedLink);
// allocate that now
if ( ! m_recommendedLinksBuf.reserve ( need2 ,"rlkbuf") ) return NULL;
// and copy over from tmpBuf, sorted by the score
for ( int32_t i = 0 ; i < numPtrs ; i++ )
m_recommendedLinksBuf.safeMemcpy(ptrs[i],
sizeof(RecommendedLink));
// this can be really huge! > 30MB
tmpBuf.purge();
// free the ptrs too!
ptrBuf.purge();
//
// now m_recommendedLinksBuf is a bunch of RecommendedLinks sorted
// by score. now use msg20 to lookup the top 300 or so that
// do not link to our main doc
//
m_msg20Phase = true;
return lookupTitles ();
}
//static void gotLinkerTitleWrapper ( void *state ) {
// Msg20 *msg20 = (Msg20 *)state;
// XmlDoc *THIS = (XmlDoc *)msg20->m_state2;
// THIS->gotLinkerTitle ( msg20 );
// THIS->m_masterLoop ( THIS->m_masterState );
//}
SafeBuf *XmlDoc::lookupTitles ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// none have a title/url following them in here yet
int32_t numLinkers = m_recommendedLinksBuf.length();
numLinkers /= sizeof(RecommendedLink);
if ( ! m_msg20Array.length() ) {
int32_t need = numLinkers * sizeof(Msg20);
if ( ! m_msg20Array.reserve ( need,"m20arr" ) )
return (SafeBuf *)-1;
// do not re-call!
m_msg20Array.setLength(need);
char *p = m_msg20Array.getBufStart();
char *pend = p + need;
for ( ; p < pend ; p += sizeof(Msg20) )
((Msg20 *)p)->constructor();
}
Msg20 *msg20s = (Msg20 *)m_msg20Array.getBufStart();
// one per linker
int32_t numMsg20s = numLinkers;
// we can use the array model because each element is fixed size
// because they do not have the url/title string following them
// yet...
char *ppp = m_recommendedLinksBuf.getBufStart();
RecommendedLink *ptr = (RecommendedLink *)ppp;
// scan the msg20s we allocated to see if any got a reply
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
// int16_tcut
Msg20 *msg20 = &msg20s[i];
// skip if never launched
if ( ! msg20->m_launched ) continue;
// skip if it is in progress, awaiting its reply
if ( msg20->m_inProgress ) continue;
// ok, it has a reply. could be NULL if g_errno was set.
if ( ! gotLinkerTitle ( msg20 ) )
m_recommendedLinkError = g_errno;
// reset it for later us... or not...
msg20->reset();
}
//
// call a msg20 on each recommendedlink to get url/title and
// see if it links to any url on our main url's site/domain
//
for ( ; m_titleCursor < numLinkers ; m_titleCursor++ ) {
// bail?
if ( m_numMsg20sOut - m_numMsg20sIn > 60 )
break;
// stop launching if got enough
if ( m_numValidMsg20s >= MAX_RECOMMENDED_LINKS )
break;
// cast it
RecommendedLink *rl = &ptr[m_titleCursor];
// get avail msg20
int32_t i; for ( i = 0 ; i < 100 ; i++ ) {
if ( msg20s[i].m_inProgress ) continue;
break;
}
// sanity!
if ( i >= 100 ) { char *xx=NULL;*xx=0; }
// look it up
Msg20 *msg20 = &msg20s[i];
// make request
Msg20Request req;
req.m_docId = rl->m_rl_docId;
//req.m_state = msg20;
req.m_state = m_masterState;//this;
req.m_callback2 = m_masterLoop;//gotLinkerTitleWrapper;
//req.ptr_coll = cr->m_coll;
//req.size_coll = gbstrlen(cr->m_coll)+1;
req.m_collnum = cr->m_collnum;
req.m_expected = true;
req.m_niceness = m_niceness;
// do not get summary stuff. too slow.
req.m_numSummaryLines = 0;
// if it has an outlink to our site/domain set
// Msg20Reply::m_hasLinkToOurDomOrHost
req.m_ourHostHash32 = getHostHash32a();
req.m_ourDomHash32 = getDomHash32();
// store cursor in msg20 itself so we know what rd it's using
msg20->m_hack2 = m_titleCursor;
// assume outstanding
m_numMsg20sOut++;
// debug
//log("seo: DEBUG: launching msg20 d=%"INT64"",req.m_docId);
// get it. continue if blocked
if ( ! msg20->getSummary ( &req ) ) continue;
// error?
if ( ! gotLinkerTitle ( msg20 ) )
m_recommendedLinkError = g_errno;
// save mem
msg20->reset();
}
// wait for all to return?
if ( m_numMsg20sOut > m_numMsg20sIn )
return (SafeBuf *)-1;
// we called gotLinkerTitle() on all msg20s, so destroy them
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
// int16_tcut
Msg20 *msg20 = &msg20s[i];
// free
msg20->destructor();
}
// and free the lot of them
m_msg20Array.purge();
// now revert back
m_recommendedLinksBuf.stealBuf ( &m_newLinkerBuf );
// . this is an array of Inlinks
// . shit, but we need to add a count of how many related docids
// had the inlink, and what the weight or score of it was
// . it should be based on the weights/scores of the related docids
// . maybe just hijack "Inlink::m_numUniqueIPs" or something
// . crap, we also need to store the RelatedDocIds, i guess we
// could store a list of offsets to them in m_relatedDocIdBuf
m_recommendedLinksBufValid = true;
// store in cachedb. if it blocks return -1. bufvalid is set to
// true so when this function is re-entered it should return
// the safebuf ptr right away.
if ( ! storeRecommendedLinksBuf () )
return (SafeBuf *)-1;
return &m_recommendedLinksBuf;
}
// returns false and sets g_errno on error
bool XmlDoc::gotLinkerTitle ( Msg20 *msg20 ) {
// count it as returned
m_numMsg20sIn++;
// debug
//log("seo: DEBUG: got msg20 reply");
// get the recommendedlink for this (titleCursor)
char *vvv = m_recommendedLinksBuf.getBufStart();
RecommendedLink *rptrs = (RecommendedLink *)vvv;
int32_t titleCursor = msg20->m_hack2;
RecommendedLink *rl = &rptrs[titleCursor];
// sanity
if ( titleCursor < 0 ) {char *xx=NULL;*xx=0;}
// not found?
if ( g_errno ) {
log("seo: lookuptitles: %s",mstrerror(g_errno));
// ignore
g_errno = 0;
return true;
}
// get reply
Msg20Reply *reply = msg20->getReply();
// skip if linked to our site!
if ( reply->m_hasLinkToOurDomOrHost ) {
if ( m_seoDebug >= 2 )
log("seo: inlinker %s links to our "
"domain. ignoring.",
reply->ptr_ubuf);
return true;
}
// or if banned/filtered.. then skip
if ( reply->m_errno ) {
if ( m_seoDebug >= 2 )
log("seo: inlinker %s had error: %s",
reply->ptr_ubuf,
mstrerror(reply->m_errno));
return true;
}
// wtf?
if ( reply->size_ubuf <= 1 ) {
return true;
}
// set basic info
rl->m_rl_siteRank = reply->m_siteRank;
rl->m_rl_firstIp = reply->m_firstIp;
// sanity
if ( rl->m_rl_docId != reply->m_docId ) { char *xx=NULL;*xx=0; }
char *title = reply->ptr_tbuf;
int32_t titleSize = reply->size_tbuf;
if ( titleSize == 0 ) {
title = "\0";
titleSize = 1;
}
// debug
//log("seo: DEBUG: got VALID msg20 reply #%"INT32"",m_numValidMsg20s);
// count as valid
m_numValidMsg20s++;
rl->m_urlSize = reply->size_ubuf;
rl->m_titleSize = titleSize;
if ( ! m_newLinkerBuf.safeMemcpy ( rl , sizeof(RecommendedLink) ) )
return false;
if ( ! m_newLinkerBuf.safeMemcpy ( reply->ptr_ubuf,reply->size_ubuf))
return false;
if ( ! m_newLinkerBuf.safeMemcpy ( title , titleSize ) )
return false;
// i guess we are done then
return true;
}
/*
// returns false if blocked, true otherwise. sets g_errno on error
bool XmlDoc::printRecommendedLinksBuf ( SafeBuf *sb ) {
SafeBuf *recBuf = getRecommendedLinksBuf();
if ( ! recBuf ) return true;
if ( recBuf == (void *)-1 ) return false;
int32_t count = 1;
char *p = recBuf->getBufStart();
char *pend = recBuf->getBuf ();
for ( ; p < pend ; ) {
// cast it
RecommendedLink *ri = (RecommendedLink *)p;
// skip it
p += ri->getSize();
// print it out
sb->safePrintf("%"INT32") %.04f %s | %s<br>"
,count++
,ri->m_totalRecommendedScore
,ri->getUrl(recBuf)
,ri->getTitle(recBuf)
);
}
return true;
}
*/
// . use Msg25::m_numReplyPtrs and Msg25::m_replyPtrs[i] to access the
// Msg20s of the inlinks
// . NOT the same as getLinkInfo() because this does not filter out the
// "bad" inlinks, it gets everything and keeps the full Msg20Replies!!
Msg25 *XmlDoc::getAllInlinks ( bool forSite ) {
// if valid, return it now
if ( forSite && m_tempMsg25SiteValid )
return m_tempMsg25Site;
if ( ! forSite && m_tempMsg25PageValid )
return m_tempMsg25Page;
Msg25 *myMsg25 ;
if ( forSite ) myMsg25 = m_tempMsg25Site;
else myMsg25 = m_tempMsg25Page;
int32_t *ipp = getIp();
if ( ! ipp || ipp == (void *)-1 ) return (Msg25 *)ipp;
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Msg25 *)d;
char *site = getSite ();
if ( ! site || site == (char *)-1 ) return (Msg25 *)site;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
Url *fu = getFirstUrl();
// make a new one
if ( ! myMsg25 ) {
Msg25 *msg25 = NULL;
try { msg25 = new ( Msg25 ); }
catch ( ... ) {
g_errno = ENOMEM;
log("xmldoc: linksrc error2 = %s",mstrerror(g_errno));
m_hadLinkInfoError = g_errno;
}
mnew ( msg25, sizeof(Msg25),"m25li");
// record it for freeing/deleting later
if ( forSite ) m_tempMsg25Site = msg25;
else m_tempMsg25Page = msg25;
// reference it
myMsg25 = msg25;
}
int32_t type ;
if ( forSite ) type = cr_Msg25SiteInfo;
else type = cr_Msg25PageInfo;
// get list
RdbList *myList;
if ( forSite ) myList = &m_siteReplyList;
else myList = &m_pageReplyList;
int32_t uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
// first check cachedb!
bool checkIt = false;
if ( forSite && ! m_checkedCachedbForSite ) checkIt = true;
if ( ! forSite && ! m_checkedCachedbForPage ) checkIt = true;
if ( checkIt ) {
// do not repeat
if ( forSite ) m_checkedCachedbForSite = true;
else m_checkedCachedbForPage = true;
// use 0 for content hash since the link info is independent
// of your page's or site's content
key_t sk = g_cachedb.makeStartKey2 ( uh32 , 0 , type );
key_t ek = g_cachedb.makeEndKey2 ( uh32 , 0 , type );
// . get it from the appropriate host
// . get cachedb rec for all types of safebufs for this
// url/content
// . then we will set safebufs based on what recs we find
// in the returned list
if ( ! m_msg0.getList ( -1, // hostid
0 , // ip
0 , // port
0 , // maxcacheage
false, // addtocache?
RDB_CACHEDB,
cr->m_collnum ,
myList, // &m_cacheList,
(char *)&sk ,
(char *)&ek ,
30000000, // minrecsizes 30MB
m_masterState,
m_masterLoop,
m_niceness ) )
// blocked?
return (Msg25 *)-1;
}
Msg20Reply *reply;
// even if it had 0 msg20replies, list should be non-zero length
if ( ! myList->isEmpty() ) {
// get # replies
char *p = myList->getList();
// first is key
p += 12;
// then datasize
p += 4;
// then # msg20 replies
int32_t numReplies = *(int32_t *)p;
p += 4;
myMsg25->m_numReplyPtrs = numReplies;
// do not free any replies, they reference into m_pageList
myMsg25->m_ownReplies = false;
// loop over replies
for ( int32_t i = 0 ; i < numReplies ; i++ ) {
// get reply size
int32_t replySize = *(int32_t *)p;
p += 4;
// reply itself
reply = (Msg20Reply *)p;
// reconstruct ptrs from the offsets relative
// to start of "reply"
int32_t used = reply->deserialize();
if ( used < 0 ) {
log("xmldoc: reply deserialize error");
g_errno = ECORRUPTDATA;
return NULL;
}
// skip reply
p += replySize;
// store it
myMsg25->m_replyPtrs[i] = reply;
}
// validate!
if ( forSite ) m_tempMsg25SiteValid = true;
else m_tempMsg25PageValid = true;
// all done!
return myMsg25;
}
bool *calledItPtr ;
if ( forSite ) calledItPtr = &m_calledMsg25ForSite;
else calledItPtr = &m_calledMsg25ForPage;
// ok, get it the hard way
// send out the request now
if ( ! *calledItPtr ) {
// do not re-call!
*calledItPtr = true;
// call it now
if ( ! myMsg25->getLinkInfo2( site,
fu->getUrl() , // url
false , // isSiteLinkInfo?
*ipp,
*d, // docid
m_collnum,//cr->m_coll,
NULL, // qbuf
0, // qbufSize
m_masterState, // state
m_masterLoop, // callback
false, // isInjecting?
false, // pbuf (for printing)
//this, // xd holder (Msg25::m_xd
false, // printInXml
// this is irrelevant since we
// are getting all inlinks:
0, // siteNumInlinks, irrelevant
NULL, // oldlinkinfo
m_niceness,
true, // doLinkSpamCheck?
true, // onevoteperip. unused?
false,// can be cancelled?
0, // lastupdatetime
// !!!!!!!!!!
// we want all!!!!!!!!!!!!!!!!!!!
// !!!!!!!!!!
false ,//onlyneedgoodinlinks?
false,//getlinkertitles?
0, // ourhosthash32 (special)
0, // ourdomhash32 (special)
&m_myTempLinkInfoBuf ) )
// blocked?
return (Msg25 *)-1;
}
// validate it so when msg1 below returns and calls this function
// again at the top we return the ptr right away
if ( forSite ) m_tempMsg25SiteValid = true;
else m_tempMsg25PageValid = true;
// serialize the msg20 reply ptrs into a buf for list
SafeBuf listBuf;
// compute datasize
int32_t dataSize = 0;
// # of replies
dataSize += 4;
// each reply
for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
// reply size
dataSize += 4;
// reply data
//dataSize += myMsg25->m_replySizes[i];
// we can't use replySizes[i] because Linkdb.cpp will
// MODIFY the msg20 requests to add ptr_note/size_note
reply = myMsg25->m_replyPtrs[i];
// so we have to calculate the new serialized size
dataSize += reply->getStoredSize();
}
// how much to reserve?
int32_t need = sizeof(key_t) + 4 + dataSize;
// reserve that space!
if ( ! listBuf.reserve ( need ,"listbuf" ) ) {
// just ignore error
g_errno = 0;
// and return
if ( forSite ) return m_tempMsg25Site;
else return m_tempMsg25Page;
}
// make key for it, contenthash is 0, since it is irrelevant
key_t kk = g_cachedb.makeKey ( uh32 , 0 , type );
// store key
listBuf.safeMemcpy ( &kk , sizeof(key_t) );
// store datasize
listBuf.pushLong ( dataSize );
// # of replies
listBuf.pushLong ( myMsg25->m_numReplyPtrs );
// store each reply then
for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
// get reply
reply = myMsg25->m_replyPtrs[i];
// . how many bytes to store the MODIFIED msg20reply?
// . Linkdb.cpp adds the ptr_note AFTER it receives all replies
// so we can't just use Msg25::m_replySizes[i]
int32_t replySize = reply->getStoredSize();
listBuf.pushLong ( replySize );
// store that
int32_t stored = reply->serialize ( listBuf.getBuf() ,
listBuf.getAvail() );
// skip that
listBuf.incrementLength ( stored );
// sanity
if ( stored != replySize ) { char *xx=NULL;*xx=0; }
}
// sanity
if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
// make the list to add to cachedb
RdbList storeList;
key_t startKey = g_cachedb.makeStartKey2 ( uh32, 0 , type );
key_t endKey = g_cachedb.makeEndKey2 ( uh32, 0 , type );
m_storeList.set ( listBuf.getBufStart() ,
listBuf.length() ,
listBuf.getBufStart() , // alloc
listBuf.getCapacity(), // allocsize
startKey,
endKey,
-1, // fixeddatasize
true, // owndata?
false ); // use half keys?
// disconnect it from safebuf so it doesn't get freed
listBuf.detachBuf();
//m_storeList.printList();
QUICKPOLL(m_niceness);
char *tt ;
if ( forSite ) tt = "site";
else tt = "page";
log("xmldoc: adding msg20%slinkreplies list of %"INT32" bytes to cachedb",
tt,m_storeList.m_listSize);
// returns false if it blocks, true otherwise
if ( ! m_msg1.addList ( &m_storeList,
RDB_CACHEDB ,
cr->m_collnum,
m_masterState,
m_masterLoop,
false, // forcelocal?
m_niceness ) )
// blocked?
return (Msg25 *)-1;
if ( forSite ) return m_tempMsg25Site;
else return m_tempMsg25Page;
}
// . returns false and sets g_errno on error
// . sets RelatedDocId::m_relatedWeight
// . when printing the competitor pages, we sort by this, highest first
// 1. then scan the list of queries for each related docid
// 2. determine each of those matching queries weights
// 3. add up the weights and set RelatedDocId::m_relatedWeight to that
bool XmlDoc::setRelatedDocIdWeightAndRank ( RelatedDocId *rd ) {
// get our site hash
int32_t *shp = getSiteHash32();
if ( ! shp ) return false;
if ( shp == (int32_t *)-1 ) { char *xx=NULL;*xx=0; }
if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
int32_t mainUrlSiteRank = getSiteRank();
// max queries
int32_t nc = rd->m_numCommonQueries;
int32_t unit = 0;
unit += sizeof(float);
//unit += sizeof(Msg99Reply *);
unit += sizeof(Query);
unit += sizeof(HashTableX);
unit += sizeof(QueryNumLinkedNode *);
int32_t need = nc * unit;
char *mem = (char *)mmalloc ( need , "qrybuf" );
if ( ! mem ) {
log("seo: failed to set related docid weight: %s",
mstrerror(g_errno));
return false;
}
char *p = mem;
float *queryWeights = (float *)p;
p += nc * sizeof(float);
//Msg99Reply **replyPtrs = (Msg99Reply **)p;
//p += nc * sizeof(Msg99Reply *);
Query *queries = (Query *)p;
p += nc * sizeof(Query);
QueryNumLinkedNode **qnPtrs = (QueryNumLinkedNode **)p;
p += nc * sizeof(QueryNumLinkedNode *);
HashTableX *htables = (HashTableX *)p;
p += nc * sizeof(HashTableX);
// sanity
if ( p != mem + need ) { char *xx=NULL;*xx=0; }
// initialize the mem
for ( int32_t i = 0 ; i < nc ; i++ ) {
queryWeights[i] = 1.0;
qnPtrs[i] = NULL;
queries[i].constructor();
htables[i].constructor();
}
// total pages indexed!
//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
float totalWeight;
// get matching queries
//SafeBuf *qpbuf = getMatchingQueriesScored();
//if ( ! qpbuf || qpbuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
// cast it
//Msg99Reply **qptrs=(Msg99Reply **)qpbuf->getBufStart();
SafeBuf *mq = getMatchingQueryBuf();
if ( mq == NULL || mq == (void *)-1 ) { char *xx=NULL;*xx=0; }
int32_t nks = mq->length() / sizeof(QueryLink);
QueryLink *qks = (QueryLink *)mq->getBufStart();
// print the queries in common!
int32_t firstOff = rd->m_firstCommonQueryNumOff;
int32_t offset = firstOff;
int32_t qc = 0;
//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
// this is fixed at the time we set QueryLogEntry::m_numResultsInSlice
int64_t numPagesIndexed = 1114000000;
int64_t point0 = numPagesIndexed / 119LL;
int64_t point1 = numPagesIndexed / 15LL;
// loop over the query/score pairs this related docid matched
for ( ; offset >= 0 ; qc++ ) {
// get that node
char *buf = m_commonQueryNumBuf.getBufStart();
// and offset
buf += offset;
// then cast
QueryNumLinkedNode *qn;
qn = (QueryNumLinkedNode *)buf;
// advance. will be -1 when done
if ( qn ) offset = qn->m_nextOff;
else offset = -1;
// get #qn into there
//Msg99Reply *rp = qptrs[qn->m_queryNum];
if ( qn->m_queryNum < 0 || qn->m_queryNum >= nks ) {
char *xx=NULL;*xx=0; }
QueryLink *qk = &qks[qn->m_queryNum];
QueryLogEntry *qe ;
qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
char *qstr = qe->getQueryString();
qnPtrs[qc] = qn;
// save ptrs too
//replyPtrs[qc] = rp;
// get main url score for query
//float mainUrlScore = rp->m_myScore;
int32_t mainUrlSiteHash26 = m_siteHash32;
// seems like clusterdb masks them a bit in
// Clusterdb::getSiteHash()
mainUrlSiteHash26 &= 0x03ffffff;
int32_t mainUrlRank = -1;
int32_t rdRank = -1;
//float mainUrlSerpScore = -1.0;
// . the relateddocidnumhack
// . this is used as the topdocidnum # in the case of
// m_matchingQueryBuf (doMatchingQueries)
int32_t tdnum = qk->m_relatedDocIdNum;
TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
int32_t maxnum = m_topDocIdsBuf.length()/sizeof(TopDocIds);
if ( tdnum < 0 || tdnum >= maxnum ) { char *xx=NULL;*xx=0; }
TopDocIds *td = &tds[tdnum];
// assume none
//float rdScore = 0.0;
// find docid for this related docid
//TopDocIds *td = rp->getTopDocIds(&m_topDocIdsBuf);
int32_t nd = td->m_numDocIds;
for ( int32_t y = 0 ; y < nd ; y++ ) {
// if we first encounter a result from the same
// site as the main url then stop! you don't get
// the 10x bonus then!
if ( td->m_topSiteHashes26[y] == mainUrlSiteHash26 &&
mainUrlRank == -1 ) {
//mainUrlSerpScore = td->m_topScores[y];
mainUrlRank = y;
}
// set our score?
if ( td->m_topDocIds[y] == rd->m_docId ) {
//rdScore = td->m_topScores[y];
rdRank = y;
}
}
// these should always be set! even if not ranked in the
// top 300 because of our new logic using msg4f in
// getRelatedDocIdsScored()
float rdScore = qn->m_relatedDocIdSerpScore;
float mainUrlSerpScore = qk->m_serpScore;
bool better = false;
// give it a weight of 10 if higher-scoring!
//if ( rdRank < mainUrlRank ) better = true;
if ( rdScore >= mainUrlSerpScore ) better = true;
// if your site not in top 300 or so, and he is, he's better
//if ( mainUrlRank == -1 && rdRank >= 0 ) better = true;
// this is the specific url, not the SITE, like
// mainUrlRank is, for the entire site
//if ( rdScore > mainUrlScore ) better = true;
// how many search results does this query have total?
int64_t numResults = qe->m_numTotalResultsInSlice;
// fix it to be global
numResults *= (int64_t)g_hostdb.getNumShards();
// big indexes did the "slice logic" restricting docid
// range to MAX_DOCID * .10 when setting this!
if ( numPagesIndexed > 10000000 ) numResults *= 10;
////////////////////
//
// Scoring is what we do when the number of combinations
// it too high to effectively compute. - matt
//
////////////////////
// lower from 10 so google still won't dominate generic qyries?
// crap, at 2.0 gigablast.com had bad competitors because
// they all matc queries with gigablast in them.
// i put it down from 30.0 to 5.0 to fix chessusa.com
// who was getting bad competitor pages that had just
// 'ccc' matching non-generic queries having them come up too
// high of score.
//if ( better )
// queryWeights[qc] = 1.0;//30.0;//100.0; // 10.0;
//
// do not give related docid query that has YOUR brand in it
// much weight. we do not want it talking about you, because
// it is a competitor.
//
// PROBLEM: "cheatcodes.com"'s brand is descriptive!
//
// . if not generic and it beats YOU, give more!
// . try to fix ibm.com gigablast.com seomoz.org ahrefs.com
// that suffer because of matching their brand. actually
// maybe only do this if seomoz.org matches this query
// with their link text only...??? thus, pages that contain
// "seo moz" will match the "seo moz" query but will gain
// RELATIVELY little because they can't be seomoz.org on it.
// . crap though this will hurt chessusa.com right?? try again
// since algo changed a lot since then
bool isBrand = true;
// if other guy ranks better than you, probably not
// your brand, or if it is, it could be his brand too?
if ( better ) // && numResults < point0 )
isBrand = false;
// or if you are not in the top 100 it is probably not
// your brand name either!
if ( mainUrlRank == -1 )
isBrand = false;
// fix chessusa.com for 'chess' by lowering from 100 to 20...
if ( mainUrlRank >= 20 )
isBrand = false;
// fix 'corporation' for ibm.com. it is too generic to
// be a brand. on our 1.1B page index, point0 is like 9.3M.
// 'ibm' is 5.5M, 'corporation' is 25M,...
if ( numResults >= point0 )
isBrand = false;
// or for ibm.com ... or other pages with high siteranks,
// your brand queries should be in the top 10!! otherwise,
// ibm has so many other matching queries in the top 100 that
// are not brands for it because its siterank is so high.
if ( mainUrlSiteRank >= 10 && mainUrlRank >= 10 )
isBrand = false;
// top 5 for brands in siterank 11 sites
if ( mainUrlSiteRank >= 11 && mainUrlRank >= 5 )
isBrand = false;
// . good competitors will be in top 30 for a query
// . let's keep in mind though that we use these competitors
// to find backlinks AND to generate related terms, so
// it's not so important that they dominate a query, but
// rather that they match your content...
/*
if ( better &&
numResults < point0 &&
rdRank >= 0 &&
rdRank < 20 )
queryWeights[qc] *= 1.2;//50.0;
// top ten???
if ( better &&
numResults < point0 &&
rdRank >= 0 &&
rdRank < 10 )
queryWeights[qc] *= 1.3;//51.0;
// top 5?
if ( better &&
numResults < point0 &&
rdRank >= 0 &&
rdRank < 5 )
queryWeights[qc] *= 1.4;//52.0;
*/
// weight it by how relevant the query it matches is to us
//if ( better && numResults < point0 )
// queryWeights[qc] = (qk->m_serpScore / 1000000.0);
//
// generic query?
//
float weight = 1.0;
if ( numResults < point0 ) weight = 100.0;
else if ( numResults < point1 ) weight = 10.0;
queryWeights[qc] *= weight;
//
// weight by related docid's serp score
//
float ss = qk->m_serpScore;
float w2 = 1.0;
if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M
else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M
else if ( ss > 100000.0 ) w2 = 6.0; // > 100k
else if ( ss > 10000.0 ) w2 = 5.0; // > 10k
else if ( ss > 1000.0 ) w2 = 4.0; // > 1k
else if ( ss > 100.0 ) w2 = 3.0; // > 100
else if ( ss > 10.0 ) w2 = 2.0; // > 10
queryWeights[qc] *= w2;
//
// weight by main url's serp score as well!
//
ss = mainUrlSerpScore;//qk->m_serpScore;
w2 = 1.0;
if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M
else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M
else if ( ss > 100000.0 ) w2 = 6.0; // > 100k
else if ( ss > 10000.0 ) w2 = 5.0; // > 10k
else if ( ss > 1000.0 ) w2 = 4.0; // > 1k
else if ( ss > 100.0 ) w2 = 3.0; // > 100
else if ( ss > 10.0 ) w2 = 2.0; // > 10
queryWeights[qc] *= w2;
// punish query weight if it is your brand most likely
//if ( isBrand )
// queryWeights[qc] = 0.01;
// . store related docid rank and your rank
// . then we do not need cache m_topDocIdsBuf and seo.cpp
// has this info readily available.
qn->m_relatedDocIdRank = rdRank;
qn->m_mainUrlRank = mainUrlRank;
//qn->m_mainUrlSerpScore = mainUrlSerpScore;
/*
int64_t numResults = qe->m_numTotalResultsInSlice;
// fix it to be global
numResults *= (int64_t)g_hostdb.getNumGroups();
// big indexes did the "slice logic" restricting docid
// range to MAX_DOCID * .10 when setting this!
if ( numPagesIndexed > 10000000 ) numResults *= 10;
// fix divide by zero and make all rare queries similar weight
//if ( numResults < 1000 ) numResults = 1000;
// divide by # results query has so more generic stuff
// is down weighted
//queryWeights[qc] /= (float)numResults;
if ( numResults < 1000 )
queryWeights[qc] /= 1;
else if ( numResults < 10000 )
queryWeights[qc] /= 2;
else if ( numResults < 100000 )
queryWeights[qc] /= 4;
else if ( numResults < 1000000 ) // 1M
queryWeights[qc] /= 8;
else if ( numResults < 10000000 ) // 10M
queryWeights[qc] /= 16;
else if ( numResults < 10000000 ) // 100M
queryWeights[qc] /= 32;
else
queryWeights[qc] /= 64;
*/
//int32_t qlen = gbstrlen(qstr);
// int16_tcuts
Query *qp = &queries[qc];
HashTableX *ht = &htables[qc];
// this is currently a int64_t bit vector
int32_t vs = sizeof(qvec_t);
if ( ! ht->set ( 8,vs,128,NULL,0,false,m_niceness,"wbvbuf") )
// hopefully g_errno is preserved
goto done;
// if unknown use english so pandora's -> pandora,pandoras?
// because 'pandora's tower' was not matching
// 'pandoras tower' because both words could have been
// english or german, thus the queries were thought to be
// independent! giving rise to high-scoring competitive pages
// that matched only those two queries.
uint8_t qlangId = qe->m_langId;
if ( ! qlangId ) qlangId = langEnglish;
qp->set2 ( qstr , qlangId , true );
// hash it up
for ( int32_t i = 0 ; i < qp->m_numTerms ; i++ ) {
// int16_tcut
QueryTerm *qt = &qp->m_qterms[i];
// bigrams imply 2 explicit bits, one from each term
// in the bigram. synonym terms should share the same
// bit as the term they are a synonym of
int64_t bits = qt->m_implicitBits;
// . add bit vec. use rawTermId?
// . hash to wordbit vector of query words contained
if ( ! ht->addKey ( &qt->m_termId , &bits ) )
goto done;
}
}
// . set the dup flags!
// . scan queries related docid matches
for ( int32_t i = 0 ; i < qc ; i++ ) {
// get it
Query *qpi = &queries[i];
HashTableX *hti = &htables[i];
// scan all queries above
for ( int32_t j = i+1 ; j < qc ; j++ ) {
// reset
bool jIsSubQueryOfi = false;
bool iIsSubQueryOfj = false;
// skip ourselves
//if ( j == i ) continue;
// get it
Query *qpj = &queries[j];
HashTableX *htj = &htables[j];
// scan every query term in query #j and map each
// termid to the term bit vector that indicates what
// terms query #j has in query #i.
qvec_t totalVec = 0LL;
// is it a dup?
for ( int32_t k = 0 ; k < qpj->m_numTerms ; k++ ) {
// int16_tcut
QueryTerm *qt = &qpj->m_qterms[k];
// see if in there
char *val ;
val = (char *)hti->getValue(&qt->m_termId);
if ( ! val ) continue;
// get implied term bits
qvec_t vec = *(qvec_t *)val;
// this is the termbit vector for query #i.
// it tells us what terms query #j shares.
totalVec |= vec;
}
// we only care about "required" terms. i.e. bigrams
// are essentially ignored if not in quotes.
totalVec &= qpi->m_requiredBits;
// how many words do we match?
if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
int32_t numSharedWithQueryi = getNumBitsOn64(totalVec);
// how many required bits does it have?
int32_t needi = getNumBitsOn64(qpi->m_requiredBits);
// if all terms in query #i are in query #j then subset
if ( numSharedWithQueryi == needi )
iIsSubQueryOfj = true;
//
// now go the other way
//
totalVec = 0LL;
// is it a dup?
for ( int32_t k = 0 ; k < qpi->m_numTerms ; k++ ) {
// int16_tcut
QueryTerm *qt = &qpi->m_qterms[k];
// see if in there
char *val;
val = (char *)htj->getValue(&qt->m_termId);
if ( ! val ) continue;
// get implied term bits
qvec_t vec = *(qvec_t *)val;
// this is the termbit vector for query #j.
// it tells us what terms query #i shares.
totalVec |= vec;
}
// we only care about "required" terms. i.e. bigrams
// are essentially ignored if not in quotes.
totalVec &= qpj->m_requiredBits;
// how many words do we match?
if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
int32_t numSharedWithQueryj = getNumBitsOn64(totalVec);
// how many required bits does it have?
int32_t needj = getNumBitsOn64(qpj->m_requiredBits);
// if all terms in query #i are in query #j then subset
if ( numSharedWithQueryj == needj )
jIsSubQueryOfi = true;
// now set dup bit if query #i is same as query #j
// taking into account "missing spaces" so that we
// have two terms in one query , and their bigram
// in the other query. OR we have synonyms. OR we
// have differences of "ignored" words.
// "leg" = "legs"
// "cheat code" = "cheatcodes"
// "the tigers" = "tigers"
if(jIsSubQueryOfi&&
iIsSubQueryOfj&&
queryWeights[j]>.02){
// debug?
if ( m_seoDebug >= 2 )
log("seo: %s ISDUPOF %s",
qpj->m_orig,
qpi->m_orig);
// the dup weight is .02
queryWeights[j] *= .1; // = .02
}
// proper subquery examples:
// "leg" is subquery of "nice legs"
else if ( jIsSubQueryOfi &&
! iIsSubQueryOfj &&
queryWeights[j] > .05 ) {
// debug?
if ( m_seoDebug >= 2 )
log("seo: %s SUBQUERYOF %s",
qpj->m_orig,
qpi->m_orig);
// the subquery weight is .05
queryWeights[j] *= 0.1; // = 5.0;//.05;
}
// is query #i a PROPER subquery of query #j
else if ( iIsSubQueryOfj &&
! jIsSubQueryOfi &&
queryWeights[i] > .05 ) {
// debug?
if ( m_seoDebug >= 2 )
log("seo: %s SUBQUERYOF %s",
qpi->m_orig,
qpj->m_orig);
// the subquery weight is .05
// increase to 5.0 to try to drown out the
// anomaly queries promoting poker sites
// for cheatcodes.com competitors
queryWeights[i] *= 0.1; // = 5.0;//.05;
}
else {
// debug?
//if ( debug )
//log("seo: %s UNRELATEDTO %s",
// qpi->m_orig,
// qpj->m_orig);
}
}
}
// scan the queries again and add up their weights this time!
totalWeight = 0.0;
for ( int32_t i = 0 ; i < qc ; i++ ) {
totalWeight += queryWeights[i];
qnPtrs[i]->m_queryScoreWeight = queryWeights[i];
//Msg99Reply *ptr = replyPtrs[i];
Query *qp = &queries[i];
char *qstr = qp->m_orig;//ptr->m_queryStr;
// log it
if ( m_seoDebug >= 2 )
log("seo: docid=%"INT64" weight=%f qry=%s",
rd->m_docId,
queryWeights[i],
qstr);
}
// that is the docid related weight now
rd->m_relatedWeight = totalWeight;
done:
for ( int32_t i = 0 ; i < nc ; i++ ) {
queries[i].destructor();
htables[i].destructor();
}
mfree ( mem , need , "qrybuf" );
return true;
}
// returns false and sets g_errno on error
bool XmlDoc::addRelatedDocIdInfo ( int64_t docId ,
int32_t queryNum ,
float score ,
int32_t rank ,
int32_t siteHash26 ) {
// do not add if does not match the query
if ( score <= 0.0 ) return true;
// alloc space if first time calling
if ( ! m_rdtab.m_numSlots ) {
if ( ! m_rdtab.set(8,sizeof(RelatedDocId),1024,NULL,0,
false,0,"rdtab"))
return false;
}
// get the related docid as it exists in m_relatedDocIdBuf
RelatedDocId *rd = NULL;
// now we also store these for intersecting
// in phase 2 to see what urls are most
// similar to us
int32_t slot = m_rdtab.getSlot(&docId);
// if not there, add it
if ( slot < 0 ) {
// make one
RelatedDocId rdx;
// the most important thing is the docid!
rdx.m_docId = docId;
// and now the 32-bit site hash
rdx.m_siteHash26 = siteHash26;
// how many search results we are in
rdx.m_numCommonQueries = 0;
// the queryImportance should be our score
// for this query divided by m_minTop50Score
// to normalize it.
//float qimp=qp->m_queryInfo.m_queryImportance;
// just add up the query importance for
// each query we share in common with main url
//rd.m_similarityScore = qip;
// now we do a dot product of this related
// docids score vector with the main url's
// score vector. both vector's are normalized
// using the score of the 1st result!
//rd.m_dotProduct = score;
// reset this
rdx.m_rd_siteRank = -1;
rdx.m_rd_langId = 255;
rdx.rd_title_off = -1;
rdx.rd_url_off = -1;
rdx.rd_site_off = -1;
// point to beginning of linked list of qrynums
rdx.m_firstCommonQueryNumOff = -1;//off;
//rdx.m_lastCommonQueryNumOff = -1;//off;
// remember offset
int32_t rdOff = m_relatedDocIdBuf.length();
// store it
m_relatedDocIdBuf.safeMemcpy ( &rdx , sizeof(RelatedDocId) );
// add OFFSET to table. data is 12 bytes
if(! m_rdtab.addKey(&docId,&rdOff)) return false;
// all done then
//continue;
// set this for adding to the linked list
char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
// cast it
rd = (RelatedDocId *)p;
}
else {
// get the data
int32_t rdOff = *(int32_t *)m_rdtab.getValueFromSlot(slot);
// point to it
char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
// cast it
rd = (RelatedDocId *)p;
}
// before we add the querynumlinkednode make sure not a dup!
char *qnbuf = m_commonQueryNumBuf.getBufStart();
// . offset of first node for this related docid
// . this is the start of his linked list of query/score nodes
int32_t firstOff = rd->m_firstCommonQueryNumOff;
// sanity
if ( firstOff == -1 && rd->m_numCommonQueries ) { char *xx=NULL;*xx=0;}
// assume no linked list
QueryNumLinkedNode *node = NULL;
// only a linked list if firstOff is not -1
if ( firstOff >= 0 ) node = (QueryNumLinkedNode *)(qnbuf + firstOff);
// scan the nodes (query/score pairs) we got for this related docid
for ( ; node ; ) {
// if this query is already in the linked list, stop! we
// do not want to add dup QueryNumLinkedNode nodes.
if ( node->m_queryNum == queryNum ) return true;
// end of linked list?
if ( node->m_nextOff == -1 ) break;
// advance to next node in linked list
node = (QueryNumLinkedNode *)(qnbuf+node->m_nextOff);
}
// store query num element in a linked list so
// we can print the actualy queryNums a related
// docid has in common with the main url
int32_t nodeOff = m_commonQueryNumBuf.length();
// we can record our rank and your rank in this!
QueryNumLinkedNode qn;
qn.m_queryNum = queryNum; // qp->m_queryNum;
qn.m_nextOff = -1;
qn.m_relatedDocIdRank = rank;
qn.m_relatedDocIdSerpScore = score;
qn.m_mainUrlRank = -1;
//qn.m_mainUrlSerpScore = -1.0;
int32_t sq = sizeof(QueryNumLinkedNode);
// point to it
if ( ! m_commonQueryNumBuf.safeMemcpy(&qn,sq) )
return false;
// point to node we stored in the buf so we can adjust it below
QueryNumLinkedNode *stored ;
stored = (QueryNumLinkedNode *)(m_commonQueryNumBuf.getBuf() - sq);
// increment the count. the # of nodes in his linked list.
rd->m_numCommonQueries++;
// continue the linked list
qnbuf = m_commonQueryNumBuf.getBufStart();
// the first node?
if ( firstOff == -1 ) {
rd->m_firstCommonQueryNumOff = nodeOff;
//rd->m_lastCommonQueryNumOff = nodeOff;
return true;
}
// get the current first
int32_t oldFirstOff = rd->m_firstCommonQueryNumOff;
//char *vv = qnbuf + rd->m_firstCommonQueryNumOff;
//QueryNumLinkedNode *first = (QueryNumLinkedNode *)vv;
// we are the new first
rd->m_firstCommonQueryNumOff = nodeOff;
// we point to old first as our next
stored->m_nextOff = oldFirstOff;
// and update that node's next link
//last->m_nextOff = nodeOff;
// and our new tail
//rd->m_lastCommonQueryNumOff = nodeOff;
return true;
}
// . safebuf returned is a buffer of QueryLinks
// . use m_matchingQueryBuf/m_matchingStringBuf
SafeBuf *XmlDoc::getMatchingQueryBuf ( ) {
setStatus ( "getmatchingqueries" );
// try to set from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
if ( m_matchingQueryBufValid )
return &m_matchingQueryBuf;
if ( ! m_beginTimeAllMatch )
m_beginTimeAllMatch = gettimeofdayInMilliseconds();
if ( m_docIdListBuf.length() == 0 )
m_docIdListBuf.pushLongLong(m_docId);
// true = doMatchingQueries?
SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , true );
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
m_matchingQueryBuf .stealBuf ( qkbuf );
m_matchingQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );
// show time
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_beginTimeAllMatch;
log("seopipe: time: getMatchingQueries took %"INT64" ms",took);
m_matchingQueryBufValid = true;
// if getRelatedQueryBuf calles getQueryLinkBuf() it should
// do a recompute, so set this to false
m_queryLinkBufValid = false;
m_docIdListBuf.purge();
// store it
if ( ! storeMatchingQueriesIntoCachedb() )
return (SafeBuf *)-1;
return &m_matchingQueryBuf;
}
// . returns safebuf of QueryLinks, representing the intersected matching
// queries of all the related docids
SafeBuf *XmlDoc::getRelatedQueryBuf () {
// try to set from cachedb record
if ( ! checkCachedb() )
return (SafeBuf *)-1;
if ( m_relatedQueryBufValid )
return &m_relatedQueryBuf;
// we need these
SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
if ( ! m_beginRelatedQueries )
m_beginRelatedQueries = gettimeofdayInMilliseconds();
if ( m_docIdListBuf.length() == 0 ) {
int32_t numRelatedDocIds = rdbuf->length()/sizeof(RelatedDocId);
// just use the top 50 for related queries for speed!
if ( numRelatedDocIds > 50 ) numRelatedDocIds = 50;
RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
RelatedDocId *rd = &rds[i];
m_docIdListBuf.pushLongLong(rd->m_docId);
}
}
// false = doMatchingQueries?
SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , false );
if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
m_relatedQueryBuf .stealBuf ( qkbuf );
m_relatedQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );
m_relatedQueryBufValid = true;
m_queryLinkBufValid = false;
m_docIdListBuf.purge();
// show time
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_beginRelatedQueries;
log("seopipe: time: getRelatedQueries took %"INT64" ms",took);
// store it
if ( ! storeRelatedQueriesIntoCachedb() )
return (SafeBuf *)-1;
return &m_relatedQueryBuf;
}
static void gotMsg8eReplyWrapper ( void *state , UdpSlot *slot ) {
XmlDoc *THIS = (XmlDoc *)state;
int32_t hostId = slot->m_hostId;
THIS->m_msg8eReply [hostId] = slot->m_readBuf;
THIS->m_msg8eReplySize[hostId] = slot->m_readBufSize;
// do not let udpserver.cpp free it, we will later
slot->m_readBuf = NULL;
log("seo: got msg8e reply #%"INT32" of %"INT32" from host #%"INT32"",
(int32_t)THIS->m_numMsg8eReplies,
(int32_t)THIS->m_numMsg8eRequests,
(int32_t)hostId);
THIS->m_numMsg8eReplies++;
// do not free send buf until last reply!
if ( THIS->m_numMsg8eReplies < THIS->m_numMsg8eRequests ) {
slot->m_sendBufAlloc = NULL;
return;
}
// ok, sendBuf will auto free in UdpServer.cpp when we return from this
THIS->m_masterLoop ( THIS->m_masterState );
}
//static void gotMsg20ReplyWrapper ( void *state ) {
// XmlDoc *THIS = (XmlDoc *)state;
// THIS->m_numMsg20Replies++;
// if ( THIS->m_numMsg20Replies < THIS->m_numMsg20Requests )
// return;
// THIS->m_masterLoop ( THIS->m_masterState );
//}
// . returned safebuf is array of QueryLinks
// . gets all matching queries from all related docids and store them
// compactly as QueryLinks, otherwise we'd run out of memory because
// each docid has like 50,000 matching queries on avg.
// . we now get matching queries in modulus parts to avoid OOM, because
// with my new changes i made we are getting like a few hundred thousand
// matching queries per related docid.
// . we do not store the query string, etc, for the QueryLink,
// just the query offset and the hostid that has the query in its
// memory (g_qbuf). after we intersect the QueryLinks we will get the
// query strings, etc. there will be a lot fewer in the intersection.
SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {
if ( m_queryLinkBufValid )
return &m_queryLinkBuf;
bool doRelatedQueries = true;
if ( doMatchingQueries ) doRelatedQueries = false;
// get the 32-bit terms the main doc matches, so we may determine
// what terms in a related query are novel to this document.
SafeBuf *mainUrlTwidBuf32 = NULL;
if ( doRelatedQueries ) {
mainUrlTwidBuf32 = getTermId32Buf() ;//InfoBuf();
if ( ! mainUrlTwidBuf32 || mainUrlTwidBuf32 == (void *)-1 )
return mainUrlTwidBuf32;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
//
// SHIT! we can't use the keys in the termlistbuf for dual purpose
// role as terms the doc contains, because they do not have the
// synonym forms!!! So we have to get this terminfobuf as wells
// as the termlistbuf for each docid!!!!
//
// so we might as well not sort by the lower 32 bit hack as well
//
//
//
// 1. get termlistbuf for each docid possibly using msg20s
//
// we need this for getting the QueryLink::m_serpScores in
// handleRequest8e
//
//
//int32_t numDocIds = docIdList->length() / 8;
//int64_t *docIds = (int64_t *)docIdList->getBufStart();
//SafeBuf *tlistBuf = NULL;
//SafeBuf *twidBuf32 = NULL;
// . we just want the termlistbuf of each related docid
// . hack: it should be sorted by the LOWER 32 bits of termid
// so handlerequest8e does not need to sort its termid32/twid32 buf
//if ( doMatchingQueries ) {
// tlistBuf = getTermListBuf();
// if ( ! tlistBuf || tlistBuf == (void *)-1 ) return tlistBuf;
// twidBuf32 = getTermId32Buf();
// if ( ! twidBuf32 || twidBuf32 == (void *)-1 ) return twidBuf32;
//}
/*
if ( doRelatedQueries && ! m_launchedAll ) {
int32_t need = sizeof(Msg20) * numDocIds;
// we also use this same buf in getRelatedDocIdsWithTitles
if ( ! m_msg20Buf.reserve ( need,"m20buf3" ) ) return NULL;
// mark it all in use
m_msg20Buf.setLength(need);
// init them
Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
// reset cursor to start with first related docid
m_numMsg20Replies = 0;
m_numMsg20Requests = 0;
// launch all!
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
// int16_tcut
Msg20 *msg20 = &mp[i];
// get current related docid
//RelatedDocId *rd = &rds[i];
// make the request
Msg20Request req;
req.ptr_coll = cr->m_coll;
req.size_coll = gbstrlen(cr->m_coll)+1;
req.m_docId = docIds[i];
req.m_expected = true;
req.m_niceness = m_niceness;
//req.m_state = m_masterState;
//req.m_callback2 = m_masterLoop;
req.m_state = this;
req.m_callback2 = gotMsg20ReplyWrapper;
// do not get summary stuff. too slow.
req.m_numSummaryLines = 0;
// get this
req.m_getTermListBuf = true;
// count these!
m_numMsg20Requests++;
// store cursor in msg20 itself so we know the rd
//msg20->m_hack2 = i;
// launch it
if ( ! msg20->getSummary ( &req ) ) continue;
// error?
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("seo: error getting termlistbuf docid=%"INT64"",
docIds[i]);
// reset it
//msg20->reset();
// count reply as back now
m_numMsg20Replies++;
}
m_launchedAll = true;
}
// wait for one reply per related docid
if ( doRelatedQueries && m_numMsg20Replies < m_numMsg20Requests )
return (SafeBuf *)-1;
*/
//
//
// 2. send one msg8e request to each host with those termlistbufs
//
// it has one termlistbuf per relateddocid, enough info
// for handlerequest8e to return the list of matching QueryLinks
// intersected for all related docids.
//
if ( m_numMsg8eRequests == 0 ) {
SafeBuf request;
// how big is the request?
int32_t need = 0;
need += 1; // for the byte flag
int32_t collLen = gbstrlen(cr->m_coll);
need += collLen + 1;
// list of docids (just one for matching queries)
need += 4;
need += docIdList->length();
// twidtable alloc
if ( doRelatedQueries ) {
need += 4;
need += mainUrlTwidBuf32->length();
}
//if ( doMatchingQueries ) {
// // just our main url's termlistbuf
// need += 4;
// need += tlistBuf->length();
// need += 4;
// need += twidBuf32->length();
//}
//
// make the 8e request
//
if ( ! request.reserve ( need ,"rep8ebuf" ) )
return NULL;
// first store flag to indicate if getting matching or
// related queries
if ( doMatchingQueries ) request.pushChar(1);
else request.pushChar(0);
// then coll\0
request.safeMemcpy ( cr->m_coll, collLen );
request.pushChar ( 0 );
// then docids after the collection name
request.pushLong ( docIdList->length() );
request.safeMemcpy ( docIdList );
// then if doing related queries we need to store our
// 32-bit twids of the main url for setting m_uniqueRound
if ( doRelatedQueries ) {
request.pushLong(mainUrlTwidBuf32->length());
request.safeMemcpy(mainUrlTwidBuf32->getBufStart(),
mainUrlTwidBuf32->length() );
}
/*
// then store each termlistbuf from each msg20
for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ) {
// int16_tcut
Msg20 *mp = &mps[i];
Msg20Reply *rep = mp->getReply();
if ( rep ) {
request.pushLong ( rep->size_tlistBuf );
request.safeMemcpy ( rep->ptr_tlistBuf ,
rep->size_tlistBuf );
// then the 32-bit termid buf with synonyms
// that the above posdblist termlists don't
// have so we can match queries
request.pushLong ( rep->size_tiBuf );
request.safeMemcpy ( rep->ptr_tiBuf,
rep->size_tiBuf );
}
// make them empty i guess
else {
request.pushLong ( 0 );
request.pushLong ( 0 );
}
}
*/
/*
// just our main url's termlistbuf
if ( doMatchingQueries ) {
request.pushLong (tlistBuf->length());
request.safeMemcpy (tlistBuf);
// then the 32-bit termid buf with synonyms that
// the above posdblist termlists don't have so
// we can match queries
request.pushLong (twidBuf32->length());
request.safeMemcpy (twidBuf32);
}
*/
// sanity
if ( request.length() != need ) { char *xx=NULL;*xx=0; }
// do not free it here, let udpserver free it
char *req = request.getBufStart();
int32_t reqSize = request.length();
request.detachBuf();
// we've formulated the 8e request, no need for msg20s anymore
//for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ){
// // int16_tcut
// Msg20 *mp = &mps[i];
// mp->destructor();
//}
// free the mem as well
//m_msg20Buf.purge();
// must be host #0 for this next algo to work
if ( g_hostdb.m_hostId != 0 ) { char *xx=NULL;*xx=0; }
//
// send msg8e request to each host. skip if dead.
//
for ( int32_t k = 1; k <= g_hostdb.m_numHosts ; k++ ) {
// breathe
QUICKPOLL(m_niceness);
// send to ourselves last so we can do all in parallel
int32_t hosti = k;
if ( k == g_hostdb.m_numHosts ) hosti = 0;
// get ptr to the host
Host *host = g_hostdb.getHost(hosti);
// get hostid of host #i
int32_t hostId = host->m_hostId;
if ( hostId != hosti ) { char *xx=NULL;*xx=0; }
// count it
m_numMsg8eRequests++;
// skip if dead. i guess no queries from that guy. we
// can't send to a twin because the twin does not have
// the same queries in its in-memory query log.
if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive) {
log("seo: skipping msg8e to dead host %"INT32"",
hostId);
m_msg8eReply [hostId] = NULL;
m_msg8eReplySize[hostId] = 0;
m_numMsg8eReplies++;
continue;
}
// . send request to him
// . reply is the query strings
// . when reply comes in we store it in the query
// string buf and make the QueryLinks reference it
// with their QueryLink::m_queryStringOffset
if ( ! g_udpServer.sendRequest ( req ,
reqSize ,
0x8e , // msgtype
host->m_ip , // ip
host->m_port , // port
hostId,
NULL, // retslot
this,
gotMsg8eReplyWrapper,
999999, // timeout
-1 , // backoff
-1 , // maxwait
NULL, // replybuf
0, // replybufmaxsize
m_niceness // niceness
)) {
// let admin know about error
log("seopipe: sendRequest 8e had error: %s",
mstrerror(g_errno));
// count it as replied then
m_numMsg8eReplies++;
continue;
}
}
}
// this should never happen now with our new wrapper
if ( m_numMsg8eReplies < m_numMsg8eRequests )
return (SafeBuf *)-1;
//
//
// 3. MERGE the msg8e replies from all hosts
//
//
// gotMgs8eReplyWrapper() should have recorded each one into
// m_msg8Reply[i], the msg20 reply ptr. set up for merging.
char *bestPtr[MAX_HOSTS];
char *bufEnd [MAX_HOSTS];
for ( int32_t i = 0; i < g_hostdb.m_numHosts ; i++ ) {
char *reply = m_msg8eReply [i];
// this happens if host is dead...
if ( ! reply ) {
bestPtr[i] = NULL;
bufEnd [i] = NULL;
continue;
}
//int32_t replySize = m_msg8eReplySize [i];
// it should be a list of QueryLinks
char *p = reply;
int32_t queryLinkBufSize = *(int32_t *)p;
p += 4;
bestPtr[i] = p;
// bufEnd[i] also marks the start of the querystringbuf
bufEnd [i] = p + queryLinkBufSize;
}
int32_t count = 0;
int32_t maxQueryLinks = MAX_RELATED_QUERIES;
if ( doMatchingQueries ) maxQueryLinks = MAX_MATCHING_QUERIES;
// now merge the top "max" highest scoring
// QueryLinks and their correspoding QueryLogEntries into
// m_queryLinkBuf/m_queryLinkStringBuf
storeMore:
// get the max scoring QueryLink from the 8e replies
int32_t maxi = -1;
float maxScore = -1.0;
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
// skip if exhausted
if ( bestPtr[i] >= bufEnd[i] ) continue;
// cast it
QueryLink *qk = (QueryLink *)bestPtr[i];
// sanity, if not list head core
if ( ! qk->m_isFirst ) { char *xx=NULL;*xx=0; }
// skip if score is not the current maximum
if ( qk->m_totalQueryImportance < maxScore ) continue;
// we got a new max!
maxScore = qk->m_totalQueryImportance;
maxi = i;
}
// store max into m_queryLinkBuf and m_queryLinkStringBuf
if ( maxi >= 0 ) {
// int16_tcut
QueryLink *best = (QueryLink *)bestPtr[maxi];
// get # to copy
int32_t toCopy = sizeof(QueryLink);
if ( doRelatedQueries )
// how many querylinks in this list? i.e. those
// that all share the same query, but different
// relateddocid?
toCopy = best->m_numInList * sizeof(QueryLink);
// copy the querylink
if ( ! m_queryLinkBuf.reserve ( toCopy ) ) return NULL;
// point to it
QueryLink *qk = (QueryLink *)m_queryLinkBuf.getBuf();
// THEN store it
m_queryLinkBuf.safeMemcpy( best , toCopy );
// point to its querylogentry buf, it occurs right
// after the list of QueryLinks!
char *p = bufEnd[maxi];
// and the query it is for
p += qk->m_queryStringOffset;
// cast that
QueryLogEntry *qe = (QueryLogEntry *)p;
// ensure enough space
if ( ! m_queryLinkStringBuf.reserve(qe->getSize(),"rqbb" ) )
return NULL;
// we are moving it into the final buf
qk->m_queryStringOffset = m_queryLinkStringBuf.length();
// store query log entry here now
m_queryLinkStringBuf.safeMemcpy ( qe, qe->getSize() );
// advance
bestPtr[maxi] += toCopy;
}
// limit
if ( ++count < maxQueryLinks ) goto storeMore;
// liberate those msg20 reply buffers
for ( int32_t i = 0; i < g_hostdb.m_numHosts;i++) {
if ( ! m_msg8eReply[i] ) continue;
mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
m_msg8eReply[i] = NULL;
}
// reset our parms if we are re-called for related queries
m_numMsg8eReplies = 0;
m_numMsg8eRequests = 0;
m_queryLinkBufValid = true;
// show time
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_beginRelatedQueries;
log("seopipe: getrelatedquerybuftook %"INT64" ms",took);
m_beginRelatedQueries = 0LL;
// validate
m_queryLinkBufValid = true;
/*
// log for debug
qks = (QueryLink *)m_queryLinkBuf->getBufStart();
nks = m_queryLinkBuf->length() / sizeof(QueryLink);
for ( int32_t k = 0 ; k < nks ; k++ ) {
// now we use offsets into m_relatedQueryBuf.m_buf[]
QueryRel *qk = &qks[k];
// skip if not a head
if ( ! qk->m_isFirst ) continue;
char *qstr = qk->getQueryString(&m_queryLinkStringBuf);
log("seopipe: relquery=\"%s\" imp=%f votes=%"INT32"",
qstr,
qk->m_rq_totalScore,
qk->m_docIdVotes);
}
*/
return &m_queryLinkBuf;
}
// scan matches like XmlDoc::getSummary() does and get all sentences
// containing a query term...
//void XmlDoc::getGigabitExcerpts ( ) {
//}
// this is still used by Title.cpp to get the title: field quickly
char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {
if ( ! json ) return NULL;
// get length
int32_t fieldLen = gbstrlen(field);
// keep track of in a quote or not
bool inQuotes = false;
char *stringStart = NULL;
char *p = json;
bool gotOne = false;
int32_t depth = 0;
// scan
for ( ; *p ; p++ ) {
// escaping a quote? ignore quote then.
if ( *p == '\\' && p[1] == '\"' ) {
// skip two bytes then..
p++;
continue;
}
// count {} depth
if ( ! inQuotes ) {
if ( *p == '{' ) depth++;
if ( *p == '}' ) depth--;
}
// a quote?
if ( *p == '\"' ) {
inQuotes = ! inQuotes;
// set start of the string if quote is beginning
if ( inQuotes ) stringStart = p + 1;
// if quote is ending and a colon follows then
// it was a json field name. so if it matches the
// field we want return the following field for it.
else if ( ! inQuotes &&
! gotOne &&
p[1] == ':' &&
// {"title":"whatever",...}
// could be product:{title:... depth=2
(depth == 1 ||depth==2) &&
stringStart &&
(p - stringStart) == fieldLen &&
strncmp(field,stringStart,fieldLen)==0 ) {
// now, the next time we set stringStart
// it will be set to the VALUE of this field
// assuming the field is a STRING!!!!
gotOne = true;
// return after the quote
//return p + 2;
}
// ok, we got the string after the field string...
else if ( ! inQuotes && gotOne ) {
if ( valueLen ) *valueLen = p - stringStart;
return stringStart;
}
// keep chugging
continue;
}
}
// done, not found
return NULL;
}
Json *XmlDoc::getParsedJson ( ) {
if ( m_jpValid ) return &m_jp;
// core if not a json object
if ( m_contentTypeValid && m_contentType != CT_JSON &&
// spider status docs are now really json
m_contentType != CT_STATUS ) {
char *xx=NULL;*xx=0; }
// \0 terminated
char **pp = getUtf8Content();
if ( ! pp || pp == (void *)-1 ) return (Json *)pp;
// point to the json
char *p = *pp;
// empty? all done then.
//if ( ! p ) return (char *)pp;
// . returns NULL and sets g_errno on error
// . if p is NULL i guess this should still be ok and be empty
if ( ! m_jp.parseJsonStringIntoJsonItems ( p , m_niceness ) ) {
g_errno = EBADJSONPARSER;
return NULL;
}
m_jpValid = true;
return &m_jp;
}
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . hash each json VALUE (not FIELD) ... AND ... hash each json
// VALUE with its FIELD like "title:cool" or "description:whatever"
// . example:
// [{"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":1378322570280,"matched":64,"status":"Stopped","start":1378322184332,"token":"poo","parameterMap":{"token":"poo","seed":"www.alleyinsider.com","api":"article"},"crawled":64},{"id":"830e0584-7f69-4bdd-
#include "Json.h"
char *XmlDoc::hashJSONFields ( HashTableX *table ) {
setStatus ( "hashing json fields" );
HashInfo hi;
hi.m_tt = table;
hi.m_desc = "json object";
// use new json parser
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
return hashJSONFields2 ( table , &hi , jp , true );
}
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
HashInfo *hi , Json *jp ,
bool hashWithoutFieldNames ) {
JsonItem *ji = jp->getFirstItem();
char nb[1024];
SafeBuf nameBuf(nb,1024);
//int32_t totalHash32 = 0;
for ( ; ji ; ji = ji->m_next ) {
QUICKPOLL(m_niceness);
// skip if not number or string
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
continue;
// reset, but don't free mem etc. just set m_length to 0
nameBuf.reset();
// get its full compound name like "meta.twitter.title"
JsonItem *p = ji;
char *lastName = NULL;
char *nameArray[20];
int32_t numNames = 0;
for ( ; p ; p = p->m_parent ) {
// empty name?
if ( ! p->m_name ) continue;
if ( ! p->m_name[0] ) continue;
// dup? can happen with arrays. parent of string
// in object, has same name as his parent, the
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
if ( p->m_name == lastName ) continue;
// update
lastName = p->m_name;
// add it up
nameArray[numNames++] = p->m_name;
// breach?
if ( numNames < 15 ) continue;
log("build: too many names in json tag");
break;
}
// if we are the diffbot reply "html" field do not hash this
// because it is redundant and it hashes html tags etc.!
// plus it slows us down a lot and bloats the index.
if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"html")==0)
continue;
// assemble the names in reverse order which is correct order
for ( int32_t i = 1 ; i <= numNames ; i++ ) {
// copy into our safebuf
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
return NULL;
// separate names with periods
if ( ! nameBuf.pushChar('.') ) return NULL;
}
// remove last period
nameBuf.removeLastChar('.');
// and null terminate
if ( ! nameBuf.nullTerm() ) return NULL;
// change all :'s in names to .'s since : is reserved!
char *px = nameBuf.getBufStart();
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
//for ( px = nameBuf.getBufStart(); *px ; px++ ) if ( *px == '-' ) *px = '_';
//
// DIFFBOT special field hacks
//
char *name = nameBuf.getBufStart();
hi->m_hashGroup = HASHGROUP_BODY;
if ( strstr(name,"title") )
hi->m_hashGroup = HASHGROUP_TITLE;
if ( strstr(name,"url") )
hi->m_hashGroup = HASHGROUP_INURL;
if ( strstr(name,"resolved_url") )
hi->m_hashGroup = HASHGROUP_INURL;
if ( strstr(name,"tags") )
hi->m_hashGroup = HASHGROUP_INTAG;
if ( strstr(name,"meta") )
hi->m_hashGroup = HASHGROUP_INMETATAG;
//
// now Json.cpp decodes and stores the value into
// a buffer, so ji->getValue() should be decoded completely
//
// . get the value of the json field
// . if it's a number or bool it converts into a string
int32_t vlen;
char *val = ji->getValueAsString( &vlen );
char tbuf[32];
// if the value is clearly a date, just hash it as
// a number, so use a temporary value that holds the
// time_t and hash with that... this will hash
// diffbot's article date field as a number so we can
// sortby and constrain by it in the search results
if ( name && (strcasecmp(name,"date") == 0 || strcasecmp(name,"estimatedDate") == 0)) {
// this is in HttpMime.cpp
int64_t tt = atotime1 ( val );
// we can't store 64-bit dates... so truncate to -2147483648
// which is Dec 13 1901. so we don't quite get the 1898 date
// for the new york times dbpedia entry. maybe if we added
// an extra termlist for more precision to indicate century or
// something.
if ( tt && tt < (int32_t)0x80000000 )
tt = (int32_t)0x80000000;
// likewise, we can't be too big, passed 2038
if ( tt && tt > 0x7fffffff )
tt = (int32_t)0x7fffffff;
if ( tt ) {
// print out the time_t in ascii
vlen = sprintf(tbuf,"%"INT32"",(int32_t)tt);
// and point to it for hashing/indexing
val = tbuf;
}
}
//
// for deduping search results we set m_contentHash32 here for
// diffbot json objects.
// we can't do this here anymore, we have to set the
// contenthash in ::getContentHash32() because we need it to
// set EDOCUNCHANGED in ::getIndexCode() above.
//
/*
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
// make the content hash so we can set m_contentHash32
// for deduping
int32_t nh32 = hash32n ( name );
// do an exact hash for now...
int32_t vh32 = hash32 ( val , vlen , m_niceness );
// accumulate, order independently
totalHash32 ^= nh32;
totalHash32 ^= vh32;
}
*/
// index like "title:whatever"
hi->m_prefix = name;
hashString ( val , vlen , hi );
//log("hashing json var as %s %s %d", name, val, vlen);
// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
if ( name )
hashFieldMatchTerm ( val , (int32_t)vlen , hi );
if ( ! hashWithoutFieldNames )
continue;
// hash without the field name as well
hi->m_prefix = NULL;
hashString ( val , vlen , hi );
/*
// a number? hash special then as well
if ( ji->m_type != JT_NUMBER ) continue;
// use prefix for this though
hi->m_prefix = name;
// hash as a number so we can sort search results by
// this number and do range constraints
float f = ji->m_valueDouble;
if ( ! hashNumber2 ( f , hi ) )
return NULL;
*/
}
//m_contentHash32 = totalHash32;
//m_contentHash32Valid = true;
return (char *)0x01;
}
char *XmlDoc::hashXMLFields ( HashTableX *table ) {
setStatus ( "hashing xml fields" );
HashInfo hi;
hi.m_tt = table;
hi.m_desc = "xml object";
hi.m_hashGroup = HASHGROUP_BODY;
Xml *xml = getXml();
int32_t n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes ();
SafeBuf nameBuf;
// scan the xml nodes
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// . skip if it's a tag not text node skip it
// . we just want the "text" nodes
if ( nodes[i].isTag() ) continue;
//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
// log("hey:hy");
// assemble the full parent name
// like "tag1.tag2.tag3"
nameBuf.reset();
xml->getCompoundName ( i , &nameBuf );
// this is \0 terminated
char *tagName = nameBuf.getBufStart();
// get the utf8 text
char *val = nodes[i].m_node;
int32_t vlen = nodes[i].m_nodeLen;
// index like "title:whatever"
if ( tagName && tagName[0] ) {
hi.m_prefix = tagName;
hashString ( val , vlen , &hi );
}
// hash without the field name as well
hi.m_prefix = NULL;
hashString ( val , vlen , &hi );
}
return (char *)0x01;
}
// if our url is that of a subdoc, then get the url of the parent doc
// from which we were a subsection
char *XmlDoc::getDiffbotParentUrl( char *myUrl ) {
// remove -diffbotxyz
if ( ! m_kbuf.safeStrcpy( myUrl ) ) return NULL;
char *p = m_kbuf.getBufStart();
char *s = strstr(p,"-diffbotxyz");
if ( s ) { *s = '\0'; return p; }
// temporarily until we inject "diffbotreply" uncomment this
/*
// otherwise i guess we got dan's format of -article|%"INT32"|%"INT32"
char *e = m_kbuf.getBuf() - 1;
for ( ; *e && is_digit(*e) ; e-- );
if ( *e != '|' ) return NULL;
e--;
for ( ; *e && is_digit(*e) ; e-- );
if ( *e != '|' ) return NULL;
e--;
// now to hyphen
char *estart = m_kbuf.getBufStart();
for ( ; e>estart && *e !='-' ; e-- );
if ( *e != '-' ) return NULL;
*e = '\0';
return p;
*/
return NULL;
}
bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
// sanity
if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
storeFacetValuesSite ( qs, sb, fvh );
if ( m_hasMetadata) {
Json jpMetadata;
if (jpMetadata.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
storeFacetValuesJSON ( qs, sb, fvh, &jpMetadata );
}
}
// if "qa" is a gbxpathsitehash123456 type of beastie then we
// gotta scan the sections
if ( strncasecmp(qs,"gbxpathsitehash",15) == 0 )
return storeFacetValuesSections ( qs , sb , fvh );
// if a json doc, get json field
// spider status docs are really json now
if ( m_contentType == CT_JSON || m_contentType == CT_STATUS )
return storeFacetValuesJSON ( qs , sb , fvh, getParsedJson());
if ( m_contentType == CT_HTML )
return storeFacetValuesHtml ( qs , sb , fvh );
if ( m_contentType == CT_XML )
return storeFacetValuesXml ( qs , sb , fvh );
return true;
}
// Store facet for site
bool XmlDoc::storeFacetValuesSite ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
char* val = getSite();
int vlen = gbstrlen(val);
FacetValHash_t val32 = hash32 ( val , vlen );
// skip if not for us
if ( fvh && val32 != fvh ) return false;
if ( strcmp("gbtagsite",qs) ) return false;
// otherwise add facet FIELD to our buf
if ( ! sb->safeStrcpy(qs) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// then add facet VALUE
if ( !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false;
if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
if ( ! sb->pushChar('\0') ) return false;
return true;
}
bool XmlDoc::storeFacetValuesSections ( char *qs , SafeBuf *sb ,
FacetValHash_t fvh ) {
// scan all sections
Sections *ss = getSections();
if ( ! ss ) return false;
if ( ss == (void *)-1 ) { char *xx=NULL;*xx=0; }
Words *ww = getWords();
if ( ! ww ) return false;
if ( ww == (void *)-1 ) { char *xx=NULL;*xx=0; }
int32_t siteHash32 = *getSiteHash32();
// qs is like gbxpathsitehash1234567
// so get the digit part
char *p = qs;
for ( ; *p && ! is_digit(*p); p++ );
uint64_t xsh = (uint64_t)atoll(p);
bool isString = false;
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
Section *si = ss->m_rootSection;
//sec_t mflags = SEC_SENTENCE | SEC_MENU;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// is it a match?
uint64_t mod;
mod = (uint32_t)si->m_turkTagHash32;
mod ^= (uint32_t)siteHash32;
if ( mod != xsh ) continue;
// . then add facet VALUE
// . hash of the innerhtml of sentence
// . get hash of sentences this tag contains indirectly
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
if ( ! val32 ) continue;
// if a facetvalhash was provided we must match
if ( fvh && val32 != fvh ) continue;
// got one print the facet field
if ( ! sb->safeStrcpy(qs) ) return false;
if ( ! sb->pushChar('\0') ) return false;
if ( isString && ! sb->safePrintf("%"UINT32",",val32) )
return false;
// put ALSO print the string somewhat
char *a = m_words.m_words[si->m_next->m_a];
char *b = m_words.m_words[si->m_next->m_b-1];
b += m_words.m_wordLens [si->m_next->m_b-1];
if ( ! sb->safeTruncateEllipsis (a,b-a,160) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// if wanted a specific string, we are done
if ( fvh ) return true;
}
return true;
}
bool XmlDoc::storeFacetValuesHtml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
Xml *xml = getXml();
int32_t qsLen = gbstrlen(qs);
bool isString = false;
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
// check for gblang:en etc.
// if ( isString && strncmp(qs,"gblang",6)==0 ) {
// if (!sb->safeStrcpy(qs) ) return false;
// if (!sb->pushChar('\0') ) return false;
// // find the lang that has that hash!
// if (!sb->safePrintf("%"UINT32",",(uint32_t)val32))return false;
// if (!sb->safeMemcpy(content,contentLen) ) return false;
// if (!sb->pushChar('\0') ) return false;
//}
char *content;
int32_t contentLen;
int32_t nameLen;
char *s;
int32_t i = 0;
bool uniqueField = false;
// a title tag can count now too
if ( strcmp(qs,"title") == 0 ) {
// skip leading spaces = false
content = xml->getString ("title",&contentLen,false);
uniqueField = true;
goto skip;
}
// find the first meta summary node
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
// continue if not a meta tag
if ( xml->m_nodes[i].m_nodeId != TAG_META ) continue;
// . does it have a type field that's "summary"
// . <meta name=summary content="...">
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
s = xml->getString ( i , "name", &nameLen );
// "s" can be "summary","description","keywords",...
if ( nameLen != qsLen ) continue;
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
// point to the summary itself
content = xml->getString ( i , "content" , &contentLen );
if ( ! content || contentLen <= 0 ) continue;
skip:
// hash it to match it if caller specified a particular hash
// because they are coming from Msg40::lookUpFacets() function
// to convert the hashes to strings, like for rendering in
// the facets box to the left of the search results
FacetValHash_t val32 = hash32 ( content, contentLen);
if ( fvh && fvh != val32 ) continue;
// otherwise add facet FIELD to our buf
if ( ! sb->safeStrcpy(qs) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// then add facet VALUE
if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
return false;
if ( !sb->safeMemcpy(content,contentLen) ) return false;
if ( !sb->pushChar('\0') ) return false;
// if only one specified, we are done
if ( fvh ) return true;
if ( uniqueField ) return true;
}
return true;
}
bool XmlDoc::storeFacetValuesXml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
Xml *xml = getXml();
int32_t qsLen = gbstrlen(qs);
bool isString = false;
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
int32_t i = 0;
bool uniqueField = false;
SafeBuf nameBuf;
// find the first meta summary node
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
// skip text nodes
if ( xml->m_nodes[i].m_nodeId == 0 ) continue;
// assemble the full parent name
// like "tag1.tag2.tag3"
nameBuf.reset();
xml->getCompoundName ( i , &nameBuf );
int32_t nameLen = nameBuf.length();
char *s = nameBuf.getBufStart();
// . does it have a type field that's "summary"
// . <meta name=summary content="...">
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
//s = xml->getString ( i , "name", &nameLen );
// "s" can be "summary","description","keywords",...
if ( nameLen != qsLen ) continue;
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
// got it...
// wtf?
if ( i + 1 >= xml->m_numNodes ) continue;
// point to the content! this is a text node?
// skip if not a text node, we don't return tag nodes i guess
if ( xml->m_nodes[i+1].m_nodeId ) continue;
char *content = xml->m_nodes[i+1].m_node;
int32_t contentLen = xml->m_nodes[i+1].m_nodeLen;
// skip if empty
if ( ! content || contentLen <= 0 ) continue;
// skip commen cases too! like white space
if ( contentLen == 1 && is_wspace_a(content[0]) ) continue;
// hash it to match it if caller specified a particular hash
// because they are coming from Msg40::lookUpFacets() function
// to convert the hashes to strings, like for rendering in
// the facets box to the left of the search results
FacetValHash_t val32 = hash32 ( content, contentLen);
if ( fvh && fvh != val32 ) continue;
// otherwise add facet FIELD to our buf
if ( ! sb->safeStrcpy(qs) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// then add facet VALUE
if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
return false;
if ( !sb->safeMemcpy(content,contentLen) ) return false;
if ( !sb->pushChar('\0') ) return false;
// if only one specified, we are done
if ( fvh ) return true;
if ( uniqueField ) return true;
}
return true;
}
bool XmlDoc::storeFacetValuesJSON (char *qs,
SafeBuf *sb,
FacetValHash_t fvh,
Json *jp ) {
JsonItem *ji = jp->getFirstItem();
char nb[1024];
SafeBuf nameBuf(nb,1024);
bool isString = false;
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
for ( ; ji ; ji = ji->m_next ) {
QUICKPOLL(m_niceness);
// skip if not number or string
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
continue;
// reset, but don't free mem etc. just set m_length to 0
nameBuf.reset();
// get its full compound name like "meta.twitter.title"
ji->getCompoundName ( nameBuf );
// skip if not for us
if ( strcmp(nameBuf.getBufStart(),qs) ) continue;
//
// now Json.cpp decodes and stores the value into
// a buffer, so ji->getValue() should be decoded completely
//
int32_t vlen;
char *val = ji->getValueAsString( &vlen );
// hash it to match it if caller specified a particular hash
// because they are coming from Msg40::lookUpFacets() function
// to convert the hashes to strings, like for rendering in
// the facets box to the left of the search results
FacetValHash_t val32 = hash32 ( val , vlen );
if ( fvh && val32 != fvh )
continue;
// otherwise add facet FIELD to our buf
if ( ! sb->safeStrcpy(qs) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// then add facet VALUE
if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
return false;
if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// if wanted a specific string, then we are done
if ( fvh ) return true;
}
return true;
}