Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

Conflicts:
	Spider.cpp
This commit is contained in:
Matt Wells 2015-06-18 08:40:53 -07:00
commit 18dbaf89c9
10 changed files with 234 additions and 109 deletions

@ -3573,6 +3573,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
float respiderFreq = m_collectiveRespiderFrequency;
if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
// lower from 7 to 1 since we have so many collections now
int32_t diffbotipms = 1; // 7
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
@ -3587,10 +3589,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// domains it slows diffbot back-end down, so change this
// from 100 to 7 if doing a bulk job
if ( m_isCustomCrawl == 2 )
m_maxSpidersPerRule[i] = 7;
m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
// ethan wants some speed
if ( isEthan )
m_spiderIpMaxSpiders[i] = 30;

@ -537,7 +537,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
if ( ! s_initialized ) {
//m_memtablesize = m_maxMem / 6510;
// support 1.2M ptrs for now. good for about 8GB
m_memtablesize = 1200*1024;//m_maxMem / 6510;
m_memtablesize = 3000*1024;//m_maxMem / 6510;
//if ( m_maxMem < 8000000000 ) { char *xx=NULL;*xx=0; }
}

@ -785,7 +785,7 @@ bool Msg3::doneScanning ( ) {
// if shutting down gb then limit to 20 so we can shutdown because
// it can't shutdown until all threads are out of the queue i think
if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
log("msg3: forcing retries to 0 because shutting down");
//log("msg3: forcing retries to 0 because shutting down");
max = 0;
}

@ -17851,7 +17851,8 @@ void Parms::init ( ) {
// can use those to sort regular docs and not have spider reply
// status docs in the serps.
// back on 4/21/2015 seems pretty stable.
m->m_def = "1";
// but it uses disk space so turn off for now again. 6/16/2015
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;

11
Rdb.cpp

@ -1833,6 +1833,14 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
m_rdbId == RDB_DOLEDB ||
m_rdbId == RDB_SPIDERDB ||
m_rdbId == RDB_REVDB ) ) {
// exception, spider status docs can be deleted from titledb
// if user turns off 'index spider replies' before doing
// the rebuild, when not rebuilding titledb.
if ( m_rdbId == RDB_TITLEDB &&
list->m_listSize == 12 )
goto exception;
// allow banning of sites still
//m_rdbId == RDB_TAGDB ) ) {
log("db: How did an add come in while in repair mode?"
@ -1840,6 +1848,9 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
g_errno = EREPAIRING;
return false;
}
exception:
/*
if ( g_repair.isRepairActive() &&
g_repair.m_fullRebuild &&

@ -839,6 +839,22 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
// this returns false and sets g_errno on error
if ( ! m->generateMap ( f ) ) {
log("db: Map generation failed.");
log("db: Moving .dat and .map file to trash dir");
SafeBuf tmp;
tmp.safePrintf("%s",f->getFilename());
// take off .dat and make it * so we can move map file
int32_t len = tmp.getLength();
char *str = tmp.getBufStart();
str[len-3] = '*';
str[len-2] = '\0';
SafeBuf cmd;
cmd.safePrintf("mv %s/%s %s/trash/",
m_dir.getDir(),
str,
g_hostdb.m_dir);
log("db: %s",cmd.getBufStart() );
gbsystem ( cmd.getBufStart() );
exit(0);
mdelete ( f , sizeof(BigFile),"RdbBase");
delete (f);
mdelete ( m , sizeof(RdbMap),"RdbBase");
@ -1359,6 +1375,9 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
if ( g_merge.m_isSuspended ) return;
if ( g_merge2.m_isSuspended ) return;
// shutting down? do not start another merge then
if ( g_process.m_mode == EXIT_MODE ) return;
// sanity checks
if ( g_loop.m_inQuickPoll ) {
log("rdb: cant attempt merge in quickpoll");

@ -323,10 +323,11 @@ bool RdbMap::verifyMap2 ( ) {
KEYSET(lastKey,k,m_ks); continue; }
// just bitch for now
log(
"db: Key out of order in map file %s. "
"db: Key out of order in map file %s%s. "
"page = %"INT32". key offset = %"INT64". Map or data file is "
"corrupt, but it is probably the data file.",
m_file.getFilename() ,
"corrupt, but it is probably the data file. Please "
"delete the map file and restart.",
m_file.m_dir,m_file.getFilename() ,
i,(int64_t)m_pageSize*(int64_t)i+getOffset(i));
//log("db: oldk.n1=%08"XINT32" n0=%016"XINT64"",
@ -336,6 +337,7 @@ bool RdbMap::verifyMap2 ( ) {
KEY1(lastKey,m_ks),KEY0(lastKey));
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
log("db: m_numPages = %"INT32"",m_numPages);
exit(0);
//char *xx=NULL;*xx=0;
// was k too small?
//if ( i + 1 < m_numPages && lastKey <= getKey(i+1) ) {
@ -1371,6 +1373,9 @@ bool RdbMap::chopHead ( int32_t fileHeadSize ) {
bool RdbMap::generateMap ( BigFile *f ) {
reset();
if ( g_conf.m_readOnlyMode ) return false;
log("db: Generating map for %s/%s",f->m_dir,f->getFilename());
// we don't support headless datafiles right now
if ( ! f->doesPartExist(0) ) {
g_errno = EBADENGINEER;

@ -20354,6 +20354,17 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
sb->safePrintf("addstatusdocsize=%05"INT32" ",0);
if ( m_useSecondaryRdbs ) {
sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
if ( cr )
sb->safePrintf("indexspiderreplies=%i ",(int)
cr->m_indexSpiderReplies);
}
if ( size_imageData && m_imageDataValid ) {
// url is in data now
ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
@ -21913,6 +21924,58 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// returning from a handler that had an error?
if ( g_errno ) return NULL;
// if we are a spider status doc/titlerec and we are doing a rebuild
// operation, then keep it simple
if ( m_setFromTitleRec &&
m_useSecondaryRdbs &&
m_contentTypeValid &&
m_contentType == CT_STATUS ) {
// if not rebuilding posdb then done, list is empty since
// spider status docs do not contribute to linkdb, clusterdb,..
if ( ! m_usePosdb && ! m_useTitledb ) {
m_metaListValid = true;
return m_metaList;
}
/////////////
//
// if user disabled spider status docs then delete the titlerec
// AND the posdb index list from our dbs for this ss doc
//
/////////////
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( ! cr->m_indexSpiderReplies ) {
int64_t uh48 = m_firstUrl.getUrlHash48();
// delete title rec. true = delete?
key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
// shortcut
SafeBuf *ssb = &m_spiderStatusDocMetaList;
// add to list. and we do not add the spider status
// doc to posdb since we deleted its titlerec.
ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
ssb->safeMemcpy ( &tkey , sizeof(key_t) );
m_metaList = ssb->getBufStart();
m_metaListSize = ssb->getLength ();
m_metaListValid = true;
return m_metaList;
}
// set safebuf to the json of the spider status doc
SafeBuf jd;
if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
return NULL;
// set m_spiderStatusDocMetaList from the json
if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
return NULL;
// TODO: support titledb rebuild as well
m_metaList = m_spiderStatusDocMetaList.getBufStart();
m_metaListSize = m_spiderStatusDocMetaList.getLength();
m_metaListValid = true;
return m_metaList;
}
// any other indexing issue? hey! g_errno might not be set here
//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }
@ -22937,11 +23000,20 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// i guess it is safe to do this after getting the spiderreply
SafeBuf *spiderStatusDocMetaList = NULL;
//if ( indexReply ) {
// get the spiderreply ready to be added to the rdbs w/ msg4
spiderStatusDocMetaList = getSpiderStatusDocMetaList (newsr,forDelete);
// block?
if ( ! spiderStatusDocMetaList ||
spiderStatusDocMetaList == (void *)-1)
// but if doing a rebuild operation then do not get it, we'll rebuild
// it since it will have its own titlerec
if ( ! m_useSecondaryRdbs ) {
spiderStatusDocMetaList =
getSpiderStatusDocMetaList (newsr,forDelete);
if ( ! spiderStatusDocMetaList ) {
log("build: ss doc metalist null. bad!");
return NULL;
}
}
if ( spiderStatusDocMetaList == (void *)-1)
return (char *)spiderStatusDocMetaList;
//}
@ -24070,6 +24142,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
spiderStatusDocMetaList->getBufStart() ,
spiderStatusDocMetaList->length() );
m_p += spiderStatusDocMetaList->length();
m_addedStatusDocSize = spiderStatusDocMetaList->length();
m_addedStatusDocSizeValid = true;
}
/*
@ -27739,21 +27813,121 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
// end the json spider status doc
jd.safePrintf("\n}\n");
// BEFORE ANY HASHING
int32_t savedDist = m_dist;
// add the index list for it. it returns false and sets g_errno on err
// otherwise it sets m_spiderStatusDocMetaList
if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
return NULL;
// now make the titlerec
char xdhead[2048];
// just the head of it. this is the hacky part.
XmlDoc *xd = (XmlDoc *)xdhead;
// clear it out
memset ( xdhead, 0 , 2048);
// copy stuff from THIS so the spider reply "document" has the same
// header info stuff
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
gbmemcpy ( xdhead , (char *)this , hsize );
// override spider time in case we had error to be consistent
// with the actual SpiderReply record
//xd->m_spideredTime = reply->m_spideredTime;
//xd->m_spideredTimeValid = true;
// sanity
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
// this will cause the maroon box next to the search result to
// say "STATUS" similar to "PDF" "DOC" etc.
xd->m_contentType = CT_STATUS;
int32_t fullsize = &m_dummyEnd - (char *)this;
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
/*
// the ptr_* were all zero'd out, put the ones we want to keep back in
SafeBuf tmp;
// was "Spider Status: %s" but that is unnecessary
tmp.safePrintf("<title>%s</title>",
mstrerror(m_indexCode));
// if we are a dup...
if ( m_indexCode == EDOCDUP )
tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
if ( m_redirUrlPtr && m_redirUrlValid )
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
*/
// put stats like we log out from logIt
//tmp.safePrintf("<div style=max-width:800px;>\n");
// store log output into doc
//logIt(&tmp);
//tmp.safePrintf("\n</div>");
// the content is just the title tag above
// xd->ptr_utf8Content = tmp.getBufStart();
// xd->size_utf8Content = tmp.length()+1;
xd->ptr_utf8Content = jd.getBufStart();
xd->size_utf8Content = jd.length()+1;
// keep the same url as the doc we are the spider reply for
xd->ptr_firstUrl = ptr_firstUrl;
xd->size_firstUrl = size_firstUrl;
// serps need site, otherwise search results core
xd->ptr_site = ptr_site;
xd->size_site = size_site;
// if this is null then ip lookup failed i guess so just use
// the subdomain
if ( ! ptr_site && m_firstUrlValid ) {
xd->ptr_site = m_firstUrl.getHost();
xd->size_site = m_firstUrl.getHostLen();
}
// use the same uh48 of our parent
int64_t uh48 = m_firstUrl.getUrlHash48();
// then make into a titlerec but store in metalistbuf, not m_titleRec
SafeBuf titleRecBuf;
// this should not include ptrs that are NULL when compressing
// using its m_internalFlags1
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
return NULL;
// concat titleRec to our posdb key records
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
return NULL;
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
return NULL;
// return the right val
m_dist = savedDist;
// ok, good to go, ready to add to posdb and titledb
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
}
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
// the posdb table
HashTableX tt4;
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
return NULL;
return false;
Json jp2;
if (! jp2.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
g_errno = EBADJSONPARSER;
return NULL;
return false;
}
// BEFORE ANY HASHING
int32_t savedDist = m_dist;
// re-set to 0
m_dist = 0;
@ -27859,7 +28033,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
*/
// store keys in safebuf then to make our own meta list
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );
// debug this shit
//SafeBuf tmpsb;
@ -27868,97 +28042,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
// &tmpsb );
//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());
// now make the titlerec
char xdhead[2048];
// just the head of it. this is the hacky part.
XmlDoc *xd = (XmlDoc *)xdhead;
// clear it out
memset ( xdhead, 0 , 2048);
// copy stuff from THIS so the spider reply "document" has the same
// header info stuff
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
gbmemcpy ( xdhead , (char *)this , hsize );
// override spider time in case we had error to be consistent
// with the actual SpiderReply record
//xd->m_spideredTime = reply->m_spideredTime;
//xd->m_spideredTimeValid = true;
// sanity
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
// this will cause the maroon box next to the search result to
// say "STATUS" similar to "PDF" "DOC" etc.
xd->m_contentType = CT_STATUS;
int32_t fullsize = &m_dummyEnd - (char *)this;
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
/*
// the ptr_* were all zero'd out, put the ones we want to keep back in
SafeBuf tmp;
// was "Spider Status: %s" but that is unnecessary
tmp.safePrintf("<title>%s</title>",
mstrerror(m_indexCode));
// if we are a dup...
if ( m_indexCode == EDOCDUP )
tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
if ( m_redirUrlPtr && m_redirUrlValid )
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
*/
// put stats like we log out from logIt
//tmp.safePrintf("<div style=max-width:800px;>\n");
// store log output into doc
//logIt(&tmp);
//tmp.safePrintf("\n</div>");
// the content is just the title tag above
// xd->ptr_utf8Content = tmp.getBufStart();
// xd->size_utf8Content = tmp.length()+1;
xd->ptr_utf8Content = jd.getBufStart();
xd->size_utf8Content = jd.length()+1;
// keep the same url as the doc we are the spider reply for
xd->ptr_firstUrl = ptr_firstUrl;
xd->size_firstUrl = size_firstUrl;
// serps need site, otherwise search results core
xd->ptr_site = ptr_site;
xd->size_site = size_site;
// if this is null then ip lookup failed i guess so just use
// the subdomain
if ( ! ptr_site && m_firstUrlValid ) {
xd->ptr_site = m_firstUrl.getHost();
xd->size_site = m_firstUrl.getHostLen();
}
// use the same uh48 of our parent
int64_t uh48 = m_firstUrl.getUrlHash48();
// then make into a titlerec but store in metalistbuf, not m_titleRec
SafeBuf titleRecBuf;
// this should not include ptrs that are NULL when compressing
// using its m_internalFlags1
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
return NULL;
// concat titleRec to our posdb key records
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
return NULL;
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
return NULL;
// return the right val
m_dist = savedDist;
// ok, good to go, ready to add to posdb and titledb
m_spiderStatusDocMetaListValid = true;
return &m_spiderStatusDocMetaList;
return true;
}
// returns false and sets g_errno on error

@ -509,6 +509,7 @@ class XmlDoc {
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
bool forDelete ) ;
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
bool setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t ssDocId ) ;
SafeBuf m_spiderStatusDocMetaList;
char *getIsAdult ( ) ;
int32_t **getIndCatIds ( ) ;

@ -6618,6 +6618,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
"ch32=%010"UINT32" "
"clen=%07"INT32" "
"cs=%04d "
"ctype=%s "
"lang=%02d "
"sni=%03"INT32" "
//"cats=%"INT32" "
@ -6642,6 +6643,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
xd->m_contentHash32,
xd->size_utf8Content,//tr.getContentLen() ,
xd->m_charset,//tr.getCharset(),
g_contentTypeStrings[xd->m_contentType],
xd->m_langId,//tr.getLanguage(),
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
//nc,