Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
Conflicts: Spider.cpp
This commit is contained in:
commit
18dbaf89c9
@ -3573,6 +3573,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
float respiderFreq = m_collectiveRespiderFrequency;
|
||||
if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
|
||||
|
||||
// lower from 7 to 1 since we have so many collections now
|
||||
int32_t diffbotipms = 1; // 7
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
@ -3587,10 +3589,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
// domains it slows diffbot back-end down, so change this
|
||||
// from 100 to 7 if doing a bulk job
|
||||
if ( m_isCustomCrawl == 2 )
|
||||
m_maxSpidersPerRule[i] = 7;
|
||||
m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster
|
||||
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
|
||||
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
|
||||
// ethan wants some speed
|
||||
if ( isEthan )
|
||||
m_spiderIpMaxSpiders[i] = 30;
|
||||
|
2
Mem.cpp
2
Mem.cpp
@ -537,7 +537,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
|
||||
if ( ! s_initialized ) {
|
||||
//m_memtablesize = m_maxMem / 6510;
|
||||
// support 1.2M ptrs for now. good for about 8GB
|
||||
m_memtablesize = 1200*1024;//m_maxMem / 6510;
|
||||
m_memtablesize = 3000*1024;//m_maxMem / 6510;
|
||||
//if ( m_maxMem < 8000000000 ) { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
|
2
Msg3.cpp
2
Msg3.cpp
@ -785,7 +785,7 @@ bool Msg3::doneScanning ( ) {
|
||||
// if shutting down gb then limit to 20 so we can shutdown because
|
||||
// it can't shutdown until all threads are out of the queue i think
|
||||
if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
|
||||
log("msg3: forcing retries to 0 because shutting down");
|
||||
//log("msg3: forcing retries to 0 because shutting down");
|
||||
max = 0;
|
||||
}
|
||||
|
||||
|
@ -17851,7 +17851,8 @@ void Parms::init ( ) {
|
||||
// can use those to sort regular docs and not have spider reply
|
||||
// status docs in the serps.
|
||||
// back on 4/21/2015 seems pretty stable.
|
||||
m->m_def = "1";
|
||||
// but it uses disk space so turn off for now again. 6/16/2015
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
|
11
Rdb.cpp
11
Rdb.cpp
@ -1833,6 +1833,14 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
|
||||
m_rdbId == RDB_DOLEDB ||
|
||||
m_rdbId == RDB_SPIDERDB ||
|
||||
m_rdbId == RDB_REVDB ) ) {
|
||||
|
||||
// exception, spider status docs can be deleted from titledb
|
||||
// if user turns off 'index spider replies' before doing
|
||||
// the rebuild, when not rebuilding titledb.
|
||||
if ( m_rdbId == RDB_TITLEDB &&
|
||||
list->m_listSize == 12 )
|
||||
goto exception;
|
||||
|
||||
// allow banning of sites still
|
||||
//m_rdbId == RDB_TAGDB ) ) {
|
||||
log("db: How did an add come in while in repair mode?"
|
||||
@ -1840,6 +1848,9 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
|
||||
g_errno = EREPAIRING;
|
||||
return false;
|
||||
}
|
||||
|
||||
exception:
|
||||
|
||||
/*
|
||||
if ( g_repair.isRepairActive() &&
|
||||
g_repair.m_fullRebuild &&
|
||||
|
19
RdbBase.cpp
19
RdbBase.cpp
@ -839,6 +839,22 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
|
||||
// this returns false and sets g_errno on error
|
||||
if ( ! m->generateMap ( f ) ) {
|
||||
log("db: Map generation failed.");
|
||||
log("db: Moving .dat and .map file to trash dir");
|
||||
SafeBuf tmp;
|
||||
tmp.safePrintf("%s",f->getFilename());
|
||||
// take off .dat and make it * so we can move map file
|
||||
int32_t len = tmp.getLength();
|
||||
char *str = tmp.getBufStart();
|
||||
str[len-3] = '*';
|
||||
str[len-2] = '\0';
|
||||
SafeBuf cmd;
|
||||
cmd.safePrintf("mv %s/%s %s/trash/",
|
||||
m_dir.getDir(),
|
||||
str,
|
||||
g_hostdb.m_dir);
|
||||
log("db: %s",cmd.getBufStart() );
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
exit(0);
|
||||
mdelete ( f , sizeof(BigFile),"RdbBase");
|
||||
delete (f);
|
||||
mdelete ( m , sizeof(RdbMap),"RdbBase");
|
||||
@ -1359,6 +1375,9 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
|
||||
if ( g_merge.m_isSuspended ) return;
|
||||
if ( g_merge2.m_isSuspended ) return;
|
||||
|
||||
// shutting down? do not start another merge then
|
||||
if ( g_process.m_mode == EXIT_MODE ) return;
|
||||
|
||||
// sanity checks
|
||||
if ( g_loop.m_inQuickPoll ) {
|
||||
log("rdb: cant attempt merge in quickpoll");
|
||||
|
11
RdbMap.cpp
11
RdbMap.cpp
@ -323,10 +323,11 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEYSET(lastKey,k,m_ks); continue; }
|
||||
// just bitch for now
|
||||
log(
|
||||
"db: Key out of order in map file %s. "
|
||||
"db: Key out of order in map file %s%s. "
|
||||
"page = %"INT32". key offset = %"INT64". Map or data file is "
|
||||
"corrupt, but it is probably the data file.",
|
||||
m_file.getFilename() ,
|
||||
"corrupt, but it is probably the data file. Please "
|
||||
"delete the map file and restart.",
|
||||
m_file.m_dir,m_file.getFilename() ,
|
||||
i,(int64_t)m_pageSize*(int64_t)i+getOffset(i));
|
||||
|
||||
//log("db: oldk.n1=%08"XINT32" n0=%016"XINT64"",
|
||||
@ -336,6 +337,7 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEY1(lastKey,m_ks),KEY0(lastKey));
|
||||
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
|
||||
log("db: m_numPages = %"INT32"",m_numPages);
|
||||
exit(0);
|
||||
//char *xx=NULL;*xx=0;
|
||||
// was k too small?
|
||||
//if ( i + 1 < m_numPages && lastKey <= getKey(i+1) ) {
|
||||
@ -1371,6 +1373,9 @@ bool RdbMap::chopHead ( int32_t fileHeadSize ) {
|
||||
bool RdbMap::generateMap ( BigFile *f ) {
|
||||
reset();
|
||||
if ( g_conf.m_readOnlyMode ) return false;
|
||||
|
||||
log("db: Generating map for %s/%s",f->m_dir,f->getFilename());
|
||||
|
||||
// we don't support headless datafiles right now
|
||||
if ( ! f->doesPartExist(0) ) {
|
||||
g_errno = EBADENGINEER;
|
||||
|
286
XmlDoc.cpp
286
XmlDoc.cpp
@ -20354,6 +20354,17 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
sb->safePrintf("addstatusdocsize=%05"INT32" ",0);
|
||||
|
||||
|
||||
if ( m_useSecondaryRdbs ) {
|
||||
sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
|
||||
sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
|
||||
sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
|
||||
sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
|
||||
sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
|
||||
if ( cr )
|
||||
sb->safePrintf("indexspiderreplies=%i ",(int)
|
||||
cr->m_indexSpiderReplies);
|
||||
}
|
||||
|
||||
if ( size_imageData && m_imageDataValid ) {
|
||||
// url is in data now
|
||||
ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
|
||||
@ -21913,6 +21924,58 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// returning from a handler that had an error?
|
||||
if ( g_errno ) return NULL;
|
||||
|
||||
// if we are a spider status doc/titlerec and we are doing a rebuild
|
||||
// operation, then keep it simple
|
||||
if ( m_setFromTitleRec &&
|
||||
m_useSecondaryRdbs &&
|
||||
m_contentTypeValid &&
|
||||
m_contentType == CT_STATUS ) {
|
||||
// if not rebuilding posdb then done, list is empty since
|
||||
// spider status docs do not contribute to linkdb, clusterdb,..
|
||||
if ( ! m_usePosdb && ! m_useTitledb ) {
|
||||
m_metaListValid = true;
|
||||
return m_metaList;
|
||||
}
|
||||
|
||||
/////////////
|
||||
//
|
||||
// if user disabled spider status docs then delete the titlerec
|
||||
// AND the posdb index list from our dbs for this ss doc
|
||||
//
|
||||
/////////////
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
if ( ! cr->m_indexSpiderReplies ) {
|
||||
int64_t uh48 = m_firstUrl.getUrlHash48();
|
||||
// delete title rec. true = delete?
|
||||
key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
|
||||
// shortcut
|
||||
SafeBuf *ssb = &m_spiderStatusDocMetaList;
|
||||
// add to list. and we do not add the spider status
|
||||
// doc to posdb since we deleted its titlerec.
|
||||
ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
|
||||
ssb->safeMemcpy ( &tkey , sizeof(key_t) );
|
||||
m_metaList = ssb->getBufStart();
|
||||
m_metaListSize = ssb->getLength ();
|
||||
m_metaListValid = true;
|
||||
return m_metaList;
|
||||
}
|
||||
|
||||
// set safebuf to the json of the spider status doc
|
||||
SafeBuf jd;
|
||||
if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
|
||||
return NULL;
|
||||
// set m_spiderStatusDocMetaList from the json
|
||||
if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
|
||||
return NULL;
|
||||
// TODO: support titledb rebuild as well
|
||||
m_metaList = m_spiderStatusDocMetaList.getBufStart();
|
||||
m_metaListSize = m_spiderStatusDocMetaList.getLength();
|
||||
m_metaListValid = true;
|
||||
return m_metaList;
|
||||
}
|
||||
|
||||
|
||||
// any other indexing issue? hey! g_errno might not be set here
|
||||
//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }
|
||||
|
||||
@ -22937,11 +23000,20 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// i guess it is safe to do this after getting the spiderreply
|
||||
SafeBuf *spiderStatusDocMetaList = NULL;
|
||||
//if ( indexReply ) {
|
||||
|
||||
// get the spiderreply ready to be added to the rdbs w/ msg4
|
||||
spiderStatusDocMetaList = getSpiderStatusDocMetaList (newsr,forDelete);
|
||||
// block?
|
||||
if ( ! spiderStatusDocMetaList ||
|
||||
spiderStatusDocMetaList == (void *)-1)
|
||||
// but if doing a rebuild operation then do not get it, we'll rebuild
|
||||
// it since it will have its own titlerec
|
||||
if ( ! m_useSecondaryRdbs ) {
|
||||
spiderStatusDocMetaList =
|
||||
getSpiderStatusDocMetaList (newsr,forDelete);
|
||||
if ( ! spiderStatusDocMetaList ) {
|
||||
log("build: ss doc metalist null. bad!");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if ( spiderStatusDocMetaList == (void *)-1)
|
||||
return (char *)spiderStatusDocMetaList;
|
||||
//}
|
||||
|
||||
@ -24070,6 +24142,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
spiderStatusDocMetaList->getBufStart() ,
|
||||
spiderStatusDocMetaList->length() );
|
||||
m_p += spiderStatusDocMetaList->length();
|
||||
m_addedStatusDocSize = spiderStatusDocMetaList->length();
|
||||
m_addedStatusDocSizeValid = true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -27739,21 +27813,121 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
// end the json spider status doc
|
||||
jd.safePrintf("\n}\n");
|
||||
|
||||
// BEFORE ANY HASHING
|
||||
int32_t savedDist = m_dist;
|
||||
|
||||
// add the index list for it. it returns false and sets g_errno on err
|
||||
// otherwise it sets m_spiderStatusDocMetaList
|
||||
if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
|
||||
return NULL;
|
||||
|
||||
// now make the titlerec
|
||||
char xdhead[2048];
|
||||
// just the head of it. this is the hacky part.
|
||||
XmlDoc *xd = (XmlDoc *)xdhead;
|
||||
// clear it out
|
||||
memset ( xdhead, 0 , 2048);
|
||||
|
||||
// copy stuff from THIS so the spider reply "document" has the same
|
||||
// header info stuff
|
||||
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
|
||||
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
|
||||
gbmemcpy ( xdhead , (char *)this , hsize );
|
||||
|
||||
// override spider time in case we had error to be consistent
|
||||
// with the actual SpiderReply record
|
||||
//xd->m_spideredTime = reply->m_spideredTime;
|
||||
//xd->m_spideredTimeValid = true;
|
||||
// sanity
|
||||
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
|
||||
|
||||
// this will cause the maroon box next to the search result to
|
||||
// say "STATUS" similar to "PDF" "DOC" etc.
|
||||
xd->m_contentType = CT_STATUS;
|
||||
|
||||
int32_t fullsize = &m_dummyEnd - (char *)this;
|
||||
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
/*
|
||||
// the ptr_* were all zero'd out, put the ones we want to keep back in
|
||||
SafeBuf tmp;
|
||||
// was "Spider Status: %s" but that is unnecessary
|
||||
tmp.safePrintf("<title>%s</title>",
|
||||
mstrerror(m_indexCode));
|
||||
|
||||
// if we are a dup...
|
||||
if ( m_indexCode == EDOCDUP )
|
||||
tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
|
||||
|
||||
if ( m_redirUrlPtr && m_redirUrlValid )
|
||||
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
|
||||
*/
|
||||
|
||||
// put stats like we log out from logIt
|
||||
//tmp.safePrintf("<div style=max-width:800px;>\n");
|
||||
// store log output into doc
|
||||
//logIt(&tmp);
|
||||
//tmp.safePrintf("\n</div>");
|
||||
|
||||
// the content is just the title tag above
|
||||
// xd->ptr_utf8Content = tmp.getBufStart();
|
||||
// xd->size_utf8Content = tmp.length()+1;
|
||||
xd->ptr_utf8Content = jd.getBufStart();
|
||||
xd->size_utf8Content = jd.length()+1;
|
||||
|
||||
// keep the same url as the doc we are the spider reply for
|
||||
xd->ptr_firstUrl = ptr_firstUrl;
|
||||
xd->size_firstUrl = size_firstUrl;
|
||||
|
||||
// serps need site, otherwise search results core
|
||||
xd->ptr_site = ptr_site;
|
||||
xd->size_site = size_site;
|
||||
|
||||
// if this is null then ip lookup failed i guess so just use
|
||||
// the subdomain
|
||||
if ( ! ptr_site && m_firstUrlValid ) {
|
||||
xd->ptr_site = m_firstUrl.getHost();
|
||||
xd->size_site = m_firstUrl.getHostLen();
|
||||
}
|
||||
|
||||
// use the same uh48 of our parent
|
||||
int64_t uh48 = m_firstUrl.getUrlHash48();
|
||||
// then make into a titlerec but store in metalistbuf, not m_titleRec
|
||||
SafeBuf titleRecBuf;
|
||||
// this should not include ptrs that are NULL when compressing
|
||||
// using its m_internalFlags1
|
||||
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
|
||||
return NULL;
|
||||
|
||||
// concat titleRec to our posdb key records
|
||||
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
|
||||
return NULL;
|
||||
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
|
||||
return NULL;
|
||||
|
||||
// return the right val
|
||||
m_dist = savedDist;
|
||||
|
||||
// ok, good to go, ready to add to posdb and titledb
|
||||
m_spiderStatusDocMetaListValid = true;
|
||||
return &m_spiderStatusDocMetaList;
|
||||
}
|
||||
|
||||
|
||||
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
|
||||
|
||||
// the posdb table
|
||||
HashTableX tt4;
|
||||
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
|
||||
return NULL;
|
||||
return false;
|
||||
|
||||
|
||||
Json jp2;
|
||||
if (! jp2.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
|
||||
if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
|
||||
g_errno = EBADJSONPARSER;
|
||||
return NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
// BEFORE ANY HASHING
|
||||
int32_t savedDist = m_dist;
|
||||
// re-set to 0
|
||||
m_dist = 0;
|
||||
|
||||
@ -27859,7 +28033,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
*/
|
||||
|
||||
// store keys in safebuf then to make our own meta list
|
||||
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
|
||||
addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );
|
||||
|
||||
// debug this shit
|
||||
//SafeBuf tmpsb;
|
||||
@ -27868,97 +28042,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
// &tmpsb );
|
||||
//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());
|
||||
|
||||
|
||||
// now make the titlerec
|
||||
char xdhead[2048];
|
||||
// just the head of it. this is the hacky part.
|
||||
XmlDoc *xd = (XmlDoc *)xdhead;
|
||||
// clear it out
|
||||
memset ( xdhead, 0 , 2048);
|
||||
|
||||
// copy stuff from THIS so the spider reply "document" has the same
|
||||
// header info stuff
|
||||
int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
|
||||
if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
|
||||
gbmemcpy ( xdhead , (char *)this , hsize );
|
||||
|
||||
// override spider time in case we had error to be consistent
|
||||
// with the actual SpiderReply record
|
||||
//xd->m_spideredTime = reply->m_spideredTime;
|
||||
//xd->m_spideredTimeValid = true;
|
||||
// sanity
|
||||
//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
|
||||
|
||||
// this will cause the maroon box next to the search result to
|
||||
// say "STATUS" similar to "PDF" "DOC" etc.
|
||||
xd->m_contentType = CT_STATUS;
|
||||
|
||||
int32_t fullsize = &m_dummyEnd - (char *)this;
|
||||
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
/*
|
||||
// the ptr_* were all zero'd out, put the ones we want to keep back in
|
||||
SafeBuf tmp;
|
||||
// was "Spider Status: %s" but that is unnecessary
|
||||
tmp.safePrintf("<title>%s</title>",
|
||||
mstrerror(m_indexCode));
|
||||
|
||||
// if we are a dup...
|
||||
if ( m_indexCode == EDOCDUP )
|
||||
tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
|
||||
|
||||
if ( m_redirUrlPtr && m_redirUrlValid )
|
||||
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
|
||||
*/
|
||||
|
||||
// put stats like we log out from logIt
|
||||
//tmp.safePrintf("<div style=max-width:800px;>\n");
|
||||
// store log output into doc
|
||||
//logIt(&tmp);
|
||||
//tmp.safePrintf("\n</div>");
|
||||
|
||||
// the content is just the title tag above
|
||||
// xd->ptr_utf8Content = tmp.getBufStart();
|
||||
// xd->size_utf8Content = tmp.length()+1;
|
||||
xd->ptr_utf8Content = jd.getBufStart();
|
||||
xd->size_utf8Content = jd.length()+1;
|
||||
|
||||
// keep the same url as the doc we are the spider reply for
|
||||
xd->ptr_firstUrl = ptr_firstUrl;
|
||||
xd->size_firstUrl = size_firstUrl;
|
||||
|
||||
// serps need site, otherwise search results core
|
||||
xd->ptr_site = ptr_site;
|
||||
xd->size_site = size_site;
|
||||
|
||||
// if this is null then ip lookup failed i guess so just use
|
||||
// the subdomain
|
||||
if ( ! ptr_site && m_firstUrlValid ) {
|
||||
xd->ptr_site = m_firstUrl.getHost();
|
||||
xd->size_site = m_firstUrl.getHostLen();
|
||||
}
|
||||
|
||||
// use the same uh48 of our parent
|
||||
int64_t uh48 = m_firstUrl.getUrlHash48();
|
||||
// then make into a titlerec but store in metalistbuf, not m_titleRec
|
||||
SafeBuf titleRecBuf;
|
||||
// this should not include ptrs that are NULL when compressing
|
||||
// using its m_internalFlags1
|
||||
if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
|
||||
return NULL;
|
||||
|
||||
// concat titleRec to our posdb key records
|
||||
if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
|
||||
return NULL;
|
||||
if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
|
||||
return NULL;
|
||||
|
||||
// return the right val
|
||||
m_dist = savedDist;
|
||||
|
||||
// ok, good to go, ready to add to posdb and titledb
|
||||
m_spiderStatusDocMetaListValid = true;
|
||||
return &m_spiderStatusDocMetaList;
|
||||
return true;
|
||||
}
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
|
1
XmlDoc.h
1
XmlDoc.h
@ -509,6 +509,7 @@ class XmlDoc {
|
||||
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
|
||||
bool forDelete ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
|
||||
bool setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t ssDocId ) ;
|
||||
SafeBuf m_spiderStatusDocMetaList;
|
||||
char *getIsAdult ( ) ;
|
||||
int32_t **getIndCatIds ( ) ;
|
||||
|
2
main.cpp
2
main.cpp
@ -6618,6 +6618,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
"ch32=%010"UINT32" "
|
||||
"clen=%07"INT32" "
|
||||
"cs=%04d "
|
||||
"ctype=%s "
|
||||
"lang=%02d "
|
||||
"sni=%03"INT32" "
|
||||
//"cats=%"INT32" "
|
||||
@ -6642,6 +6643,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
xd->m_contentHash32,
|
||||
xd->size_utf8Content,//tr.getContentLen() ,
|
||||
xd->m_charset,//tr.getCharset(),
|
||||
g_contentTypeStrings[xd->m_contentType],
|
||||
xd->m_langId,//tr.getLanguage(),
|
||||
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
|
||||
//nc,
|
||||
|
Loading…
x
Reference in New Issue
Block a user