Merge branch 'ia' into ia-zak

This commit is contained in:
Matt 2015-05-10 09:41:55 -06:00
commit 29212bbe9c
12 changed files with 280 additions and 51 deletions

@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
if ( ! h ) {
log("crawlbot: bad custom collname");
g_errno = EBADENGINEER;
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
return true;
}
*h = '\0';
crawl = h + 1;
if ( ! crawl[0] ) {
log("crawlbot: bad custom crawl name");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
// or if too big!
if ( gbstrlen(crawl) > 30 ) {
log("crawlbot: crawlbot crawl NAME is over 30 chars");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
@ -1939,12 +1951,13 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
if ( ! g_conf.m_doingCommandLine ) {
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
}
@ -3472,6 +3485,70 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
///////
//
// recompile regular expressions
//
///////
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
int32_t err;
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
// error!
char errbuf[1024];
regerror(err,&m_ucr,errbuf,1000);
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
char errbuf[1024];
regerror(err,&m_upr,errbuf,1000);
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_upr );
m_hasupr = false;
}
// what diffbot url to use for processing
char *api = m_diffbotApiUrl.getBufStart();
@ -3913,17 +3990,20 @@ void testRegex ( ) {
rx = ".*?article[0-9]*?.html";
regex_t ucr;
int32_t err;
if ( regcomp ( &ucr , rx ,
REG_ICASE
|REG_EXTENDED
//|REG_NEWLINE
//|REG_NOSUB
) ) {
if ( ( err = regcomp ( &ucr , rx ,
REG_ICASE
|REG_EXTENDED
//|REG_NEWLINE
//|REG_NOSUB
) ) ) {
// error!
char errbuf[1024];
regerror(err,&ucr,errbuf,1000);
log("xmldoc: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
rx,errbuf);
}
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);

@ -108,6 +108,7 @@ case EDNSBAD : return "DNS sent an unknown response code";
case EDNSREFUSED : return "DNS refused to talk";
case EDNSDEAD : return "DNS hostname does not exist";
case EDNSTIMEDOUT : return "DNS timed out";
case EDNSERROR : return "DNS lookup error";
case ECOLLTOOBIG : return "Collection is too long";
case ESTRIKEOUT : return "Retried enough times, deleting doc";
case ENOPERM : return "Permission Denied";

@ -112,6 +112,7 @@ enum {
EDNSREFUSED , //dns refused to talk to us
EDNSDEAD , //dns is dead
EDNSTIMEDOUT , //was just EUDPTIMEDOUT
EDNSERROR ,
ECOLLTOOBIG , //collection is too long
ESTRIKEOUT , //retried enough times; deleting doc & giving up
ENOPERM , //permission denied

@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
//(int32_t)mem,size,h,s_n,note);
s_n++;
// debug
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
"used=%"INT64"",
size,note,(PTRTYPE)mem,m_used);
@ -1023,7 +1023,7 @@ bool Mem::rmMem ( void *mem , int32_t size , const char *note ) {
keepgoing:
// debug
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
log(LOG_INFO,"mem: rmMem (%"INT32"): "
"ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);

@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
int32_t minRecSize = m_minRecSizes[m_i];
// sanity check
if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
minRecSize < 0) ){
log( "minRecSize = %"INT32"", minRecSize );
char *xx=NULL; *xx=0;
}
// if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
// minRecSize < 0) ){
// log( "minRecSize = %"INT32"", minRecSize );
// char *xx=NULL; *xx=0;
// }
//bool forceLocalIndexdb = true;
// if it is a no-split term, we may gotta get it over the net
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {
// like 90MB last time i checked. so it won't read more
// than that...
int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
// MDW: no, it's better to print oom then not give all the
// results leaving users scratching their heads. besides,
// we should do docid range splitting before we go out of
// mem. we should also report the size of each termlist
// in bytes in the query info header.
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
int32_t minRecSizes = -1;
// start up the read. thread will wait in thread queue to
// launch if too many threads are out.
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
if ( m_minRecSizes[i] == 0 ) continue;
if ( m_minRecSizes[i] == -1 ) continue;
// do not print this if compiling section xpathsitehash stats
// because we only need like 10k of list to get a decent
// reading
if ( m_req->m_forSectionStats ) break;
log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
"docIds!",
log("msg2: read termlist #%"INT32" size=%"INT32" "
"maxSize=%"INT32". losing docIds!",
i,m_lists[i].m_listSize,m_minRecSizes[i]);
}

@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
// 'time enough for love' query was hitting 30MB termlists.
//rs = 50000000;
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// it is better to go oom then leave users scratching their
// heads as to why some results are not being returned.
rs = -1;
// if section stats, limit to 1MB
//if ( m_r->m_getSectionStats ) rs = 1000000;
// get the jth query term

@ -182,9 +182,10 @@ bool Msg5::getList ( char rdbId ,
// log("Msg5::readList: startKey > endKey warning");
// we no longer allow negative minRecSizes
if ( minRecSizes < 0 ) {
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
minRecSizes = 1;
char *xx = NULL; *xx = 0;
if ( g_conf.m_logDebugDb )
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
minRecSizes = 0x7fffffff;
//char *xx = NULL; *xx = 0;
}
// ensure startKey last bit clear, endKey last bit set
//if ( (startKey.n0 & 0x01) == 0x01 )

@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
// use only 64k values so we don't stress doledb/waittrees/etc.
// for large #'s of docids
int32_t firstIp = (docId & 0x0000ffff);
// bits 6-13 of the docid are the domain hash so use those
// when doing a REINDEX (not delete!) to ensure that requests
// on the same domain go to the same shard, at least when
// we have up to 256 shards. if we have more than 256 shards
// at this point some shards will not participate in the
// query reindex/delete process because of this, so
// we'll want to allow more bits in in that case perhaps.
// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
// to see what shard is responsible for storing and indexing
// this SpiderRequest based on the firstIp.
if ( ! m_forceDel ) {
// if we are a REINDEX not a delete because
// deletes don't need to spider/redownload the doc
// so the distribution can be more random
firstIp >>= 6;
firstIp &= 0xff;
}
// 0 is not a legit val. it'll core below.
if ( firstIp == 0 ) firstIp = 1;
// use a fake ip

@ -22598,15 +22598,25 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"to adjust how often you want things respidered."
"</td></tr>"
"<tr class=poo><td>indexage</td>"
"<td>"
"How long has it been since the url was last "
"successfully indexed? In seconds. "
"Can use <, >, <=, >=, ==, != comparison operators."
"</td></tr>"
"<tr class=poo><td>urlage</td>"
"<td>"
"This is the time, in seconds, since a url was first "
"added to spiderdb to be spidered. This is "
"This uses the time, in seconds, since a url was "
"first added to spiderdb to be spidered, aka "
"its discovery date. "
"Can use <, >, <=, >=, ==, != comparison operators."
"</td></tr>"
//"<tr class=poo><td>!newoutlink</td>"
//"<td>Matches if document is NOT a new outlink."
//"</td></tr>"

@ -207,9 +207,21 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
}
void SpiderReply::setKey (int32_t firstIp,
int64_t parentDocId,
// no need for parentdocid in this any more.
//int64_t parentDocId,
int64_t uh48,
bool isDel) {
// now we use a 1 parentdocid for replies that were successful
int64_t parentDocId = 1;
// or 0 if had error. this way we only keep at most 2 SpiderReplies
// for each url in spiderdb. we need to keep the last successful
// spiderreply in spiderdb so
// SpiderRequest::m_lastSuccessfulSpideredTime will be valid.
// this way the reply that was successful will occur after the
// one that had an error, so we can just check the last spider reply
// when doing our scan in scanListForWinners().
if ( m_errCode ) parentDocId = 0;
m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
// set dataSize too!
m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
@ -4565,6 +4577,13 @@ bool SpiderColl::scanListForWinners ( ) {
// assume our added time is the first time this url was added
sreq->m_discoveryTime = sreq->m_addedTime;
// record the last time we successfully indexed this doc, ifany
if ( srep && ! srep->m_errCode )
sreq->m_lastSuccessfulSpideredTime =
srep->m_spideredTime;
else
sreq->m_lastSuccessfulSpideredTime = 0;
// if ( uh48 == 110582802025376LL )
// log("hey");
@ -4594,10 +4613,12 @@ bool SpiderColl::scanListForWinners ( ) {
// and the min added time as well!
// get the oldest timestamp so
// gbssDiscoveryTime will be accurate.
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
if ( sreq->m_discoveryTime <
wsreq->m_discoveryTime )
wsreq->m_discoveryTime =
sreq->m_discoveryTime;
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
if ( wsreq->m_discoveryTime <
sreq->m_discoveryTime )
sreq->m_discoveryTime =
wsreq->m_discoveryTime;
}
@ -11313,6 +11334,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( *p != 'i' ) goto skipi;
if ( strncmp(p,"isinjected",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
@ -11923,6 +11945,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
// non-boolen junk
skipi:
@ -12407,6 +12430,32 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
// constraint for last time url was successfully indexed
if ( *p=='i' && strncmp(p,"indexage",8) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if never successfully indexed, skip this one
if ( sreq->m_lastSuccessfulSpideredTime == 0) continue;
int32_t age;
age = nowGlobal - sreq->m_lastSuccessfulSpideredTime;
// the argument entered by user
int32_t uage = atoi(s) ;
if ( sign == SIGN_EQ && age != uage ) continue;
if ( sign == SIGN_NE && age == uage ) continue;
if ( sign == SIGN_GT && age <= uage ) continue;
if ( sign == SIGN_LT && age >= uage ) continue;
if ( sign == SIGN_GE && age < uage ) continue;
if ( sign == SIGN_LE && age > uage ) continue;
// skip over 'indexage'
p += 8;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// selector using the first time it was added to the Spiderdb
// added by Sam, May 5th 2015
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
@ -12430,6 +12479,8 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
// skip over 'urlage'
p += 6;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;

@ -532,7 +532,11 @@ class SpiderRequest {
// then we increment the last 8 bits or so. see Msg22.cpp.
//int64_t m_probDocId;
//int32_t m_reservedc1;
int32_t m_reservedc2;
//int32_t m_reservedc2;
// if there is a 'successful' SpiderReply for this url then this is
// the SpiderReply::m_spideredTime of the most recent one.
int32_t m_lastSuccessfulSpideredTime;
//int32_t m_parentPubDate;
@ -955,7 +959,7 @@ class SpiderReply {
void reset() { memset ( this , 0 , sizeof(SpiderReply) ); };
void setKey ( int32_t firstIp,
int64_t parentDocId ,
//int64_t parentDocId ,
int64_t uh48 ,
bool isDel ) ;

@ -1859,7 +1859,10 @@ bool XmlDoc::set2 ( char *titleRec ,
//m_hasContactInfoValid = true;
// sanity check. if m_siteValid is true, this must be there
if ( ! ptr_site ) { char *xx=NULL;*xx=0; }
if ( ! ptr_site ) {
log("set4: ptr_site is null for docid %"INT64"",m_docId);
//char *xx=NULL;*xx=0; }
}
// lookup the tagdb rec fresh if setting for a summary. that way we
// can see if it is banned or not
@ -2534,16 +2537,50 @@ bool XmlDoc::indexDoc ( ) {
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// sanity log
if ( *fip == 0 || *fip == -1 ) {
//
// now add a spider status doc for this so we know
// why a crawl might have failed to start
//
SafeBuf *ssDocMetaList = NULL;
// save this
int32_t saved = m_indexCode;
// and make it the real reason for the spider status doc
m_indexCode = EDNSERROR;
// get the spiderreply ready to be added
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
// revert
m_indexCode = saved;
// error?
if ( ! ssDocMetaList ) return true;
// blocked?
if ( ssDocMetaList == (void *)-1 ) return false;
// need to alloc space for it too
char *list = ssDocMetaList->getBufStart();
int32_t len = ssDocMetaList->length();
//needx += len;
// this too
m_addedStatusDocSize = len;
m_addedStatusDocSizeValid = true;
char *url = "unknown";
if ( m_sreqValid ) url = m_sreq.m_url;
log("build: error2 getting real firstip of %"INT32" for "
"%s. Not adding new spider req", (int32_t)*fip,url);
// also count it as a crawl attempt
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
if ( ! m_metaList2.safeMemcpy ( list , len ) )
return true;
goto skipNewAdd1;
}
// store the new request (store reply for this below)
char rd = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
m_metaList2.pushChar(rd);
if ( ! m_metaList2.pushChar(rd) )
return true;
// store it here
SpiderRequest revisedReq;
// this fills it in
@ -23044,7 +23081,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
srep.m_domHash32 = m_sreq.m_domHash32;
srep.m_spideredTime = getTimeGlobal();
int64_t uh48 = m_sreq.getUrlHash48();
int64_t parentDocId = 0LL;
//int64_t parentDocId = 0LL;
srep.m_contentHash32 = 0;
// were we already in titledb before we started spidering?
// yes otherwise we would have called "goto skip9" above
@ -23054,7 +23091,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
srep.m_isIndexedINValid = false;
srep.m_errCode = EREINDEXREDIR; // indexCode
srep.m_downloadEndTime = 0;
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
srep.setKey ( srep.m_firstIp, /*parentDocId ,*/uh48 , false );
// lock of request needs to match that of reply so the
// reply, when recevied by Rdb.cpp which calls addSpiderReply()
// can unlock this url so it can be spidered again.
@ -25945,7 +25982,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);
// set the key, m_srep.m_key
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
m_srep.setKey ( firstIp, /*parentDocId ,*/ uh48 , false );
// . did we download a page? even if indexcode is set we might have
// . if this is non-zero that means its valid
@ -28372,7 +28409,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
// . TODO:
// usedProxy:1
// proxyIp:1.2.3.4
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
setStatus ( "making spider reply meta list");
@ -28393,8 +28430,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
int64_t *uqd = getAvailDocIdOnly ( d );
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
unsigned char *hc = (unsigned char *)getHopCount();
if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
// unsigned char *hc = (unsigned char *)getHopCount();
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
int32_t tmpVal = -1;
int32_t *priority = &tmpVal;
@ -28430,7 +28467,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
// why isn't gbhopcount: being indexed consistently?
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
// reset just in case
m_spiderStatusDocMetaList.reset();
@ -28465,12 +28502,17 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
ptr_redirUrl);
if ( m_indexCodeValid ) {
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
jd.safePrintf("\"gbssStatusMsg\":\"");
jd.jsonEncode (mstrerror(m_indexCode));
jd.safePrintf("\",\n");
}
else {
jd.safePrintf("\"gbssStatusCode\":-1,\n");
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
}
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
jd.safePrintf("\"gbssStatusMsg\":\"");
jd.jsonEncode (mstrerror(m_indexCode));
jd.safePrintf("\",\n");
if ( m_httpStatusValid )
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
@ -28514,12 +28556,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
jd.safePrintf("\",\n");
//if ( m_redirUrlPtr && m_redirUrlValid )
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
m_numRedirects);
//if ( m_numRedirectsValid )
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
if ( m_docIdValid )
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
if ( m_hopCountValid )
//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);
// crawlbot round
if ( cr->m_isCustomCrawl )
@ -28946,6 +28991,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
xd->ptr_site = ptr_site;
xd->size_site = size_site;
// if this is null then ip lookup failed i guess so just use
// the subdomain
if ( ! ptr_site && m_firstUrlValid ) {
xd->ptr_site = m_firstUrl.getHost();
xd->size_site = m_firstUrl.getHostLen();
}
// use the same uh48 of our parent
int64_t uh48 = m_firstUrl.getUrlHash48();
// then make into a titlerec but store in metalistbuf, not m_titleRec