Merge branch 'testing'
This commit is contained in:
commit
fa7216f978
Address.cppAddress.hAutoBan.cppBigFile.cppCachedb.cppCatdb.cppClusterdb.cppCollectiondb.cppCollectiondb.hConf.cppConf.hDatedb.cppDates.cppDiskPageCache.cppErrno.cppErrno.hHashTableX.hHostdb.cppHostdb.hHttpRequest.cppHttpRequest.hHttpServer.cppImages.cppImages.hIndexdb.cppIndexdb.hJson.cppJson.hLICENSELinkdb.cppLinkdb.hMake.dependMakefileMem.cppMonitordb.cppMsg0.cppMsg0.hMsg1.cppMsg1.hMsg13.cppMsg13.hMsg17.cppMsg17.hMsg2.cppMsg2.hMsg20.cppMsg20.hMsg22.cppMsg3.cppMsg3.hMsg36.cppMsg36.hMsg37.cppMsg37.hMsg39.cppMsg39.hMsg3a.cppMsg3a.hMsg4.cppMsg4.hMsg40.cppMsg40.hMsg5.cppMsg5.hMsg51.cppMsg51.hMsg8b.cppMsg9b.cppMsge0.cppMsge0.hMsge1.cppPageAddUrl.cppPageBasic.cppPageCatdb.cppPageCrawlBot.cppPageCrawlBot.hPageDirectory.cppPageEvents.cppPageGet.cppPageHosts.cppPageIndexdb.cppPageInject.cppPageLogView.cppPageLogin.cppPageOverview.cppPageParser.cppPageParser.hPageReindex.cppPageReindex.hPageResults.cppPageResults.hPageRoot.cppPageSockets.cppPageStats.cppPageThesaurus.cppPageTitledb.cppPages.cppPages.hParms.cppParms.h
28
Address.cpp
28
Address.cpp
@ -646,7 +646,8 @@ bool Addresses::set ( Sections *sections ,
|
||||
TagRec *gr ,
|
||||
Url *url ,
|
||||
long long docId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
long domHash32 ,
|
||||
long ip ,
|
||||
//long tagPairHash ,
|
||||
@ -678,7 +679,7 @@ bool Addresses::set ( Sections *sections ,
|
||||
m_gr = gr;
|
||||
m_url = url;
|
||||
m_docId = docId;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_domHash32 = domHash32;
|
||||
m_ip = ip;
|
||||
//m_tagPairHash = tagPairHash;
|
||||
@ -1090,7 +1091,7 @@ bool Addresses::set ( Sections *sections ,
|
||||
// parsing consistency
|
||||
if ( //! m_addressReplyValid &&
|
||||
! m_msg2c->verifyAddresses ( this ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
m_domHash32 ,
|
||||
m_ip ,
|
||||
m_niceness ,
|
||||
@ -10257,7 +10258,7 @@ void Addresses::print ( SafeBuf *pbuf , long long uh64 ) {
|
||||
"be a KEY in placedb. So you generally need two "
|
||||
"places inlining the same name before that will "
|
||||
"happen.</i>");
|
||||
pbuf->safePrintf("</br>\n");
|
||||
pbuf->safePrintf("<br>\n");
|
||||
|
||||
}
|
||||
|
||||
@ -16102,7 +16103,8 @@ void Msg2c::reset() {
|
||||
// into the TitleRec for re-parsing purposes later on, so we consistently
|
||||
// re-parse
|
||||
bool Msg2c::verifyAddresses ( Addresses *aa ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
long domHash32 ,
|
||||
long ip ,
|
||||
long niceness ,
|
||||
@ -16111,7 +16113,7 @@ bool Msg2c::verifyAddresses ( Addresses *aa ,
|
||||
|
||||
m_niceness = niceness;
|
||||
m_addresses = aa;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_domHash32 = domHash32;
|
||||
m_ip = ip;
|
||||
m_callback = callback;
|
||||
@ -16255,9 +16257,11 @@ bool Msg2c::launchRequests ( ) {
|
||||
char isName = ( a->m_street->m_flags2 & PLF2_IS_NAME ) ;
|
||||
*(char *)p = isName ; p += 1;
|
||||
// collection
|
||||
long collSize = gbstrlen(m_coll) + 1;
|
||||
memcpy ( p , m_coll , collSize );
|
||||
p += collSize;
|
||||
//long collSize = gbstrlen(m_coll) + 1;
|
||||
//memcpy ( p , m_coll , collSize );
|
||||
//p += collSize;
|
||||
*(collnum_t *)p = m_collnum;
|
||||
p += sizeof(collnum_t);
|
||||
// end of it
|
||||
char *pend = requestBuf + REQBUFSIZE; // s_requestBuf + max;
|
||||
// . then the address string, semicolon separated, null terminated
|
||||
@ -16495,14 +16499,16 @@ void handleRequest2c ( UdpSlot *slot , long nicenessWTF ) {
|
||||
// save it
|
||||
st->m_niceness = niceness;
|
||||
// get coll
|
||||
char *coll = p; p += gbstrlen(p) + 1;
|
||||
//char *coll = p; p += gbstrlen(p) + 1;
|
||||
collnum_t collnum = *(collnum_t *)p;
|
||||
p += sizeof(collnum_t);
|
||||
// the address string, semicolon separated, NULL terminated
|
||||
st->m_addrStr = p; p += gbstrlen(p) + 1;
|
||||
|
||||
// . get from msg5, return if it blocked
|
||||
// . will probably not block since in the disk page cache a lot
|
||||
if ( ! st->m_msg5.getList ( RDB_PLACEDB ,
|
||||
coll ,
|
||||
collnum ,
|
||||
&st->m_list ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
|
13
Address.h
13
Address.h
@ -483,7 +483,8 @@ class Msg2c {
|
||||
// . closest matching "site" is used as the "site" (the site url)
|
||||
// . stores the tagRec in your "tagRec"
|
||||
bool verifyAddresses ( class Addresses *aa ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum,
|
||||
long domHash32 ,
|
||||
long ip ,
|
||||
//HashTableX *avt ,
|
||||
@ -494,8 +495,8 @@ class Msg2c {
|
||||
bool launchRequests ( );
|
||||
|
||||
// some specified input
|
||||
char *m_coll;
|
||||
long m_collLen;
|
||||
//char *m_coll;
|
||||
//long m_collLen;
|
||||
collnum_t m_collnum;
|
||||
void (*m_callback ) ( void *state );
|
||||
void *m_state;
|
||||
@ -597,7 +598,8 @@ class Addresses {
|
||||
class TagRec *gr ,
|
||||
class Url *url ,
|
||||
long long docId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum,
|
||||
long domHash32 ,
|
||||
long ip ,
|
||||
//long tagPairHash ,
|
||||
@ -716,7 +718,8 @@ class Addresses {
|
||||
RdbList m_list;
|
||||
class Url *m_url;
|
||||
long long m_docId;
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
long long m_termId;
|
||||
long m_domHash32;
|
||||
long m_ip;
|
||||
|
26
AutoBan.cpp
26
AutoBan.cpp
@ -802,7 +802,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
SafeBuf sb(512 * 512,"autobbuf");
|
||||
//read in all of the possible cgi parms off the bat:
|
||||
//long user = g_pages.getUserType( s , r );
|
||||
char *username = g_users.getUsername(r);
|
||||
//char *username = g_users.getUsername(r);
|
||||
//char *pwd = r->getString ("pwd");
|
||||
|
||||
char *coll = r->getString ("c");
|
||||
@ -831,8 +831,8 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
// char *ss = sb.getBuf();
|
||||
// char *ssend = sb.getBufEnd();
|
||||
g_pages.printAdminTop ( &sb, PAGE_AUTOBAN, username,
|
||||
coll , NULL , s->m_ip );
|
||||
g_pages.printAdminTop ( &sb, s , r );
|
||||
|
||||
//sb.incrementLength(sss - ss);
|
||||
|
||||
// MDW: moved to here
|
||||
@ -859,7 +859,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
&msecs);
|
||||
sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>"
|
||||
"<center><b>Code Usage "
|
||||
"(<a href=\"/master/"
|
||||
"(<a href=\"/admin/"
|
||||
"autoban?c=%s&resetcodes=1\">reset</a> "
|
||||
"%li days %li hours %li "
|
||||
"minutes %li sec ago)"
|
||||
@ -1271,15 +1271,15 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
// "%li days %li hrs %li min ago"
|
||||
// "</center></td>"
|
||||
|
||||
"<td><center><a href=\"/master/"
|
||||
"<td><center><a href=\"/admin/"
|
||||
"autoban?c=%s&allow=%s&showAllIps=%li\">"
|
||||
"allow/</a>"
|
||||
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&deny=%s&showAllIps=%li\">"
|
||||
"deny/</a>"
|
||||
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&clear=%s&showAllIps=%li\">"
|
||||
"clear</a></center>"
|
||||
"</td>",color,
|
||||
@ -1320,22 +1320,22 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td bgcolor=#%s><center><b>Show Ips by Number of Queries"
|
||||
"</b></center></td>",
|
||||
LIGHT_BLUE);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=0\">"
|
||||
"0 Queries</a></b>"
|
||||
"</font></center></td>",
|
||||
coll);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=1\">"
|
||||
"1 Query</a></b>"
|
||||
"</font></center></td>",
|
||||
coll);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=10\">"
|
||||
"10 Queries</a></b>"
|
||||
"</font></center></td>",
|
||||
coll);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=100\">"
|
||||
"100 Queries</a></b>"
|
||||
"</font></center></td></tr>",
|
||||
@ -1469,10 +1469,10 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
m_detectVals[i].m_timesBanned);
|
||||
}
|
||||
sb.safePrintf("<td><center>"
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&allow=%s&showAllIps=%li\">"
|
||||
"allow/</a>"
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&deny=%s&showAllIps=%li\">"
|
||||
"deny</a></center>"
|
||||
"</td>",
|
||||
|
16
BigFile.cpp
16
BigFile.cpp
@ -468,6 +468,9 @@ bool BigFile::readwrite ( void *buf ,
|
||||
fstate->m_callback = callback;
|
||||
fstate->m_niceness = niceness;
|
||||
fstate->m_flags = m_flags;
|
||||
// sanity
|
||||
if ( fstate->m_bytesToGo > 150000000 )
|
||||
log("file: huge read of %lli bytes",(long long)size);
|
||||
// . set our fd's before entering the thread in case RdbMerge
|
||||
// calls our unlinkPart()
|
||||
// . it's thread-UNsafe to call getfd() from within the thread
|
||||
@ -563,10 +566,12 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// request originated through Multicast, then multicast will sleep
|
||||
// and retry. Msg3 could retry, the multicast thing should be more
|
||||
// for running out of udp slots though...
|
||||
if ( g_errno && ! doWrite && g_errno != ENOTHREADSLOTS ) {
|
||||
log (LOG_INFO,"disk: May retry later.");
|
||||
return true;
|
||||
}
|
||||
// crap, call to clone() now fails a lot since we use pthreads
|
||||
// library ... so assume that is it i guess (MDW 3/15/2014)
|
||||
//if ( g_errno && ! doWrite && g_errno != ENOTHREADSLOTS ) {
|
||||
// log (LOG_INFO,"disk: May retry later.");
|
||||
// return true;
|
||||
//}
|
||||
// otherwise, thread spawn failed, do it blocking then
|
||||
g_errno = 0;
|
||||
// if threads are manually disabled don't print these msgs because
|
||||
@ -577,7 +582,8 @@ bool BigFile::readwrite ( void *buf ,
|
||||
if ( now - s_lastTime >= 1 ) {
|
||||
s_lastTime = now;
|
||||
log (LOG_INFO,
|
||||
"disk: Doing blocking disk access. This will hurt "
|
||||
"disk: Doing blocking disk access. "
|
||||
"This will hurt "
|
||||
"performance. isWrite=%li.",(long)doWrite);
|
||||
}
|
||||
}
|
||||
|
@ -99,9 +99,10 @@ bool Cachedb::verify ( char *coll ) {
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
long minRecSizes = 64000;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
if ( ! msg5.getList ( m_rdbId,//RDB_CACHEDB ,
|
||||
coll ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
(char*)&startKey ,
|
||||
(char*)&endKey ,
|
||||
|
@ -141,7 +141,7 @@ bool Catdb::verify ( char *coll ) {
|
||||
//long minRecSizes = 64000;
|
||||
|
||||
if ( ! msg5.getList ( RDB_CATDB ,
|
||||
"",//coll ,
|
||||
0,//collnum ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
|
@ -362,9 +362,10 @@ bool Clusterdb::verify ( char *coll ) {
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
//long minRecSizes = 64000;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
if ( ! msg5.getList ( RDB_CLUSTERDB ,
|
||||
coll ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
@ -394,6 +395,8 @@ bool Clusterdb::verify ( char *coll ) {
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//unsigned long groupId = getGroupId ( RDB_CLUSTERDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
|
562
Collectiondb.cpp
562
Collectiondb.cpp
@ -138,6 +138,19 @@ bool Collectiondb::loadAllCollRecs ( ) {
|
||||
if ( ! addExistingColl ( coll , collnum ) )
|
||||
return false;
|
||||
}
|
||||
// if no existing recs added... add coll.main.0 always at startup
|
||||
if ( m_numRecs == 0 ) {
|
||||
log("admin: adding main collection.");
|
||||
addNewColl ( "main",
|
||||
0 , // customCrawl ,
|
||||
NULL,
|
||||
0 ,
|
||||
true , // bool saveIt ,
|
||||
// Parms.cpp reserves this so it can be sure
|
||||
// to add the same collnum to every shard
|
||||
0 );
|
||||
}
|
||||
|
||||
// note it
|
||||
//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
|
||||
// "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
|
||||
@ -449,10 +462,10 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
// show the ban links in the search results. the
|
||||
// collection name is cryptographic enough to show that
|
||||
cr->m_isCustomCrawl = customCrawl;
|
||||
cr->m_diffbotOnlyProcessIfNew = true;
|
||||
cr->m_diffbotOnlyProcessIfNewUrl = true;
|
||||
// default respider to off
|
||||
cr->m_collectiveRespiderFrequency = 0.0;
|
||||
cr->m_restrictDomain = true;
|
||||
//cr->m_restrictDomain = true;
|
||||
// reset the crawl stats
|
||||
// . this will core if a host was dead and then when it came
|
||||
// back up host #0's parms.cpp told it to add a new coll
|
||||
@ -604,7 +617,7 @@ bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
|
||||
if ( r->getLong("admin",1) == 0 ) return false;
|
||||
if ( g_conf.isMasterAdmin ( s , r ) ) return true;
|
||||
@ -615,7 +628,6 @@ bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
|
||||
//return cr->hasPermission ( r , s );
|
||||
}
|
||||
|
||||
/*
|
||||
void savingCheckWrapper1 ( int fd , void *state ) {
|
||||
WaitEntry *we = (WaitEntry *)state;
|
||||
// no state?
|
||||
@ -688,6 +700,8 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
|
||||
|
||||
// if there is an outstanding disk read thread or merge thread then
|
||||
// Spider.cpp will handle the delete in the callback.
|
||||
// this is now tryToDeleteSpiderColl in Spider.cpp
|
||||
/*
|
||||
void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
|
||||
|
||||
sc->m_deleteMyself = true;
|
||||
@ -701,10 +715,11 @@ void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
// do not allow this if in repair mode
|
||||
if ( g_repairMode > 0 ) {
|
||||
if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
|
||||
log("admin: Can not delete collection while in repair mode.");
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
@ -794,7 +809,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
//sc->reset();
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
deleteSpiderColl ( sc );
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
cr->m_spiderColl = NULL;
|
||||
@ -836,8 +851,8 @@ bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// get the CollectionRec for "test"
|
||||
CollectionRec *cr = getRec ( coll ); // "test" );
|
||||
// get the CollectionRec for "qatest123"
|
||||
CollectionRec *cr = getRec ( coll ); // "qatest123" );
|
||||
|
||||
// must be there. if not, we create test i guess
|
||||
if ( ! cr ) {
|
||||
@ -849,6 +864,47 @@ bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
|
||||
}
|
||||
*/
|
||||
|
||||
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
|
||||
bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
|
||||
|
||||
// an add, make sure big enough
|
||||
long need = ((long)collnum+1)*sizeof(CollectionRec *);
|
||||
long have = m_recPtrBuf.getLength();
|
||||
long need2 = need - have;
|
||||
|
||||
// if already big enough
|
||||
if ( need2 <= 0 ) {
|
||||
m_recs [ collnum ] = NULL;
|
||||
return true;
|
||||
}
|
||||
|
||||
// . true here means to clear the new space to zeroes
|
||||
// . this shit works based on m_length not m_capacity
|
||||
if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
|
||||
log("admin: error growing rec ptr buf2.");
|
||||
return false;
|
||||
}
|
||||
|
||||
// sanity
|
||||
if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// set it
|
||||
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
|
||||
|
||||
// update length of used bytes in case we re-alloc
|
||||
m_recPtrBuf.setLength ( need );
|
||||
|
||||
// re-max
|
||||
long max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
|
||||
// sanity
|
||||
if ( collnum >= max ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// initialize slot
|
||||
m_recs [ collnum ] = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
|
||||
|
||||
@ -891,29 +947,12 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// an add, make sure big enough
|
||||
long need = ((long)collnum+1)*sizeof(CollectionRec *);
|
||||
long have = m_recPtrBuf.getLength();
|
||||
long need2 = need - have;
|
||||
// . true here means to clear the new space to zeroes
|
||||
// . this shit works based on m_length not m_capacity
|
||||
if ( need2 > 0 && ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
|
||||
log("admin: error growing rec ptr buf2.");
|
||||
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
|
||||
if ( ! growRecPtrBuf ( collnum ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
// sanity
|
||||
if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }
|
||||
// update length of used bytes in case we re-alloc
|
||||
m_recPtrBuf.setLength ( need );
|
||||
// sanity
|
||||
if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
|
||||
// re-ref it in case it is different
|
||||
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
|
||||
// re-max
|
||||
max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
|
||||
// sanity
|
||||
if ( collnum >= max ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// add to hash table to map name to collnum_t
|
||||
long long h64 = hash64n(cr->m_coll);
|
||||
@ -946,6 +985,39 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// moves a file by first trying rename, then copying since cross device renaming doesn't work
|
||||
// returns 0 on success
|
||||
int mv(char* src, char* dest) {
|
||||
int status = rename( src , dest );
|
||||
|
||||
if (status == 0)
|
||||
return 0;
|
||||
FILE *fsrc, *fdest;
|
||||
fsrc = fopen(src, "r");
|
||||
if (fsrc == NULL)
|
||||
return -1;
|
||||
fdest = fopen(dest, "w");
|
||||
if (fdest == NULL) {
|
||||
fclose(fsrc);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const int BUF_SIZE = 1024;
|
||||
char buf[BUF_SIZE];
|
||||
while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
|
||||
int read = fread(buf, 1, BUF_SIZE, fsrc);
|
||||
fwrite(buf, 1, read, fdest);
|
||||
}
|
||||
|
||||
fclose(fsrc);
|
||||
fclose(fdest);
|
||||
if (ferror(fdest) || ferror(fsrc))
|
||||
return -1;
|
||||
|
||||
remove(src);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// . returns false if we need a re-call, true if we completed
|
||||
// . returns true with g_errno set on error
|
||||
bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
@ -956,8 +1028,8 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// save parms in case we block
|
||||
//we->m_purgeSeeds = purgeSeeds;
|
||||
|
||||
// now must be "test" only for now
|
||||
//if ( strcmp(coll,"test") ) { char *xx=NULL;*xx=0; }
|
||||
// now must be "qatest123" only for now
|
||||
//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
||||
// no spiders can be out. they may be referencing the CollectionRec
|
||||
// in XmlDoc.cpp... quite likely.
|
||||
//if ( g_conf.m_spideringEnabled ||
|
||||
@ -968,7 +1040,7 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
//}
|
||||
|
||||
// do not allow this if in repair mode
|
||||
if ( g_repairMode > 0 ) {
|
||||
if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
|
||||
log("admin: Can not delete collection while in repair mode.");
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
@ -992,6 +1064,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
//collnum_t oldCollnum = cr->m_collnum;
|
||||
//collnum_t newCollnum = m_numRecs;
|
||||
|
||||
// in case of bulk job, be sure to save list of spots
|
||||
// copy existing list to a /tmp, where they will later be transferred back to the new folder
|
||||
char oldbulkurlsname[1036];
|
||||
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
|
||||
char newbulkurlsname[1036];
|
||||
snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
|
||||
char tmpbulkurlsname[1036];
|
||||
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
|
||||
|
||||
if (cr->m_isCustomCrawl == 2)
|
||||
mv( oldbulkurlsname , tmpbulkurlsname );
|
||||
|
||||
// reset spider info
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
|
||||
if ( sc ) {
|
||||
@ -1004,7 +1088,7 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
//sc->reset();
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
deleteSpiderColl ( sc );
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
cr->m_spiderColl = NULL;
|
||||
@ -1101,14 +1185,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// save coll.conf to new directory
|
||||
cr->save();
|
||||
|
||||
// be sure to copy back the bulk urls for bulk jobs
|
||||
if (cr->m_isCustomCrawl == 2)
|
||||
mv( tmpbulkurlsname, newbulkurlsname );
|
||||
|
||||
// and clear the robots.txt cache in case we recently spidered a
|
||||
// robots.txt, we don't want to use it, we want to use the one we
|
||||
// have in the test-parser subdir so we are consistent
|
||||
RdbCache *robots = Msg13::getHttpCacheRobots();
|
||||
RdbCache *others = Msg13::getHttpCacheOthers();
|
||||
robots->clear ( oldCollnum );
|
||||
others->clear ( oldCollnum );
|
||||
//RdbCache *robots = Msg13::getHttpCacheRobots();
|
||||
//RdbCache *others = Msg13::getHttpCacheOthers();
|
||||
// clear() was removed do to possible corruption
|
||||
//robots->clear ( oldCollnum );
|
||||
//others->clear ( oldCollnum );
|
||||
|
||||
//g_templateTable.reset();
|
||||
//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
|
||||
@ -1329,6 +1417,9 @@ collnum_t Collectiondb::reserveCollNum ( ) {
|
||||
|
||||
if ( m_numRecs < 0x7fff ) {
|
||||
collnum_t next = m_numRecs;
|
||||
// make the ptr NULL at least to accomodate the
|
||||
// loop that scan up to m_numRecs lest we core
|
||||
growRecPtrBuf ( next );
|
||||
m_numRecs++;
|
||||
return next;
|
||||
}
|
||||
@ -1458,6 +1549,9 @@ void CollectionRec::reset() {
|
||||
if ( m_hasucr ) regfree ( &m_ucr );
|
||||
if ( m_hasupr ) regfree ( &m_upr );
|
||||
|
||||
m_hasucr = false;
|
||||
m_hasupr = false;
|
||||
|
||||
// make sure we do not leave spiders "hanging" waiting for their
|
||||
// callback to be called... and it never gets called
|
||||
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
|
||||
@ -1759,31 +1853,193 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
|
||||
long n = 0;
|
||||
|
||||
//strcpy(m_regExs [n],"default");
|
||||
/*
|
||||
m_regExs[n].set("default");
|
||||
m_regExs[n].nullTerm();
|
||||
m_numRegExs++;
|
||||
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_numRegExs2++;
|
||||
|
||||
m_spiderPriorities[n] = 0;
|
||||
m_numRegExs3++;
|
||||
|
||||
m_maxSpidersPerRule[n] = 99;
|
||||
m_numRegExs10++;
|
||||
|
||||
m_spiderIpWaits[n] = 1000;
|
||||
m_numRegExs5++;
|
||||
|
||||
m_spiderIpMaxSpiders[n] = 7;
|
||||
m_numRegExs6++;
|
||||
|
||||
//m_spidersEnabled[n] = 1;
|
||||
//m_numRegExs7++;
|
||||
|
||||
m_harvestLinks[n] = 1;
|
||||
m_numRegExs8++;
|
||||
*/
|
||||
|
||||
m_regExs[n].set("isdocidbased");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 80;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("ismedia");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
m_regExs[n].set("!insitelist");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 45;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isaddurl");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 20;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 19;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 2;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("default");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
n++;
|
||||
|
||||
|
||||
m_numRegExs = n;
|
||||
m_numRegExs2 = n;
|
||||
m_numRegExs3 = n;
|
||||
m_numRegExs10 = n;
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
|
||||
// more rules
|
||||
|
||||
|
||||
|
||||
|
||||
//m_spiderDiffbotApiNum[n] = 1;
|
||||
//m_numRegExs11++;
|
||||
@ -1859,7 +2115,9 @@ bool CollectionRec::save ( ) {
|
||||
snprintf ( tmp , 1023, "%scoll.%s.%li/localcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
//log("coll: saving %s",tmp);
|
||||
SafeBuf sb;
|
||||
// in case emergency save from malloc core, do not alloc
|
||||
char stack[1024];
|
||||
SafeBuf sb(stack,1024);
|
||||
//m_localCrawlInfo.print ( &sb );
|
||||
// binary now
|
||||
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
|
||||
@ -2029,6 +2287,8 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
|
||||
}
|
||||
|
||||
bool expandRegExShortcuts ( SafeBuf *sb ) ;
|
||||
bool updateSiteListTables ( collnum_t collnum,bool addSeeds,char *siteListArg);
|
||||
void nukeDoledb ( collnum_t collnum );
|
||||
|
||||
// . anytime the url filters are updated, this function is called
|
||||
// . it is also called on load of the collection at startup
|
||||
@ -2058,6 +2318,48 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
// if collection is brand new being called from addNewColl()
|
||||
// then sc will be NULL
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(m_collnum);
|
||||
|
||||
// . do not do this at startup
|
||||
// . this essentially resets doledb
|
||||
if ( g_doledb.m_rdb.m_initialized &&
|
||||
// somehow this is initialized before we set m_recs[m_collnum]
|
||||
// so we gotta do the two checks below...
|
||||
sc &&
|
||||
// must be a valid coll
|
||||
m_collnum < g_collectiondb.m_numRecs &&
|
||||
g_collectiondb.m_recs[m_collnum] ) {
|
||||
|
||||
|
||||
log("coll: resetting doledb for %s (%li)",m_coll,
|
||||
(long)m_collnum);
|
||||
|
||||
// clear doledb recs from tree
|
||||
//g_doledb.getRdb()->deleteAllRecs ( m_collnum );
|
||||
nukeDoledb ( m_collnum );
|
||||
|
||||
// add it back
|
||||
//if ( ! g_doledb.getRdb()->addRdbBase2 ( m_collnum ) )
|
||||
// log("coll: error re-adding doledb for %s",m_coll);
|
||||
|
||||
// just start this over...
|
||||
// . MDW left off here
|
||||
//tryToDelete ( sc );
|
||||
// maybe this is good enough
|
||||
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
|
||||
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
|
||||
// . rebuild sitetable? in PageBasic.cpp.
|
||||
// . re-adds seed spdierrequests using msg4
|
||||
// . true = addSeeds
|
||||
updateSiteListTables ( m_collnum ,
|
||||
true ,
|
||||
cr->m_siteListBuf.getBufStart() );
|
||||
}
|
||||
|
||||
|
||||
// only for diffbot custom crawls
|
||||
if ( m_isCustomCrawl != 1 && // crawl api
|
||||
@ -2082,6 +2384,66 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
@ -2092,6 +2454,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// default to 250ms i guess. -1 means unset i think.
|
||||
if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
|
||||
|
||||
bool isEthan = false;
|
||||
if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
// through m_diffbotUrlCrawlPattern alternatively. if that itself
|
||||
@ -2102,6 +2467,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_maxSpidersPerRule [i] = 100;
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
|
||||
// ethan wants some speed
|
||||
if ( isEthan )
|
||||
m_spiderIpMaxSpiders[i] = 30;
|
||||
//m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] =m_collectiveRespiderFrequency;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
@ -2110,33 +2478,53 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
|
||||
long i = 0;
|
||||
|
||||
// 1st one! for query reindex/ query delete
|
||||
m_regExs[i].set("isdocidbased");
|
||||
m_spiderIpMaxSpiders [i] = 10;
|
||||
m_spiderPriorities [i] = 70;
|
||||
i++;
|
||||
|
||||
// 1st default url filter
|
||||
// 2nd default url filter
|
||||
m_regExs[i].set("ismedia && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
|
||||
// 2nd default filter
|
||||
if ( m_restrictDomain ) {
|
||||
// always turn this on for now. they need to add domains they want
|
||||
// to crawl as seeds so they do not spider the web.
|
||||
// no because FTB seeds with link pages that link to another
|
||||
// domain. they just need to be sure to supply a crawl pattern
|
||||
// to avoid spidering the whole web.
|
||||
//
|
||||
// if they did not EXPLICITLY provide a url crawl pattern or
|
||||
// url crawl regex then restrict to seeds to prevent from spidering
|
||||
// the entire internet
|
||||
if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
|
||||
m_regExs[i].set("errorcount>=1 && !hastmperror");
|
||||
m_spiderPriorities [i] = 15;
|
||||
m_spiderFreqs [i] = 0.00; // 86 seconds
|
||||
m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
|
||||
i++;
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
m_regExs[i].set("errorcount==1");
|
||||
m_regExs[i].set("errorcount==1 && hastmperror");
|
||||
m_spiderPriorities [i] = 40;
|
||||
m_spiderFreqs [i] = 0.001; // 86 seconds
|
||||
i++;
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
m_regExs[i].set("errorcount==2");
|
||||
m_regExs[i].set("errorcount==2 && hastmperror");
|
||||
m_spiderPriorities [i] = 40;
|
||||
m_spiderFreqs [i] = 0.1; // 2.4 hrs
|
||||
i++;
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
m_regExs[i].set("errorcount>=3");
|
||||
m_regExs[i].set("errorcount>=3 && hastmperror");
|
||||
m_spiderPriorities [i] = 30;
|
||||
m_spiderFreqs [i] = 30; // 30 days
|
||||
i++;
|
||||
@ -2240,63 +2628,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_numRegExs8 = i;
|
||||
//m_numRegExs11 = i;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
|
||||
//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ class Collectiondb {
|
||||
|
||||
// . does this requester have root admin privledges???
|
||||
// . uses the root collection record!
|
||||
bool isAdmin ( class HttpRequest *r , class TcpSocket *s );
|
||||
//bool isAdmin ( class HttpRequest *r , class TcpSocket *s );
|
||||
|
||||
//collnum_t getNextCollnum ( collnum_t collnum );
|
||||
|
||||
@ -129,6 +129,7 @@ class Collectiondb {
|
||||
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
|
||||
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
|
||||
|
||||
bool growRecPtrBuf ( collnum_t collnum ) ;
|
||||
bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;
|
||||
|
||||
// returns false if blocked, true otherwise.
|
||||
@ -138,7 +139,7 @@ class Collectiondb {
|
||||
//bool updateRec ( CollectionRec *newrec );
|
||||
bool deleteRecs ( class HttpRequest *r ) ;
|
||||
|
||||
void deleteSpiderColl ( class SpiderColl *sc );
|
||||
//void deleteSpiderColl ( class SpiderColl *sc );
|
||||
|
||||
// returns false if blocked, true otherwise.
|
||||
//bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );
|
||||
@ -310,10 +311,10 @@ class CollectionRec {
|
||||
// . set ourselves the cgi parms in an http request
|
||||
// . unspecified cgi parms will be assigned default values
|
||||
// . returns false and sets errno on error
|
||||
bool set ( class HttpRequest *r , TcpSocket *s );
|
||||
bool set ( class HttpRequest *r , class TcpSocket *s );
|
||||
|
||||
// calls hasPermission() below
|
||||
bool hasPermission ( class HttpRequest *r , TcpSocket *s ) ;
|
||||
bool hasPermission ( class HttpRequest *r , class TcpSocket *s ) ;
|
||||
|
||||
// . does this user have permission for editing this collection?
|
||||
// . "p" is the password for this collection in question
|
||||
@ -326,7 +327,7 @@ class CollectionRec {
|
||||
// . can this ip perform a search or add url on this collection?
|
||||
// . mamma.com provides encapsulated ips of their queriers so we
|
||||
// can ban them by ip
|
||||
bool hasSearchPermission ( TcpSocket *s , long encapIp = 0 );
|
||||
bool hasSearchPermission ( class TcpSocket *s , long encapIp = 0 );
|
||||
|
||||
// how many bytes would this record occupy in raw binary format?
|
||||
//long getStoredSize () { return m_recSize; };
|
||||
@ -458,7 +459,7 @@ class CollectionRec {
|
||||
char m_enforceNewQuotas ;
|
||||
char m_doIpLookups ; // considered iff using proxy
|
||||
char m_useRobotsTxt ;
|
||||
char m_restrictDomain ; // say on same domain as seeds?
|
||||
//char m_restrictDomain ; // say on same domain as seeds?
|
||||
char m_doTuringTest ; // for addurl
|
||||
char m_applyFilterToText ; // speeds us up
|
||||
char m_allowHttps ; // read HTTPS using SSL
|
||||
@ -640,7 +641,7 @@ class CollectionRec {
|
||||
long m_hasucr:1;
|
||||
long m_hasupr:1;
|
||||
|
||||
char m_diffbotOnlyProcessIfNew;
|
||||
char m_diffbotOnlyProcessIfNewUrl;
|
||||
|
||||
//SafeBuf m_diffbotClassify;
|
||||
//char m_diffbotClassify;
|
||||
@ -678,6 +679,9 @@ class CollectionRec {
|
||||
// for storing callbacks waiting in line for freshest crawl info
|
||||
//SafeBuf m_callbackQueue;
|
||||
|
||||
// list of url patterns to be indexed.
|
||||
SafeBuf m_siteListBuf;
|
||||
char m_spiderToo;
|
||||
|
||||
// . now the url regular expressions
|
||||
// . we chain down the regular expressions
|
||||
|
99
Conf.cpp
99
Conf.cpp
@ -18,6 +18,7 @@ Conf::Conf ( ) {
|
||||
// . master admin can administer ALL collections
|
||||
// . use CollectionRec::hasPermission() to see if has permission
|
||||
// to adminster one particular collection
|
||||
/*
|
||||
bool Conf::isMasterAdmin ( TcpSocket *s , HttpRequest *r ) {
|
||||
// sometimes they don't want to be admin intentionally for testing
|
||||
if ( r->getLong ( "master" , 1 ) == 0 ) return false;
|
||||
@ -64,37 +65,84 @@ bool Conf::isMasterAdmin ( TcpSocket *s , HttpRequest *r ) {
|
||||
// check admin ips
|
||||
// scan the passwords
|
||||
// MDW: no! too vulnerable to attacks!
|
||||
/*
|
||||
for ( long i = 0 ; i < m_numMasterPwds ; i++ ) {
|
||||
if ( strcmp ( m_masterPwds[i], p ) != 0 ) continue;
|
||||
// . matching one password is good enough now, default OR
|
||||
// . because just matching an IP is good enough security,
|
||||
// there is really no need for both IP AND passwd match
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
//for ( long i = 0 ; i < m_numMasterPwds ; i++ ) {
|
||||
// if ( strcmp ( m_masterPwds[i], p ) != 0 ) continue;
|
||||
// // . matching one password is good enough now, default OR
|
||||
// // . because just matching an IP is good enough security,
|
||||
// // there is really no need for both IP AND passwd match
|
||||
// return true;
|
||||
//}
|
||||
// ok, make sure they came from an acceptable IP
|
||||
if ( isAdminIp ( ip ) )
|
||||
if ( isRootIp ( ip ) )
|
||||
// they also have a matching IP, so they now have permission
|
||||
return true;
|
||||
// if no security, allow all
|
||||
// MDW: nonononono!!!!
|
||||
/*
|
||||
if ( m_numMasterPwds == 0 &&
|
||||
m_numMasterIps == 0 ) return true;
|
||||
*/
|
||||
//if ( m_numMasterPwds == 0 &&
|
||||
// m_numMasterIps == 0 ) return true;
|
||||
// if they did not match an ip or password, even if both lists
|
||||
// are empty, do not allow access... this prevents security breeches
|
||||
// by accident
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
bool Conf::isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// until we have coll tokens use this...
|
||||
return isRootAdmin ( socket , hr );
|
||||
}
|
||||
|
||||
// . is user a root administrator?
|
||||
// . only need to be from root IP *OR* have password, not both
|
||||
bool Conf::isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
// totally open access?
|
||||
if ( m_numConnectIps <= 0 && m_numMasterPwds <= 0 )
|
||||
return true;
|
||||
|
||||
// coming from root gets you in
|
||||
if ( isRootIp ( socket->m_ip ) ) return true;
|
||||
|
||||
//if ( isConnectIp ( socket->m_ip ) ) return true;
|
||||
|
||||
if ( hasRootPwd ( hr ) ) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool Conf::hasRootPwd ( HttpRequest *hr ) {
|
||||
|
||||
if ( m_numMasterPwds == 0 ) return false;
|
||||
|
||||
char *p = hr->getString("pwd");
|
||||
|
||||
if ( ! p ) p = hr->getString("password");
|
||||
|
||||
if ( ! p ) p = hr->getStringFromCookie("pwd");
|
||||
|
||||
if ( ! p ) return false;
|
||||
|
||||
for ( long i = 0 ; i < m_numMasterPwds ; i++ ) {
|
||||
if ( strcmp ( m_masterPwds[i], p ) != 0 ) continue;
|
||||
// we got a match
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// . check this ip in the list of admin ips
|
||||
bool Conf::isAdminIp ( unsigned long ip ) {
|
||||
for ( long i = 0 ; i < m_numMasterIps ; i++ )
|
||||
if ( m_masterIps[i] == (long)ip )
|
||||
bool Conf::isRootIp ( unsigned long ip ) {
|
||||
|
||||
//if ( m_numMasterIps == 0 ) return false;
|
||||
if ( m_numConnectIps == 0 ) return false;
|
||||
|
||||
for ( long i = 0 ; i < m_numConnectIps ; i++ )
|
||||
if ( m_connectIps[i] == (long)ip )
|
||||
return true;
|
||||
|
||||
//if ( ip == atoip("10.5.0.2",8) ) return true;
|
||||
|
||||
// no match
|
||||
return false;
|
||||
}
|
||||
@ -124,8 +172,17 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
|
||||
g_parms.setToDefault ( (char *)this );
|
||||
m_save = true;
|
||||
char fname[1024];
|
||||
if ( dir ) sprintf ( fname , "%sgb.conf", dir );
|
||||
else sprintf ( fname , "./gb.conf" );
|
||||
if ( dir ) sprintf ( fname , "%slocalgb.conf", dir );
|
||||
else sprintf ( fname , "./localgb.conf" );
|
||||
File f;
|
||||
f.set ( fname );
|
||||
m_isLocal = true;
|
||||
if ( ! f.doesExist() ) {
|
||||
m_isLocal = false;
|
||||
if ( dir ) sprintf ( fname , "%sgb.conf", dir );
|
||||
else sprintf ( fname , "./gb.conf" );
|
||||
}
|
||||
|
||||
// make sure g_mem.maxMem is big enough temporarily
|
||||
if ( g_mem.m_maxMem < 10000000 ) g_mem.m_maxMem = 10000000;
|
||||
bool status = g_parms.setFromFile ( this , fname , NULL );
|
||||
@ -351,7 +408,9 @@ bool Conf::save ( ) {
|
||||
bool status = g_parms.saveToXml ( (char *)this , fname );
|
||||
if ( status ) {
|
||||
char fname2[1024];
|
||||
sprintf( fname2 , "%sgb.conf" , g_hostdb.m_dir );
|
||||
char *local = "";
|
||||
if ( m_isLocal ) local = "local";
|
||||
sprintf( fname2 , "%s%sgb.conf" , g_hostdb.m_dir , local );
|
||||
if(access(fname2, F_OK) == 0) unlink(fname2);
|
||||
if(link(fname, fname2) == 0) {
|
||||
unlink(fname);
|
||||
|
19
Conf.h
19
Conf.h
@ -24,7 +24,7 @@
|
||||
#include "Collectiondb.h"
|
||||
|
||||
#define MAX_MASTER_IPS 15
|
||||
#define MAX_MASTER_PASSWORDS 10
|
||||
#define MAX_MASTER_PASSWORDS 5
|
||||
|
||||
#define USERAGENTMAXSIZE 128
|
||||
|
||||
@ -49,9 +49,13 @@ class Conf {
|
||||
|
||||
Conf();
|
||||
|
||||
bool isMasterAdmin ( class TcpSocket *s , class HttpRequest *r );
|
||||
bool isSpamAssassin ( class TcpSocket *s , class HttpRequest *r );
|
||||
bool isAdminIp ( unsigned long ip );
|
||||
bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
|
||||
|
||||
bool isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
|
||||
//bool isMasterAdmin ( class TcpSocket *s , class HttpRequest *r );
|
||||
//bool isSpamAssassin ( class TcpSocket *s , class HttpRequest *r );
|
||||
bool hasRootPwd ( HttpRequest *hr ) ;
|
||||
bool isRootIp ( unsigned long ip );
|
||||
bool isConnectIp ( unsigned long ip );
|
||||
|
||||
// loads conf parms from this file "{dir}/gb.conf"
|
||||
@ -94,6 +98,8 @@ class Conf {
|
||||
// a core dump saving them
|
||||
char m_save;
|
||||
|
||||
bool m_isLocal;
|
||||
|
||||
//director info (optional) (used iff m_isTrustedNet is false)
|
||||
//public_key m_dirPubKey; // everyone should know director's pub key
|
||||
//private_key m_dirPrivKey; // this is 0 if we don't know it
|
||||
@ -663,9 +669,10 @@ class Conf {
|
||||
|
||||
long m_numMasterPwds;
|
||||
char m_masterPwds[MAX_MASTER_PASSWORDS][PASSWORD_MAX_LEN];
|
||||
long m_numMasterIps;
|
||||
long m_masterIps[MAX_MASTER_IPS];
|
||||
//long m_numMasterIps;
|
||||
//long m_masterIps[MAX_MASTER_IPS];
|
||||
|
||||
// these are the new master ips
|
||||
long m_numConnectIps;
|
||||
long m_connectIps [ MAX_CONNECT_IPS ];
|
||||
|
||||
|
@ -145,10 +145,11 @@ bool Datedb::verify ( char *coll ) {
|
||||
key_t endKey;
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
//long minRecSizes = 64000;
|
||||
|
||||
if ( ! msg5.getList ( RDB_DATEDB ,
|
||||
coll ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
|
@ -1318,7 +1318,7 @@ sections. -- todo -- might be an alignment issue... check out later
|
||||
|
||||
// . make a whole new set of urls for pub date detection
|
||||
// . grab that sample set from buzz wiki page
|
||||
// . record the correct pub date for urls in the "test" coll and make sure
|
||||
// . record the correct pub date for urls in the "qatest123" coll and make sure
|
||||
// we get them each time, otherwise core dump!!
|
||||
// . check the date we extract with the rss feed. that is a good test too!
|
||||
// report on that accuracy in the logs and on the stats page.
|
||||
@ -2428,7 +2428,7 @@ bool Dates::setPart1 ( //char *u ,
|
||||
//if ( m_nw != words->m_numWords ) { char *xx=NULL; *xx=0; }
|
||||
|
||||
// . get the current time in utc
|
||||
// . NO! to ensure the "test" collection re-injects docs exactly
|
||||
// . NO! to ensure the "qatest123" collection re-injects docs exactly
|
||||
// the same, use the spideredTime from the doc
|
||||
// . we make sure to save this in the test subdir somehow..
|
||||
//m_now = nd->m_spideredTime; // getTimeSynced();
|
||||
@ -3283,7 +3283,7 @@ bool Dates::setPart1 ( //char *u ,
|
||||
// DF_NOTCLOCK flags from this.
|
||||
|
||||
// . current time. sync'd with host #0 who uses ntp supposedly...! :(
|
||||
// . to ensure that the "test" subdir re-injects docs exactly the
|
||||
// . to ensure that the "qatest123" subdir re-injects docs exactly the
|
||||
// same, we need to use this date now
|
||||
long now = nd->m_spideredTime;
|
||||
// how long has elapsed since we downloaded it last approx.?
|
||||
@ -3294,7 +3294,8 @@ bool Dates::setPart1 ( //char *u ,
|
||||
// might have been different than ours... actually i think our
|
||||
// spiderdate.txt file had an older date in it from a previous round!
|
||||
// so disable this when test spidering.
|
||||
if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,"test"))
|
||||
if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,
|
||||
"qatest123"))
|
||||
elapsed = 0;
|
||||
// is true.
|
||||
if ( elapsed < 0 ) {
|
||||
|
@ -108,7 +108,10 @@ bool DiskPageCache::init ( const char *dbname ,
|
||||
// void (*rmVfd2)(DiskPageCache*, long) ) {
|
||||
reset();
|
||||
|
||||
// fix cores while rebalancing
|
||||
// seems like we lose data when it prints "Caught add breach"
|
||||
// so let's stop using until we fix that... happens while we are
|
||||
// dumping i think and somehow the data seems to get lost that
|
||||
// we were dumping.
|
||||
//maxMem = 0;
|
||||
|
||||
m_rdbId = rdbId;
|
||||
|
@ -167,6 +167,7 @@ case EFAKEFIRSTIP: return "Fake firstIp";
|
||||
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
||||
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
||||
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -170,6 +170,7 @@ enum {
|
||||
EFAKEFIRSTIP,
|
||||
EBADHOSTSCONF,
|
||||
EWAITINGTOSYNCHOSTSCONF,
|
||||
EDOCNONCANONICAL
|
||||
EDOCNONCANONICAL,
|
||||
ECUSTOMCRAWLMISMATCH // a crawl request was made with a name that already existed for bulk request (or the other way around)
|
||||
};
|
||||
#endif
|
||||
|
@ -21,6 +21,9 @@ class HashTableX {
|
||||
char *allocName ,
|
||||
bool useKeyMagic = false );
|
||||
|
||||
// key size is 0 if UNinitialized
|
||||
bool isInitialized ( ) { return (m_ks != 0); };
|
||||
|
||||
HashTableX ( );
|
||||
~HashTableX ( );
|
||||
void constructor ();
|
||||
@ -389,6 +392,10 @@ class HashTableX {
|
||||
long getNumSlotsUsed ( ) { return m_numSlotsUsed; };
|
||||
long getNumUsedSlots ( ) { return m_numSlotsUsed; };
|
||||
|
||||
bool isEmpty() {
|
||||
if ( m_numSlotsUsed == 0 ) return true;
|
||||
return false; };
|
||||
|
||||
// how many are there total? used and unused.
|
||||
long getNumSlots ( ) { return m_numSlots; };
|
||||
|
||||
|
@ -2315,10 +2315,10 @@ uint32_t Hostdb::getShardNum ( char rdbId,void *k ) { // ,bool split ) {
|
||||
else if ( rdbId == RDB_LINKDB || rdbId == RDB2_LINKDB2 ) {
|
||||
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
|
||||
}
|
||||
else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
|
||||
unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
//else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
|
||||
// unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
|
||||
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
//}
|
||||
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
|
||||
unsigned long long d = g_titledb.getDocId ( (key_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
|
4
Hostdb.h
4
Hostdb.h
@ -53,7 +53,7 @@ enum {
|
||||
#define PFLAG_RECOVERYMODE 0x80
|
||||
|
||||
// added slow disk reads to it, 4 bytes (was 52)
|
||||
#define MAX_PING_SIZE (44+4)
|
||||
#define MAX_PING_SIZE (44+4+4)
|
||||
|
||||
#define HT_GRUNT 0x01
|
||||
#define HT_SPARE 0x02
|
||||
@ -144,6 +144,8 @@ class Host {
|
||||
// cpu usage
|
||||
float m_cpuUsage;
|
||||
|
||||
float m_diskUsage;
|
||||
|
||||
long m_slowDiskReads;
|
||||
|
||||
// doc count
|
||||
|
@ -6,8 +6,21 @@
|
||||
HttpRequest::HttpRequest () { m_cgiBuf = NULL; m_cgiBuf2 = NULL; reset(); }
|
||||
HttpRequest::~HttpRequest() { reset(); }
|
||||
|
||||
char HttpRequest::getReplyFormat() {
|
||||
if ( m_replyFormatValid ) return m_replyFormat;
|
||||
char *fs = getString("format",NULL,NULL);
|
||||
char fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
|
||||
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
|
||||
m_replyFormat = fmt;
|
||||
m_replyFormatValid = true;
|
||||
return m_replyFormat;
|
||||
}
|
||||
|
||||
void HttpRequest::reset() {
|
||||
m_numFields = 0;
|
||||
m_replyFormatValid = false;
|
||||
//if ( m_cgiBuf ) mfree ( m_cgiBuf , m_cgiBufMaxLen , "HttpRequest");
|
||||
m_cgiBufLen = 0;
|
||||
m_cgiBuf = NULL;
|
||||
|
@ -27,6 +27,16 @@
|
||||
#include "Url.h" // Url class
|
||||
#include "TcpSocket.h"
|
||||
|
||||
// values for HttpRequest::m_replyFormat
|
||||
#define FORMAT_HTML 1
|
||||
#define FORMAT_XML 2
|
||||
#define FORMAT_JSON 3
|
||||
#define FORMAT_CSV 4
|
||||
#define FORMAT_TXT 5
|
||||
#define FORMAT_PROCOG 6
|
||||
|
||||
|
||||
|
||||
class HttpRequest {
|
||||
|
||||
public:
|
||||
@ -59,6 +69,11 @@ class HttpRequest {
|
||||
//return m_buf;
|
||||
};
|
||||
|
||||
// FORMAT_HTML FORMAT_JSON FORMAT_XML
|
||||
char getReplyFormat();
|
||||
bool m_replyFormatValid;
|
||||
char m_replyFormat;
|
||||
|
||||
// get the referer field of the MIME header
|
||||
char *getReferer () { return m_ref; };
|
||||
|
||||
|
@ -986,6 +986,7 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( strncmp(path,"/crawlbot",9) == 0 ) n = PAGE_CRAWLBOT;
|
||||
if ( strncmp(path,"/v2/crawl",9) == 0 ) n = PAGE_CRAWLBOT;
|
||||
if ( strncmp(path,"/v2/bulk" ,8) == 0 ) n = PAGE_CRAWLBOT;
|
||||
if ( strncmp(path,"/v2/search" ,8) == 0 ) n = PAGE_RESULTS;
|
||||
|
||||
bool isProxy = g_proxy.isProxy();
|
||||
// . prevent coring
|
||||
@ -1401,6 +1402,7 @@ bool HttpServer::sendReply2 ( char *mime,
|
||||
|
||||
// . store the login/logout links after <body> tag
|
||||
// . only proxy should provide a non-null hr right now
|
||||
/*
|
||||
if ( hr ) {
|
||||
long newReplySize;
|
||||
char *newReply = g_proxy.storeLoginBar ( sendBuf,
|
||||
@ -1417,6 +1419,7 @@ bool HttpServer::sendReply2 ( char *mime,
|
||||
sendBufSize = newReplySize;
|
||||
sendBufAlloc = newReplySize;
|
||||
}
|
||||
*/
|
||||
|
||||
// . send it away
|
||||
// . this returns false if blocked, true otherwise
|
||||
@ -1900,7 +1903,7 @@ long getMsgSize ( char *buf, long bufSize, TcpSocket *s ) {
|
||||
max = 0x7fffffff; // maxOtherDocLen not available
|
||||
// if post is a /cgi/12.cgi (tagdb) allow 10 megs
|
||||
//if ( pp + 11 < ppend && strncmp ( pp ,"/cgi/12.cgi",11)==0)
|
||||
if ( pp + 11 < ppend && strncmp ( pp ,"/master/tagdb",13)==0)
|
||||
if ( pp + 12 < ppend && strncmp ( pp ,"/admin/tagdb",12)==0)
|
||||
max = 10*1024*1024;
|
||||
if ( pp + 4 < ppend && strncmp ( pp ,"/vec",4)==0)
|
||||
max = 0x7fffffff;
|
||||
|
13
Images.cpp
13
Images.cpp
@ -221,7 +221,7 @@ bool Images::getThumbnail ( char *pageSite ,
|
||||
long siteLen ,
|
||||
long long docId ,
|
||||
XmlDoc *xd ,
|
||||
char *coll ,
|
||||
collnum_t collnum,//char *coll ,
|
||||
char **statusPtr ,
|
||||
long hopCount,
|
||||
void *state ,
|
||||
@ -246,7 +246,7 @@ bool Images::getThumbnail ( char *pageSite ,
|
||||
// save these
|
||||
m_statusPtr = statusPtr;
|
||||
// save this
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_docId = docId;
|
||||
|
||||
// if no candidates, we are done, no error
|
||||
@ -280,7 +280,7 @@ bool Images::getThumbnail ( char *pageSite ,
|
||||
// store the termid
|
||||
long long termId = q.getTermId(0);
|
||||
|
||||
if ( ! m_msg36.getTermFreq ( coll ,
|
||||
if ( ! m_msg36.getTermFreq ( m_collnum ,
|
||||
0 , // maxAge
|
||||
termId ,
|
||||
this ,
|
||||
@ -340,7 +340,7 @@ bool Images::launchRequests ( ) {
|
||||
0 , // maxAge
|
||||
false , // addToCache?
|
||||
RDB_INDEXDB ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
&m_list , // RdbList ptr
|
||||
startKey ,
|
||||
endKey ,
|
||||
@ -414,6 +414,9 @@ bool Images::downloadImages () {
|
||||
mfree ( m_imgBuf , m_imgBufMaxLen , "Image" );
|
||||
m_imgBuf = NULL;
|
||||
}
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
|
||||
|
||||
// . download each leftover image
|
||||
// . stop as soon as we get one with good dimensions
|
||||
// . make a thumbnail of that one
|
||||
@ -442,7 +445,7 @@ bool Images::downloadImages () {
|
||||
r->reset();
|
||||
r->m_maxTextDocLen = 200000;
|
||||
r->m_maxOtherDocLen = 500000;
|
||||
if ( ! strcmp(m_coll,"test")) {
|
||||
if ( ! strcmp(cr->m_coll,"qatest123")) {
|
||||
r->m_useTestCache = 1;
|
||||
r->m_addToTestCache = 1;
|
||||
}
|
||||
|
4
Images.h
4
Images.h
@ -41,7 +41,7 @@ class Images {
|
||||
long siteLen ,
|
||||
long long docId ,
|
||||
class XmlDoc *xd ,
|
||||
char *coll ,
|
||||
collnum_t collnum,
|
||||
char **statusPtr ,
|
||||
long hopCount,
|
||||
void *state ,
|
||||
@ -71,7 +71,7 @@ class Images {
|
||||
bool m_stopDownloading;
|
||||
char **m_statusPtr;
|
||||
char m_statusBuf[128];
|
||||
char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
long long m_docId;
|
||||
IndexList m_list;
|
||||
|
12
Indexdb.cpp
12
Indexdb.cpp
@ -202,9 +202,10 @@ bool Indexdb::verify ( char *coll ) {
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
//long minRecSizes = 64000;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
if ( ! msg5.getList ( RDB_INDEXDB ,
|
||||
coll ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
@ -293,6 +294,7 @@ void Indexdb::deepVerify ( char *coll ) {
|
||||
RdbBase *rdbBase = g_indexdb.m_rdb.getBase(collnum);
|
||||
long numFiles = rdbBase->getNumFiles();
|
||||
long currentFile = 0;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
deepLoop:
|
||||
// done after scanning all files
|
||||
@ -304,7 +306,7 @@ deepLoop:
|
||||
}
|
||||
// scan this file
|
||||
if ( ! msg5.getList ( RDB_INDEXDB ,
|
||||
coll ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
@ -389,7 +391,7 @@ key_t Indexdb::makeKey ( long long termId ,
|
||||
|
||||
// . accesses RdbMap to estimate size of the indexList for this termId
|
||||
// . returns an UPPER BOUND
|
||||
long long Indexdb::getTermFreq ( char *coll , long long termId ) {
|
||||
long long Indexdb::getTermFreq ( collnum_t collnum , long long termId ) {
|
||||
// establish the list boundary keys
|
||||
key_t startKey = makeStartKey ( termId );
|
||||
key_t endKey = makeEndKey ( termId );
|
||||
@ -403,7 +405,7 @@ long long Indexdb::getTermFreq ( char *coll , long long termId ) {
|
||||
long oldTrunc = 100000;
|
||||
// get maxKey for only the top "oldTruncLimit" docids because when
|
||||
// we increase the trunc limit we screw up our extrapolation! BIG TIME!
|
||||
maxRecs = m_rdb.getListSize(coll,startKey,endKey,&maxKey,oldTrunc )/6;
|
||||
maxRecs=m_rdb.getListSize(collnum,startKey,endKey,&maxKey,oldTrunc )/6;
|
||||
// . TRUNCATION NOW OBSOLETE
|
||||
return maxRecs;
|
||||
|
||||
@ -427,7 +429,7 @@ long long Indexdb::getTermFreq ( char *coll , long long termId ) {
|
||||
// . modify maxKey
|
||||
key_t midKey = g_indexdb.makeKey ( termId , shy , 0LL , true );
|
||||
// get # of recs that have this termId and score
|
||||
long lastChunk = m_rdb.getListSize(coll,
|
||||
long lastChunk = m_rdb.getListSize(collnum,
|
||||
midKey,endKey,&maxKey,oldTrunc)/ 6;
|
||||
// now interpolate number of uncounted docids for the score "shy"
|
||||
long remaining = (((long long)lastChunk) * lastDocId) /
|
||||
|
@ -164,7 +164,7 @@ class Indexdb {
|
||||
// . accesses RdbMap to estimate size of the indexList for this termId
|
||||
// . returns a pretty tight upper bound if indexList not truncated
|
||||
// . if truncated, it's does linear interpolation (use exponential!)
|
||||
long long getTermFreq ( char *coll , long long termId ) ;
|
||||
long long getTermFreq ( collnum_t collnum , long long termId ) ;
|
||||
|
||||
//long getTruncationLimit ( ){return g_conf.m_indexdbTruncationLimit;};
|
||||
|
||||
|
20
Json.cpp
20
Json.cpp
@ -421,3 +421,23 @@ bool JsonItem::isInArray ( ) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// convert nubers and bools to strings for this one
|
||||
char *JsonItem::getValueAsString ( long *valueLen ) {
|
||||
|
||||
// strings are the same
|
||||
if ( m_type == JT_STRING ) {
|
||||
*valueLen = getValueLen();
|
||||
return getValue();
|
||||
}
|
||||
|
||||
// numbers...
|
||||
static char s_numBuf[64];
|
||||
if ( (float)m_valueLong == m_valueDouble ) {
|
||||
*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
||||
*valueLen = sprintf ( s_numBuf,"%f", m_valueDouble );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
3
Json.h
3
Json.h
@ -51,6 +51,9 @@ class JsonItem {
|
||||
return (char *)this + sizeof(JsonItem);
|
||||
};
|
||||
|
||||
// convert nubers and bools to strings for this one
|
||||
char *getValueAsString ( long *valueLen ) ;
|
||||
|
||||
// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
|
||||
bool getCompoundName ( SafeBuf &nameBuf ) ;
|
||||
|
||||
|
4
LICENSE
4
LICENSE
@ -198,7 +198,3 @@ license that then you can arrange a licensing agreement with Matt Wells.
|
||||
|
||||
Likewise, the Event datamining logic is in Events.cpp and must be separately licensed
|
||||
as well.
|
||||
|
||||
And any code in between "#ifdef NEEDLICENSE" and "#endif" statements is not
|
||||
covered by this license and must be licensed separately, too. That code is
|
||||
not compiled by default and only pertains to a few isolated things.
|
||||
|
561
Linkdb.cpp
561
Linkdb.cpp
@ -14,6 +14,7 @@ void Linkdb::reset() {
|
||||
}
|
||||
|
||||
bool Linkdb::init ( ) {
|
||||
|
||||
key224_t k;
|
||||
// sanity tests
|
||||
uint32_t linkeeSiteHash32 = (uint32_t)rand();
|
||||
@ -198,9 +199,10 @@ bool Linkdb::verify ( char *coll ) {
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
long minRecSizes = 64000;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
if ( ! msg5.getList ( RDB_LINKDB ,
|
||||
coll ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
(char*)&startKey ,
|
||||
(char*)&endKey ,
|
||||
@ -231,6 +233,8 @@ bool Linkdb::verify ( char *coll ) {
|
||||
list.skipCurrentRecord() ) {
|
||||
key224_t k;
|
||||
list.getCurrentKey((char*)&k);
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//uint32_t shardNum = getShardNum ( RDB_LINKDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
@ -393,7 +397,7 @@ key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
|
||||
//static void gotRootTitleRecWrapper25 ( void *state ) ;
|
||||
//static void gotTermFreqWrapper ( void *state ) ;
|
||||
static void gotListWrapper ( void *state );//, RdbList *list );
|
||||
static void gotListWrapper ( void *state ,RdbList *list,Msg5 *msg5);
|
||||
static bool gotLinkTextWrapper ( void *state );
|
||||
//static void sendLinkInfoReplyWrapper ( void *state );//, LinkInfo *info ) ;
|
||||
//static void gotReplyWrapper25 ( void *state , void *state2 ) ;
|
||||
@ -404,7 +408,7 @@ Msg25::Msg25() {
|
||||
// set minhopcount to unknown
|
||||
//m_minInlinkerHopCount = -1;
|
||||
m_numReplyPtrs = 0;
|
||||
m_linkInfo = NULL;
|
||||
//m_linkInfo = NULL;
|
||||
m_ownReplies = true;
|
||||
}
|
||||
|
||||
@ -423,7 +427,7 @@ void Msg25::reset() {
|
||||
//if ( m_linkInfo )
|
||||
// mfree ( m_linkInfo , m_linkInfo->getStoredSize(),"msg25s");
|
||||
// this now points into m_linkInfoBuf safebuf, just NULL it
|
||||
m_linkInfo = NULL;
|
||||
//m_linkInfo = NULL;
|
||||
|
||||
m_table.reset();
|
||||
m_ipTable.reset();
|
||||
@ -435,12 +439,417 @@ void Msg25::reset() {
|
||||
#define MODE_PAGELINKINFO 1
|
||||
#define MODE_SITELINKINFO 2
|
||||
|
||||
// . get the inlinkers to this SITE (any page on this site)
|
||||
// . use that to compute a site quality
|
||||
// . also get the inlinkers sorted by date and see how many good inlinkers
|
||||
// we had since X days ago. (each inlinker needs a pub/birth date)
|
||||
// . we got a reply back from the msg25 request
|
||||
// . reply should just be a LinkInfo class
|
||||
// . set XmlDoc::m_linkInfoBuf safebuf to that reply
|
||||
// . we store tr to that safebuf in Msg25Request::m_linkInfoBuf
|
||||
void gotMulticastReplyWrapper25 ( void *state , void *state2 ) {
|
||||
|
||||
Msg25Request *req = (Msg25Request *)state;
|
||||
|
||||
// call callback now if error is set
|
||||
if ( g_errno ) {
|
||||
req->m_callback ( req->m_state );
|
||||
return;
|
||||
}
|
||||
|
||||
Multicast *mcast = req->m_mcast;
|
||||
|
||||
long replySize;
|
||||
long replyMaxSize;
|
||||
bool freeit;
|
||||
char *reply = mcast->getBestReply (&replySize,&replyMaxSize,&freeit);
|
||||
|
||||
// . store reply in caller's linkInfoBuf i guess
|
||||
// . mcast should free the reply
|
||||
req->m_linkInfoBuf->safeMemcpy ( reply , replySize );
|
||||
|
||||
// i guess we gotta free this
|
||||
mfree ( reply , replySize , "rep25" );
|
||||
|
||||
req->m_callback ( req->m_state );
|
||||
}
|
||||
|
||||
|
||||
// . returns false if would block, true otherwise
|
||||
// . sets g_errno and returns true on launch error
|
||||
// . calls req->m_callback when ready if it would block
|
||||
bool getLinkInfo ( SafeBuf *reqBuf ,
|
||||
Multicast *mcast ,
|
||||
char *site ,
|
||||
char *url ,
|
||||
bool isSiteLinkInfo ,
|
||||
long ip ,
|
||||
long long docId ,
|
||||
collnum_t collnum ,
|
||||
char *qbuf,
|
||||
long qbufSize,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ,
|
||||
bool isInjecting ,
|
||||
SafeBuf *pbuf ,
|
||||
bool printInXml ,
|
||||
long siteNumInlinks ,
|
||||
LinkInfo *oldLinkInfo ,
|
||||
long niceness ,
|
||||
bool doLinkSpamCheck ,
|
||||
bool oneVotePerIpDom ,
|
||||
bool canBeCancelled ,
|
||||
long lastUpdateTime ,
|
||||
bool onlyNeedGoodInlinks ,
|
||||
bool getLinkerTitles ,
|
||||
long ourHostHash32 ,
|
||||
long ourDomHash32 ,
|
||||
SafeBuf *linkInfoBuf ) {
|
||||
|
||||
long siteLen = gbstrlen(site);
|
||||
long urlLen = gbstrlen(url);
|
||||
|
||||
long oldLinkSize = 0;
|
||||
if ( oldLinkInfo )
|
||||
oldLinkSize = oldLinkInfo->getSize();
|
||||
|
||||
long need = sizeof(Msg25Request) + siteLen+1 + urlLen+1 + oldLinkSize;
|
||||
|
||||
// keep it in a safebuf so caller can just add "SafeBuf m_msg25Req;"
|
||||
// to his .h file and not have to worry about freeing it.
|
||||
reqBuf->purge();
|
||||
|
||||
// clear = true. put 0 bytes in there
|
||||
if ( ! reqBuf->reserve ( need ,"m25req", true ) ) return true;
|
||||
|
||||
Msg25Request *req = (Msg25Request *)reqBuf->getBufStart();
|
||||
|
||||
req->m_linkInfoBuf = linkInfoBuf;
|
||||
|
||||
req->m_mcast = mcast;
|
||||
|
||||
req->ptr_site = site;
|
||||
req->size_site = siteLen + 1;
|
||||
|
||||
req->ptr_url = url;
|
||||
req->size_url = urlLen + 1;
|
||||
|
||||
req->ptr_oldLinkInfo = (char *)oldLinkInfo;
|
||||
if ( oldLinkInfo ) req->size_oldLinkInfo = oldLinkInfo->getSize();
|
||||
else req->size_oldLinkInfo = 0;
|
||||
|
||||
if ( isSiteLinkInfo ) req->m_mode = MODE_SITELINKINFO;
|
||||
else req->m_mode = MODE_PAGELINKINFO;
|
||||
|
||||
req->m_ip = ip;
|
||||
req->m_docId = docId;
|
||||
req->m_collnum = collnum;
|
||||
req->m_state = state;
|
||||
req->m_callback = callback;
|
||||
req->m_isInjecting = isInjecting;
|
||||
req->m_printInXml = printInXml;
|
||||
req->m_siteNumInlinks = siteNumInlinks;
|
||||
req->m_niceness = niceness;
|
||||
req->m_doLinkSpamCheck = doLinkSpamCheck;
|
||||
req->m_oneVotePerIpDom = oneVotePerIpDom;
|
||||
req->m_canBeCancelled = canBeCancelled;
|
||||
req->m_lastUpdateTime = lastUpdateTime;
|
||||
req->m_onlyNeedGoodInlinks = onlyNeedGoodInlinks;
|
||||
req->m_getLinkerTitles = getLinkerTitles;
|
||||
req->m_ourHostHash32 = ourHostHash32;
|
||||
req->m_ourDomHash32 = ourDomHash32;
|
||||
|
||||
if ( g_conf.m_logDebugLinkInfo )
|
||||
req->m_printDebugMsgs = true;
|
||||
|
||||
Url u;
|
||||
u.set ( req->ptr_url );
|
||||
|
||||
req->m_linkHash64 = (uint64_t)u.getUrlHash64();
|
||||
|
||||
|
||||
req->m_siteHash32 = 0LL;
|
||||
req->m_siteHash64 = 0LL;
|
||||
if ( req->ptr_site ) {
|
||||
// hash collection # in with it
|
||||
long long h64 = hash64n ( req->ptr_site );
|
||||
h64 = hash64 ((char *)&req->m_collnum,sizeof(collnum_t),h64);
|
||||
req->m_siteHash64 = h64;
|
||||
req->m_siteHash32 = hash32n ( req->ptr_site );
|
||||
}
|
||||
|
||||
// send to host for local linkdb lookup
|
||||
key224_t startKey ;
|
||||
//long siteHash32 = hash32n ( req->ptr_site );
|
||||
// access different parts of linkdb depending on the "mode"
|
||||
if ( req->m_mode == MODE_SITELINKINFO )
|
||||
startKey = g_linkdb.makeStartKey_uk ( req->m_siteHash32 );
|
||||
else
|
||||
startKey = g_linkdb.makeStartKey_uk (req->m_siteHash32,
|
||||
req->m_linkHash64 );
|
||||
// what group has this linkdb list?
|
||||
unsigned long shardNum = getShardNum ( RDB_LINKDB, &startKey );
|
||||
// use a biased lookup
|
||||
long numTwins = g_hostdb.getNumHostsPerShard();
|
||||
long long sectionWidth = (0xffffffff/(long long)numTwins) + 1;
|
||||
// these are 192 bit keys, top 32 bits are a hash of the url
|
||||
unsigned long x = req->m_siteHash32;//(startKey.n1 >> 32);
|
||||
long hostNum = x / sectionWidth;
|
||||
long numHosts = g_hostdb.getNumHostsPerShard();
|
||||
Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
|
||||
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
|
||||
long hostId = hosts [ hostNum ].m_hostId ;
|
||||
|
||||
// . serialize the string buffers
|
||||
// . use Msg25Request::m_buf[MAX_NEEDED]
|
||||
// . turns the ptr_* members into offsets into req->m_buf[]
|
||||
req->serialize();
|
||||
|
||||
// this should always block
|
||||
if ( ! mcast->send (
|
||||
(char *)req ,
|
||||
req->getStoredSize() ,
|
||||
0x25 ,
|
||||
false , // does multicast own request?
|
||||
shardNum ,
|
||||
false , // send to whole group?
|
||||
0 , // key is passed on startKey
|
||||
req , // state data
|
||||
NULL , // state data
|
||||
gotMulticastReplyWrapper25 ,
|
||||
1000 , // timeout in seconds (was 30)
|
||||
req->m_niceness ,
|
||||
false, // realtime ,
|
||||
hostId )) {// firstHostId ,
|
||||
log("linkdb: Failed to send multicast for %s err=%s",
|
||||
u.getUrl(),mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
// wait for req->m_callback(req->m_state) to be called
|
||||
return false;
|
||||
}
|
||||
|
||||
HashTableX g_lineTable;
|
||||
|
||||
static void sendReplyWrapper ( void *state ) {
|
||||
|
||||
long saved = g_errno;
|
||||
|
||||
Msg25 *m25 = (Msg25 *)state;
|
||||
// the original request
|
||||
Msg25Request *mr = m25->m_req25;
|
||||
// get udp slot for sending back reply
|
||||
UdpSlot *slot2 = mr->m_udpSlot;
|
||||
// shortcut
|
||||
SafeBuf *info = m25->m_linkInfoBuf;
|
||||
// steal this buffer
|
||||
char *reply1 = info->getBufStart();
|
||||
long replySize = info->length();
|
||||
// sanity. no if collrec not found its 0!
|
||||
if ( ! saved && replySize <= 0 ) {
|
||||
saved = g_errno = EBADENGINEER;
|
||||
log("linkdb: sending back empty link text reply. did "
|
||||
"coll get deleted?");
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
// get original request
|
||||
Msg25Request *req = (Msg25Request *)slot2->m_readBuf;
|
||||
// sanity
|
||||
if ( req->m_udpSlot != slot2 ) { char *xx=NULL;*xx=0;}
|
||||
// if in table, nuke it
|
||||
g_lineTable.removeKey ( &req->m_siteHash64 );
|
||||
|
||||
nextLink:
|
||||
|
||||
UdpSlot *udpSlot = req->m_udpSlot;
|
||||
|
||||
// update for next udpSlot
|
||||
req = req->m_next;
|
||||
|
||||
// just dup the reply for each one
|
||||
char *reply2 = (char *)mdup(reply1,replySize,"m25repd");
|
||||
|
||||
// error?
|
||||
if ( saved || ! reply2 ) {
|
||||
long err = saved;
|
||||
if ( ! err ) err = g_errno;
|
||||
if ( ! err ) { char *xx=NULL;*xx=0; }
|
||||
g_udpServer.sendErrorReply(udpSlot,err);
|
||||
}
|
||||
else {
|
||||
// send it back to requester
|
||||
g_udpServer.sendReply_ass ( reply2 ,
|
||||
replySize ,
|
||||
reply2 ,
|
||||
replySize,
|
||||
udpSlot );
|
||||
}
|
||||
|
||||
// if we had a link
|
||||
if ( req ) goto nextLink;
|
||||
|
||||
// the destructor
|
||||
mdelete ( m25 ,sizeof(Msg25),"msg25");
|
||||
delete ( m25 );
|
||||
}
|
||||
|
||||
|
||||
void handleRequest25 ( UdpSlot *slot , long netnice ) {
|
||||
|
||||
Msg25Request *req = (Msg25Request *)slot->m_readBuf;
|
||||
|
||||
req->deserialize();
|
||||
|
||||
// make sure this always NULL for our linked list logic
|
||||
req->m_next = NULL;
|
||||
|
||||
// udp socket for sending back the final linkInfo in m_linkInfoBuf
|
||||
// used by sendReply()
|
||||
req->m_udpSlot = slot;
|
||||
|
||||
// set up the hashtable if our first time
|
||||
if ( ! g_lineTable.isInitialized() )
|
||||
g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
|
||||
|
||||
// . if already working on this same request, wait for it, don't
|
||||
// overload server with duplicate requests
|
||||
// . hashkey is combo of collection, url, and m_mode
|
||||
// . TODO: ensure does not send duplicate "page" link info requests
|
||||
// just "site" link info requests
|
||||
long slotNum = -1;
|
||||
bool isSiteLinkInfo = false;
|
||||
if ( req->m_mode == MODE_SITELINKINFO ) {
|
||||
slotNum = g_lineTable.getSlot ( &req->m_siteHash64 );
|
||||
isSiteLinkInfo = true;
|
||||
}
|
||||
|
||||
if ( slotNum >= 0 ) {
|
||||
Msg25Request *head ;
|
||||
head = *(Msg25Request **)g_lineTable.getValueFromSlot(slotNum);
|
||||
if ( head->m_next )
|
||||
req->m_next = head->m_next;
|
||||
head->m_next = req;
|
||||
// note it for debugging
|
||||
log("build: msg25 request waiting in line for %s slot=0x%lx",
|
||||
req->ptr_url,(long)slot);
|
||||
// we will send a reply back for this guy when done
|
||||
// getting the reply for the head msg25request
|
||||
return;
|
||||
}
|
||||
|
||||
// make a new Msg25
|
||||
Msg25 *m25;
|
||||
try { m25 = new ( Msg25 ); }
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
log("build: msg25: new(%i): %s",
|
||||
sizeof(Msg25),mstrerror(g_errno));
|
||||
g_udpServer.sendErrorReply ( slot , g_errno );
|
||||
return;
|
||||
}
|
||||
mnew ( m25 , sizeof(Msg25) , "Msg25" );
|
||||
|
||||
if ( isSiteLinkInfo ) {
|
||||
// add the initial entry
|
||||
g_lineTable.addKey ( &req->m_siteHash64 , &req );
|
||||
}
|
||||
|
||||
// point to a real safebuf here for populating with data
|
||||
m25->m_linkInfoBuf = &m25->m_realBuf;
|
||||
|
||||
// set some new stuff. should probably be set in getLinkInfo2()
|
||||
// but we are trying to leave that as unaltered as possible to
|
||||
// try to reduce debugging.
|
||||
m25->m_req25 = req;
|
||||
|
||||
// this should call our callback when done
|
||||
if ( ! m25->getLinkInfo2 ( req->ptr_site ,
|
||||
req->ptr_url ,
|
||||
isSiteLinkInfo ,
|
||||
req->m_ip ,
|
||||
req->m_docId ,
|
||||
req->m_collnum , // coll
|
||||
NULL, // qbuf
|
||||
0 , // qbufSize
|
||||
m25 , // state
|
||||
sendReplyWrapper , // CALLBACK!
|
||||
req->m_isInjecting ,
|
||||
req->m_printDebugMsgs ,
|
||||
//XmlDoc *xd ,
|
||||
req->m_printInXml ,
|
||||
req->m_siteNumInlinks ,
|
||||
(LinkInfo *)req->ptr_oldLinkInfo ,
|
||||
req->m_niceness ,
|
||||
req->m_doLinkSpamCheck ,
|
||||
req->m_oneVotePerIpDom ,
|
||||
req->m_canBeCancelled ,
|
||||
req->m_lastUpdateTime ,
|
||||
req->m_onlyNeedGoodInlinks ,
|
||||
req->m_getLinkerTitles ,
|
||||
req->m_ourHostHash32 ,
|
||||
req->m_ourDomHash32 ,
|
||||
m25->m_linkInfoBuf ) ) // SafeBuf 4 output
|
||||
return;
|
||||
|
||||
if(m25->m_linkInfoBuf->getLength()<=0&&!g_errno){char *xx=NULL;*xx=0;}
|
||||
|
||||
if ( g_errno == ETRYAGAIN ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( g_errno )
|
||||
log("linkdb: error getting linkinfo: %s",mstrerror(g_errno));
|
||||
|
||||
// it did not block... g_errno will be set on error so sendReply()
|
||||
// should in that case send an error reply.
|
||||
sendReplyWrapper ( m25 );
|
||||
}
|
||||
|
||||
long Msg25Request::getStoredSize() {
|
||||
return sizeof(Msg25Request) + size_url + size_site + size_oldLinkInfo;
|
||||
}
|
||||
|
||||
// . fix the char ptrs for sending over the network
|
||||
// . use a for loop like we do in Msg20.cpp if we get too many strings
|
||||
void Msg25Request::serialize ( ) {
|
||||
|
||||
char *p = m_buf;
|
||||
|
||||
memcpy ( p , ptr_url , size_url );
|
||||
ptr_url = (char *)(p - m_buf);
|
||||
p += size_url;
|
||||
|
||||
memcpy ( p , ptr_site , size_site );
|
||||
ptr_site = (char *)(p - m_buf);
|
||||
p += size_site;
|
||||
|
||||
memcpy ( p , ptr_oldLinkInfo , size_oldLinkInfo );
|
||||
ptr_oldLinkInfo = (char *)(p - m_buf);
|
||||
p += size_oldLinkInfo;
|
||||
}
|
||||
|
||||
void Msg25Request::deserialize ( ) {
|
||||
|
||||
char *p = m_buf;
|
||||
|
||||
ptr_url = p;
|
||||
p += size_url;
|
||||
|
||||
if ( size_url == 0 ) ptr_url = NULL;
|
||||
|
||||
ptr_site = p;
|
||||
p += size_site;
|
||||
|
||||
if ( size_site == 0 ) ptr_site = NULL;
|
||||
|
||||
ptr_oldLinkInfo = p;
|
||||
p += size_oldLinkInfo;
|
||||
|
||||
if ( size_oldLinkInfo == 0 ) ptr_oldLinkInfo = NULL;
|
||||
}
|
||||
|
||||
//////
|
||||
//
|
||||
// OLD interface below here. use the stuff above now so we can send
|
||||
// the request to a single host and multiple incoming requests can
|
||||
// wait in line, and we can set network bandwidth too.
|
||||
//
|
||||
/////
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -448,21 +857,23 @@ void Msg25::reset() {
|
||||
// . NOTE: make sure no input vars are on the stack in case we block
|
||||
// . reallyGetLinkInfo is set to false if caller does not want it but calls
|
||||
// us anyway for some reason forgotten...
|
||||
bool Msg25::getLinkInfo ( char *site ,
|
||||
bool Msg25::getLinkInfo2( char *site ,
|
||||
char *url ,
|
||||
// either MODE_PAGELINKINFO or MODE_SITELINKINFO
|
||||
bool isSiteLinkInfo ,
|
||||
long ip ,
|
||||
long long docId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum,
|
||||
char *qbuf ,
|
||||
long qbufSize ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ,
|
||||
bool isInjecting ,
|
||||
SafeBuf *pbuf ,
|
||||
XmlDoc *xd ,
|
||||
//bool printInXml ,
|
||||
//SafeBuf *pbuf ,
|
||||
bool printDebugMsgs ,
|
||||
//XmlDoc *xd ,
|
||||
bool printInXml ,
|
||||
long siteNumInlinks ,
|
||||
//long sitePop ,
|
||||
LinkInfo *oldLinkInfo ,
|
||||
@ -475,19 +886,26 @@ bool Msg25::getLinkInfo ( char *site ,
|
||||
bool getLinkerTitles ,
|
||||
long ourHostHash32 ,
|
||||
long ourDomHash32 ,
|
||||
// put LinkInfo output class in here
|
||||
SafeBuf *linkInfoBuf ) {
|
||||
|
||||
// reset the ip table
|
||||
reset();
|
||||
|
||||
//long mode = MODE_PAGELINKINFO;
|
||||
//m_printInXml = printInXml;
|
||||
if ( isSiteLinkInfo ) m_mode = MODE_SITELINKINFO;
|
||||
else m_mode = MODE_PAGELINKINFO;
|
||||
m_xd = xd;
|
||||
m_printInXml = false;
|
||||
if ( m_xd ) m_printInXml = m_xd->m_printInXml;
|
||||
//m_xd = xd;
|
||||
//m_printInXml = false;
|
||||
//if ( m_xd ) m_printInXml = m_xd->m_printInXml;
|
||||
m_printInXml = printInXml;
|
||||
|
||||
if ( printDebugMsgs ) m_pbuf = &m_tmp;
|
||||
else m_pbuf = NULL;
|
||||
|
||||
// sanity check
|
||||
if ( ! coll ) { char *xx=NULL; *xx=0; }
|
||||
//if ( ! coll ) { char *xx=NULL; *xx=0; }
|
||||
m_onlyNeedGoodInlinks = onlyNeedGoodInlinks;
|
||||
m_getLinkerTitles = getLinkerTitles;
|
||||
// save safebuf ptr, where we store the link info
|
||||
@ -498,10 +916,10 @@ bool Msg25::getLinkInfo ( char *site ,
|
||||
// must have a valid ip
|
||||
//if ( ! ip || ip == -1 ) { char *xx = NULL; *xx = 0; }
|
||||
// get collection rec for our collection
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );//, collLen );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );//, collLen );
|
||||
// bail if NULL
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOTFOUND;
|
||||
g_errno = ENOCOLLREC;
|
||||
log("build: No collection record found when getting "
|
||||
"link info.");
|
||||
return true;
|
||||
@ -532,7 +950,8 @@ bool Msg25::getLinkInfo ( char *site ,
|
||||
m_linkSpamLinkdb = 0;
|
||||
//m_url = url;
|
||||
m_docId = docId;
|
||||
m_coll = coll;
|
||||
//m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
//m_collLen = collLen;
|
||||
m_callback = callback;
|
||||
m_state = state;
|
||||
@ -545,7 +964,7 @@ bool Msg25::getLinkInfo ( char *site ,
|
||||
m_qbufSize = qbufSize;
|
||||
m_isInjecting = isInjecting;
|
||||
m_oldLinkInfo = oldLinkInfo;
|
||||
m_pbuf = pbuf;
|
||||
//m_pbuf = pbuf;
|
||||
m_ip = ip;
|
||||
m_top = iptop(m_ip);
|
||||
m_lastUpdateTime = lastUpdateTime;
|
||||
@ -601,6 +1020,7 @@ bool Msg25::getLinkInfo ( char *site ,
|
||||
// must have a valid ip
|
||||
if ( ! ip || ip == -1 ) { //char *xx = NULL; *xx = 0; }
|
||||
log("linkdb: no inlinks because ip is invalid");
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -651,7 +1071,7 @@ bool Msg25::doReadLoop ( ) {
|
||||
long numFiles = -1;
|
||||
// NO, DON't restrict because it will mess up the hopcount.
|
||||
bool includeTree = true;
|
||||
|
||||
/*
|
||||
// what group has this linkdb list?
|
||||
//unsigned long groupId = getGroupId ( RDB_LINKDB , &startKey );
|
||||
unsigned long shardNum = getShardNum ( RDB_LINKDB, &startKey );
|
||||
@ -665,7 +1085,7 @@ bool Msg25::doReadLoop ( ) {
|
||||
Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
|
||||
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
|
||||
long hostId = hosts [ hostNum ].m_hostId ;
|
||||
|
||||
*/
|
||||
// debug log
|
||||
if ( g_conf.m_logDebugLinkInfo ) {
|
||||
char *ms = "page";
|
||||
@ -677,6 +1097,15 @@ bool Msg25::doReadLoop ( ) {
|
||||
|
||||
m_gettingList = true;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) {
|
||||
log("linkdb: no coll for collnum %li",(long)m_collnum);
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
|
||||
//char *coll = cr->m_coll;
|
||||
|
||||
// . get the linkdb list
|
||||
// . we now get the WHOLE list so we can see how many linkers there are
|
||||
// . we need a high timeout because udp server was getting suspended
|
||||
@ -685,27 +1114,22 @@ bool Msg25::doReadLoop ( ) {
|
||||
// Now we hang indefinitely. We also fixed UdpServer to resend
|
||||
// requests after 30 seconds even though it was fully acked in case
|
||||
// the receiving host went down and is now back up.
|
||||
if ( ! m_msg0.getList ( -1 , // hostId, -1 if none
|
||||
0 , // hostId ip
|
||||
0 , // hostId port
|
||||
0 , // max cache age in seconds
|
||||
false , // addToCache?
|
||||
if ( ! m_msg5.getList (
|
||||
RDB_LINKDB ,
|
||||
m_coll ,
|
||||
cr->m_collnum ,
|
||||
&m_list ,
|
||||
(char*)&startKey,
|
||||
(char*)&endKey ,
|
||||
m_minRecSizes ,
|
||||
includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // maxcacheage
|
||||
0 , // startFileNum
|
||||
numFiles ,
|
||||
this ,
|
||||
gotListWrapper ,
|
||||
m_niceness ,
|
||||
true , // error correct?
|
||||
includeTree ,
|
||||
true , // do merge
|
||||
hostId , // firstHostId
|
||||
0 , // startFileNum
|
||||
numFiles ,
|
||||
60*60*24*365 )){// timeout of one year
|
||||
true )){ // error correct?
|
||||
//log("debug: msg0 blocked this=%lx",(long)this);
|
||||
return false;
|
||||
}
|
||||
@ -725,7 +1149,7 @@ bool Msg25::doReadLoop ( ) {
|
||||
return gotList();
|
||||
}
|
||||
|
||||
void gotListWrapper ( void *state ) { // , RdbList *list ) {
|
||||
void gotListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
Msg25 *THIS = (Msg25 *) state;
|
||||
|
||||
//log("debug: entering gotlistwrapper this=%lx",(long)THIS);
|
||||
@ -964,6 +1388,13 @@ bool Msg25::sendRequests ( ) {
|
||||
if ( ourMax > MAX_MSG20_OUTSTANDING )
|
||||
ourMax = MAX_MSG20_OUTSTANDING;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) {
|
||||
log("linkdb: collnum %li is gone",(long)m_collnum);
|
||||
return true;
|
||||
}
|
||||
//char *coll = cr->m_coll;
|
||||
|
||||
// if more than 300 sockets in use max this 1. prevent udp socket clog.
|
||||
if ( g_udpServer.m_numUsedSlots >= 300 ) ourMax = 1;
|
||||
|
||||
@ -1204,8 +1635,9 @@ bool Msg25::sendRequests ( ) {
|
||||
r-> ptr_linkee = m_site;
|
||||
r->size_linkee = gbstrlen(m_site)+1; // include \0
|
||||
}
|
||||
r-> ptr_coll = m_coll;
|
||||
r->size_coll = gbstrlen(m_coll) + 1; // include \0
|
||||
//r-> ptr_coll = coll;
|
||||
//r->size_coll = gbstrlen(coll) + 1; // include \0
|
||||
r->m_collnum = cr->m_collnum;
|
||||
r->m_docId = docId;
|
||||
r->m_expected = true; // false;
|
||||
r->m_niceness = m_niceness;
|
||||
@ -1532,6 +1964,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
mstrerror(g_errno),docId);
|
||||
// this is a special case
|
||||
if ( g_errno == ECANCELLED ||
|
||||
g_errno == ENOCOLLREC ||
|
||||
g_errno == ENOMEM ||
|
||||
g_errno == ENOSLOTS ) {
|
||||
m_errors++;
|
||||
@ -1826,7 +2259,8 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
log("linkdb: recalling round=%li for %s=%s",
|
||||
m_round,ms,m_site);
|
||||
}
|
||||
// and re-call
|
||||
// and re-call. returns true if did not block.
|
||||
// returns true with g_errno set on error.
|
||||
if ( ! doReadLoop() ) return false;
|
||||
// it did not block!! wtf? i guess it read no more or
|
||||
// launched no more requests.
|
||||
@ -1898,11 +2332,18 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
ms,m_site,m_url,m_docId);
|
||||
}
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) {
|
||||
log("linkdb: collnum %li is gone",(long)m_collnum);
|
||||
return true;
|
||||
}
|
||||
char *coll = cr->m_coll;
|
||||
|
||||
// . this returns NULL and sets g_errno on error
|
||||
// . returns an allocated ptr to a LinkInfo class
|
||||
// . we are responsible for freeing
|
||||
// . LinkInfo::getSize() returns the allocated size
|
||||
m_linkInfo = makeLinkInfo ( m_coll ,
|
||||
makeLinkInfo ( coll ,
|
||||
m_ip ,
|
||||
m_siteNumInlinks ,
|
||||
//m_sitePop ,
|
||||
@ -1919,7 +2360,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
this ,
|
||||
m_linkInfoBuf );
|
||||
// return true with g_errno set on error
|
||||
if ( ! m_linkInfo ) {
|
||||
if ( ! m_linkInfoBuf->length() ) {
|
||||
log("build: msg25 linkinfo set: %s",mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
@ -1973,7 +2414,9 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
char *ss = "site";
|
||||
if ( m_mode == MODE_PAGELINKINFO ) ss = "page";
|
||||
|
||||
long siteRank = ::getSiteRank ( m_linkInfo->m_numGoodInlinks );
|
||||
LinkInfo *info = (LinkInfo *)m_linkInfoBuf->getBufStart();
|
||||
|
||||
long siteRank = ::getSiteRank ( info->m_numGoodInlinks );
|
||||
|
||||
if ( m_printInXml ) { // && m_xd ) {
|
||||
|
||||
@ -1983,19 +2426,24 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
"</sampleCreatedUTC>\n"
|
||||
, m_lastUpdateTime
|
||||
);
|
||||
char *u = NULL;
|
||||
if ( m_xd ) u = m_xd->ptr_firstUrl;
|
||||
//char *u = NULL;
|
||||
//if ( m_xd ) u = m_xd->ptr_firstUrl;
|
||||
// m_url should point into the Msg25Request buffer
|
||||
char *u = m_url;
|
||||
if ( u )
|
||||
m_pbuf->safePrintf("\t<url><![CDATA[%s]]></url>\n",u);
|
||||
|
||||
char *site = NULL;
|
||||
if ( m_xd ) site = m_xd->ptr_site;
|
||||
//char *site = NULL;
|
||||
//if ( m_xd ) site = m_xd->ptr_site;
|
||||
// m_site should point into the Msg25Request buffer
|
||||
char *site = m_site;
|
||||
if ( site )
|
||||
m_pbuf->safePrintf("\t<site><![CDATA[%s]]></site>\n",
|
||||
site);
|
||||
|
||||
long long d = 0LL;
|
||||
if ( m_xd ) d = m_xd->m_docId;
|
||||
//long long d = 0LL;
|
||||
//if ( m_xd ) d = m_xd->m_docId;
|
||||
long long d = m_docId;
|
||||
if ( d && d != -1LL )
|
||||
m_pbuf->safePrintf("\t<docId>%lli</docId>\n",d);
|
||||
|
||||
@ -2018,7 +2466,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
// the total # of inlinkers. we may not have
|
||||
// read all of them from disk though.
|
||||
, m_numDocIds
|
||||
, m_linkInfo->m_numGoodInlinks
|
||||
, info->m_numGoodInlinks
|
||||
, m_cblocks
|
||||
, m_uniqueIps
|
||||
);
|
||||
@ -2142,10 +2590,10 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
for ( long j = 0 ; j < MAX_ENTRY_DOCIDS ; j++ ) {
|
||||
if ( e->m_docIds[j] == -1LL ) break;
|
||||
if ( ! m_printInXml )
|
||||
m_pbuf->safePrintf ("<a href=\"/master/titledb"
|
||||
m_pbuf->safePrintf ("<a href=\"/admin/titledb"
|
||||
"?c=%s&d=%lli\">"
|
||||
"%li</a> ",
|
||||
m_coll,e->m_docIds[j],j);
|
||||
coll,e->m_docIds[j],j);
|
||||
}
|
||||
if ( ! m_printInXml )
|
||||
m_pbuf->safePrintf ( " </td></tr>\n" );
|
||||
@ -2225,7 +2673,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
(long)m_ipDupsLinkdb ,
|
||||
(long)m_docIdDupsLinkdb ,
|
||||
(long)m_linkSpamLinkdb ,
|
||||
m_linkInfo->m_numGoodInlinks
|
||||
info->m_numGoodInlinks
|
||||
// good and max
|
||||
//(long)m_linkInfo->getNumInlinks() ,
|
||||
);
|
||||
@ -2490,7 +2938,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
m_pbuf->safePrintf("<td><a href=\"/search?q=ip%%3A"
|
||||
"%s&c=%s&n=200\">%s</a></td>" // ip
|
||||
, iptoa(r->m_ip)
|
||||
, m_coll
|
||||
, coll
|
||||
, iptoa(r->m_ip)
|
||||
);
|
||||
m_pbuf->safePrintf("<td>%s</td>"
|
||||
@ -3487,7 +3935,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
// . how many unique ips link to us?
|
||||
// . this count includes internal IPs as well
|
||||
info->m_numUniqueIps = msg25->m_uniqueIps;
|
||||
// keep things consistent for the "test" coll
|
||||
// keep things consistent for the "qatest123" coll
|
||||
info->m_reserved1 = 0;
|
||||
info->m_reserved2 = 0;
|
||||
// how many total GOOD inlinks we got. does not include internal cblock
|
||||
@ -3551,6 +3999,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
// how many guys that we stored were internal?
|
||||
info->m_numInlinksInternal = (char)icount3;
|
||||
|
||||
linkInfoBuf->setLength ( need );
|
||||
|
||||
// sanity parse it
|
||||
//long ss = 0;
|
||||
@ -4161,7 +4610,7 @@ bool LinkInfo::print ( SafeBuf *sb , char *coll ) {
|
||||
"<tr><td colspan=2>link #%04li "
|
||||
"("
|
||||
//"baseScore=%010li, "
|
||||
"d=<a href=\"/master/titledb?c=%s&"
|
||||
"d=<a href=\"/admin/titledb?c=%s&"
|
||||
"d=%lli\">%016lli</a>, "
|
||||
"siterank=%li, "
|
||||
"hopcount=%03li "
|
||||
|
151
Linkdb.h
151
Linkdb.h
@ -35,6 +35,123 @@
|
||||
#include "DiskPageCache.h"
|
||||
#include "Titledb.h"
|
||||
|
||||
void handleRequest25 ( UdpSlot *slot , long netnice ) ;
|
||||
|
||||
// . get the inlinkers to this SITE (any page on this site)
|
||||
// . use that to compute a site quality
|
||||
// . also get the inlinkers sorted by date and see how many good inlinkers
|
||||
// we had since X days ago. (each inlinker needs a pub/birth date)
|
||||
class Msg25Request {
|
||||
public:
|
||||
// either MODE_PAGELINKINFO or MODE_SITELINKINFO
|
||||
char m_mode; // bool m_isSiteLinkInfo ;
|
||||
long m_ip ;
|
||||
long long m_docId ;
|
||||
collnum_t m_collnum ;
|
||||
bool m_isInjecting ;
|
||||
bool m_printInXml ;
|
||||
|
||||
// when we get a reply we call this
|
||||
void *m_state ;
|
||||
void (* m_callback)(void *state) ;
|
||||
|
||||
// server-side parms so it doesn't have to allocate a state
|
||||
//SafeBuf m_pbuf ;
|
||||
//SafeBuf m_linkInfoBuf ;
|
||||
|
||||
//char *coll ;
|
||||
//char *qbuf ;
|
||||
//long qbufSize ;
|
||||
//XmlDoc *xd ;
|
||||
|
||||
long m_siteNumInlinks ;
|
||||
class LinkInfo *m_oldLinkInfo ;
|
||||
long m_niceness ;
|
||||
bool m_doLinkSpamCheck ;
|
||||
bool m_oneVotePerIpDom ;
|
||||
bool m_canBeCancelled ;
|
||||
long m_lastUpdateTime ;
|
||||
bool m_onlyNeedGoodInlinks ;
|
||||
bool m_getLinkerTitles ;
|
||||
long m_ourHostHash32 ;
|
||||
long m_ourDomHash32 ;
|
||||
|
||||
// new stuff
|
||||
long m_siteHash32;
|
||||
long long m_siteHash64;
|
||||
long long m_linkHash64;
|
||||
// for linked list of these guys in g_lineTable in Linkdb.cpp
|
||||
// but only used on the server end, not client end
|
||||
class Msg25Request *m_next;
|
||||
// the mutlicast we use
|
||||
class Multicast *m_mcast;
|
||||
UdpSlot *m_udpSlot;
|
||||
bool m_printDebugMsgs;
|
||||
// store final LinkInfo reply in here
|
||||
SafeBuf *m_linkInfoBuf;
|
||||
|
||||
|
||||
char *ptr_site;
|
||||
char *ptr_url;
|
||||
char *ptr_oldLinkInfo;
|
||||
|
||||
long size_site;
|
||||
long size_url;
|
||||
long size_oldLinkInfo;
|
||||
|
||||
char m_buf[0];
|
||||
|
||||
long getStoredSize();
|
||||
void serialize();
|
||||
void deserialize();
|
||||
};
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets errno on error
|
||||
// . your req->m_callback will be called with the Msg25Reply
|
||||
bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here
|
||||
Multicast *mcast , // use this to send msg 0x25 request
|
||||
char *site ,
|
||||
char *url ,
|
||||
bool isSiteLinkInfo ,
|
||||
long ip ,
|
||||
long long docId ,
|
||||
collnum_t collnum ,
|
||||
char *qbuf ,
|
||||
long qbufSize ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ,
|
||||
bool isInjecting ,
|
||||
SafeBuf *pbuf ,
|
||||
//class XmlDoc *xd ,
|
||||
bool printInXml ,
|
||||
long siteNumInlinks ,
|
||||
//long sitePop ,
|
||||
LinkInfo *oldLinkInfo ,
|
||||
long niceness ,
|
||||
bool doLinkSpamCheck ,
|
||||
bool oneVotePerIpDom ,
|
||||
bool canBeCancelled ,
|
||||
long lastUpdateTime ,
|
||||
bool onlyNeedGoodInlinks ,
|
||||
bool getLinkerTitles , //= false ,
|
||||
// if an inlinking document has an outlink
|
||||
// of one of these hashes then we set
|
||||
// Msg20Reply::m_hadLinkToOurDomOrHost.
|
||||
// it is used to remove an inlinker to a related
|
||||
// docid, which also links to our main seo url
|
||||
// being processed. so we do not recommend
|
||||
// such links since they already link to a page
|
||||
// on your domain or hostname. set BOTH to zero
|
||||
// to not perform this algo in handleRequest20()'s
|
||||
// call to XmlDoc::getMsg20Reply().
|
||||
long ourHostHash32 , // = 0 ,
|
||||
long ourDomHash32 , // = 0 );
|
||||
SafeBuf *myLinkInfoBuf );
|
||||
|
||||
|
||||
void handleRequest25 ( UdpSlot *slot , long netnice ) ;
|
||||
|
||||
long getSiteRank ( long sni ) ;
|
||||
|
||||
class Linkdb {
|
||||
@ -307,19 +424,22 @@ class Msg25 {
|
||||
// any link text and return true right away, really saves a bunch
|
||||
// of disk seeks when spidering small collections that don't need
|
||||
// link text/info indexing/analysis
|
||||
bool getLinkInfo ( char *site ,
|
||||
bool getLinkInfo2 (char *site ,
|
||||
char *url ,
|
||||
bool isSiteLinkInfo ,
|
||||
long ip ,
|
||||
long long docId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum,
|
||||
char *qbuf ,
|
||||
long qbufSize ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ,
|
||||
bool isInjecting ,
|
||||
SafeBuf *pbuf ,
|
||||
class XmlDoc *xd ,
|
||||
//SafeBuf *pbuf ,
|
||||
bool printDebugMsgs , // into "Msg25::m_pbuf"
|
||||
//class XmlDoc *xd ,
|
||||
bool printInXml ,
|
||||
long siteNumInlinks ,
|
||||
//long sitePop ,
|
||||
LinkInfo *oldLinkInfo ,
|
||||
@ -363,17 +483,21 @@ class Msg25 {
|
||||
|
||||
//char getMinInlinkerHopCount () { return m_minInlinkerHopCount; };
|
||||
|
||||
// a new parm referencing the request we got over the network
|
||||
class Msg25Request * m_req25;
|
||||
|
||||
class Msg20Reply *getLoser (class Msg20Reply *r, class Msg20Reply *p);
|
||||
char *isDup (class Msg20Reply *r, class Msg20Reply *p);
|
||||
|
||||
bool addNote ( char *note , long noteLen , long long docId );
|
||||
|
||||
class LinkInfo *getLinkInfo () { return m_linkInfo; };
|
||||
//class LinkInfo *getLinkInfo () { return m_linkInfo; };
|
||||
|
||||
// m_linkInfo ptr references into here. provided by caller.
|
||||
SafeBuf *m_linkInfoBuf;
|
||||
|
||||
SafeBuf m_realBuf;
|
||||
|
||||
// private:
|
||||
// these need to be public for wrappers to call:
|
||||
bool gotTermFreq ( bool msg42Called ) ;
|
||||
@ -409,9 +533,10 @@ class Msg25 {
|
||||
bool m_onlyNeedGoodInlinks;
|
||||
bool m_getLinkerTitles;
|
||||
long long m_docId;
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
//long m_collLen;
|
||||
LinkInfo *m_linkInfo;
|
||||
//LinkInfo *m_linkInfo;
|
||||
void *m_state;
|
||||
void (* m_callback) ( void *state );
|
||||
|
||||
@ -419,7 +544,7 @@ class Msg25 {
|
||||
//long m_sitePop;
|
||||
long m_mode;
|
||||
bool m_printInXml;
|
||||
class XmlDoc *m_xd;
|
||||
//class XmlDoc *m_xd;
|
||||
|
||||
// private:
|
||||
|
||||
@ -437,7 +562,8 @@ class Msg25 {
|
||||
// . the href: IndexList's docIds are docs that link to us
|
||||
// . we now use Msg2 since it has "restrictIndexdb" support to limit
|
||||
// indexdb searches to just the root file to decrease disk seeks
|
||||
Msg0 m_msg0;
|
||||
//Msg0 m_msg0;
|
||||
Msg5 m_msg5;
|
||||
RdbList m_list;
|
||||
|
||||
class Inlink *m_k;
|
||||
@ -499,7 +625,12 @@ class Msg25 {
|
||||
// this is used for link ban checks
|
||||
//Msg18 m_msg18;
|
||||
|
||||
SafeBuf *m_pbuf;
|
||||
SafeBuf m_tmp;
|
||||
SafeBuf *m_pbuf; // will point to m_tmp if m_printDebugMsgs
|
||||
|
||||
// for holding the final linkinfo output
|
||||
//SafeBuf m_linkInfoBuf;
|
||||
|
||||
// copied from CollectionRec
|
||||
bool m_oneVotePerIpDom ;
|
||||
bool m_doLinkSpamCheck ;
|
||||
|
8644
Make.depend
8644
Make.depend
File diff suppressed because it is too large
Load Diff
19
Makefile
19
Makefile
@ -2,11 +2,11 @@ SHELL = /bin/bash
|
||||
|
||||
CC=g++
|
||||
|
||||
OBJS = Tfndb.o UdpSlot.o Rebalance.o \
|
||||
OBJS = UdpSlot.o Rebalance.o \
|
||||
Msg13.o Mime.o IndexReadInfo.o \
|
||||
PageGet.o PageHosts.o PageIndexdb.o PageLogin.o \
|
||||
PageGet.o PageHosts.o PageIndexdb.o \
|
||||
PageParser.o PageInject.o PagePerf.o PageReindex.o PageResults.o \
|
||||
PageRoot.o PageSockets.o PageStats.o \
|
||||
PageAddUrl.o PageRoot.o PageSockets.o PageStats.o \
|
||||
PageTitledb.o \
|
||||
PageAddColl.o \
|
||||
hash.o Domains.o \
|
||||
@ -57,9 +57,10 @@ OBJS = Tfndb.o UdpSlot.o Rebalance.o \
|
||||
PostQueryRerank.o Msge0.o Msge1.o \
|
||||
CountryCode.o DailyMerge.o CatRec.o Tagdb.o \
|
||||
Users.o Images.o Wiki.o Wiktionary.o Scraper.o \
|
||||
Dates.o Sections.o SiteGetter.o Syncdb.o \
|
||||
Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
|
||||
Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
|
||||
Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o
|
||||
Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o
|
||||
|
||||
|
||||
CHECKFORMATSTRING = -D_CHECK_FORMAT_STRING_
|
||||
|
||||
@ -76,7 +77,8 @@ ifeq ("titan","$(HOST)")
|
||||
# in 2013. So it just uses clone() and does its own "threading". Unfortunately,
|
||||
# the way it works is not even possible on newer kernels because they no longer
|
||||
# allow you to override the _errno_location() function. -- matt
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DMATTWELLS -DNEEDLICENSE
|
||||
# -DMATTWELLS
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DTITAN
|
||||
LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
|
||||
else
|
||||
# use -m32 to force 32-bit mode compilation.
|
||||
@ -326,8 +328,9 @@ Rdb.o:
|
||||
RdbBase.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
RdbCache.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
# RdbCache.cpp gets "corrupted" with -O2... like RdbTree.cpp
|
||||
#RdbCache.o:
|
||||
# $(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
# fast dictionary generation and spelling recommendations
|
||||
#Speller.o:
|
||||
|
44
Mem.cpp
44
Mem.cpp
@ -14,7 +14,7 @@
|
||||
|
||||
// put me back
|
||||
//#define EFENCE
|
||||
#define EFENCE_SIZE 100000
|
||||
//#define EFENCE_SIZE 50000
|
||||
|
||||
// uncomment this for EFENCE to do underflow checks instead of the
|
||||
// default overflow checks
|
||||
@ -52,7 +52,7 @@
|
||||
// there because it will hit a different PAGE, to be more sure we could
|
||||
// make UNDERPAD and OVERPAD PAGE bytes, although the overrun could still write
|
||||
// to another allocated area of memory and we can never catch it.
|
||||
#ifdef EFENCE
|
||||
#if defined(EFENCE) || defined(EFENCE_SIZE)
|
||||
#define UNDERPAD 0
|
||||
#define OVERPAD 0
|
||||
#else
|
||||
@ -68,7 +68,7 @@ extern bool g_isYippy;
|
||||
|
||||
bool freeCacheMem();
|
||||
|
||||
#ifdef EFENCE
|
||||
#if defined(EFENCE) || defined(EFENCE_SIZE)
|
||||
static void *getElecMem ( long size ) ;
|
||||
static void freeElecMem ( void *p ) ;
|
||||
#endif
|
||||
@ -254,6 +254,12 @@ void * operator new (size_t size) throw (std::bad_alloc) {
|
||||
}
|
||||
#ifdef EFENCE
|
||||
void *mem = getElecMem(size);
|
||||
#elif EFENCE_SIZE
|
||||
void *mem;
|
||||
if ( size > EFENCE_SIZE )
|
||||
mem = getElecMem(size);
|
||||
else
|
||||
mem = sysmalloc ( size );
|
||||
#else
|
||||
//void *mem = dlmalloc ( size );
|
||||
void *mem = sysmalloc ( size );
|
||||
@ -332,6 +338,12 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
|
||||
}
|
||||
#ifdef EFENCE
|
||||
void *mem = getElecMem(size);
|
||||
#elif EFENCE_SIZE
|
||||
void *mem;
|
||||
if ( size > EFENCE_SIZE )
|
||||
mem = getElecMem(size);
|
||||
else
|
||||
mem = sysmalloc ( size );
|
||||
#else
|
||||
//void *mem = dlmalloc ( size );
|
||||
void *mem = sysmalloc ( size );
|
||||
@ -445,10 +457,11 @@ bool Mem::init ( long long maxMem ) {
|
||||
if ( g_conf.m_detectMemLeaks )
|
||||
log(LOG_INIT,"mem: Memory leak checking is enabled.");
|
||||
|
||||
#ifdef EFENCE
|
||||
#if defined(EFENCE) || defined(EFENCE_SIZE)
|
||||
log(LOG_INIT,"mem: using electric fence!!!!!!!");
|
||||
#endif
|
||||
|
||||
#ifndef TITAN
|
||||
// if we can't alloc 3gb exit and retry
|
||||
long long start = gettimeofdayInMilliseconds();
|
||||
char *pools[30];
|
||||
@ -471,6 +484,7 @@ bool Mem::init ( long long maxMem ) {
|
||||
if ( took > 20 ) log("mem: took %lli ms to check memory ceiling",took);
|
||||
// return if could not alloc the full 3GB
|
||||
if ( i < 30 ) return false;
|
||||
#endif
|
||||
|
||||
// reset this, our max mem used over time ever because we don't
|
||||
// want the mem test we did above to count towards it
|
||||
@ -500,6 +514,15 @@ void Mem::addMem ( void *mem , long size , const char *note , char isnew ) {
|
||||
|
||||
//validate();
|
||||
|
||||
if ( (long)m_numAllocated + 100 >= (long)m_memtablesize ) {
|
||||
bool s_printed = false;
|
||||
if ( ! s_printed ) {
|
||||
log("mem: using too many slots");
|
||||
printMem();
|
||||
s_printed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// sanity check
|
||||
if ( g_inSigHandler ) {
|
||||
log(LOG_LOGIC,"mem: In sig handler.");
|
||||
@ -1284,7 +1307,7 @@ void *Mem::gbmalloc ( int size , const char *note ) {
|
||||
mem = getElecMem(size+UNDERPAD+OVERPAD);
|
||||
|
||||
// conditional electric fence?
|
||||
#elif EFENCE_BIG
|
||||
#elif EFENCE_SIZE
|
||||
if ( size >= EFENCE_SIZE )
|
||||
mem = getElecMem(size+0+0);
|
||||
else
|
||||
@ -1435,9 +1458,9 @@ void *Mem::gbrealloc ( void *ptr , int oldSize , int newSize ,
|
||||
|
||||
char *mem;
|
||||
|
||||
// even though size may be < 100k for EFENCE_BIG, do it this way
|
||||
// even though size may be < 100k for EFENCE_SIZE, do it this way
|
||||
// for simplicity...
|
||||
#if defined(EFENCE) || defined(EFENCE_BIG)
|
||||
#if defined(EFENCE) || defined(EFENCE_SIZE)
|
||||
mem = (char *)mmalloc ( newSize , note );
|
||||
if ( ! mem ) return NULL;
|
||||
// copy over to it
|
||||
@ -1516,21 +1539,22 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
|
||||
bool isnew = s_isnew[slot];
|
||||
|
||||
#ifdef EFENCE
|
||||
// this does a delayed free so do not call rmMem() just yet
|
||||
freeElecMem ((char *)ptr - UNDERPAD );
|
||||
return;
|
||||
#endif
|
||||
|
||||
#ifdef EFENCE_BIG
|
||||
#ifdef EFENCE_SIZE
|
||||
if ( size == -1 ) size = s_sizes[slot];
|
||||
if ( size >= EFENCE_SIZE ) {
|
||||
freeElecMem ((char *)ptr - 0 );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool isnew = s_isnew[slot];
|
||||
|
||||
// if this returns false it was an unbalanced free
|
||||
if ( ! rmMem ( ptr , size , note ) ) return;
|
||||
|
||||
|
@ -78,9 +78,10 @@ bool Monitordb::verify ( char *coll ) {
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
long minRecSizes = 64000;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
if ( ! msg5.getList ( RDB_MONITORDB ,
|
||||
coll ,
|
||||
cr->m_collnum,
|
||||
&list ,
|
||||
(char*)&startKey ,
|
||||
(char*)&endKey ,
|
||||
|
17
Msg0.cpp
17
Msg0.cpp
@ -103,7 +103,8 @@ bool Msg0::getList ( long long hostId , // host to ask (-1 if none)
|
||||
long maxCacheAge , // max cached age in seconds
|
||||
bool addToCache , // add net recv'd list to cache?
|
||||
char rdbId , // specifies the rdb
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
RdbList *list ,
|
||||
//key_t startKey ,
|
||||
//key_t endKey ,
|
||||
@ -143,7 +144,7 @@ bool Msg0::getList ( long long hostId , // host to ask (-1 if none)
|
||||
//if ( doIndexdbSplit )
|
||||
// logf(LOG_DEBUG,"net: doing msg0 with indexdb split true");
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg0.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0.");
|
||||
|
||||
//if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -189,7 +190,7 @@ bool Msg0::getList ( long long hostId , // host to ask (-1 if none)
|
||||
KEYSET(m_endKey,endKey,m_ks);
|
||||
m_minRecSizes = minRecSizes;
|
||||
m_rdbId = rdbId;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;// = coll;
|
||||
m_isRealMerge = isRealMerge;
|
||||
m_allowPageCache = allowPageCache;
|
||||
|
||||
@ -349,7 +350,7 @@ bool Msg0::getList ( long long hostId , // host to ask (-1 if none)
|
||||
*/
|
||||
QUICKPOLL(m_niceness);
|
||||
if ( ! m_msg5->getList ( rdbId,
|
||||
coll ,
|
||||
m_collnum ,
|
||||
m_list ,
|
||||
m_startKey ,
|
||||
m_endKey ,
|
||||
@ -462,7 +463,8 @@ skip:
|
||||
KEYSET(p,m_startKey,m_ks); ; p+=m_ks;
|
||||
KEYSET(p,m_endKey,m_ks); ; p+=m_ks;
|
||||
// NULL terminated collection name
|
||||
strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
|
||||
//strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
|
||||
*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
|
||||
m_requestSize = p - m_request;
|
||||
// ask an individual host for this list if hostId is NOT -1
|
||||
if ( m_hostId != -1 ) {
|
||||
@ -957,7 +959,8 @@ void handleRequest0 ( UdpSlot *slot , long netnice ) {
|
||||
char *startKey = p; p+=ks;
|
||||
char *endKey = p; p+=ks;
|
||||
// then null terminated collection
|
||||
char *coll = p;
|
||||
//char *coll = p;
|
||||
collnum_t collnum = *(collnum_t *)p; p += sizeof(collnum_t);
|
||||
|
||||
|
||||
// error set from XmlDoc::cacheTermLists()?
|
||||
@ -1175,7 +1178,7 @@ void handleRequest0 ( UdpSlot *slot , long netnice ) {
|
||||
// . return if this blocks
|
||||
// . we'll call sendReply later
|
||||
if ( ! st0->m_msg5.getList ( rdbId ,
|
||||
coll ,
|
||||
collnum ,
|
||||
&st0->m_list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
|
13
Msg0.h
13
Msg0.h
@ -36,7 +36,7 @@ bool getRecFromTermListCache ( char *coll,
|
||||
*/
|
||||
|
||||
//#define MSG0_REQ_SIZE (8 + 2 * sizeof(key_t) + 16 + 5 + MAX_COLL_LEN + 1 )
|
||||
#define MSG0_REQ_SIZE (8 + 2 * MAX_KEY_BYTES + 16 + 5 + MAX_COLL_LEN + 1 + 1 )
|
||||
#define MSG0_REQ_SIZE (8 + 2 * MAX_KEY_BYTES + 16 + 5 + 4 + 1 + 1 )
|
||||
|
||||
class Msg0 {
|
||||
|
||||
@ -68,7 +68,8 @@ class Msg0 {
|
||||
long maxCacheAge , // max cached age in seconds
|
||||
bool addToCache , // add net recv'd list to cache?
|
||||
char rdbId , // specifies the rdb
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
class RdbList *list ,
|
||||
//key_t startKey ,
|
||||
//key_t endKey ,
|
||||
@ -106,7 +107,8 @@ class Msg0 {
|
||||
long maxCacheAge , // max cached age in seconds
|
||||
bool addToCache , // add net recv'd list to cache?
|
||||
char rdbId , // specifies the rdb
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
class RdbList *list ,
|
||||
key_t startKey ,
|
||||
key_t endKey ,
|
||||
@ -144,7 +146,7 @@ class Msg0 {
|
||||
maxCacheAge ,
|
||||
addToCache ,
|
||||
rdbId ,
|
||||
coll ,
|
||||
collnum ,
|
||||
list ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
@ -256,7 +258,8 @@ class Msg0 {
|
||||
char m_endKey[MAX_KEY_BYTES];
|
||||
long m_minRecSizes ;
|
||||
char m_rdbId ;
|
||||
char *m_coll ;
|
||||
//char *m_coll ;
|
||||
collnum_t m_collnum;
|
||||
|
||||
class Msg5 *m_msg5 ;
|
||||
class Msg5 *m_msg5b;
|
||||
|
53
Msg1.cpp
53
Msg1.cpp
@ -95,7 +95,7 @@ bool Msg1::addRecord ( char *rec ,
|
||||
sizeof(key_t));
|
||||
return addList ( &m_tmpList ,
|
||||
rdbId ,
|
||||
g_collectiondb.m_recs[collnum]->m_coll ,
|
||||
collnum,//g_collectiondb.m_recs[collnum]->m_coll ,
|
||||
state ,
|
||||
callback ,
|
||||
false , // force local?
|
||||
@ -111,7 +111,7 @@ bool Msg1::addRecord ( char *rec ,
|
||||
// when the reply does come back we do NOT call the callback
|
||||
bool Msg1::addList ( RdbList *list ,
|
||||
char rdbId ,
|
||||
char *coll ,
|
||||
collnum_t collnum, // char *coll ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ,
|
||||
bool forceLocal ,
|
||||
@ -120,7 +120,7 @@ bool Msg1::addList ( RdbList *list ,
|
||||
bool waitForReply ,
|
||||
bool *inTransit ) {
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg1.cpp.");
|
||||
if ( collnum<0 ) log(LOG_LOGIC,"net: bad collection. msg1.cpp.");
|
||||
// if list has no records in it return true
|
||||
if ( ! list || list->isEmpty() ) return true;
|
||||
// sanity check
|
||||
@ -175,7 +175,7 @@ bool Msg1::addList ( RdbList *list ,
|
||||
bool inTransit;
|
||||
bool status = Y->addList ( &Y->m_ourList ,
|
||||
rdbId ,
|
||||
coll ,
|
||||
collnum ,
|
||||
Y , // state
|
||||
returnMsg1 , // callback
|
||||
forceLocal ,
|
||||
@ -205,7 +205,7 @@ bool Msg1::addList ( RdbList *list ,
|
||||
// remember these vars
|
||||
m_list = list;
|
||||
m_rdbId = rdbId;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_state = state;
|
||||
m_callback = callback;
|
||||
m_forceLocal = forceLocal;
|
||||
@ -451,6 +451,11 @@ bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
|
||||
*/
|
||||
// if the data is being added to our group, don't send ourselves
|
||||
// a msg1, if we can add it right now
|
||||
// MDW: crap this is getting ETRYAGAIN and it isn't being tried again
|
||||
// i guess and Spider.cpp fails to add to doledb but the doleiptable
|
||||
// maintains a positive count, thereby hanging the spiders. let's
|
||||
// just always go through multicast so it will auto-retry ETRYAGAIN
|
||||
/*
|
||||
bool sendToSelf = true;
|
||||
if ( shardNum == getMyShardNum() &&
|
||||
! g_conf.m_interfaceMachine ) {
|
||||
@ -485,7 +490,8 @@ bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
|
||||
// if no error, no need to use a Msg1 UdpSlot for ourselves
|
||||
if ( ! g_errno ) sendToSelf = false;
|
||||
else {
|
||||
log("rdb: msg1 had error: %s",mstrerror(g_errno));
|
||||
log("rdb: msg1 coll=%s rdb=%s had error: %s",
|
||||
m_coll,rdb->m_dbname,mstrerror(g_errno));
|
||||
// this is messing up generate catdb's huge rdblist add
|
||||
// why did we put it in there??? from msg9b.cpp
|
||||
//return true;
|
||||
@ -497,16 +503,17 @@ bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
|
||||
g_hostdb.getNumHostsPerShard() == 1 ) return true;
|
||||
}
|
||||
skip:
|
||||
*/
|
||||
// . make an add record request to multicast to a bunch of machines
|
||||
// . this will alloc new space, returns NULL on failure
|
||||
//char *request = makeRequest ( listData, listSize, groupId ,
|
||||
//m_rdbId , &requestLen );
|
||||
long collLen = gbstrlen ( m_coll );
|
||||
//long collLen = gbstrlen ( m_coll );
|
||||
// . returns NULL and sets g_errno on error
|
||||
// . calculate total size of the record
|
||||
// . 1 byte for rdbId, 1 byte for flags,
|
||||
// then collection NULL terminated, then list
|
||||
long requestLen = 1 + 1 + collLen + 1 + listSize ;
|
||||
long requestLen = 1 + 1 + sizeof(collnum_t) + listSize ;
|
||||
// make the request
|
||||
char *request = (char *) mmalloc ( requestLen ,"Msg1" );
|
||||
if ( ! request ) return true;
|
||||
@ -518,16 +525,18 @@ skip:
|
||||
if ( m_injecting ) *p |= 0x80;
|
||||
p++;
|
||||
// then collection name
|
||||
memcpy ( p , m_coll , collLen );
|
||||
p += collLen;
|
||||
*p++ = '\0';
|
||||
//memcpy ( p , m_coll , collLen );
|
||||
//p += collLen;
|
||||
//*p++ = '\0';
|
||||
*(collnum_t *)p = m_collnum;
|
||||
p += sizeof(collnum_t);
|
||||
// sanity check
|
||||
if ( collLen <= 0 ) {
|
||||
log(LOG_LOGIC,"net: No collection specified for list add.");
|
||||
//char *xx = NULL; *xx = 0;
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
//if ( collLen <= 0 ) {
|
||||
// log(LOG_LOGIC,"net: No collection specified for list add.");
|
||||
// //char *xx = NULL; *xx = 0;
|
||||
// g_errno = ENOCOLLREC;
|
||||
// return true;
|
||||
//}
|
||||
//if ( m_deleteRecs ) request[1] |= 0x80;
|
||||
//if ( m_overwriteRecs ) request[1] |= 0x40;
|
||||
// store the list after coll
|
||||
@ -573,7 +582,7 @@ skip:
|
||||
k , // cache key
|
||||
RDB_NONE , // bogus rdbId
|
||||
-1 , // unknown minRecSizes read size
|
||||
sendToSelf ))
|
||||
true )) // sendToSelf ))
|
||||
return false;
|
||||
|
||||
QUICKPOLL(m_niceness);
|
||||
@ -675,8 +684,10 @@ void handleRequest1 ( UdpSlot *slot , long netnice ) {
|
||||
else injecting = false;
|
||||
p++;
|
||||
// then collection
|
||||
char *coll = p;
|
||||
p += gbstrlen (p) + 1;
|
||||
//char *coll = p;
|
||||
//p += gbstrlen (p) + 1;
|
||||
collnum_t collnum = *(collnum_t *)p;
|
||||
p += sizeof(collnum_t);
|
||||
// . make a list from this data
|
||||
// . skip over the first 4 bytes which is the rdbId
|
||||
// . TODO: embed the rdbId in the msgtype or something...
|
||||
@ -694,7 +705,7 @@ void handleRequest1 ( UdpSlot *slot , long netnice ) {
|
||||
//log("msg1: handlerequest1 calling addlist niceness=%li",niceness);
|
||||
//log("msg1: handleRequest1 niceness=%li",niceness);
|
||||
// this returns false and sets g_errno on error
|
||||
rdb->addList ( coll , &list , niceness);
|
||||
rdb->addList ( collnum , &list , niceness);
|
||||
// if titledb, add tfndb recs to map the title recs
|
||||
//if ( ! g_errno && rdb == g_titledb.getRdb() && injecting )
|
||||
// updateTfndb ( coll , &list , true, 0);
|
||||
|
5
Msg1.h
5
Msg1.h
@ -59,7 +59,7 @@ class Msg1 {
|
||||
// . when deleteRecs is true, the recs in the list are really just keys
|
||||
bool addList ( RdbList *list ,
|
||||
char rdbId ,
|
||||
char *coll ,
|
||||
collnum_t collnum, // char *coll ,
|
||||
void *state ,
|
||||
void (*callback)(void *state) ,
|
||||
bool forceLocal ,
|
||||
@ -95,7 +95,8 @@ class Msg1 {
|
||||
|
||||
// rdb id to add to ( see Msg0::getRdb(char rdbId) )
|
||||
char m_rdbId;
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
// groupId to send to (may be -1 if it's up to us to decide)
|
||||
unsigned long m_groupId;
|
||||
|
27
Msg13.cpp
27
Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
"(compatible; MSIE 6.0; Windows 98; "
|
||||
"Win 9x 4.90)" ;
|
||||
|
||||
// for bulk jobs avoid actual downloads of the page for efficiency
|
||||
if ( r->m_isCustomCrawl == 2 ) {
|
||||
char *s =
|
||||
"HTTP/1.0 200 (OK)\r\n"
|
||||
"Content-Length: 0\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Content-Type: text/html\r\n\r\n";
|
||||
long slen = gbstrlen(s);
|
||||
long fakeBufSize = slen + 1;
|
||||
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
|
||||
gotHttpReply2 ( r ,
|
||||
fakeBuf,
|
||||
fakeBufSize, // include \0
|
||||
fakeBufSize, // allocsize
|
||||
NULL ); // tcpsock
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// download it
|
||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
||||
r->m_urlIp ,
|
||||
@ -1390,7 +1409,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
//
|
||||
//
|
||||
// . UTILITY FUNCTIONS for injecting into the "test" collection
|
||||
// . UTILITY FUNCTIONS for injecting into the "qatest123" collection
|
||||
// . we need to ensure that the web pages remain constant so we store them
|
||||
//
|
||||
//
|
||||
@ -1400,7 +1419,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
|
||||
// . now that we are lower level in Msg13.cpp, set "ts" not "slot"
|
||||
bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {
|
||||
// sanity check
|
||||
//if ( strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
|
||||
//if ( strcmp(m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
||||
// hash the url into 64 bits
|
||||
long long h = hash64 ( u , gbstrlen(u) );
|
||||
// read the spider date file first
|
||||
@ -1547,7 +1566,7 @@ bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// add it to our "test" subdir
|
||||
// add it to our "qatest123" subdir
|
||||
bool addTestDoc ( long long urlHash64 , char *httpReply , long httpReplySize ,
|
||||
long err , Msg13Request *r ) {
|
||||
|
||||
@ -1796,7 +1815,7 @@ long hasGoodDates ( char *content ,
|
||||
NULL , // tag rec
|
||||
NULL , // url
|
||||
0 , // docid
|
||||
NULL , // coll
|
||||
0 , // collnum
|
||||
0 , // domhash32
|
||||
0 , // ip
|
||||
niceness ,
|
||||
|
2
Msg13.h
2
Msg13.h
@ -32,6 +32,8 @@ public:
|
||||
// if doing spider compression, compute contentHash32 of document
|
||||
// downloaded, and if it matches this then send back EDOCUNCHANGED
|
||||
long m_contentHash32;
|
||||
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
|
||||
char m_isCustomCrawl;
|
||||
// send back error ENOGOODDATE if it does not have one. but if
|
||||
// harvestLinks is true, just send back a filtered list of links
|
||||
long m_requireGoodDate:1;
|
||||
|
28
Msg17.cpp
28
Msg17.cpp
@ -67,7 +67,8 @@ bool Msg17::getFromCache ( char cacheId,
|
||||
key_t key,
|
||||
char **recPtr,
|
||||
long *recSize,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
void *state ,
|
||||
void (*callback) (void *state) ,
|
||||
long niceness ,
|
||||
@ -107,7 +108,7 @@ bool Msg17::getFromCache ( char cacheId,
|
||||
if ( c ) {
|
||||
time_t cachedTime;
|
||||
// return true if not found in our local cache
|
||||
if ( ! c->getRecord ( coll ,
|
||||
if ( ! c->getRecord ( collnum ,
|
||||
m_key ,
|
||||
recPtr ,
|
||||
recSize ,
|
||||
@ -148,7 +149,8 @@ bool Msg17::getFromCache ( char cacheId,
|
||||
*p++ = m_cacheId;
|
||||
// the flag (0 means read request, 1 means store request)
|
||||
*p++ = 0;
|
||||
strcpy ( p , coll ); p += gbstrlen ( coll ) + 1;
|
||||
memcpy ( p , &collnum, sizeof(collnum_t)); p += sizeof(collnum_t);
|
||||
//strcpy ( p , coll ); p += gbstrlen ( coll ) + 1;
|
||||
// . send the request to the key host
|
||||
// . this returns false and sets g_errno on error
|
||||
// . now wait for 1 sec before timing out
|
||||
@ -317,13 +319,14 @@ void handleRequest17 ( UdpSlot *slot , long niceness ) {
|
||||
// then 1-byte flag (0 means read request, 1 means store request)
|
||||
char flag = *p++;
|
||||
// NULL terminated collection name follows
|
||||
char *coll = p; p += gbstrlen ( coll ) + 1 ;
|
||||
//char *coll = p; p += gbstrlen ( coll ) + 1 ;
|
||||
collnum_t collnum = *(collnum_t *)p; p += sizeof(collnum_t);
|
||||
|
||||
RdbCache *c = &g_genericCache[(int)cacheId];
|
||||
|
||||
// if flag is 1 then it is a request to store a compressed Msg40
|
||||
if ( flag == 1 ) {
|
||||
if ( ! c->addRecord ( coll ,
|
||||
if ( ! c->addRecord ( collnum ,
|
||||
k,
|
||||
p,
|
||||
pend - p ) )
|
||||
@ -338,7 +341,7 @@ void handleRequest17 ( UdpSlot *slot , long niceness ) {
|
||||
long recSize;
|
||||
time_t cachedTime;
|
||||
// send back nothing if not in cache
|
||||
if ( ! c->getRecord ( coll ,
|
||||
if ( ! c->getRecord ( collnum ,
|
||||
k ,
|
||||
&rec ,
|
||||
&recSize ,
|
||||
@ -386,7 +389,7 @@ bool Msg17::storeInCache ( char cacheId ,
|
||||
key_t key ,
|
||||
char *recPtr ,
|
||||
long recSize ,
|
||||
char *coll ,
|
||||
collnum_t collnum, // char *coll ,
|
||||
long niceness ,
|
||||
long timeout ) {
|
||||
|
||||
@ -446,7 +449,8 @@ bool Msg17::storeInCache ( char cacheId ,
|
||||
// use "1" for a store request
|
||||
*p++ = 1;
|
||||
//char *coll = si->m_coll;
|
||||
strcpy ( p , coll ); p += gbstrlen(coll) + 1; // includes '\0'
|
||||
//strcpy ( p , coll ); p += gbstrlen(coll) + 1; // includes '\0'
|
||||
memcpy ( p ,&collnum ,sizeof(collnum_t)); p += sizeof(collnum_t);
|
||||
|
||||
QUICKPOLL(niceness);
|
||||
|
||||
@ -466,7 +470,7 @@ bool Msg17::storeInCache ( char cacheId ,
|
||||
long avail = pend - p;
|
||||
// save it
|
||||
long saved = avail;
|
||||
long clen = gbstrlen(coll);
|
||||
//long clen = gbstrlen(coll);
|
||||
// compress "tmp" into m_buf, but leave leading bytes
|
||||
// for the key
|
||||
int err = gbcompress ( (unsigned char *)p ,
|
||||
@ -479,10 +483,10 @@ bool Msg17::storeInCache ( char cacheId ,
|
||||
if ( err != Z_OK ) {
|
||||
g_errno = ECOMPRESSFAILED;
|
||||
log("query: Compression of cache cacheId=%i "
|
||||
"failed err=%li avail=%li collLen=%li "
|
||||
"failed err=%li avail=%li collnum=%li "
|
||||
"recSize=%li.",
|
||||
cacheId , (long)err ,
|
||||
saved , clen , recSize );
|
||||
saved , (long)collnum , recSize );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -506,7 +510,7 @@ bool Msg17::storeInCache ( char cacheId ,
|
||||
// if we are that host, store it ourselves right now
|
||||
if ( host->m_hostId == g_hostdb.m_hostId ) {
|
||||
RdbCache *c = &g_genericCache[(int)m_cacheId];
|
||||
if ( ! c->addRecord ( coll ,
|
||||
if ( ! c->addRecord ( collnum ,
|
||||
key ,
|
||||
cacheRec ,
|
||||
cacheRecSize ) )
|
||||
|
6
Msg17.h
6
Msg17.h
@ -53,7 +53,7 @@ class Msg17 {
|
||||
key_t key ,
|
||||
char **recPtr ,
|
||||
long *recSize ,
|
||||
char *coll,
|
||||
collnum_t collnum,
|
||||
void *state ,
|
||||
void (* callback) (void *state) ,
|
||||
long niceness,
|
||||
@ -83,7 +83,7 @@ class Msg17 {
|
||||
key_t key ,
|
||||
char *recPtr ,
|
||||
long recSize ,
|
||||
char *coll ,
|
||||
collnum_t collnum,
|
||||
long niceness ,
|
||||
long timeout );
|
||||
|
||||
@ -116,7 +116,7 @@ class Msg17 {
|
||||
// ptr to "it"
|
||||
class Msg40 *m_msg40;
|
||||
|
||||
char m_request [ 12 + 1 + MAX_COLL_LEN + 1 ]; // key + flag + coll name
|
||||
char m_request [ 12 + 1 + 8 ]; // key + flag + collnum_t
|
||||
};
|
||||
|
||||
// allow our cache to be used by PageStats.cpp to display its stats
|
||||
|
10
Msg2.cpp
10
Msg2.cpp
@ -28,7 +28,7 @@ Msg2 *g_msg2;
|
||||
// other termlists have a componentCode of -2. These are typically taken
|
||||
// from the Query.cpp class.
|
||||
bool Msg2::getLists ( long rdbId ,
|
||||
char *coll ,
|
||||
collnum_t collnum , // char *coll ,
|
||||
long maxAge ,
|
||||
bool addToCache ,
|
||||
//QueryTerm *qterms ,
|
||||
@ -53,7 +53,7 @@ bool Msg2::getLists ( long rdbId ,
|
||||
char forceParitySplit ,
|
||||
bool checkCache ) {
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg2.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"net: bad collection. msg2.");
|
||||
if ( ! minRecSizes ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log(LOG_LOGIC,"net: MinRecSizes is NULL.");
|
||||
@ -82,7 +82,7 @@ bool Msg2::getLists ( long rdbId ,
|
||||
m_getComponents = false;
|
||||
m_rdbId = rdbId;
|
||||
m_addToCache = addToCache;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_restrictPosdb = restrictPosdb;
|
||||
m_forceParitySplit = forceParitySplit;
|
||||
m_checkCache = checkCache;
|
||||
@ -278,7 +278,7 @@ bool Msg2::getLists ( ) {
|
||||
// . we now always compress the list for 2x faster transmits
|
||||
if ( ! msg5->getList (
|
||||
m_rdbId , // rdbid
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
&m_lists[m_i], // listPtr
|
||||
sk2,//&m_startKeys [i*ks],
|
||||
ek2,//&m_endKeys [i*ks],
|
||||
@ -410,7 +410,7 @@ bool Msg2::getLists ( ) {
|
||||
// start up the read. thread will wait in thread queue to
|
||||
// launch if too many threads are out.
|
||||
if ( ! msg5->getList ( m_rdbId , // rdbid
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
&m_whiteLists[m_w], // listPtr
|
||||
&sk3,//&m_startKeys [i*ks],
|
||||
&ek3,//&m_endKeys [i*ks],
|
||||
|
5
Msg2.h
5
Msg2.h
@ -33,7 +33,7 @@ class Msg2 {
|
||||
// . sets errno on error
|
||||
// . "termIds/termFreqs" should NOT be on the stack in case we block
|
||||
bool getLists ( long rdbId ,
|
||||
char *coll ,
|
||||
collnum_t collnum ,//char *coll ,
|
||||
long maxAge ,
|
||||
bool addToCache ,
|
||||
//key_t *startKeys ,
|
||||
@ -123,7 +123,8 @@ class Msg2 {
|
||||
bool m_getComponents;
|
||||
char m_rdbId;
|
||||
bool m_addToCache;
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
bool m_restrictPosdb;
|
||||
long m_compoundListMaxSize;
|
||||
char m_forceParitySplit;
|
||||
|
35
Msg20.cpp
35
Msg20.cpp
@ -14,6 +14,7 @@ void Msg20::constructor () {
|
||||
m_r = NULL;
|
||||
m_inProgress = false;
|
||||
m_launched = false;
|
||||
m_ii = -1;
|
||||
reset();
|
||||
m_mcast.constructor();
|
||||
}
|
||||
@ -22,6 +23,18 @@ void Msg20::destructor () { reset(); m_mcast.destructor(); }
|
||||
|
||||
#include "Process.h"
|
||||
|
||||
void Msg20::freeReply() {
|
||||
if ( ! m_r ) return;
|
||||
// sometimes the msg20 reply carries an merged bffer from
|
||||
// msg40 that is a constructed ptr_eventSummaryLines from a
|
||||
// merge operation in msg40. this fixes the "merge20buf1" memory
|
||||
// leak from Msg40.cpp
|
||||
m_r->destructor();
|
||||
if ( m_ownReply ) mfree ( m_r, m_replyMaxSize , "Msg20b" );
|
||||
m_r = NULL;
|
||||
|
||||
}
|
||||
|
||||
void Msg20::reset() {
|
||||
// not allowed to reset one in progress
|
||||
if ( m_inProgress ) {
|
||||
@ -33,15 +46,12 @@ void Msg20::reset() {
|
||||
m_launched = false;
|
||||
if ( m_request && m_request != m_requestBuf )
|
||||
mfree ( m_request , m_requestSize , "Msg20rb" );
|
||||
// sometimes the msg20 reply carries an merged bffer from
|
||||
// msg40 that is a constructed ptr_eventSummaryLines from a
|
||||
// merge operation in msg40. this fixes the "merge20buf1" memory
|
||||
// leak from Msg40.cpp
|
||||
if ( m_r ) m_r->destructor();
|
||||
if ( m_r && m_ownReply ) //&& (char *)m_r != m_replyBuf )
|
||||
mfree ( m_r , m_replyMaxSize , "Msg20b" );
|
||||
freeReply();
|
||||
//if ( m_r ) m_r->destructor();
|
||||
//if ( m_r && m_ownReply ) //&& (char *)m_r != m_replyBuf )
|
||||
// mfree ( m_r , m_replyMaxSize , "Msg20b" );
|
||||
//m_r = NULL; // the reply ptr
|
||||
m_request = NULL; // the request buf ptr
|
||||
m_r = NULL; // the reply ptr
|
||||
m_gotReply = false;
|
||||
m_errno = 0;
|
||||
m_requestDocId = -1LL;
|
||||
@ -268,6 +278,13 @@ void Msg20::gotReply ( UdpSlot *slot ) {
|
||||
m_inProgress = false;
|
||||
// sanity check
|
||||
if ( m_r ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
// free our serialized request buffer to save mem
|
||||
if ( m_request && m_request != m_requestBuf ) {
|
||||
mfree ( m_request , m_requestSize , "Msg20rb" );
|
||||
m_request = NULL;
|
||||
}
|
||||
|
||||
// save error so Msg40 can look at it
|
||||
if ( g_errno ) {
|
||||
m_errno = g_errno;
|
||||
@ -349,7 +366,7 @@ void handleRequest20 ( UdpSlot *slot , long netnice ) {
|
||||
if ( nb != slot->m_readBufSize ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
// sanity check, the size include the \0
|
||||
if ( req->size_coll <= 1 || *req->ptr_coll == '\0' ) {
|
||||
if ( req->m_collnum < 0 ) {
|
||||
log("query: Got empty collection in msg20 handler. FIX!");
|
||||
char *xx =NULL; *xx = 0;
|
||||
}
|
||||
|
11
Msg20.h
11
Msg20.h
@ -121,6 +121,8 @@ class Msg20Request {
|
||||
// serialized using Address::serialize(), and all the start dates
|
||||
// from now onward
|
||||
long m_eventId ;
|
||||
// we now use the numeric collection # and not the ptr_coll
|
||||
collnum_t m_collnum;
|
||||
// set this to true when you pass in m_eventIdBits...
|
||||
char m_getEventSummary ;
|
||||
char m_summaryMode ;
|
||||
@ -189,7 +191,7 @@ class Msg20Request {
|
||||
char *ptr_termFreqs ;
|
||||
char *ptr_affWeights ;
|
||||
char *ptr_linkee ; // used by Msg25 for getting link text
|
||||
char *ptr_coll ;
|
||||
//char *ptr_coll ;
|
||||
char *ptr_imgUrl ;
|
||||
char *ptr_displayMetas ;
|
||||
|
||||
@ -206,7 +208,7 @@ class Msg20Request {
|
||||
long size_termFreqs ;
|
||||
long size_affWeights ;
|
||||
long size_linkee ; // size includes terminating \0
|
||||
long size_coll ; // size includes terminating \0
|
||||
//long size_coll ; // size includes terminating \0
|
||||
long size_imgUrl ;
|
||||
long size_displayMetas ; // size includes terminating \0
|
||||
|
||||
@ -309,6 +311,7 @@ public:
|
||||
//long m_numLikers ;
|
||||
bool m_datedbDateIsEstimated;
|
||||
long m_errno ; // LinkInfo uses it for LinkTextRepl
|
||||
collnum_t m_collnum ; // collection # we came from
|
||||
char m_sumFromDmoz ; // unused
|
||||
long m_hostHash ;
|
||||
char m_noArchive ;
|
||||
@ -334,7 +337,7 @@ public:
|
||||
//long m_numCatIds ; // use size_catIds
|
||||
//long m_numIndCatIds ; // use size_indCatIds
|
||||
long m_contentLen ; // was m_docLen
|
||||
//long m_contentHash ;
|
||||
long m_contentHash32 ; // for deduping diffbot json objects streaming
|
||||
//long m_docSummaryScore ;
|
||||
//long m_inSectionScore ;
|
||||
//float m_proximityScore ;
|
||||
@ -780,9 +783,11 @@ class Msg20 {
|
||||
// so we can alloc arrays of these using mmalloc()
|
||||
void constructor ();
|
||||
void destructor ();
|
||||
void freeReply ();
|
||||
void reset ();
|
||||
|
||||
long m_hack;
|
||||
long m_ii;
|
||||
|
||||
// is the reply in progress? if msg20 has not launched a request
|
||||
// this is false. if msg20 received its reply, this is false.
|
||||
|
10
Msg22.cpp
10
Msg22.cpp
@ -320,7 +320,7 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
|
||||
// get the request
|
||||
Msg22Request *r = (Msg22Request *)slot->m_readBuf;
|
||||
// get this
|
||||
char *coll = g_collectiondb.getCollName ( r->m_collnum );
|
||||
//char *coll = g_collectiondb.getCollName ( r->m_collnum );
|
||||
|
||||
// sanity check
|
||||
long requestSize = slot->m_readBufSize;
|
||||
@ -333,10 +333,10 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *tbase;
|
||||
if ( ! (tbase=getRdbBase(RDB_TITLEDB,coll) ) ) {
|
||||
log("db: Could not get title rec in collection \"%s\" "
|
||||
if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) {
|
||||
log("db: Could not get title rec in collection # %li "
|
||||
"because rdbbase is null.",
|
||||
coll);
|
||||
(long)r->m_collnum);
|
||||
g_errno = EBADENGINEER;
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
return;
|
||||
@ -763,7 +763,7 @@ void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
// . our file range should be solid
|
||||
// . use 500 million for min recsizes to get all in range
|
||||
if ( ! st->m_msg5.getList ( RDB_TITLEDB ,
|
||||
coll ,
|
||||
r->m_collnum ,
|
||||
&st->m_tlist ,
|
||||
startKey , // startKey
|
||||
endKey , // endKey
|
||||
|
14
Msg3.cpp
14
Msg3.cpp
@ -63,7 +63,7 @@ void Msg3::reset() {
|
||||
// in Sync class can just read from titledb*.dat files that were formed
|
||||
// since the last sync point.
|
||||
bool Msg3::readList ( char rdbId ,
|
||||
char *coll ,
|
||||
collnum_t collnum ,
|
||||
//key_t startKey ,
|
||||
//key_t endKey ,
|
||||
char *startKeyArg ,
|
||||
@ -94,10 +94,10 @@ bool Msg3::readList ( char rdbId ,
|
||||
// reset m_alloc and data in all lists in case we are a re-call
|
||||
reset();
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg3.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3.");
|
||||
// remember the callback
|
||||
m_rdbId = rdbId;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_callback = callback;
|
||||
m_state = state;
|
||||
m_niceness = niceness;
|
||||
@ -136,7 +136,7 @@ bool Msg3::readList ( char rdbId ,
|
||||
long max ;
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
|
||||
// if caller specified exactly
|
||||
/*
|
||||
@ -673,7 +673,7 @@ void doneScanningWrapper ( void *state ) {
|
||||
// if we had an error, remember it
|
||||
if ( g_errno ) {
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on err
|
||||
RdbBase *base; base=getRdbBase(THIS->m_rdbId,THIS->m_coll);
|
||||
RdbBase *base; base=getRdbBase(THIS->m_rdbId,THIS->m_collnum);
|
||||
char *dbname = "NOT FOUND";
|
||||
if ( base ) dbname = base->m_dbname;
|
||||
long tt = LOG_WARN;
|
||||
@ -783,7 +783,7 @@ bool Msg3::doneScanning ( ) {
|
||||
}
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
|
||||
// this really slows things down because it blocks the cpu so
|
||||
// leave it out for now
|
||||
@ -964,7 +964,7 @@ bool Msg3::doneSleeping ( ) {
|
||||
g_loop.unregisterSleepCallback(this,doneSleepingWrapper3);
|
||||
// read again
|
||||
if ( ! readList ( m_rdbId ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
m_startKey ,
|
||||
m_endKeyOrig ,
|
||||
m_minRecSizesOrig ,
|
||||
|
6
Msg3.h
6
Msg3.h
@ -50,7 +50,8 @@ class Msg3 {
|
||||
// by Msg5.cpp to constrain the endKey so it can read the recs
|
||||
// from the tree using that endKey, and not waste time.
|
||||
bool readList ( char rdbId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
//key_t startKey ,
|
||||
//key_t endKey ,
|
||||
char *startKey ,
|
||||
@ -110,7 +111,8 @@ class Msg3 {
|
||||
|
||||
// the rdb we're scanning for
|
||||
char m_rdbId;
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
// the scan classes, 1 per file, used to read from that file
|
||||
RdbScan *m_scans ; // [ MAX_RDB_FILES ];
|
||||
|
20
Msg36.cpp
20
Msg36.cpp
@ -36,7 +36,7 @@ bool Msg36::registerHandler ( ) {
|
||||
// . sets g_errno on error
|
||||
// . "termIds/termFreqs" should NOT be on the stack in case we block
|
||||
// . i based this on ../titledb/Msg23.cpp
|
||||
bool Msg36::getTermFreq ( char *coll ,
|
||||
bool Msg36::getTermFreq ( collnum_t collnum , // char *coll ,
|
||||
long maxAge ,
|
||||
long long termId ,
|
||||
void *state ,
|
||||
@ -53,7 +53,7 @@ bool Msg36::getTermFreq ( char *coll ,
|
||||
return true;
|
||||
}
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"quota: msg36: NULL collection.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"quota: msg36: bad collection.");
|
||||
// no more quotas here!
|
||||
if ( incCount || decCount ) { char *xx = NULL; *xx = 0; }
|
||||
// sanity check
|
||||
@ -117,7 +117,7 @@ bool Msg36::getTermFreq ( char *coll ,
|
||||
//unsigned long i = ((unsigned long)groupId/*key*/) % numHosts;
|
||||
// if it's us then no need to multicast to ourselves
|
||||
//if(hosts[i].m_hostId==g_hostdb.m_hostId||g_conf.m_fullSplit) {
|
||||
m_termFreq = g_posdb.getTermFreq ( coll , termId );
|
||||
m_termFreq = g_posdb.getTermFreq ( collnum , termId );
|
||||
// clear g_errno
|
||||
g_errno = 0;
|
||||
return true;
|
||||
@ -134,7 +134,8 @@ bool Msg36::getTermFreq ( char *coll ,
|
||||
if ( m_niceness ) *p |= 0x08;
|
||||
p++;
|
||||
*(long long *)p = termId ; p += sizeof(long long);
|
||||
strcpy ( p , coll ); p += gbstrlen(coll) + 1; // copy includes \0
|
||||
//strcpy ( p , coll ); p += gbstrlen(coll) + 1; // copy includes \0
|
||||
*(collnum_t *)p = collnum; p += sizeof(collnum_t);
|
||||
|
||||
long timeout = 5;
|
||||
//if ( incCount || decCount ) timeout = 9999999;
|
||||
@ -339,12 +340,13 @@ void handleRequest36 ( UdpSlot *slot , long netnice ) {
|
||||
//if ( *request & 0x04 ) decCount = true;
|
||||
if ( *request & 0x08 ) niceness = MAX_NICENESS;
|
||||
long long termId = *(long long *) (request+1) ;
|
||||
char *coll = request + 8 + 1;
|
||||
//char *coll = request + 8 + 1;
|
||||
collnum_t collnum = *(collnum_t *)(request + 8 + 1);
|
||||
|
||||
// if there is no way this termlist size exceeds exactMax, then just
|
||||
// return the approximation we got, saves on disk seeks
|
||||
if ( ! exactCount ) {//&& ! incCount && ! decCount ) { //max<exactMax){
|
||||
long long termFreq = g_posdb.getTermFreq(coll,termId);
|
||||
long long termFreq = g_posdb.getTermFreq(collnum,termId);
|
||||
// no need to malloc since we have the tmp buf
|
||||
char *reply = slot->m_tmpBuf;
|
||||
*(long long *)reply = termFreq ;
|
||||
@ -355,7 +357,7 @@ void handleRequest36 ( UdpSlot *slot , long netnice ) {
|
||||
}
|
||||
|
||||
// check our cache for this termid and collection,
|
||||
collnum_t collnum = g_collectiondb.getCollnum(coll);
|
||||
//collnum_t collnum = g_collectiondb.getCollnum(coll);
|
||||
if ( collnum < 0 ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
log("quota: msg36: collection does not exist.");
|
||||
@ -508,10 +510,10 @@ void callMsg5 ( State36 *st , key144_t startKey , key144_t endKey ) {
|
||||
// . TODO: if quota is over about 30 million docs for a particular site
|
||||
// then we will need to fix this code, cuz it only reads up to
|
||||
// 200MB (MRS) if the site: termlist
|
||||
char *coll = g_collectiondb.getCollName ( st->m_collnum );
|
||||
//char *coll = g_collectiondb.getCollName ( st->m_collnum );
|
||||
//log (LOG_WARN,"build: getting frequency from disk");
|
||||
if ( ! st->m_msg5.getList ( RDB_POSDB ,
|
||||
coll ,
|
||||
st->m_collnum ,
|
||||
&st->m_list ,
|
||||
&startKey ,
|
||||
&endKey ,
|
||||
|
2
Msg36.h
2
Msg36.h
@ -28,7 +28,7 @@ class Msg36 {
|
||||
// . sets errno on error
|
||||
// . "termFreq" should NOT be on the stack in case we block
|
||||
// . sets *termFreq to UPPER BOUND on # of records with that "termId"
|
||||
bool getTermFreq ( char *coll ,
|
||||
bool getTermFreq ( collnum_t collnum,//char *coll ,
|
||||
long maxAge ,
|
||||
long long termId ,
|
||||
void *state ,
|
||||
|
@ -9,7 +9,7 @@ static void gotTermFreqWrapper ( void *state ) ;
|
||||
// . "termIds/termFreqs" should NOT be on the stack in case we block
|
||||
// . i based this on ../titled/Msg25.cpp since it sends out multiple msgs at
|
||||
// the same time, too
|
||||
bool Msg37::getTermFreqs ( char *coll ,
|
||||
bool Msg37::getTermFreqs ( collnum_t collnum,//char *coll ,
|
||||
long maxAge ,
|
||||
long long *termIds ,
|
||||
long numTerms ,
|
||||
@ -20,7 +20,7 @@ bool Msg37::getTermFreqs ( char *coll ,
|
||||
bool exactCount ) {
|
||||
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg37.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"net: bad collection. msg37.");
|
||||
// we haven't got any responses as of yet or sent any requests
|
||||
m_callback = callback;
|
||||
m_state = state;
|
||||
@ -31,7 +31,8 @@ bool Msg37::getTermFreqs ( char *coll ,
|
||||
m_errno = 0;
|
||||
m_numTerms = numTerms;
|
||||
m_termFreqs = termFreqs;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
//m_coll = coll;
|
||||
m_maxAge = maxAge;
|
||||
m_termIds = termIds;
|
||||
// set all to 1 in case there's an error
|
||||
@ -84,7 +85,7 @@ bool Msg37::launchRequests ( ) {
|
||||
m_msg36[j].m_i = m_i;
|
||||
// . start up a Msg36 to get it
|
||||
// . this will return false if blocks
|
||||
if ( ! m_msg36[j].getTermFreq ( m_coll ,
|
||||
if ( ! m_msg36[j].getTermFreq ( m_collnum ,
|
||||
m_maxAge ,
|
||||
m_termIds[m_i] ,
|
||||
&m_msg36[j],
|
||||
|
4
Msg37.h
4
Msg37.h
@ -22,7 +22,7 @@ class Msg37 {
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets errno on error
|
||||
// . "termIds/termFreqs" should NOT be on the stack in case we block
|
||||
bool getTermFreqs ( char *coll ,
|
||||
bool getTermFreqs ( collnum_t collnum ,
|
||||
long maxAge ,
|
||||
long long *termIds ,
|
||||
long numTermIds ,
|
||||
@ -58,7 +58,7 @@ class Msg37 {
|
||||
|
||||
bool m_exactCount;
|
||||
|
||||
char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
long m_maxAge;
|
||||
long long *m_termIds ;
|
||||
|
64
Msg39.cpp
64
Msg39.cpp
@ -151,7 +151,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
|
||||
// deserialize it before we do anything else
|
||||
long finalSize = deserializeMsg ( sizeof(Msg39Request) ,
|
||||
&m_r->size_readSizes ,
|
||||
&m_r->size_coll ,
|
||||
&m_r->size_whiteList,//coll ,
|
||||
&m_r->ptr_readSizes,
|
||||
m_r->m_buf );
|
||||
|
||||
@ -176,15 +176,17 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
if ( g_conf.m_logTimingQuery ) m_debug = true;
|
||||
|
||||
// ensure it's size is ok
|
||||
if ( m_r->size_coll <= 0 ) {
|
||||
/*
|
||||
if ( m_r->size_whiteList <= 0 ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
*/
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_r->ptr_coll );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
||||
@ -541,7 +543,7 @@ bool Msg39::getLists () {
|
||||
"component=%li "
|
||||
"otermLen=%li "
|
||||
"isSynonym=%li "
|
||||
"querylangid=%li ",
|
||||
"querylangid=%li " ,
|
||||
(long)this ,
|
||||
i ,
|
||||
qt->m_term,//bb ,
|
||||
@ -567,7 +569,7 @@ bool Msg39::getLists () {
|
||||
(long)m_tmpq.m_componentCodes[i],
|
||||
(long)m_tmpq.getTermLen(i) ,
|
||||
isSynonym,
|
||||
(long)m_tmpq.m_langId); // ,tt
|
||||
(long)m_tmpq.m_langId ); // ,tt
|
||||
// put it back
|
||||
*tpc = tmp;
|
||||
if ( st ) {
|
||||
@ -614,7 +616,7 @@ bool Msg39::getLists () {
|
||||
long split = g_hostdb.m_myHost->m_shardNum;
|
||||
// call msg2
|
||||
if ( ! m_msg2.getLists ( rdbId ,
|
||||
m_r->ptr_coll ,
|
||||
m_r->m_collnum,//m_r->ptr_coll ,
|
||||
m_r->m_maxAge ,
|
||||
m_r->m_addToCache ,
|
||||
//m_tmpq.m_qterms ,
|
||||
@ -659,6 +661,7 @@ void gotListsWrapper ( void *state ) {
|
||||
Msg39 *THIS = (Msg39 *) state;
|
||||
// . hash the lists into our index table
|
||||
// . this will send back a reply or recycle and read more list data
|
||||
|
||||
if ( ! THIS->gotLists ( true ) ) return;
|
||||
|
||||
// . if he did not block and there was an errno we send reply
|
||||
@ -669,6 +672,12 @@ void gotListsWrapper ( void *state ) {
|
||||
log("msg39: sending back error reply = %s",mstrerror(g_errno));
|
||||
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
|
||||
}
|
||||
|
||||
// no, block? call the docid split loop
|
||||
//if ( numDocIdSplits <= 1 ) return;
|
||||
|
||||
// if we get the lists and processed them without blocking, repeat!
|
||||
THIS->doDocIdSplitLoop();
|
||||
}
|
||||
|
||||
// . now come here when we got the necessary index lists
|
||||
@ -677,6 +686,7 @@ void gotListsWrapper ( void *state ) {
|
||||
bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// bail on error
|
||||
if ( g_errno ) {
|
||||
hadError:
|
||||
log("msg39: Had error getting termlists: %s.",
|
||||
mstrerror(g_errno));
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
@ -694,6 +704,13 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_r->m_niceness );
|
||||
|
||||
// ensure collection not deleted from under us
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
goto hadError;
|
||||
}
|
||||
|
||||
// . set the IndexTable so it can set it's score weights from the
|
||||
// termFreqs of each termId in the query
|
||||
// . this now takes into account the special termIds used for sorting
|
||||
@ -707,7 +724,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
m_debug ,
|
||||
this ,
|
||||
&m_tt ,
|
||||
m_r->ptr_coll ,
|
||||
m_r->m_collnum,//ptr_coll ,
|
||||
&m_msg2 , // m_lists ,
|
||||
//m_tmpq.m_numTerms , // m_numLists
|
||||
m_r );
|
||||
@ -743,10 +760,25 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// . now we must call this separately here, not in allocTopTree()
|
||||
// . we have to re-set the QueryTermInfos with each docid range split
|
||||
// since it will set the list ptrs from the msg2 lists
|
||||
if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
|
||||
return true;
|
||||
if ( ! m_posdbTable.setQueryTermInfo () ) return true;
|
||||
|
||||
// print query term bit numbers here
|
||||
for ( long i = 0 ;
|
||||
m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
|
||||
QueryTerm *qt = &m_tmpq.m_qterms[i];
|
||||
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
|
||||
char *tpc = qt->m_term + qt->m_termLen;
|
||||
char tmp = *tpc;
|
||||
*tpc = '\0';
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("query: msg39: BITNUM query term #%li \"%s\" "
|
||||
"bitnum=%li ", i , qt->m_term, qt->m_bitNum );
|
||||
// put it back
|
||||
*tpc = tmp;
|
||||
logf(LOG_DEBUG,"%s",sb.getBufStart());
|
||||
}
|
||||
|
||||
|
||||
// timestamp log
|
||||
if ( m_debug ) {
|
||||
log(LOG_DEBUG,"query: msg39: [%lu] Preparing to intersect "
|
||||
@ -777,7 +809,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
|
||||
// . create the thread
|
||||
// . only one of these type of threads should be launched at a time
|
||||
if ( g_threads.call ( INTERSECT_THREAD , // threadType
|
||||
if ( ! m_debug &&
|
||||
g_threads.call ( INTERSECT_THREAD , // threadType
|
||||
m_r->m_niceness ,
|
||||
this , // top 4 bytes must be cback
|
||||
threadDoneWrapper ,
|
||||
@ -806,6 +839,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// time it
|
||||
diff = gettimeofdayInMilliseconds() - start;
|
||||
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
|
||||
|
||||
// returns false if blocked, true otherwise
|
||||
return addedLists ();
|
||||
}
|
||||
@ -982,7 +1016,7 @@ bool Msg39::setClusterRecs ( ) {
|
||||
m_clusterLevels ,
|
||||
m_clusterRecs ,
|
||||
m_numClusterDocIds ,
|
||||
m_r->ptr_coll ,
|
||||
m_r->m_collnum ,
|
||||
0 , // maxAge
|
||||
false , // addToCache
|
||||
this ,
|
||||
@ -1095,7 +1129,7 @@ void Msg39::estimateHits ( ) {
|
||||
|
||||
// convenience ptrs. we will store the docids/scores into these arrays
|
||||
long long *topDocIds;
|
||||
float *topScores;
|
||||
double *topScores;
|
||||
key_t *topRecs;
|
||||
|
||||
// numDocIds counts docs in all tiers when using toptree.
|
||||
@ -1162,7 +1196,7 @@ void Msg39::estimateHits ( ) {
|
||||
mr.ptr_clusterRecs = NULL;
|
||||
// this is how much space to reserve
|
||||
mr.size_docIds = 8 * numDocIds; // long long
|
||||
mr.size_scores = 4 * numDocIds; // float
|
||||
mr.size_scores = sizeof(double) * numDocIds; // float
|
||||
// if not doing site clustering, we won't have these perhaps...
|
||||
if ( m_gotClusterRecs )
|
||||
mr.size_clusterRecs = sizeof(key_t) *numDocIds;
|
||||
@ -1190,7 +1224,7 @@ void Msg39::estimateHits ( ) {
|
||||
return ;
|
||||
}
|
||||
topDocIds = (long long *) mr.ptr_docIds;
|
||||
topScores = (float *) mr.ptr_scores;
|
||||
topScores = (double *) mr.ptr_scores;
|
||||
topRecs = (key_t *) mr.ptr_clusterRecs;
|
||||
}
|
||||
|
||||
@ -1224,6 +1258,8 @@ void Msg39::estimateHits ( ) {
|
||||
//add it to the reply
|
||||
topDocIds [docCount] = t->m_docId;
|
||||
topScores [docCount] = t->m_score;
|
||||
if ( m_tt.m_useIntScores )
|
||||
topScores[docCount] = (double)t->m_intScore;
|
||||
// supply clusterdb rec? only for full splits
|
||||
if ( m_gotClusterRecs )
|
||||
topRecs [docCount] = t->m_clusterRec;
|
||||
|
13
Msg39.h
13
Msg39.h
@ -49,6 +49,7 @@ class Msg39Request {
|
||||
m_useMinAlgo = false;
|
||||
m_fastIntersection = -1;
|
||||
m_stripe = 0;
|
||||
m_collnum = -1;
|
||||
m_useQueryStopWords = true;
|
||||
m_useNewAlgo = true;
|
||||
m_doMaxScoreAlgo = true;
|
||||
@ -58,12 +59,12 @@ class Msg39Request {
|
||||
ptr_readSizes = NULL;
|
||||
ptr_query = NULL; // in utf8?
|
||||
ptr_whiteList = NULL;
|
||||
ptr_coll = NULL;
|
||||
//ptr_coll = NULL;
|
||||
|
||||
size_readSizes = 0;
|
||||
size_query = 0;
|
||||
size_whiteList = 0;
|
||||
size_coll = 0;
|
||||
//size_coll = 0;
|
||||
|
||||
m_getDocIdScoringInfo = 1;
|
||||
|
||||
@ -115,6 +116,8 @@ class Msg39Request {
|
||||
char m_useMinAlgo;
|
||||
char m_fastIntersection;
|
||||
|
||||
collnum_t m_collnum;
|
||||
|
||||
long long m_minDocId;
|
||||
long long m_maxDocId;
|
||||
bool m_makeReply;
|
||||
@ -128,13 +131,13 @@ class Msg39Request {
|
||||
char *ptr_termFreqWeights;
|
||||
char *ptr_query; // in utf8?
|
||||
char *ptr_whiteList;
|
||||
char *ptr_coll;
|
||||
//char *ptr_coll;
|
||||
|
||||
long size_readSizes;
|
||||
long size_termFreqWeights;
|
||||
long size_query;
|
||||
long size_whiteList;
|
||||
long size_coll;
|
||||
//long size_coll;
|
||||
|
||||
char m_buf[0];
|
||||
};
|
||||
@ -158,7 +161,7 @@ public:
|
||||
long m_errno;
|
||||
|
||||
char *ptr_docIds ; // the results, long long
|
||||
char *ptr_scores; ; // floats
|
||||
char *ptr_scores; ; // now doubles! so we can have intScores
|
||||
char *ptr_scoreInfo ; // transparency info
|
||||
char *ptr_pairScoreBuf ; // transparency info
|
||||
char *ptr_singleScoreBuf ; // transparency info
|
||||
|
61
Msg3a.cpp
61
Msg3a.cpp
@ -20,6 +20,7 @@ void Msg3a::constructor ( ) {
|
||||
m_finalBuf = NULL;
|
||||
m_docsToGet = 0;
|
||||
m_numDocIds = 0;
|
||||
m_collnums = NULL;
|
||||
|
||||
// need to call all safebuf constructors now to set m_label
|
||||
m_rbuf2.constructor();
|
||||
@ -68,6 +69,8 @@ void Msg3a::reset ( ) {
|
||||
m_docsToGet = 0;
|
||||
m_errno = 0;
|
||||
m_numDocIds = 0;
|
||||
m_collnums = NULL;
|
||||
m_numTotalEstimatedHits = 0LL;
|
||||
}
|
||||
|
||||
Msg39Request *g_r = NULL;
|
||||
@ -139,8 +142,9 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
m_state = state;
|
||||
|
||||
// warning. coll size includes \0
|
||||
if ( ! m_r->ptr_coll || m_r->size_coll-1 <= 0 )
|
||||
log(LOG_LOGIC,"net: NULL or bad collection. msg3a.");
|
||||
if ( ! m_r->m_collnum < 0 ) // ptr_coll || m_r->size_coll-1 <= 0 )
|
||||
log(LOG_LOGIC,"net: bad collection. msg3a. %li",
|
||||
(long)m_r->m_collnum);
|
||||
|
||||
//m_indexdbSplit = g_hostdb.m_indexSplits;
|
||||
// certain query term, like, gbdom:xyz.com, are NOT split
|
||||
@ -171,7 +175,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
return true;
|
||||
// . set g_errno if not found and return true
|
||||
// . coll is null terminated
|
||||
CollectionRec *cr = g_collectiondb.getRec(r->ptr_coll, r->size_coll-1);
|
||||
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
|
||||
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
|
||||
|
||||
// query is truncated if had too many terms in it
|
||||
@ -201,7 +205,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
if ( m_r->m_useSeoResultsCache ) {
|
||||
// the all important seo results cache key
|
||||
m_ckey.n0 = hash64 ( m_r->ptr_query ,m_r->size_query - 1 ,0 );
|
||||
m_ckey.n0 = hash64 ( m_r->ptr_coll,m_r->size_coll, m_ckey.n0);
|
||||
m_ckey.n0 = hash64h ( (long long)m_r->m_collnum, m_ckey.n0);
|
||||
m_ckey.n0 = hash64 ( (char *)&m_r->m_language,1 , m_ckey.n0 );
|
||||
m_ckey.n0 = hash64 ( (char *)&m_r->m_docsToGet,4, m_ckey.n0 );
|
||||
// this should be non-zero so g_hostdb.getGroupId(RDB_SERPDB)
|
||||
@ -236,7 +240,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
0 , // maxcacheage
|
||||
false, // addtocache?
|
||||
RDB_SERPDB,//RDB_CACHEDB,
|
||||
m_r->ptr_coll,
|
||||
m_r->m_collnum,//ptr_coll,
|
||||
&m_seoCacheList,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey,
|
||||
@ -277,8 +281,8 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
m_docIds = (long long *)p;
|
||||
p += 8 * m_numDocIds;
|
||||
// scores
|
||||
m_scores = (float *)p;
|
||||
p += sizeof(float) * m_numDocIds;
|
||||
m_scores = (double *)p;
|
||||
p += sizeof(double) * m_numDocIds;
|
||||
// site hashes
|
||||
m_siteHashes26 = (long *)p;
|
||||
p += 4 * m_numDocIds;
|
||||
@ -303,10 +307,10 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
CollectionRec *cr;
|
||||
cr = g_collectiondb.getRec(m_r->ptr_coll,m_r->size_coll-1);
|
||||
//CollectionRec *cr;
|
||||
//cr = g_collectiondb.getRec(m_r->ptr_coll,m_r->size_coll-1);
|
||||
|
||||
setTermFreqWeights ( cr->m_coll,m_q,m_termFreqs , m_termFreqWeights );
|
||||
setTermFreqWeights ( m_r->m_collnum,m_q,m_termFreqs,m_termFreqWeights);
|
||||
|
||||
if ( m_debug ) {
|
||||
//long long *termIds = m_q->getTermIds();
|
||||
@ -402,7 +406,7 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
// end up copying over ourselves.
|
||||
m_rbufPtr = serializeMsg ( sizeof(Msg39Request),
|
||||
&m_r->size_readSizes,
|
||||
&m_r->size_coll,
|
||||
&m_r->size_whiteList,
|
||||
&m_r->ptr_readSizes,
|
||||
m_r,
|
||||
&m_rbufSize ,
|
||||
@ -727,20 +731,20 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
if ( ! m_debug ) continue;
|
||||
// cast these for printing out
|
||||
long long *docIds = (long long *)mr->ptr_docIds;
|
||||
score_t *scores = (score_t *)mr->ptr_scores;
|
||||
double *scores = (double *)mr->ptr_scores;
|
||||
// print out every docid in this split reply
|
||||
for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
|
||||
// print out score_t
|
||||
logf( LOG_DEBUG,
|
||||
"query: msg3a: [%lu] %03li) "
|
||||
"split=%li docId=%012llu domHash=0x%02lx "
|
||||
"score=%lu" ,
|
||||
"score=%f" ,
|
||||
(unsigned long)this ,
|
||||
j ,
|
||||
i ,
|
||||
docIds [j] ,
|
||||
(long)g_titledb.getDomHash8FromDocId(docIds[j]),
|
||||
(long)scores[j] );
|
||||
(float)scores[j] );
|
||||
}
|
||||
}
|
||||
|
||||
@ -772,7 +776,7 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushLongLong(m_docIds[i] );
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushFloat(m_scores[i]);
|
||||
cr.pushDouble(m_scores[i]);
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushLong(getSiteHash26(i));
|
||||
// sanity
|
||||
@ -807,7 +811,7 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
// this will often block, but who cares!? it just sends a request off
|
||||
if ( ! m_msg1.addList ( &m_seoCacheList ,
|
||||
RDB_SERPDB,//RDB_CACHEDB,
|
||||
m_r->ptr_coll,
|
||||
m_r->m_collnum,//ptr_coll,
|
||||
this, // state
|
||||
gotSerpdbReplyWrapper, // callback
|
||||
false, // forcelocal?
|
||||
@ -849,7 +853,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
// . tcPtr = term count. how many required query terms does the doc
|
||||
// have? formerly called topExplicits in IndexTable2.cpp
|
||||
long long *diPtr [MAX_INDEXDB_SPLIT];
|
||||
float *rsPtr [MAX_INDEXDB_SPLIT];
|
||||
double *rsPtr [MAX_INDEXDB_SPLIT];
|
||||
key_t *ksPtr [MAX_INDEXDB_SPLIT];
|
||||
long long *diEnd [MAX_INDEXDB_SPLIT];
|
||||
for ( long j = 0; j < m_numHosts ; j++ ) {
|
||||
@ -863,7 +867,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
continue;
|
||||
}
|
||||
diPtr [j] = (long long *)mr->ptr_docIds;
|
||||
rsPtr [j] = (float *)mr->ptr_scores;
|
||||
rsPtr [j] = (double *)mr->ptr_scores;
|
||||
ksPtr [j] = (key_t *)mr->ptr_clusterRecs;
|
||||
diEnd [j] = (long long *)(mr->ptr_docIds +
|
||||
mr->m_numDocIds * 8);
|
||||
@ -919,7 +923,8 @@ bool Msg3a::mergeLists ( ) {
|
||||
|
||||
// . how much do we need to store final merged docids, etc.?
|
||||
// . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1
|
||||
long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1);
|
||||
long need = m_docsToGet * (8+sizeof(double)+
|
||||
sizeof(key_t)+sizeof(DocIdScore *)+1);
|
||||
// allocate it
|
||||
m_finalBuf = (char *)mmalloc ( need , "finalBuf" );
|
||||
m_finalBufSize = need;
|
||||
@ -928,7 +933,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
// hook into it
|
||||
char *p = m_finalBuf;
|
||||
m_docIds = (long long *)p; p += m_docsToGet * 8;
|
||||
m_scores = (float *)p; p += m_docsToGet * sizeof(float);
|
||||
m_scores = (double *)p; p += m_docsToGet * sizeof(double);
|
||||
m_clusterRecs = (key_t *)p; p += m_docsToGet * sizeof(key_t);
|
||||
m_clusterLevels = (char *)p; p += m_docsToGet * 1;
|
||||
m_scoreInfos = (DocIdScore **)p;p+=m_docsToGet*sizeof(DocIdScore *);
|
||||
@ -1078,7 +1083,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
|
||||
// turn it into a float, that is what rscore_t is.
|
||||
// we do this to make it easier for PostQueryRerank.cpp
|
||||
m_scores [m_numDocIds]=(float)*rsPtr[maxj];
|
||||
m_scores [m_numDocIds]=(double)*rsPtr[maxj];
|
||||
if ( m_r->m_doSiteClustering )
|
||||
m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
|
||||
// clear this out
|
||||
@ -1142,7 +1147,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
long Msg3a::getStoredSize ( ) {
|
||||
// docId=8, scores=sizeof(rscore_t), clusterLevel=1 bitScores=1
|
||||
// eventIds=1
|
||||
long need = m_numDocIds * ( 8 + sizeof(rscore_t) + 1 ) +
|
||||
long need = m_numDocIds * ( 8 + sizeof(double) + 1 ) +
|
||||
4 + // m_numDocIds
|
||||
8 ; // m_numTotalEstimatedHits (estimated # of results)
|
||||
return need;
|
||||
@ -1158,8 +1163,8 @@ long Msg3a::serialize ( char *buf , char *bufEnd ) {
|
||||
// store each docid, 8 bytes each
|
||||
memcpy ( p , m_docIds , m_numDocIds * 8 ); p += m_numDocIds * 8;
|
||||
// store scores
|
||||
memcpy ( p , m_scores , m_numDocIds * sizeof(rscore_t) );
|
||||
p += m_numDocIds * sizeof(rscore_t) ;
|
||||
memcpy ( p , m_scores , m_numDocIds * sizeof(double) );
|
||||
p += m_numDocIds * sizeof(double) ;
|
||||
// store cluster levels
|
||||
memcpy ( p , m_clusterLevels , m_numDocIds ); p += m_numDocIds;
|
||||
// sanity check
|
||||
@ -1178,7 +1183,7 @@ long Msg3a::deserialize ( char *buf , char *bufEnd ) {
|
||||
// get each docid, 8 bytes each
|
||||
m_docIds = (long long *)p; p += m_numDocIds * 8;
|
||||
// get scores
|
||||
m_scores = (rscore_t *)p; p += m_numDocIds * sizeof(rscore_t) ;
|
||||
m_scores = (double *)p; p += m_numDocIds * sizeof(double) ;
|
||||
// get cluster levels
|
||||
m_clusterLevels = (char *)p; p += m_numDocIds;
|
||||
// sanity check
|
||||
@ -1214,13 +1219,13 @@ void Msg3a::printTerms ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
void setTermFreqWeights ( char *coll,
|
||||
void setTermFreqWeights ( collnum_t collnum , // char *coll,
|
||||
Query *q ,
|
||||
long long *termFreqs,
|
||||
float *termFreqWeights ) {
|
||||
|
||||
long long numDocsInColl = 0;
|
||||
RdbBase *base = getRdbBase ( RDB_CLUSTERDB , coll );
|
||||
RdbBase *base = getRdbBase ( RDB_CLUSTERDB , collnum );
|
||||
if ( base ) numDocsInColl = base->getNumGlobalRecs();
|
||||
// issue? set it to 1000 if so
|
||||
if ( numDocsInColl < 0 ) {
|
||||
@ -1232,7 +1237,7 @@ void setTermFreqWeights ( char *coll,
|
||||
long long *termIds = q->getTermIds();
|
||||
// just use rdbmap to estimate!
|
||||
for ( long i = 0 ; i < q->getNumTerms(); i++ ) {
|
||||
long long tf = g_posdb.getTermFreq ( coll ,termIds[i]);
|
||||
long long tf = g_posdb.getTermFreq ( collnum ,termIds[i]);
|
||||
if ( termFreqs ) termFreqs[i] = tf;
|
||||
float tfw = getTermFreqWeight(tf,numDocsInColl);
|
||||
termFreqWeights[i] = tfw;
|
||||
|
19
Msg3a.h
19
Msg3a.h
@ -11,7 +11,7 @@
|
||||
// 90MB for 32 nodes we got now with about 1.3B docs
|
||||
#define DEFAULT_POSDB_READSIZE 90000000
|
||||
|
||||
void setTermFreqWeights ( char *coll,
|
||||
void setTermFreqWeights ( collnum_t collnum, // char *coll,
|
||||
class Query *q ,
|
||||
long long *termFreqs,
|
||||
float *termFreqWeights ) ;
|
||||
@ -61,7 +61,7 @@ public:
|
||||
// we basically turn the scores we get from each msg39 split into
|
||||
// floats (rscore_t) and store them as floats so that PostQueryRerank
|
||||
// has an easier time
|
||||
float *getScores ( ) { return m_scores; };
|
||||
double *getScores ( ) { return m_scores; };
|
||||
long getNumDocIds ( ) { return m_numDocIds; };
|
||||
|
||||
long getSiteHash26 ( long i ) {
|
||||
@ -125,6 +125,10 @@ public:
|
||||
// this is set if IndexTable::addLists() had an error
|
||||
long m_errno;
|
||||
|
||||
// this is now in here so Msg40 can send out one Msg3a per
|
||||
// collection if it wants to search an entire token
|
||||
Msg39Request m_rrr;
|
||||
|
||||
// use msg37 to get TermFreqs
|
||||
//Msg37 m_msg37;
|
||||
long long m_termFreqs [MAX_QUERY_TERMS];
|
||||
@ -160,16 +164,25 @@ public:
|
||||
|
||||
// final merged lists go here
|
||||
long long *m_docIds ;
|
||||
float *m_scores ;
|
||||
double *m_scores ;
|
||||
class DocIdScore **m_scoreInfos ;
|
||||
//key_t *m_recs ; // clusterdb recs
|
||||
key_t *m_clusterRecs ;
|
||||
char *m_clusterLevels ;
|
||||
// this is new
|
||||
collnum_t *m_collnums;
|
||||
long m_numDocIds ;
|
||||
// the above ptrs point into this buffer
|
||||
char *m_finalBuf;
|
||||
long m_finalBufSize;
|
||||
|
||||
// when merging this list of docids into a final list keep
|
||||
// track of the cursor into m_docIds[]
|
||||
long m_cursor;
|
||||
|
||||
// what collection # are these docids from if m_collnums[] is NULL
|
||||
//collnum_t m_collnum;
|
||||
|
||||
//
|
||||
// new things for seoresults cache
|
||||
//
|
||||
|
24
Msg4.cpp
24
Msg4.cpp
@ -159,7 +159,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
// . injecting into the "test" coll flushes after each inject
|
||||
// . injecting into the "qatest123" coll flushes after each inject
|
||||
// . returns false if blocked and callback will be called
|
||||
bool flushMsg4Buffers ( void *state , void (* callback) (void *) ) {
|
||||
// if all empty, return true now
|
||||
@ -535,6 +535,8 @@ bool Msg4::addMetaList ( char *metaList ,
|
||||
m_next = NULL;
|
||||
m_shardOverride = shardOverride;
|
||||
|
||||
retry:
|
||||
|
||||
// get in line if there's a line
|
||||
if ( s_msg4Head ) {
|
||||
// add ourselves to the line
|
||||
@ -554,8 +556,21 @@ bool Msg4::addMetaList ( char *metaList ,
|
||||
// then do it
|
||||
if ( addMetaList2 ( ) ) return true;
|
||||
|
||||
// sanity check
|
||||
if ( s_msg4Head || s_msg4Tail ) { char *xx=NULL; *xx=0; }
|
||||
// . sanity check
|
||||
// . we sometimes get called with niceness 0 from possibly
|
||||
// an injection or something and from a quickpoll
|
||||
// inside addMetList2() in which case our addMetaList2() will
|
||||
// fail, assuming s_msg4Head got set, BUT it SHOULD be OK because
|
||||
// being interrupted at the one QUICKPOLL() in addMetaList2()
|
||||
// doesn't seem like it would hurt.
|
||||
// . FURTHEMORE the multicast seems to always be called with
|
||||
// MAX_NICENESS so i'm not sure how niceness 0 will really help
|
||||
// with any of this stuff.
|
||||
//if ( s_msg4Head || s_msg4Tail ) { char *xx=NULL; *xx=0; }
|
||||
if ( s_msg4Head || s_msg4Tail ) {
|
||||
log("msg4: got unexpected head"); // :)
|
||||
goto retry;
|
||||
}
|
||||
|
||||
// . spider hang bug
|
||||
// . debug log. seems to happen a lot if not using threads..
|
||||
@ -708,6 +723,9 @@ bool Msg4::addMetaList2 ( ) {
|
||||
// flush them buffers
|
||||
//flushLocal();
|
||||
|
||||
// in case this was being used to hold the data, free it
|
||||
m_tmpBuf.purge();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
4
Msg4.h
4
Msg4.h
@ -17,6 +17,8 @@ bool addMetaList ( char *p , class UdpSlot *slot = NULL ) ;
|
||||
|
||||
bool isInMsg4LinkedList ( class Msg4 *msg4 ) ;
|
||||
|
||||
#include "SafeBuf.h"
|
||||
|
||||
class Msg4 {
|
||||
|
||||
public:
|
||||
@ -90,6 +92,8 @@ class Msg4 {
|
||||
void (*m_callback ) ( void *state );
|
||||
void *m_state;
|
||||
|
||||
SafeBuf m_tmpBuf;
|
||||
|
||||
char m_rdbId;
|
||||
char m_inUse;
|
||||
collnum_t m_collnum;
|
||||
|
52
Msg40.h
52
Msg40.h
@ -117,7 +117,7 @@ class Msg40 {
|
||||
bool computeGigabits( class TopicGroup *tg );
|
||||
SafeBuf m_gigabitBuf;
|
||||
|
||||
#ifdef NEEDLICENSE
|
||||
// nuggabits...
|
||||
bool computeFastFacts ( );
|
||||
bool addFacts ( HashTableX *queryTable,
|
||||
HashTableX *gbitTable ,
|
||||
@ -126,13 +126,14 @@ class Msg40 {
|
||||
bool debugGigabits ,
|
||||
class Msg20Reply *reply,
|
||||
SafeBuf *factBuf ) ;
|
||||
#endif
|
||||
|
||||
SafeBuf m_factBuf;
|
||||
|
||||
// keep these public since called by wrapper functions
|
||||
bool gotDocIds ( ) ;
|
||||
bool launchMsg20s ( bool recalled ) ;
|
||||
class Msg20 *getAvailMsg20();
|
||||
class Msg20 *getCompletedSummary ( long ix );
|
||||
bool getSummaries ( ) ;
|
||||
bool gotSummary ( ) ;
|
||||
bool reallocMsg20Buf ( ) ;
|
||||
@ -158,8 +159,8 @@ class Msg40 {
|
||||
// . these routines give us back our inputted parameters we saved
|
||||
char *getQuery ( ) { return m_si->m_q->getQuery(); };
|
||||
long getQueryLen ( ) { return m_si->m_q->getQueryLen(); };
|
||||
char *getColl ( ) { return m_si->m_coll2; };
|
||||
long getCollLen ( ) { return m_si->m_collLen2; };
|
||||
//char *getColl ( ) { return m_si->m_coll2; };
|
||||
//long getCollLen ( ) { return m_si->m_collLen2; };
|
||||
long getDocsWanted ( ) { return m_si->m_docsWanted; };
|
||||
long getFirstResultNum ( ) { return m_si->m_firstResultNum; };
|
||||
|
||||
@ -171,7 +172,10 @@ class Msg40 {
|
||||
long long getDocId ( long i ){return m_msg3a.m_docIds[i]; };
|
||||
long long *getDocIds( ){return m_msg3a.m_docIds; };
|
||||
float getScore ( long i ){return m_msg3a.m_scores[i]; };
|
||||
class DocIdScore *getScoreInfo(long i){return m_msg3a.m_scoreInfos[i];}
|
||||
class DocIdScore *getScoreInfo(long i){
|
||||
if ( ! m_msg3a.m_scoreInfos ) return NULL;
|
||||
return m_msg3a.m_scoreInfos[i];
|
||||
}
|
||||
//LinkInfo *getLinkInfo( long i){return m_msg20[i]->m_linkInfo; }
|
||||
bool moreResultsFollow ( ) {return m_moreToCome; };
|
||||
time_t getCachedTime ( ) {return m_cachedTime; };
|
||||
@ -202,8 +206,21 @@ class Msg40 {
|
||||
// Msg39 and all Msg20s must use the same clock timestamp
|
||||
time_t m_nowUTC;
|
||||
|
||||
long m_lastHeartbeat;
|
||||
|
||||
bool printSearchResult9 ( long ix ) ;
|
||||
HashTableX m_columnTable;
|
||||
bool printCSVHeaderRow ( class SafeBuf *sb );
|
||||
bool printJsonItemInCSV ( class State0 *st , long ix );
|
||||
long m_numCSVColumns;
|
||||
|
||||
|
||||
HashTableX m_dedupTable;
|
||||
|
||||
long m_msg3aRecallCnt;
|
||||
Msg39Request m_r;
|
||||
// this goes into msg3a now so we can send multiple msg3as out,
|
||||
// 1 per collection
|
||||
//Msg39Request m_r;
|
||||
|
||||
long m_docsToGet;
|
||||
long m_docsToGetVisible;
|
||||
@ -211,7 +228,9 @@ class Msg40 {
|
||||
// incoming parameters
|
||||
void *m_state;
|
||||
void (* m_callback ) ( void *state );
|
||||
|
||||
|
||||
long m_needFirstReplies;
|
||||
|
||||
// max outstanding msg20s
|
||||
//long m_maxOutstanding;
|
||||
|
||||
@ -237,6 +256,17 @@ class Msg40 {
|
||||
char *m_msg20StartBuf;
|
||||
long m_numToFree;
|
||||
|
||||
bool m_hadPrintError ;
|
||||
long m_numPrinted ;
|
||||
bool m_printedHeader ;
|
||||
bool m_printedTail ;
|
||||
bool m_lastChunk ;
|
||||
long m_sendsOut ;
|
||||
long m_sendsIn ;
|
||||
long m_printi ;
|
||||
long m_socketHadError;
|
||||
|
||||
|
||||
// use msg3a to get docIds
|
||||
Msg3a m_msg3a;
|
||||
|
||||
@ -307,6 +337,14 @@ class Msg40 {
|
||||
// Msg2b for generating a directory
|
||||
//Msg2b m_msg2b;
|
||||
|
||||
bool mergeDocIdsIntoBaseMsg3a();
|
||||
long m_numCollsToSearch;
|
||||
class Msg3a **m_msg3aPtrs;
|
||||
SafeBuf m_msg3aPtrBuf;
|
||||
long m_num3aRequests;
|
||||
long m_num3aReplies;
|
||||
collnum_t m_firstCollnum;
|
||||
|
||||
PostQueryRerank m_postQueryRerank;
|
||||
|
||||
HashTableT<uint64_t, uint64_t> m_urlTable;
|
||||
|
50
Msg5.cpp
50
Msg5.cpp
@ -114,7 +114,7 @@ void makeCacheKey ( char *startKey ,
|
||||
// another special meaning. it tells msg5 to tell RdbTree's getList() to
|
||||
// pre-allocate the list size by counting the recs ahead of time.
|
||||
bool Msg5::getList ( char rdbId ,
|
||||
char *coll ,
|
||||
collnum_t collnum ,
|
||||
RdbList *list ,
|
||||
//key_t startKey ,
|
||||
//key_t endKey ,
|
||||
@ -157,7 +157,7 @@ bool Msg5::getList ( char rdbId ,
|
||||
// sanity
|
||||
if ( ! list && mergeLists ) { char *xx=NULL;*xx=0; }
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg5.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"net: bad collection. msg5.");
|
||||
// MUST have this
|
||||
//if ( rdbId == RDB_TITLEDB && ! msg5b ) {
|
||||
// log(LOG_LOGIC,"net: No msg5b supplied. 1.");
|
||||
@ -202,10 +202,10 @@ bool Msg5::getList ( char rdbId ,
|
||||
//m_startTime = gettimeofdayInMilliseconds();
|
||||
// remember stuff
|
||||
m_rdbId = rdbId;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
|
||||
m_collnum = g_collectiondb.getCollnum ( coll );
|
||||
if ( m_collnum < 0 ) {
|
||||
CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! ttt ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
@ -237,7 +237,7 @@ bool Msg5::getList ( char rdbId ,
|
||||
m_mergeLists = mergeLists;
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
// point to cache
|
||||
//RdbCache *cache = base->m_rdb->getCache();
|
||||
// . these 2 vars are used for error correction
|
||||
@ -487,7 +487,7 @@ bool Msg5::getList ( char rdbId ,
|
||||
// . loops until m_minRecSizes is satisfied OR m_endKey is reached
|
||||
bool Msg5::readList ( ) {
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
readMore:
|
||||
// . reset our tree list
|
||||
// . sets fixedDataSize here in case m_includeTree is false because
|
||||
@ -525,7 +525,7 @@ bool Msg5::readList ( ) {
|
||||
if ( m_isRealMerge ) niceness = 1;
|
||||
if ( compute ) {
|
||||
m_msg3.readList ( m_rdbId ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
m_fileStartKey , // modified by gotList()
|
||||
m_endKey ,
|
||||
m_newMinRecSizes , // modified by gotList()
|
||||
@ -722,6 +722,11 @@ bool Msg5::readList ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
// limit to 20MB so we don't go OOM!
|
||||
if ( m_newMinRecSizes > 2 * m_minRecSizes &&
|
||||
m_newMinRecSizes > 20000000 )
|
||||
m_newMinRecSizes = 20000000;
|
||||
|
||||
|
||||
QUICKPOLL((m_niceness));
|
||||
// debug msg
|
||||
@ -747,7 +752,7 @@ bool Msg5::readList ( ) {
|
||||
// . if compensateForMerge is true then m_startFileNum/m_numFiles
|
||||
// will be appropriately mapped around the merge
|
||||
if ( ! m_msg3.readList ( m_rdbId ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
m_fileStartKey , // modified by gotList()
|
||||
diskEndKey ,
|
||||
m_newMinRecSizes , // modified by gotList()
|
||||
@ -794,10 +799,10 @@ void Msg5::copyAndSendBackList ( RdbList *listSrc ) {
|
||||
bool Msg5::needsRecall ( ) {
|
||||
bool logIt;
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base = getRdbBase ( m_rdbId , m_coll );
|
||||
RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
|
||||
// if collection was deleted from under us, base will be NULL
|
||||
if ( ! base && ! g_errno ) {
|
||||
log("msg5: base lost for coll %s",m_coll);
|
||||
log("msg5: base lost for collnum %li",(long)m_collnum);
|
||||
return false;
|
||||
}
|
||||
// sanity check
|
||||
@ -849,11 +854,14 @@ bool Msg5::needsRecall ( ) {
|
||||
// seems to be very common for doledb, so don't log unless extreme
|
||||
//if ( m_rdbId == RDB_DOLEDB && m_round < 15 ) logIt = false;
|
||||
if ( m_round > 100 && (m_round % 1000) != 0 ) logIt = false;
|
||||
// seems very common when doing rebalancing then merging to have
|
||||
// to do at least one round of re-reading, so note that
|
||||
if ( m_round == 0 ) logIt = false;
|
||||
if ( logIt )
|
||||
logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
|
||||
"got %li) this=0x%lx round=%li.",
|
||||
"got %li) cn=%li this=0x%lx round=%li.",
|
||||
m_newMinRecSizes , base->m_dbname , m_minRecSizes,
|
||||
m_list->m_listSize, (long)this , m_round );
|
||||
m_list->m_listSize, (long)m_collnum,(long)this, m_round );
|
||||
m_round++;
|
||||
// record how many screw ups we had so we know if it hurts performance
|
||||
base->m_rdb->didReSeek ( );
|
||||
@ -1167,7 +1175,7 @@ bool Msg5::gotList2 ( ) {
|
||||
}
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
|
||||
// if not enough lists, use a dummy list to trigger merge so tfndb
|
||||
// filter happens and we have a chance to weed out old titleRecs
|
||||
@ -1523,7 +1531,7 @@ void Msg5::repairLists_r ( ) {
|
||||
// . logging the key ranges gives us an idea of how long
|
||||
// it will take to patch the bad data
|
||||
long nn = m_msg3.m_numFileNums;
|
||||
RdbBase *base = getRdbBase ( m_rdbId , m_coll );
|
||||
RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
|
||||
if ( i < nn && base ) {
|
||||
long fn = m_msg3.m_fileNums[i];
|
||||
BigFile *bf = base->getFile ( fn );
|
||||
@ -1574,7 +1582,7 @@ void Msg5::mergeLists_r ( ) {
|
||||
if ( KEYCMP(m_prevKey,m_fileStartKey,m_ks)>=0 ) m_prevCount = 0;
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) {
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) {
|
||||
log("No collection found."); return; }
|
||||
|
||||
/*
|
||||
@ -1747,7 +1755,7 @@ bool Msg5::doneMerging ( ) {
|
||||
//m_waitingForMerge = false;
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
|
||||
// . if there was a merge error, bitch about it
|
||||
// . Thread class should propagate g_errno when it was set in a thread
|
||||
@ -1764,8 +1772,8 @@ bool Msg5::doneMerging ( ) {
|
||||
// our first merge
|
||||
if ( m_hadCorruption ) {
|
||||
// log it here, cuz logging in thread doesn't work too well
|
||||
log("net: Encountered a corrupt list in rdb=%s coll=%s",
|
||||
base->m_dbname,m_coll);
|
||||
log("net: Encountered a corrupt list in rdb=%s collnum=%li",
|
||||
base->m_dbname,(long)m_collnum);
|
||||
// remove error condition, we removed the bad data in thread
|
||||
|
||||
m_hadCorruption = false;
|
||||
@ -1891,7 +1899,7 @@ bool Msg5::doneMerging ( ) {
|
||||
|
||||
// . for every round we get call increase by 10 percent
|
||||
// . try to fix all those negative recs in the rebalance re-run
|
||||
m_newMinRecSizes *= (1.0 + (m_round * .10));
|
||||
m_newMinRecSizes *= (long)(1.0 + (m_round * .10));
|
||||
|
||||
// wrap around?
|
||||
if ( m_newMinRecSizes < 0 || m_newMinRecSizes > 1000000000 )
|
||||
@ -2003,7 +2011,7 @@ bool Msg5::getRemoteList ( ) {
|
||||
0 , // max cached age
|
||||
false , // add to cache?
|
||||
m_rdbId , // rdbId
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
m_list ,
|
||||
m_startKey ,
|
||||
m_endKey ,
|
||||
|
10
Msg5.h
10
Msg5.h
@ -66,7 +66,8 @@ class Msg5 {
|
||||
// . if maxCacheAge is > 0, we lookup in cache first
|
||||
bool getList ( //class RdbBase *base ,
|
||||
char rdbId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
RdbList *list ,
|
||||
//key_t startKey ,
|
||||
//key_t endKey ,
|
||||
@ -99,7 +100,8 @@ class Msg5 {
|
||||
|
||||
bool getList ( //class RdbBase *base ,
|
||||
char rdbId ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
RdbList *list ,
|
||||
key_t startKey ,
|
||||
key_t endKey ,
|
||||
@ -125,7 +127,7 @@ class Msg5 {
|
||||
bool allowPageCache = true ,
|
||||
bool hitDisk = true ) {
|
||||
return getList ( rdbId ,
|
||||
coll ,
|
||||
collnum ,
|
||||
list ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
@ -216,7 +218,7 @@ class Msg5 {
|
||||
long m_startFileNum;
|
||||
long m_minRecSizes;
|
||||
//RdbBase *m_base;
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
char m_rdbId;
|
||||
|
||||
// . cache may modify these
|
||||
|
21
Msg51.cpp
21
Msg51.cpp
@ -75,7 +75,8 @@ bool Msg51::getClusterRecs ( long long *docIds ,
|
||||
char *clusterLevels ,
|
||||
key_t *clusterRecs ,
|
||||
long numDocIds ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
long maxCacheAge ,
|
||||
bool addToCache ,
|
||||
void *state ,
|
||||
@ -87,12 +88,13 @@ bool Msg51::getClusterRecs ( long long *docIds ,
|
||||
// reset this msg
|
||||
reset();
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg51.");
|
||||
if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg51.");
|
||||
// get the collection rec
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
// return true on error, g_errno should already be set
|
||||
if ( ! cr ) {
|
||||
log("db: msg51. Collection rec null for coll %s.", coll);
|
||||
log("db: msg51. Collection rec null for collnum %li.",
|
||||
(long)collnum);
|
||||
g_errno = EBADENGINEER;
|
||||
char *xx=NULL; *xx=0;
|
||||
return true;
|
||||
@ -102,8 +104,9 @@ bool Msg51::getClusterRecs ( long long *docIds ,
|
||||
m_addToCache = addToCache;
|
||||
m_state = state;
|
||||
m_callback = callback;
|
||||
m_coll = coll;
|
||||
m_collLen = gbstrlen(coll);
|
||||
//m_coll = coll;
|
||||
//m_collLen = gbstrlen(coll);
|
||||
m_collnum = collnum;
|
||||
// these are storage for the requester
|
||||
m_docIds = docIds;
|
||||
m_clusterLevels = clusterLevels;
|
||||
@ -186,7 +189,7 @@ bool Msg51::sendRequests ( long k ) {
|
||||
key_t ckey = (key_t)m_docIds[m_nexti];
|
||||
bool found = false;
|
||||
if ( c )
|
||||
found = c->getRecord ( m_coll ,
|
||||
found = c->getRecord ( m_collnum ,
|
||||
ckey , // cache key
|
||||
&crecPtr , // pointer to it
|
||||
&crecSize ,
|
||||
@ -292,7 +295,7 @@ bool Msg51::sendRequest ( long i ) {
|
||||
m_maxCacheAge ,
|
||||
m_addToCache ,
|
||||
RDB_CLUSTERDB ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
&m_lists[i] ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
@ -437,7 +440,7 @@ void Msg51::gotClusterRec ( Msg0 *msg0 ) { //, RdbList *list ) {
|
||||
// . add the record to our quick cache as a long long
|
||||
// . ignore any error
|
||||
if ( s_cacheInit )
|
||||
c->addRecord ( m_coll ,
|
||||
c->addRecord ( m_collnum ,
|
||||
(key_t)docId , // docid is key
|
||||
(char *)rec ,
|
||||
sizeof(key_t) , // recSize
|
||||
|
8
Msg51.h
8
Msg51.h
@ -108,7 +108,8 @@ class Msg51 {
|
||||
char *clusterLevels ,
|
||||
key_t *clusterRecs ,
|
||||
long numDocIds ,
|
||||
char *coll ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
long maxCacheAge ,
|
||||
bool addToCache ,
|
||||
void *state ,
|
||||
@ -169,8 +170,9 @@ class Msg51 {
|
||||
long m_firstNode;
|
||||
long m_nextNode;
|
||||
|
||||
char *m_coll;
|
||||
long m_collLen;
|
||||
//char *m_coll;
|
||||
//long m_collLen;
|
||||
collnum_t m_collnum;
|
||||
|
||||
// cache info
|
||||
long m_maxCacheAge;
|
||||
|
@ -187,7 +187,7 @@ bool Msg8b::getCatRec ( Url *url ,
|
||||
0 , // max cached age in seconds (60)
|
||||
false , // add net recv'd list to cache?
|
||||
RDB_CATDB, // specifies the rdb, 1 = tagdb
|
||||
"",//NULL,//m_coll ,
|
||||
0,//collnum"",//NULL,//m_coll ,
|
||||
//&m_list ,
|
||||
m_list ,
|
||||
startKey ,
|
||||
|
@ -172,7 +172,8 @@ bool Msg9b::addCatRecs ( char *urls ,
|
||||
// . use high priority (niceness of 0)
|
||||
// . i raised niceness from 0 to 1 so multicast does not use the
|
||||
// small UdpSlot::m_tmpBuf... might have a big file...
|
||||
return m_msg1.addList ( &m_list, RDB_CATDB, coll ,
|
||||
return m_msg1.addList ( &m_list, RDB_CATDB,
|
||||
(collnum_t)0 ,
|
||||
state , callback ,
|
||||
false , // force local?
|
||||
niceness ); // niceness
|
||||
|
@ -51,7 +51,7 @@ bool Msge0::getTagRecs ( char **urlPtrs ,
|
||||
// if skipOldLinks && urlFlags[i]&LF_OLDLINK, skip it
|
||||
bool skipOldLinks ,
|
||||
TagRec *baseTagRec ,
|
||||
char *coll ,
|
||||
collnum_t collnum,
|
||||
long niceness ,
|
||||
void *state ,
|
||||
void (*callback)(void *state) ) {
|
||||
@ -65,7 +65,7 @@ bool Msge0::getTagRecs ( char **urlPtrs ,
|
||||
m_numUrls = numUrls;
|
||||
m_skipOldLinks = skipOldLinks;
|
||||
m_baseTagRec = baseTagRec;
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;
|
||||
m_niceness = niceness;
|
||||
m_state = state;
|
||||
m_callback = callback;
|
||||
@ -240,7 +240,7 @@ bool Msge0::sendMsg8a ( long i ) {
|
||||
// subsite.
|
||||
if ( ! m->getTagRec ( &m_urls[i] ,
|
||||
NULL, // sites[i] ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
// if domain is banned, we will miss that here!
|
||||
true , // skip domain lookup?
|
||||
m_niceness ,
|
||||
|
4
Msge0.h
4
Msge0.h
@ -21,7 +21,7 @@ public:
|
||||
long numUrls ,
|
||||
bool skipOldLinks ,
|
||||
class TagRec *baseTagRec ,
|
||||
char *coll ,
|
||||
collnum_t collnum,
|
||||
long niceness ,
|
||||
void *state ,
|
||||
void (*callback)(void *state) ) ;
|
||||
@ -32,7 +32,7 @@ public:
|
||||
bool sendMsg8a ( long i );
|
||||
bool doneSending ( long i );
|
||||
|
||||
char *m_coll ;
|
||||
collnum_t m_collnum;
|
||||
long m_niceness ;
|
||||
|
||||
char **m_urlPtrs;
|
||||
|
18
Msge1.cpp
18
Msge1.cpp
@ -116,7 +116,7 @@ bool Msge1::getFirstIps ( TagRec **grv ,
|
||||
if ( ! launchRequests ( 0 ) ) return false;
|
||||
|
||||
// save it? might be a page parser
|
||||
//if ( ! strcmp(m_coll,"test") ) saveTestBuf();
|
||||
//if ( ! strcmp(m_coll,"qatest123") ) saveTestBuf();
|
||||
|
||||
// none blocked, we are done
|
||||
return true;
|
||||
@ -219,7 +219,7 @@ bool Msge1::launchRequests ( long starti ) {
|
||||
|
||||
/*
|
||||
// look up in our m_testBuf.
|
||||
if ( m_coll && ! strcmp(m_coll,"test") ) {
|
||||
if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
|
||||
bool found = false;
|
||||
// do we got it?
|
||||
long quickIp ; bool status = getTestIp ( p , &quickIp, &found);
|
||||
@ -300,7 +300,7 @@ bool Msge1::sendMsgC ( long i , char *host , long hlen ) {
|
||||
|
||||
|
||||
// look up in our m_testBuf.
|
||||
if ( m_coll && ! strcmp(m_coll,"test") ) {
|
||||
if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
|
||||
bool found = false;
|
||||
// shortcut
|
||||
//char *p = m_urlPtrs[n];
|
||||
@ -340,7 +340,7 @@ void gotMsgCWrapper ( void *state , long ip ) {
|
||||
if ( ! THIS->launchRequests(i) ) return;
|
||||
// . save it if we should. might be a page parser
|
||||
// . mdw i uncommented this when we cored all the time
|
||||
//if ( ! strcmp(THIS->m_coll,"test")) saveTestBuf();
|
||||
//if ( ! strcmp(THIS->m_coll,"qatest123")) saveTestBuf();
|
||||
// must be all done, call the callback
|
||||
THIS->m_callback ( THIS->m_state );
|
||||
}
|
||||
@ -364,7 +364,7 @@ bool Msge1::doneSending ( long i ) {
|
||||
// n, i, m_urls[i].getUrl() ,iptoa(ip));
|
||||
|
||||
// store it?
|
||||
if ( ! strcmp(m_coll,"test") ) {
|
||||
if ( ! strcmp(m_coll,"qatest123") ) {
|
||||
// get host
|
||||
long hlen = 0;
|
||||
char *host = getHostFast ( m_urlPtrs[n] , &hlen );
|
||||
@ -511,9 +511,9 @@ static char *s_last = NULL ;
|
||||
static long s_lastLen = 0 ;
|
||||
static HashTableX s_ht;
|
||||
|
||||
// . only call this if the collection is "test"
|
||||
// . only call this if the collection is "qatest123"
|
||||
// . we try to get the ip by accessing the "./test/ips.txt" file
|
||||
// . we also ad ips we lookup to that file in the collection is "test"
|
||||
// . we also ad ips we lookup to that file in the collection is "qatest123"
|
||||
// . returns false and sets g_errno on error, true on success
|
||||
bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
|
||||
char *testDir ) {
|
||||
@ -533,8 +533,8 @@ bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
|
||||
// assume not found
|
||||
*found = false;
|
||||
|
||||
// . if we are the "test" collection, check for "./test/ips.txt" file
|
||||
// that gives us the ips of the given urls.
|
||||
// . if we are the "qatestq123" collection, check for "./test/ips.txt"
|
||||
// file that gives us the ips of the given urls.
|
||||
// . if we end up doing some lookups we should append to that file
|
||||
if ( ! s_testBuf || s_needsReload ) {
|
||||
// assume needs reload now
|
||||
|
419
PageAddUrl.cpp
419
PageAddUrl.cpp
@ -2,107 +2,64 @@
|
||||
|
||||
#include "Pages.h"
|
||||
#include "Collectiondb.h"
|
||||
#include "HashTable.h"
|
||||
#include "Msg4.h"
|
||||
#include "TuringTest.h"
|
||||
#include "AutoBan.h"
|
||||
//#include "CollectionRec.h"
|
||||
//#include "Links.h"
|
||||
#include "Users.h"
|
||||
#include "HashTableT.h"
|
||||
#include "Spider.h"
|
||||
#include "Parms.h"
|
||||
|
||||
static bool sendReply ( void *state , bool addUrlEnabled );
|
||||
static bool canSubmit (unsigned long h, long now, long maxUrlsPerIpDom);
|
||||
|
||||
static void addedStuff ( void *state );
|
||||
|
||||
void resetPageAddUrl ( ) ;
|
||||
|
||||
class State2 {
|
||||
public:
|
||||
Url m_url;
|
||||
char *m_buf;
|
||||
long m_bufLen;
|
||||
long m_bufMaxLen;
|
||||
};
|
||||
|
||||
class State1 {
|
||||
public:
|
||||
Msg4 m_msg4;
|
||||
TcpSocket *m_socket;
|
||||
bool m_isAdmin;
|
||||
char m_coll[MAX_COLL_LEN+1];
|
||||
bool m_goodAnswer;
|
||||
bool m_doTuringTest;
|
||||
long m_ufuLen;
|
||||
char m_ufu[MAX_URL_LEN];
|
||||
|
||||
HttpRequest m_hr;
|
||||
|
||||
long m_urlLen;
|
||||
char m_url[MAX_URL_LEN];
|
||||
|
||||
char m_username[MAX_USER_SIZE];
|
||||
bool m_strip;
|
||||
bool m_spiderLinks;
|
||||
bool m_forceRespider;
|
||||
// buf filled by the links coming from google, msn, yahoo, etc
|
||||
State2 m_state2[5]; // gb, goog, yahoo, msn, ask
|
||||
|
||||
long m_numSent;
|
||||
long m_numReceived;
|
||||
//long m_raw;
|
||||
SpiderRequest m_sreq;
|
||||
};
|
||||
|
||||
// only allow up to 1 Msg10's to be in progress at a time
|
||||
static bool s_inprogress = false;
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
// . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp
|
||||
bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
|
||||
// . get fields from cgi field of the requested url
|
||||
// . get the search query
|
||||
long urlLen = 0;
|
||||
char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
|
||||
// also try "url" and "urls"
|
||||
if ( ! url ) url = r->getString ( "url" , &urlLen , NULL );
|
||||
if ( ! url ) url = r->getString ( "urls" , &urlLen , NULL );
|
||||
|
||||
// see if they provided a url of a file of urls if they did not
|
||||
// provide a url to add directly
|
||||
//bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
bool isAdmin = r->getIsLocal();
|
||||
long ufuLen = 0;
|
||||
char *ufu = NULL;
|
||||
if ( isAdmin )
|
||||
// get the url of a file of urls (ufu)
|
||||
ufu = r->getString ( "ufu" , &ufuLen , NULL );
|
||||
|
||||
// can't be too long, that's obnoxious
|
||||
if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
|
||||
if ( urlLen > MAX_URL_LEN ) {
|
||||
g_errno = EBUFTOOSMALL;
|
||||
g_msg = " (error: url too long)";
|
||||
return g_httpServer.sendErrorReply(s,500,"url too long");
|
||||
}
|
||||
// get the collection
|
||||
long collLen = 0;
|
||||
char *coll = r->getString("c",&collLen);
|
||||
if ( ! coll || ! coll[0] ) {
|
||||
//coll = g_conf.m_defaultColl;
|
||||
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
|
||||
collLen = gbstrlen(coll);
|
||||
}
|
||||
|
||||
// get collection rec
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r ); // coll );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
// bitch if no collection rec found
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
g_msg = " (error: no collection)";
|
||||
return g_httpServer.sendErrorReply(s,500,"no coll rec");
|
||||
}
|
||||
// . make sure the ip is not banned
|
||||
// . we may also have an exclusive list of IPs for private collections
|
||||
if ( ! cr->hasSearchPermission ( s ) ) {
|
||||
g_errno = ENOPERM;
|
||||
g_msg = " (error: permission denied)";
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
}
|
||||
|
||||
|
||||
// make a new state
|
||||
State1 *st1 ;
|
||||
try { st1 = new (State1); }
|
||||
@ -112,9 +69,11 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
sizeof(State1),mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
|
||||
mnew ( st1 , sizeof(State1) , "PageAddUrl" );
|
||||
// save socket and isAdmin
|
||||
|
||||
|
||||
st1->m_socket = s;
|
||||
st1->m_isAdmin = isAdmin;
|
||||
|
||||
st1->m_hr.copy ( r );
|
||||
|
||||
// assume no url buf yet, set below
|
||||
//st1->m_ubuf = NULL;
|
||||
@ -126,7 +85,9 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
if ( url ) {
|
||||
// normalize and add www. if it needs it
|
||||
Url uu;
|
||||
uu.set ( url , gbstrlen(url) , true );
|
||||
// do not convert xyz.com to www.xyz.com because sometimes
|
||||
// people want xyz.com exactly
|
||||
uu.set ( url , gbstrlen(url) , false ); // true );
|
||||
// remove >'s i guess and store in st1->m_url[] buffer
|
||||
st1->m_urlLen=cleanInput ( st1->m_url,
|
||||
MAX_URL_LEN,
|
||||
@ -138,63 +99,11 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
//st1->m_ubufAlloc = NULL; // do not free it!
|
||||
}
|
||||
|
||||
// save the "ufu" (url of file of urls)
|
||||
st1->m_ufu[0] = '\0';
|
||||
st1->m_ufuLen = ufuLen;
|
||||
memcpy ( st1->m_ufu , ufu , ufuLen );
|
||||
st1->m_ufu[ufuLen] = '\0';
|
||||
|
||||
st1->m_doTuringTest = cr->m_doTuringTest;
|
||||
char *username = g_users.getUsername(r);
|
||||
if(username) strcpy(st1->m_username,username);
|
||||
//st1->m_user = g_pages.getUserType ( s , r );
|
||||
st1->m_spiderLinks = true;
|
||||
st1->m_strip = true;
|
||||
//st1->m_raw = r->getLong("raw",0);
|
||||
|
||||
// init state2
|
||||
for ( long i = 0; i < 5; i++ ){
|
||||
st1->m_state2[i].m_buf = NULL;
|
||||
st1->m_state2[i].m_bufLen = 0;
|
||||
st1->m_state2[i].m_bufMaxLen = 0;
|
||||
}
|
||||
|
||||
// save the collection name in the State1 class
|
||||
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
|
||||
strncpy ( st1->m_coll , coll , collLen );
|
||||
st1->m_coll [ collLen ] = '\0';
|
||||
|
||||
// assume they answered turing test correctly
|
||||
st1->m_goodAnswer = true;
|
||||
// if addurl is turned off, just print "disabled" msg
|
||||
if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false );
|
||||
// can also be turned off in the collection rec
|
||||
if ( ! cr->m_addUrlEnabled ) return sendReply ( st1 , false );
|
||||
// or if in read-only mode
|
||||
if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false );
|
||||
// cannot add if another Msg10 from here is still in progress
|
||||
if ( s_inprogress ) return sendReply ( st1 , true );
|
||||
// use now as the spiderTime
|
||||
|
||||
// get ip of submitter
|
||||
//unsigned long h = ipdom ( s->m_ip );
|
||||
// . use top 2 bytes now, some isps have large blocks
|
||||
// . if this causes problems, then they can do pay for inclusion
|
||||
unsigned long h = iptop ( s->m_ip );
|
||||
long codeLen;
|
||||
char* code = r->getString("code", &codeLen);
|
||||
if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
|
||||
long uipLen = 0;
|
||||
char* uip = r->getString("uip",&uipLen);
|
||||
long hip = 0;
|
||||
//use the uip when we have a raw query to test if
|
||||
//we can submit
|
||||
if(uip) {
|
||||
hip = atoip(uip, uipLen);
|
||||
h = iptop( hip );
|
||||
}
|
||||
}
|
||||
|
||||
if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false );
|
||||
|
||||
st1->m_strip = r->getLong("strip",0);
|
||||
// Remember, for cgi, if the box is not checked, then it is not
|
||||
@ -208,36 +117,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
// . use to manually update spider times for a url
|
||||
// . however, will not remove old scheduled spider times
|
||||
// . mdw: made force on the default
|
||||
st1->m_forceRespider = r->getLong("force",1); // 0);
|
||||
|
||||
long now = getTimeGlobal();
|
||||
|
||||
// . allow 1 submit every 1 hour
|
||||
// . restrict by submitter domain ip
|
||||
if ( ! st1->m_isAdmin &&
|
||||
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
|
||||
// return error page
|
||||
g_errno = ETOOEARLY;
|
||||
return sendReply ( st1 , true );
|
||||
}
|
||||
|
||||
|
||||
//st1->m_query = r->getString( "qts", &st1->m_queryLen );
|
||||
|
||||
|
||||
// check it, if turing test is enabled for this collection
|
||||
if ( ! st1->m_isAdmin && cr->m_doTuringTest &&
|
||||
! g_turingTest.isHuman(r) ) {
|
||||
// log note so we know it didn't make it
|
||||
g_msg = " (error: bad answer)";
|
||||
//log("PageAddUrl:: addurl failed for %s : bad answer",
|
||||
// iptoa(s->m_ip));
|
||||
st1->m_goodAnswer = false;
|
||||
return sendReply ( st1 , true /*addUrl enabled?*/ );
|
||||
}
|
||||
|
||||
//if ( st1->m_queryLen > 0 )
|
||||
// return getPages( st1 );
|
||||
//st1->m_forceRespider = r->getLong("force",1); // 0);
|
||||
|
||||
// if no url given, just print a blank page
|
||||
if ( ! url ) return sendReply ( st1 , true );
|
||||
@ -262,7 +142,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
// now add that to spiderdb using msg4
|
||||
if ( ! m->addMetaList ( (char *)sreq ,
|
||||
sreq->getRecSize() ,
|
||||
coll ,
|
||||
cr->m_coll ,
|
||||
st1 , // state
|
||||
addedStuff ,
|
||||
MAX_NICENESS ,
|
||||
@ -283,7 +163,7 @@ void addedStuff ( void *state ) {
|
||||
|
||||
bool sendReply ( void *state , bool addUrlEnabled ) {
|
||||
// allow others to add now
|
||||
s_inprogress = false;
|
||||
//s_inprogress = false;
|
||||
// get the state properly
|
||||
State1 *st1 = (State1 *) state;
|
||||
// in order to see what sites are being added log it, then we can
|
||||
@ -292,7 +172,6 @@ bool sendReply ( void *state , bool addUrlEnabled ) {
|
||||
log(LOG_INFO,"http: add url %s (%s)",st1->m_url ,mstrerror(g_errno));
|
||||
// extract info from state
|
||||
TcpSocket *s = st1->m_socket;
|
||||
bool isAdmin = st1->m_isAdmin;
|
||||
char *url = NULL;
|
||||
if ( st1->m_urlLen ) url = st1->m_url;
|
||||
// re-null it out if just http://
|
||||
@ -315,255 +194,49 @@ bool sendReply ( void *state , bool addUrlEnabled ) {
|
||||
|
||||
char tt [ 128 ];
|
||||
tt[0] = '\0';
|
||||
if ( st1->m_coll[0] != '\0' && ! isAdmin )
|
||||
sprintf ( tt , " for %s", st1->m_coll );
|
||||
// the bg colors and style
|
||||
g_pages.printColors (&sb);
|
||||
sb.safePrintf ( "<title>Gigablast Add a Url</title>"
|
||||
"<table><tr><td valign=bottom><a href=/>"
|
||||
//"<img width=200 length=25 border=0 src=/logo2.gif></a>"
|
||||
"<img width=210 height=25 border=0 src=/logo2.gif></a>"
|
||||
" </font></td><td><font size=+1>"
|
||||
"<b>Add Url%s</td></tr></table>" , tt );
|
||||
|
||||
g_pages.printAdminTop ( &sb , st1->m_socket , &st1->m_hr );
|
||||
|
||||
// watch out for NULLs
|
||||
if ( ! url ) url = "http://";
|
||||
// blank out url if adding a url of a file of urls
|
||||
// if ( st1->m_ufu ) url = "http://";
|
||||
|
||||
// if there was an error let them know
|
||||
char msg[MAX_URL_LEN + 1024];
|
||||
char *pm = "";
|
||||
if ( g_errno ) {
|
||||
if ( g_errno == ETOOEARLY ) {
|
||||
pm = "Error. 100 urls have "
|
||||
"already been submitted by "
|
||||
"this IP address for the last 24 hours. "
|
||||
"<a href=/addurlerror.html>Explanation</a>.";
|
||||
log("addurls: Failed for user at %s: "
|
||||
"quota breeched.", iptoa(s->m_ip));
|
||||
|
||||
//rb.safePrintf("Error. %li urls have "
|
||||
// "already been submitted by "
|
||||
// "this IP address for the "
|
||||
// "last 24 hours. ",
|
||||
// cr->m_maxAddUrlsPerIpDomPerDay);
|
||||
}
|
||||
else {
|
||||
sprintf ( msg ,"Error adding url(s): <b>%s[%i]</b>",
|
||||
mstrerror(g_errno) , g_errno);
|
||||
pm = msg;
|
||||
//rb.safePrintf("Error adding url(s): %s[%i]",
|
||||
// mstrerror(g_errno) , g_errno);
|
||||
}
|
||||
sprintf ( msg ,"Error adding url(s): <b>%s[%i]</b>",
|
||||
mstrerror(g_errno) , g_errno);
|
||||
pm = msg;
|
||||
//rb.safePrintf("Error adding url(s): %s[%i]",
|
||||
// mstrerror(g_errno) , g_errno);
|
||||
}
|
||||
else {
|
||||
if ( ! addUrlEnabled ) {//g_conf.m_addUrlEnabled )
|
||||
pm = "<font color=#ff0000>"
|
||||
"Sorry, this feature is temporarily disabled. "
|
||||
"Please try again later.</font>";
|
||||
if ( st1->m_urlLen )
|
||||
log("addurls: failed for user at %s: "
|
||||
"add url is disabled. "
|
||||
"Enable add url on the "
|
||||
"Master Controls page and "
|
||||
"on the Spider Controls page for "
|
||||
"this collection.",
|
||||
iptoa(s->m_ip));
|
||||
|
||||
//rb.safePrintf("Sorry, this feature is temporarily "
|
||||
// "disabled. Please try again later.");
|
||||
}
|
||||
else if ( s_inprogress ) {
|
||||
pm = "Add url busy. Try again later.";
|
||||
log("addurls: Failed for user at %s: "
|
||||
"busy adding another.", iptoa(s->m_ip));
|
||||
//rb.safePrintf("Add url busy. Try again later.");
|
||||
|
||||
}
|
||||
// did they fail the turing test?
|
||||
else if ( ! st1->m_goodAnswer ) {
|
||||
pm = "<font color=#ff0000>"
|
||||
"Oops, you did not enter the 4 large letters "
|
||||
"you see below. Please try again.</font>";
|
||||
//rb.safePrintf("could not add the url"
|
||||
// " because the turing test"
|
||||
// " is enabled.");
|
||||
|
||||
}
|
||||
if ( url && ! st1->m_ufu[0] && url[0] && printUrl ) {
|
||||
sprintf ( msg ,"<u>%s</u> added to spider "
|
||||
"queue "
|
||||
"successfully", url );
|
||||
//rb.safePrintf("%s added to spider "
|
||||
// "queue successfully", url );
|
||||
}
|
||||
else if ( st1->m_ufu[0] ) {
|
||||
sprintf ( msg ,"urls in <u>%s</u> "
|
||||
"added to spider queue "
|
||||
"successfully", st1->m_ufu );
|
||||
|
||||
//rb.safePrintf("urls in %s added to spider "
|
||||
// "queue successfully", url );
|
||||
|
||||
}
|
||||
else {
|
||||
sprintf(msg,"Add the url you want:");
|
||||
//rb.safePrintf("Add the url you want:");
|
||||
}
|
||||
|
||||
else if ( url && printUrl && url[0] ) {
|
||||
sprintf ( msg ,"<b><u>%s</u></b> added to spider "
|
||||
"queue "
|
||||
"successfully<br><br>", url );
|
||||
//rb.safePrintf("%s added to spider "
|
||||
// "queue successfully", url );
|
||||
pm = msg;
|
||||
url = "http://";
|
||||
//else
|
||||
// pm = "Don't forget to <a href=/gigaboost.html>"
|
||||
// "Gigaboost</a> your URL.";
|
||||
}
|
||||
|
||||
// TODO: show them a list of the urls they added
|
||||
// print the addUrl page in here with a status msg
|
||||
sb.safePrintf (
|
||||
"<br><br><br><center>"
|
||||
"<b>%s</b>" // the url msg
|
||||
"<br><br>"
|
||||
"<FORM method=get action=/addurl>"
|
||||
"<input type=text name=u value=\"%s\" size=50> "
|
||||
"<input type=submit value=\"add url\" border=0><br>",pm,url);
|
||||
// if we're coming from local ip print the collection box
|
||||
if ( isAdmin )
|
||||
sb.safePrintf (
|
||||
"\n"
|
||||
|
||||
"<br><b>or specify the url of a "
|
||||
"file of urls to add:</b>"
|
||||
"<br>\n"
|
||||
"<input type=text name=ufu size=50> "
|
||||
"<input type=submit value=\"add file\" border=0><br>"
|
||||
"<br>"
|
||||
|
||||
//"<br><b>or a query to scrape from major engines:</b>"
|
||||
//"<br>\n"
|
||||
// qts = query to scrape
|
||||
//"<input type=text name=qts size=49> "
|
||||
//"<input type=submit value=\"add query\" border=0><br>"
|
||||
//"<br>"
|
||||
|
||||
"<br><b>collection to add to:</b> "
|
||||
"<input type=text name=c size=20 value=\"%s\">"
|
||||
"<br><br>\n",
|
||||
st1->m_coll );
|
||||
// otherwise hide it
|
||||
else
|
||||
sb.safePrintf ( "<input type=hidden name=c value=\"%s\">" ,
|
||||
st1->m_coll );
|
||||
|
||||
|
||||
char *ss = "";
|
||||
if ( st1->m_strip ) ss =" checked";
|
||||
sb.safePrintf ("<br>"
|
||||
"<input type=checkbox name=strip value=1%s> "
|
||||
"strip sessionids<br>", ss );
|
||||
|
||||
sb.safePrintf("<br>\n");
|
||||
|
||||
//Adding spider links box
|
||||
char *sl = "";
|
||||
if ( st1->m_spiderLinks ) sl =" checked";
|
||||
sb.safePrintf ("<input type=checkbox name=spiderLinks value=1%s> "
|
||||
"spider (harvest) links from page<br><br>\n", sl );
|
||||
g_parms.printParmTable ( &sb , st1->m_socket , &st1->m_hr );
|
||||
|
||||
if ( ! s_inprogress && addUrlEnabled && st1->m_doTuringTest ) {
|
||||
g_turingTest.printTest(&sb);
|
||||
}
|
||||
|
||||
// . print the url box, etc...
|
||||
// . assume user is always forcing their url
|
||||
// sprintf ( p ,
|
||||
// "<br><br>"
|
||||
// "<input type=checkbox name=force value=1 checked> "
|
||||
// "force respider<br>" );
|
||||
//p += gbstrlen ( p );
|
||||
/*
|
||||
sprintf ( p ,
|
||||
"<br>"
|
||||
"<a href=/?redir="
|
||||
"http://www.searchengineguide.com/submit/gigablast.html>"
|
||||
"<b>Search Engine Marketing News</b></a><br>"
|
||||
"If you would like to stay up to date with the "
|
||||
"latest articles on using search engines to market "
|
||||
"your web site, we recommend subscribing to the "
|
||||
"Search Engine Marketing weekly newsletter. Once a "
|
||||
"week, a digest of articles from the top search "
|
||||
"engine marketing experts is delivered straight to "
|
||||
"your inbox for free.<br><br>");
|
||||
p += gbstrlen(p);
|
||||
*/
|
||||
// print the final tail
|
||||
g_pages.printTail ( &sb, st1->m_isAdmin ); // local?
|
||||
g_pages.printTail ( &sb, true ); // admin?
|
||||
// clear g_errno, if any, so our reply send goes through
|
||||
g_errno = 0;
|
||||
//bool raw = st1->m_raw;
|
||||
// free the buffer
|
||||
//if ( st1->m_ubufAlloc )
|
||||
// mfree ( st1->m_ubufAlloc , st1->m_ubufAllocSize,"pau");
|
||||
//if ( st1->m_metaList )
|
||||
// mfree ( st1->m_metaList , st1->m_metaListAllocSize,"pau");
|
||||
|
||||
// nuke state
|
||||
mdelete ( st1 , sizeof(State1) , "PageAddUrl" );
|
||||
delete (st1);
|
||||
// . send this page
|
||||
// . encapsulates in html header and tail
|
||||
// . make a Mime
|
||||
// . i thought we need -2 for cacheTime, but i guess not
|
||||
//rb.safePrintf("</status>\n");
|
||||
//if(raw) return g_httpServer.sendDynamicPage (s,
|
||||
// rb.getBufStart(),
|
||||
// rb.length(),
|
||||
// -1/*cachetime*/,
|
||||
// false, // POSTREply?
|
||||
// "text/xml"// content type
|
||||
// );
|
||||
|
||||
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
|
||||
return g_httpServer.sendDynamicPage (s,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
-1/*cachetime*/);
|
||||
-1 ); // cachetime
|
||||
}
|
||||
|
||||
|
||||
// we get like 100k submissions a day!!!
|
||||
static HashTable s_htable;
|
||||
static bool s_init = false;
|
||||
static long s_lastTime = 0;
|
||||
bool canSubmit ( unsigned long h , long now , long maxAddUrlsPerIpDomPerDay ) {
|
||||
// . sometimes no limit
|
||||
// . 0 means no limit because if they don't want any submission they
|
||||
// can just turn off add url and we want to avoid excess
|
||||
// troubleshooting for why a url can't be added
|
||||
if ( maxAddUrlsPerIpDomPerDay <= 0 ) return true;
|
||||
// init the table
|
||||
if ( ! s_init ) {
|
||||
s_htable.set ( 50000 );
|
||||
s_init = true;
|
||||
}
|
||||
// clean out table every 24 hours
|
||||
if ( now - s_lastTime > 24*60*60 ) {
|
||||
s_lastTime = now;
|
||||
s_htable.clear();
|
||||
}
|
||||
// . if table almost full clean out ALL slots
|
||||
// . TODO: just clean out oldest slots
|
||||
if ( s_htable.getNumSlotsUsed() > 47000 ) s_htable.clear ();
|
||||
// . how many times has this IP domain submitted?
|
||||
// . allow 10 times per day
|
||||
long n = s_htable.getValue ( h );
|
||||
// if over 24hr limit then bail
|
||||
if ( n >= maxAddUrlsPerIpDomPerDay ) return false;
|
||||
// otherwise, inc it
|
||||
n++;
|
||||
// add to table, will replace old values
|
||||
s_htable.addKey ( h , n );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void resetPageAddUrl ( ) {
|
||||
s_htable.reset();
|
||||
}
|
||||
|
||||
|
876
PageBasic.cpp
Normal file
876
PageBasic.cpp
Normal file
@ -0,0 +1,876 @@
|
||||
#include "SafeBuf.h"
|
||||
#include "HttpRequest.h"
|
||||
#include "SearchInput.h"
|
||||
#include "Pages.h"
|
||||
#include "Parms.h"
|
||||
#include "Spider.h"
|
||||
|
||||
//bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) ;
|
||||
|
||||
///////////
|
||||
//
|
||||
// main > Basic > Settings
|
||||
//
|
||||
///////////
|
||||
/*
|
||||
bool sendPageBasicSettings ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
char buf [ 128000 ];
|
||||
SafeBuf sb(buf,128000);
|
||||
|
||||
// true = usedefault coll?
|
||||
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
|
||||
if ( ! cr ) {
|
||||
g_httpServer.sendErrorReply(socket,500,"invalid collection");
|
||||
return true;
|
||||
}
|
||||
|
||||
// process any incoming request
|
||||
handleSettingsRequest ( socket , hr );
|
||||
|
||||
// . print standard header
|
||||
// . this prints the <form tag as well
|
||||
g_pages.printAdminTop ( &sb , socket , hr );
|
||||
|
||||
|
||||
g_parms.printParms ( &sb , socket , hr );
|
||||
|
||||
|
||||
printSitePatternExamples ( &sb , hr );
|
||||
|
||||
// wrap up the form, print a submit button
|
||||
g_pages.printAdminBottom ( &sb );
|
||||
|
||||
|
||||
return g_httpServer.sendDynamicPage ( socket,
|
||||
sb.getBufStart() ,
|
||||
sb.length() ,
|
||||
-1 ,
|
||||
false,//POSTReply ,
|
||||
NULL , // contType
|
||||
-1 , // httpstatus
|
||||
NULL,//cookie ,
|
||||
NULL );// charset
|
||||
}
|
||||
*/
|
||||
|
||||
class PatternData {
|
||||
public:
|
||||
// hash of the subdomain or domain for this line in sitelist
|
||||
long m_thingHash32;
|
||||
// ptr to the line in CollectionRec::m_siteListBuf
|
||||
char *m_patternStr;
|
||||
// offset of the url path in the pattern, 0 means none
|
||||
short m_pathOff;
|
||||
short m_pathLen;
|
||||
};
|
||||
|
||||
|
||||
// . Collectiondb.cpp calls this when any parm flagged with
|
||||
// PF_REBUILDURLFILTERS is updated
|
||||
// . this returns false if it blocks
|
||||
// . returns true and sets g_errno on error
|
||||
// . uses msg4 to add seeds to spiderdb if necessary
|
||||
// . only adds seeds for the shard we are on iff we are responsible for
|
||||
// the fake firstip!!!
|
||||
bool updateSiteListTables ( collnum_t collnum ,
|
||||
bool addSeeds ,
|
||||
char *siteListArg ) {
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
if ( ! cr ) return true;
|
||||
|
||||
// this might make a new spidercoll...
|
||||
SpiderColl *sc = g_spiderCache.getSpiderColl ( cr->m_collnum );
|
||||
|
||||
// sanity. if in use we should not even be here
|
||||
if ( sc->m_msg4x.m_inUse ) {
|
||||
log("basic: trying to update site list while previous "
|
||||
"update still outstanding.");
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
|
||||
// when sitelist is update Parms.cpp should invalidate this flag!
|
||||
//if ( sc->m_siteListTableValid ) return true;
|
||||
|
||||
// hash current sitelist entries, each line so we don't add
|
||||
// dup requests into spiderdb i guess...
|
||||
HashTableX dedup;
|
||||
if ( ! dedup.set ( 4,0,1024,NULL,0,false,0,"sldt") ) return true;
|
||||
// this is a safebuf PARM in Parms.cpp now HOWEVER, not really
|
||||
// because we set it here from a call to CommandUpdateSiteList()
|
||||
// because it requires all this computational crap.
|
||||
char *op = cr->m_siteListBuf.getBufStart();
|
||||
// scan and hash each line in it
|
||||
for ( ; *op ; op++ ) {
|
||||
// get end
|
||||
char *s = op;
|
||||
// skip to end of line marker
|
||||
for ( ; *op && *op != '\n' ; op++ ) ;
|
||||
// keep it simple
|
||||
long h32 = hash32 ( s , op - s );
|
||||
// for deduping
|
||||
if ( ! dedup.addKey ( &h32 ) ) return true;
|
||||
}
|
||||
|
||||
// get the old sitelist Domain Hash to PatternData mapping table
|
||||
// which tells us what domains, subdomains or paths we can or
|
||||
// can not spider...
|
||||
HashTableX *dt = &sc->m_siteListDomTable;
|
||||
|
||||
// reset it
|
||||
if ( ! dt->set ( 4 ,
|
||||
sizeof(PatternData),
|
||||
1024 ,
|
||||
NULL ,
|
||||
0 ,
|
||||
true , // allow dup keys?
|
||||
0 , // niceness - at least for now
|
||||
"sldt" ) )
|
||||
return true;
|
||||
|
||||
|
||||
// clear old shit
|
||||
sc->m_posSubstringBuf.purge();
|
||||
sc->m_negSubstringBuf.purge();
|
||||
|
||||
// we can now free the old site list methinks
|
||||
//cr->m_siteListBuf.purge();
|
||||
|
||||
// reset flags
|
||||
//sc->m_siteListAsteriskLine = NULL;
|
||||
sc->m_siteListHasNegatives = false;
|
||||
sc->m_siteListIsEmpty = true;
|
||||
|
||||
// use this so it will be free automatically when msg4 completes!
|
||||
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
|
||||
|
||||
//char *siteList = cr->m_siteListBuf.getBufStart();
|
||||
|
||||
// scan the list
|
||||
char *pn = siteListArg;
|
||||
|
||||
// completely empty?
|
||||
if ( ! pn ) return true;
|
||||
|
||||
long lineNum = 1;
|
||||
|
||||
long added = 0;
|
||||
|
||||
Url u;
|
||||
|
||||
for ( ; *pn ; lineNum++ ) {
|
||||
|
||||
// get end
|
||||
char *s = pn;
|
||||
// skip to end of line marker
|
||||
for ( ; *pn && *pn != '\n' ; pn++ ) ;
|
||||
|
||||
char *start = s;
|
||||
|
||||
// back p up over spaces in case ended in spaces
|
||||
char *pe = pn;
|
||||
for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );
|
||||
|
||||
// advance over '\n' for next line
|
||||
if ( *pn && *pn == '\n' ) pn++;
|
||||
|
||||
// make hash of the line
|
||||
long h32 = hash32 ( s , pe - s );
|
||||
|
||||
bool seedMe = true;
|
||||
bool isUrl = true;
|
||||
bool isNeg = false;
|
||||
bool isFilter = true;
|
||||
|
||||
innerLoop:
|
||||
// skip spaces at start of line
|
||||
if ( *s == ' ' ) s++;
|
||||
|
||||
// comment?
|
||||
if ( *s == '#' ) continue;
|
||||
|
||||
// empty line?
|
||||
if ( *s == '\n' ) continue;
|
||||
|
||||
// all?
|
||||
//if ( *s == '*' ) {
|
||||
// sc->m_siteListAsteriskLine = start;
|
||||
// continue;
|
||||
//}
|
||||
|
||||
if ( *s == '-' ) {
|
||||
sc->m_siteListHasNegatives = true;
|
||||
isNeg = true;
|
||||
s++;
|
||||
}
|
||||
|
||||
// exact:?
|
||||
//if ( strncmp(s,"exact:",6) == 0 ) {
|
||||
// s += 6;
|
||||
// goto innerLoop;
|
||||
//}
|
||||
|
||||
// these will be manual adds and should pass url filters
|
||||
// because they have the "ismanual" directive override
|
||||
if ( strncmp(s,"seed:",5) == 0 ) {
|
||||
s += 5;
|
||||
isFilter = false;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
if ( strncmp(s,"site:",5) == 0 ) {
|
||||
s += 5;
|
||||
seedMe = false;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
if ( strncmp(s,"contains:",9) == 0 ) {
|
||||
s += 9;
|
||||
seedMe = false;
|
||||
isUrl = false;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
long slen = pe - s;
|
||||
|
||||
// empty line?
|
||||
if ( slen <= 0 )
|
||||
continue;
|
||||
|
||||
if ( ! isUrl ) {
|
||||
// add to string buffers
|
||||
if ( isNeg ) {
|
||||
if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen))
|
||||
return true;
|
||||
if ( !sc->m_negSubstringBuf.pushChar('\0') )
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
// add to string buffers
|
||||
if ( ! sc->m_posSubstringBuf.safeMemcpy(s,slen) )
|
||||
return true;
|
||||
if ( ! sc->m_posSubstringBuf.pushChar('\0') )
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
u.set ( s , slen );
|
||||
|
||||
// error? skip it then...
|
||||
if ( u.getHostLen() <= 0 ) {
|
||||
log("basic: error on line #%li in sitelist",lineNum);
|
||||
continue;
|
||||
}
|
||||
|
||||
// is fake ip assigned to us?
|
||||
long firstIp = getFakeIpForUrl2 ( &u );
|
||||
|
||||
if ( ! isAssignedToUs( firstIp ) ) continue;
|
||||
|
||||
// see if in existing table for existing site list
|
||||
if ( addSeeds &&
|
||||
// a "site:" directive mean no seeding
|
||||
// a "contains:" directive mean no seeding
|
||||
seedMe &&
|
||||
! dedup.isInTable ( &h32 ) ) {
|
||||
// make spider request
|
||||
SpiderRequest sreq;
|
||||
sreq.setFromAddUrl ( u.getUrl() );
|
||||
if (
|
||||
// . add this url to spiderdb as a spiderrequest
|
||||
// . calling msg4 will be the last thing we do
|
||||
!spiderReqBuf->safeMemcpy(&sreq,sreq.getRecSize()))
|
||||
return true;
|
||||
// count it
|
||||
added++;
|
||||
|
||||
}
|
||||
|
||||
// if it is a "seed: xyz.com" thing it is seed only
|
||||
// do not use it for a filter rule
|
||||
if ( ! isFilter ) continue;
|
||||
|
||||
|
||||
// make the data node used for filtering urls during spidering
|
||||
PatternData pd;
|
||||
// hash of the subdomain or domain for this line in sitelist
|
||||
pd.m_thingHash32 = u.getHostHash32();
|
||||
// . ptr to the line in CollectionRec::m_siteListBuf.
|
||||
// . includes pointing to "exact:" too i guess and tag: later.
|
||||
pd.m_patternStr = start;
|
||||
// offset of the url path in the pattern, 0 means none
|
||||
pd.m_pathOff = 0;
|
||||
// scan url pattern, it should start at "s"
|
||||
char *x = s;
|
||||
// go all the way to the end
|
||||
for ( ; *x && x < pe ; x++ ) {
|
||||
// skip ://
|
||||
if ( x[0] == ':' && x[1] =='/' && x[2] == '/' ) {
|
||||
x += 2;
|
||||
continue;
|
||||
}
|
||||
// stop if we hit another /, that is path start
|
||||
if ( x[0] != '/' ) continue;
|
||||
x++;
|
||||
// empty path besides the /?
|
||||
if ( x >= pe ) break;
|
||||
// ok, we got something here i think
|
||||
if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; }
|
||||
// calc length from "start" of line so we can
|
||||
// jump to the path quickly for compares. inc "/"
|
||||
pd.m_pathOff = (x-1) - start;
|
||||
pd.m_pathLen = pe - (x-1);
|
||||
break;
|
||||
}
|
||||
|
||||
// add to new dt
|
||||
long domHash32 = u.getDomainHash32();
|
||||
if ( ! dt->addKey ( &domHash32 , &pd ) )
|
||||
return true;
|
||||
|
||||
// we have some patterns in there
|
||||
sc->m_siteListIsEmpty = false;
|
||||
}
|
||||
|
||||
// go back to a high niceness
|
||||
dt->m_niceness = MAX_NICENESS;
|
||||
|
||||
//long siteListLen = gbstrlen(siteList);
|
||||
//cr->m_siteListBuf.safeMemcpy ( siteList , siteListLen + 1 );
|
||||
|
||||
if ( ! addSeeds ) return true;
|
||||
|
||||
log("spider: adding %li seed urls",added);
|
||||
|
||||
// use spidercoll to contain this msg4 but if in use it
|
||||
// won't be able to be deleted until it comes back..
|
||||
if ( ! sc->m_msg4x.addMetaList ( spiderReqBuf ,
|
||||
sc->m_collnum ,
|
||||
// no need for callback since m_msg4x
|
||||
// should set msg4::m_inUse to false
|
||||
// when it comes back
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
MAX_NICENESS ,
|
||||
RDB_SPIDERDB
|
||||
) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . Spider.cpp calls this to see if a url it wants to spider is
|
||||
// in our "site list"
|
||||
// . we should return the row of the FIRST match really
|
||||
// . the url patterns all contain a domain now, so this can use the domain
|
||||
// hash to speed things up
|
||||
// . return ptr to the start of the line in case it has "tag:" i guess
|
||||
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
|
||||
// if it has * and no negatives, we are in!
|
||||
//if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
|
||||
// return sc->m_siteListAsteriskLine;
|
||||
|
||||
// if it is just a bunch of comments or blank lines, it is empty
|
||||
if ( sc->m_siteListIsEmpty )
|
||||
return NULL;
|
||||
|
||||
// if we had a list of contains: or regex: directives in the sitelist
|
||||
// we have to linear scan those
|
||||
char *nb = sc->m_negSubstringBuf.getBufStart();
|
||||
char *nbend = nb + sc->m_negSubstringBuf.getLength();
|
||||
for ( ; nb && nb < nbend ; ) {
|
||||
// return NULL if matches a negative substring
|
||||
if ( strstr ( sreq->m_url , nb ) ) return NULL;
|
||||
// skip it
|
||||
nb += strlen(nb) + 1;
|
||||
}
|
||||
|
||||
|
||||
char *myPath = NULL;
|
||||
|
||||
// check domain specific tables
|
||||
HashTableX *dt = &sc->m_siteListDomTable;
|
||||
|
||||
// get this
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
|
||||
// need to build dom table for pattern matching?
|
||||
if ( dt->getNumSlotsUsed() == 0 && cr ) {
|
||||
// do not add seeds, just make siteListDomTable, etc.
|
||||
updateSiteListTables ( sc->m_collnum ,
|
||||
false , // add seeds?
|
||||
cr->m_siteListBuf.getBufStart() );
|
||||
}
|
||||
|
||||
if ( dt->getNumSlotsUsed() == 0 ) {
|
||||
// empty site list -- no matches
|
||||
return NULL;
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
// this table maps a 32-bit domain hash of a domain to a
|
||||
// patternData class. only for those urls that have firstIps that
|
||||
// we handle.
|
||||
long slot = dt->getSlot ( &sreq->m_domHash32 );
|
||||
|
||||
// loop over all the patterns that contain this domain and see
|
||||
// the first one we match, and if we match a negative one.
|
||||
for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) {
|
||||
// get pattern
|
||||
PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot );
|
||||
// is it negative? return NULL if so so url will be ignored
|
||||
//if ( pd->m_patternStr[0] == '-' )
|
||||
// return NULL;
|
||||
// otherwise, it has a path. skip if we don't match path ptrn
|
||||
if ( pd->m_pathOff ) {
|
||||
if ( ! myPath ) myPath = sreq->getUrlPath();
|
||||
if ( strncmp (myPath,
|
||||
pd->m_patternStr + pd->m_pathOff,
|
||||
pd->m_pathLen ) )
|
||||
continue;
|
||||
}
|
||||
// was the line just a domain and not a subdomain?
|
||||
if ( pd->m_thingHash32 == sreq->m_domHash32 )
|
||||
// this will be false if negative pattern i guess
|
||||
return pd->m_patternStr;
|
||||
// was it just a subdomain?
|
||||
if ( pd->m_thingHash32 == sreq->m_hostHash32 )
|
||||
// this will be false if negative pattern i guess
|
||||
return pd->m_patternStr;
|
||||
}
|
||||
|
||||
|
||||
// if we had a list of contains: or regex: directives in the sitelist
|
||||
// we have to linear scan those
|
||||
char *pb = sc->m_posSubstringBuf.getBufStart();
|
||||
char *pend = pb + sc->m_posSubstringBuf.length();
|
||||
for ( ; pb && pb < pend ; ) {
|
||||
// return NULL if matches a negative substring
|
||||
if ( strstr ( sreq->m_url , pb ) ) return pb;
|
||||
// skip it
|
||||
pb += strlen(pb) + 1;
|
||||
}
|
||||
|
||||
|
||||
// is there an '*' in the patterns?
|
||||
//if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
// true = useDefault?
|
||||
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
|
||||
if ( ! cr ) return true;
|
||||
|
||||
/*
|
||||
// it is a safebuf parm
|
||||
char *siteList = cr->m_siteListBuf.getBufStart();
|
||||
if ( ! siteList ) siteList = "";
|
||||
|
||||
SafeBuf msgBuf;
|
||||
char *status = "";
|
||||
long max = 1000000;
|
||||
if ( cr->m_siteListBuf.length() > max ) {
|
||||
msgBuf.safePrintf( "<font color=red><b>"
|
||||
"Site list is over %li bytes large, "
|
||||
"too many to "
|
||||
"display on this web page. Please use the "
|
||||
"file upload feature only for now."
|
||||
"</b></font>"
|
||||
, max );
|
||||
status = " disabled";
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
sb->safePrintf(
|
||||
"On the command like you can issue a command like "
|
||||
|
||||
"<i>"
|
||||
"gb addurls < fileofurls.txt"
|
||||
"</i> or "
|
||||
|
||||
"<i>"
|
||||
"gb addfile < *.html"
|
||||
"</i> or "
|
||||
|
||||
"<i>"
|
||||
"gb injecturls < fileofurls.txt"
|
||||
"</i> or "
|
||||
|
||||
"<i>"
|
||||
"gb injectfile < *.html"
|
||||
"</i> or "
|
||||
|
||||
"to schedule downloads or inject content directly "
|
||||
"into Gigablast."
|
||||
|
||||
"</td><td>"
|
||||
|
||||
"<input "
|
||||
"size=20 "
|
||||
"type=file "
|
||||
"name=urls>"
|
||||
"</td></tr>"
|
||||
|
||||
);
|
||||
*/
|
||||
|
||||
// example table
|
||||
sb->safePrintf ( "<a name=examples></a>"
|
||||
"<table %s>"
|
||||
"<tr class=hdrow><td colspan=2>"
|
||||
"<center><b>Site List Examples</b></tr></tr>"
|
||||
//"<tr bgcolor=#%s>"
|
||||
//"<td>"
|
||||
,TABLE_STYLE );//, DARK_BLUE);
|
||||
|
||||
|
||||
sb->safePrintf(
|
||||
//"*"
|
||||
//"</td>"
|
||||
//"<td>Spider all urls encountered. If you just submit "
|
||||
//"this by itself, then Gigablast will initiate spidering "
|
||||
//"automatically at dmoz.org, an internet "
|
||||
//"directory of good sites.</td>"
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>goodstuff.com</td>"
|
||||
"<td>"
|
||||
"Spider the url <i>goodstuff.com/</i> and spider "
|
||||
"any links we harvest that have the domain "
|
||||
"<i>goodstuff.com</i>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
// protocol and subdomain match
|
||||
"<tr>"
|
||||
"<td>http://www.goodstuff.com/</td>"
|
||||
"<td>"
|
||||
"Spider the url "
|
||||
"<i>http://www.goodstuff.com/</i> and spider "
|
||||
"any links we harvest that start with "
|
||||
"<i>http://www.goodstuff.com/</i>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>seed:www.goodstuff.com/myurl.html</td>"
|
||||
"<td>"
|
||||
"Spider the url <i>www.goodstuff.com/myurl.html</i>. "
|
||||
"Add any outlinks we find into the "
|
||||
"spider queue, but those outlinks will only be "
|
||||
"spidered if they "
|
||||
"match ANOTHER line in this site list."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
// protocol and subdomain match
|
||||
"<tr>"
|
||||
"<td>site:http://www.goodstuff.com/</td>"
|
||||
"<td>"
|
||||
"Allow any urls starting with "
|
||||
"<i>http://www.goodstuff.com/</i> to be spidered "
|
||||
"if encountered."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
// subdomain match
|
||||
"<tr>"
|
||||
"<td>site:www.goodstuff.com</td>"
|
||||
"<td>"
|
||||
"Allow any urls starting with "
|
||||
"<i>www.goodstuff.com/</i> to be spidered "
|
||||
"if encountered."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>-site:bad.goodstuff.com</td>"
|
||||
"<td>"
|
||||
"Do not spider any urls starting with "
|
||||
"<i>bad.goodstuff.com/</i> to be spidered "
|
||||
"if encountered."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
// domain match
|
||||
"<tr>"
|
||||
"<td>site:goodstuff.com</td>"
|
||||
"<td>"
|
||||
"Allow any urls starting with "
|
||||
"<i>goodstuff.com/</i> to be spidered "
|
||||
"if encountered."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
// spider this subdir
|
||||
"<tr>"
|
||||
"<td><nobr>site:"
|
||||
"http://www.goodstuff.com/goodir/anotherdir/</nobr></td>"
|
||||
"<td>"
|
||||
"Allow any urls starting with "
|
||||
"<i>http://www.goodstuff.com/goodir/anotherdir/</i> "
|
||||
"to be spidered "
|
||||
"if encountered."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
// exact match
|
||||
|
||||
//"<tr>"
|
||||
//"<td>exact:http://xyz.goodstuff.com/myurl.html</td>"
|
||||
//"<td>"
|
||||
//"Allow this specific url."
|
||||
//"</td>"
|
||||
//"</tr>"
|
||||
|
||||
/*
|
||||
// local subdir match
|
||||
"<tr>"
|
||||
"<td>file://C/mydir/mysubdir/"
|
||||
"<td>"
|
||||
"Spider all files in the given subdirectory or lower. "
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>-file://C/mydir/mysubdir/baddir/"
|
||||
"<td>"
|
||||
"Do not spider files in this subdirectory."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
// connect to a device and index it as a stream
|
||||
//"<tr>"
|
||||
//"<td>stream:/dev/eth0"
|
||||
//"<td>"
|
||||
//"Connect to a device and index it as a stream. "
|
||||
//"It will be treated like a single huge document for "
|
||||
//"searching purposes with chunks being indexed in "
|
||||
//"realtime. Or chunk it up into individual document "
|
||||
//"chunks, but proximity term searching will have to "
|
||||
//"be adjusted to compute query term distances "
|
||||
//"inter-document."
|
||||
//"</td>"
|
||||
//"</tr>"
|
||||
|
||||
// negative subdomain match
|
||||
"<tr>"
|
||||
"<td>contains:goodtuff</td>"
|
||||
"<td>Spider any url containing <i>goodstuff</i>."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>-contains:badstuff</td>"
|
||||
"<td>Do not spider any url containing <i>badstuff</i>."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<td>regexp:-pid=[0-9A-Z]+/</td>"
|
||||
"<td>Url must match this regular expression. "
|
||||
"Try to avoid using these if possible; they can slow "
|
||||
"things down and are confusing to use."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
// tag match
|
||||
"<tr><td>"
|
||||
//"<td>tag:boots contains:boots<br>"
|
||||
"<nobr>tag:boots site:www.westernfootwear."
|
||||
"</nobr>com<br>"
|
||||
"tag:boots site:www.cowboyshop.com<br>"
|
||||
"tag:boots site:www.moreboots.com<br>"
|
||||
"<nobr>tag:boots site:www.lotsoffootwear.com"
|
||||
"</nobr><br>"
|
||||
//"<td>t:boots -contains:www.cowboyshop.com/shoes/</td>"
|
||||
"</td><td>"
|
||||
"Advance users only. "
|
||||
"Tag any urls matching these 4 url patterns "
|
||||
"so we can use "
|
||||
"the expression <i>tag:boots</i> in the "
|
||||
"<a href=/scheduler>spider scheduler</a> and perhaps "
|
||||
"give such urls higher spider priority."
|
||||
"For more "
|
||||
"precise spidering control over url subsets. "
|
||||
"Preceed any pattern with the tagname followed by "
|
||||
"space to tag it."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<td># This line is a comment.</td>"
|
||||
"<td>Empty lines and lines starting with # are "
|
||||
"ignored."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"</table>"
|
||||
);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// from pagecrawlbot.cpp for printCrawlDetailsInJson()
|
||||
#include "PageCrawlBot.h"
|
||||
|
||||
///////////
|
||||
//
|
||||
// main > Basic > Status
|
||||
//
|
||||
///////////
|
||||
bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
char buf [ 128000 ];
|
||||
SafeBuf sb(buf,128000);
|
||||
sb.reset();
|
||||
|
||||
char *fs = hr->getString("format",NULL,NULL);
|
||||
char fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
|
||||
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
|
||||
|
||||
|
||||
// true = usedefault coll?
|
||||
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
|
||||
if ( ! cr ) {
|
||||
g_httpServer.sendErrorReply(socket,500,"invalid collection");
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( fmt == FMT_JSON ) {
|
||||
printCrawlDetailsInJson ( &sb , cr );
|
||||
return g_httpServer.sendDynamicPage (socket,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
0); // cachetime
|
||||
}
|
||||
|
||||
|
||||
// print standard header
|
||||
if ( fmt == FORMAT_HTML )
|
||||
// this prints the <form tag as well
|
||||
g_pages.printAdminTop ( &sb , socket , hr );
|
||||
|
||||
|
||||
//
|
||||
// show stats
|
||||
//
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
|
||||
char *seedStr = cr->m_diffbotSeeds.getBufStart();
|
||||
if ( ! seedStr ) seedStr = "";
|
||||
|
||||
SafeBuf tmp;
|
||||
long crawlStatus = -1;
|
||||
getSpiderStatusMsg ( cr , &tmp , &crawlStatus );
|
||||
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
||||
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
|
||||
if ( sentAlert ) sentAlert = 1;
|
||||
|
||||
//sb.safePrintf(
|
||||
// "<form method=get action=/crawlbot>"
|
||||
// "%s"
|
||||
// , sb.getBufStart() // hidden input token/name/..
|
||||
// );
|
||||
|
||||
char *hurts = "No";
|
||||
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
|
||||
hurts = "Yes";
|
||||
|
||||
sb.safePrintf("<TABLE border=0>"
|
||||
"<TR><TD valign=top>"
|
||||
|
||||
"<table border=0 cellpadding=5>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Status Code:</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Status Msg:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
//"<tr>"
|
||||
//"<td><b>Rounds Completed:</td>"
|
||||
//"<td>%li</td>"
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Has Urls Ready to Spider:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
// this will have to be in crawlinfo too!
|
||||
//"<tr>"
|
||||
//"<td><b>pages indexed</b>"
|
||||
//"<td>%lli</td>"
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>URLs Harvested</b> "
|
||||
"(may include dups)</td>"
|
||||
"<td>%lli</td>"
|
||||
|
||||
"</tr>"
|
||||
|
||||
//"<tr>"
|
||||
//"<td><b>URLs Examined</b></td>"
|
||||
//"<td>%lli</td>"
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Crawl Attempts</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Crawl Successes</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
//, cr->m_spiderRoundNum
|
||||
//, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
, hurts
|
||||
|
||||
, cr->m_globalCrawlInfo.m_urlsHarvested
|
||||
//, cr->m_globalCrawlInfo.m_urlsConsidered
|
||||
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
//if ( fmt != FORMAT_JSON )
|
||||
// // wrap up the form, print a submit button
|
||||
// g_pages.printAdminBottom ( &sb );
|
||||
|
||||
return g_httpServer.sendDynamicPage (socket,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
0); // cachetime
|
||||
}
|
||||
|
@ -28,7 +28,7 @@ public:
|
||||
// . sets g_errno on error
|
||||
bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
// are we the admin?
|
||||
bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
//bool isAdmin = g_collectiondb.hasPermission ( r , s );
|
||||
// get the collection record
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
if ( ! cr ) {
|
||||
@ -38,16 +38,18 @@ bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
return g_httpServer.sendErrorReply ( s , 500 ,
|
||||
"collection does not exist");
|
||||
}
|
||||
/*
|
||||
bool isAssassin = cr->isAssassin ( s->m_ip );
|
||||
if ( isAdmin ) isAssassin = true;
|
||||
// bail if permission denied
|
||||
if ( ! isAssassin && ! cr->hasPermission ( r , s ) ) {
|
||||
log("admin: Bad collection name or password. Could not add "
|
||||
"sites to tagdb. Permission denied.");
|
||||
return sendPageLogin ( s , r ,
|
||||
return sendPagexxxx ( s , r ,
|
||||
"Collection name or "
|
||||
"password is incorrect");
|
||||
}
|
||||
*/
|
||||
// get the collection
|
||||
long collLen = 0;
|
||||
char *coll = r->getString("c", &collLen, NULL);
|
||||
@ -179,14 +181,14 @@ bool sendReply ( void *state ) {
|
||||
// print the generate Catdb link
|
||||
sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
|
||||
"<td><center>"
|
||||
"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
|
||||
"<a href=\"/admin/catdb?c=%s&gencatdb=2\">"
|
||||
"Update Catdb</a> "
|
||||
"</center></td></tr>",
|
||||
st->m_coll );
|
||||
sb.safePrintf ( "<tr class=poo>"
|
||||
"<td>Generate New Catdb from DMOZ data.</td>"
|
||||
"<td><center>"
|
||||
"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
|
||||
"<a href=\"/admin/catdb?c=%s&gencatdb=1\">"
|
||||
"Generate Catdb</a> "
|
||||
"</center></td></tr>",
|
||||
st->m_coll );
|
||||
|
522
PageCrawlBot.cpp
522
PageCrawlBot.cpp
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,8 @@
|
||||
#ifndef CRAWLBOT_H
|
||||
#define CRAWLBOT_H
|
||||
|
||||
bool printCrawlDetailsInJson ( class SafeBuf *sb , class CollectionRec *cx ) ;
|
||||
|
||||
// values for the diffbot dropdown
|
||||
/*
|
||||
#define DBA_NONE 0
|
||||
|
@ -58,15 +58,15 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
|
||||
//
|
||||
else {
|
||||
// search box
|
||||
printLogoAndSearchBox(sb,r,catId);
|
||||
printLogoAndSearchBox(&sb,r,catId);
|
||||
// radio buttons for search dmoz. no, this is printed
|
||||
// from call to printLogoAndSearchBox()
|
||||
//printDmozRadioButtons(sb,catId);
|
||||
// the dmoz breadcrumb
|
||||
printDMOZCrumb ( sb,catId,xml);
|
||||
printDMOZCrumb ( &sb,catId,xml);
|
||||
// print the subtopcis in this topic. show as links above
|
||||
// the search results
|
||||
printDMOZSubTopics ( sb, catId , xml );
|
||||
printDMOZSubTopics ( &sb, catId , xml );
|
||||
// ok, for now just print the dmoz topics since our search
|
||||
// results will be empty... until populated!
|
||||
g_categories->printUrlsInTopic ( &sb , catId );
|
||||
|
@ -7610,8 +7610,8 @@ bool printAdminLinks ( SafeBuf &sb , State7 *st ) {
|
||||
// get the filename directly
|
||||
sb.safePrintf (" "
|
||||
"<font color=red><b>"
|
||||
//"<a href=\"/master/tagdb?f=%li&c=%s&u=%s\">"
|
||||
"<a href=\"/master/tagdb?"
|
||||
//"<a href=\"/admin/tagdb?f=%li&c=%s&u=%s\">"
|
||||
"<a href=\"/admin/tagdb?"
|
||||
//"tagid0=%li&"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
@ -7631,7 +7631,7 @@ bool printAdminLinks ( SafeBuf &sb , State7 *st ) {
|
||||
//long bannedTagId = getTagTypeFromStr("manualban",9);
|
||||
sb.safePrintf (" "
|
||||
"<font color=red><b>"
|
||||
"<a href=\"/master/tagdb?"
|
||||
"<a href=\"/admin/tagdb?"
|
||||
//"tagid0=%li&"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
@ -7876,7 +7876,7 @@ void printAdminEventOptions ( SafeBuf* sb,
|
||||
sb->safePrintf("Ban By Domain: ");
|
||||
|
||||
//long bannedTagId = getTagTypeFromStr("manualban",9);
|
||||
sb->safePrintf("<a href=\"/master/tagdb?"
|
||||
sb->safePrintf("<a href=\"/admin/tagdb?"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
"u=%s&c=%s\">"
|
||||
@ -8561,13 +8561,13 @@ static bool printResult ( CollectionRec *cr,
|
||||
// . if it's local, don't put the hostname/port in
|
||||
// there cuz it will mess up Global Spec's machine
|
||||
//if ( h->m_groupId == g_hostdb.m_groupId )
|
||||
sb.safePrintf(" - <a href=\"/master/titledb?c=%s&"
|
||||
sb.safePrintf(" - <a href=\"/admin/titledb?c=%s&"
|
||||
"d=%lli",coll,mr->m_docId);
|
||||
// then the [info] link to show the TitleRec
|
||||
sb.safePrintf ( "\">[info]</a>" );
|
||||
|
||||
// now the analyze link
|
||||
sb.safePrintf (" - <a href=\"/master/parser?c=%s&"
|
||||
sb.safePrintf (" - <a href=\"/admin/parser?c=%s&"
|
||||
"old=1&hc=%li&u=",
|
||||
coll,
|
||||
(long)mr->m_hopcount);
|
||||
@ -8629,7 +8629,7 @@ static bool printResult ( CollectionRec *cr,
|
||||
dbuf ,
|
||||
coll , dbuf );
|
||||
sb.safePrintf(" - "
|
||||
" <a href=\"/master/tagdb?"
|
||||
" <a href=\"/admin/tagdb?"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
"u=%s&c=%s\">"
|
||||
@ -8641,7 +8641,7 @@ static bool printResult ( CollectionRec *cr,
|
||||
memcpy ( dbuf , uu.getHost() , dlen );
|
||||
dbuf [ dlen ] = '\0';
|
||||
sb.safePrintf(" - "
|
||||
" <a href=\"/master/tagdb?"
|
||||
" <a href=\"/admin/tagdb?"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
"u=%s&c=%s\">"
|
||||
@ -17616,7 +17616,7 @@ bool gotCaptchaReply ( State9 *st9 , TcpSocket *s ) {
|
||||
if ( st9->m_isAdmin && 1 == 2) {
|
||||
SafeBuf ttt;
|
||||
ttt.safePrintf("<br>"
|
||||
"<a href=/master/parser?"
|
||||
"<a href=/admin/parser?"
|
||||
//"user=mwells&pwd=mwell62&"
|
||||
"c=%s&u=%s&content=",
|
||||
st9->m_coll,
|
||||
|
@ -120,7 +120,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
mnew ( st , sizeof(State2) , "PageGet1" );
|
||||
// save the socket and if Host: is local in the Http request Mime
|
||||
st->m_socket = s;
|
||||
st->m_isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
|
||||
st->m_isLocal = r->isLocal();
|
||||
st->m_docId = docId;
|
||||
// include header ... "this page cached by Gigablast on..."
|
||||
|
171
PageHosts.cpp
171
PageHosts.cpp
@ -23,6 +23,7 @@ static int dgramsFromSort ( const void *i1, const void *i2 );
|
||||
//static int loadAvgSort ( const void *i1, const void *i2 );
|
||||
static int memUsedSort ( const void *i1, const void *i2 );
|
||||
static int cpuUsageSort ( const void *i1, const void *i2 );
|
||||
static int diskUsageSort ( const void *i1, const void *i2 );
|
||||
|
||||
long generatePingMsg( Host *h, long long nowms, char *buffer );
|
||||
|
||||
@ -130,7 +131,7 @@ skipReplaceHost:
|
||||
if ( g_conf.m_useShotgun ) {
|
||||
colspan = "31";
|
||||
//shotcol = "<td><b>ip2</b></td>";
|
||||
sprintf ( shotcol, "<td><a href=\"/master/hosts?c=%s"
|
||||
sprintf ( shotcol, "<td><a href=\"/admin/hosts?c=%s"
|
||||
"&sort=2\">"
|
||||
"<b>ping2</b></td></a>",
|
||||
coll);
|
||||
@ -142,17 +143,17 @@ skipReplaceHost:
|
||||
"<tr><td colspan=%s><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Hosts "
|
||||
"(<a href=\"/master/hosts?c=%s&sort=%li&reset=1\">"
|
||||
"(<a href=\"/admin/hosts?c=%s&sort=%li&reset=1\">"
|
||||
"reset)</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=0\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=0\">"
|
||||
|
||||
"<b>hostId</b></td>"
|
||||
"<td><b>host ip</b></td>"
|
||||
"<td><b>shard</b></td>" // mirror group
|
||||
"<td><b>stripe</b></td>"
|
||||
"<td><b>shard</b></td>"
|
||||
"<td><b>mirror</b></td>" // mirror # within the shard
|
||||
|
||||
// i don't remember the last time i used this, so let's
|
||||
// just comment it out to save space
|
||||
@ -187,49 +188,52 @@ skipReplaceHost:
|
||||
//"<td><b>resends sent</td>"
|
||||
//"<td><b>errors recvd</td>"
|
||||
//"<td><b>ETRYAGAINS recvd</td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=3\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=3\">"
|
||||
"<b>dgrams resent</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=4\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=4\">"
|
||||
"<b>errors recvd</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=5\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=5\">"
|
||||
"<b>ETRY AGAINS recvd</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=6\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=6\">"
|
||||
"<b>dgrams to</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=7\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=7\">"
|
||||
"<b>dgrams from</a></td>"
|
||||
|
||||
//"<td><a href=\"/master/hosts?c=%s&sort=8\">"
|
||||
//"<td><a href=\"/admin/hosts?c=%s&sort=8\">"
|
||||
//"<b>loadavg</a></td>"
|
||||
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=13\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=13\">"
|
||||
"<b>avg split time</a></td>"
|
||||
|
||||
"<td><b>splits done</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=12\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=12\">"
|
||||
"<b>status</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=15\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=15\">"
|
||||
"<b>slow reads</a></td>"
|
||||
|
||||
"<td><b>docs indexed</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=9\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=9\">"
|
||||
"<b>mem used</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=10\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=10\">"
|
||||
"<b>cpu</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=14\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=17\">"
|
||||
"<b>disk</a></td>"
|
||||
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=14\">"
|
||||
"<b>max ping1</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=11\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=11\">"
|
||||
"<b>ping1 age</a></td>"
|
||||
|
||||
//"<td><b>ip1</td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=1\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=1\">"
|
||||
"<b>ping1</a></td>"
|
||||
|
||||
"%s"// "<td><b>ip2</td>"
|
||||
@ -257,6 +261,7 @@ skipReplaceHost:
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
shotcol );
|
||||
|
||||
// loop through each host we know and print it's stats
|
||||
@ -295,6 +300,7 @@ skipReplaceHost:
|
||||
case 14:gbsort ( hostSort, nh, sizeof(long), pingMaxSort ); break;
|
||||
case 15:gbsort ( hostSort, nh, sizeof(long), slowDiskSort ); break;
|
||||
case 16:gbsort ( hostSort, nh, sizeof(long), defaultSort ); break;
|
||||
case 17:gbsort ( hostSort, nh, sizeof(long), diskUsageSort ); break;
|
||||
}
|
||||
|
||||
// we are the only one that uses these flags, so set them now
|
||||
@ -379,6 +385,15 @@ skipReplaceHost:
|
||||
if ( cpu > 100.0 ) cpu = 100.0;
|
||||
if ( cpu < 0.0 ) cpu = -1.0;
|
||||
|
||||
char diskUsageMsg[64];
|
||||
sprintf(diskUsageMsg,"%.1f%%",h->m_diskUsage);
|
||||
if ( h->m_diskUsage < 0.0 )
|
||||
sprintf(diskUsageMsg,"???");
|
||||
if ( h->m_diskUsage >= 98.0 )
|
||||
sprintf(diskUsageMsg,"<font color=red><b>%.1f%%"
|
||||
"</b></font>",h->m_diskUsage);
|
||||
|
||||
|
||||
// split time, don't divide by zero!
|
||||
long splitTime = 0;
|
||||
if ( h->m_splitsDone )
|
||||
@ -437,7 +452,7 @@ skipReplaceHost:
|
||||
// print it
|
||||
sb.safePrintf (
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><a href=\"http://%s:%hi/master/hosts?"
|
||||
"<td><a href=\"http://%s:%hi/admin/hosts?"
|
||||
""
|
||||
"c=%s"
|
||||
"&sort=%li\">%li</a></td>"
|
||||
@ -494,6 +509,8 @@ skipReplaceHost:
|
||||
"<td>%s%.1f%%%s</td>"
|
||||
// cpu usage
|
||||
"<td>%.1f%%</td>"
|
||||
// disk usage
|
||||
"<td>%s</td>"
|
||||
|
||||
// ping max
|
||||
"<td>%s</td>"
|
||||
@ -547,6 +564,7 @@ skipReplaceHost:
|
||||
h->m_percentMemUsed, // float
|
||||
fontTagBack,
|
||||
cpu, // float
|
||||
diskUsageMsg,
|
||||
|
||||
// ping max
|
||||
pms,
|
||||
@ -564,6 +582,7 @@ skipReplaceHost:
|
||||
sb.safePrintf ( "</table><br>\n" );
|
||||
|
||||
|
||||
/*
|
||||
// print spare hosts table
|
||||
sb.safePrintf (
|
||||
"<table %s>"
|
||||
@ -628,7 +647,9 @@ skipReplaceHost:
|
||||
h->m_note );
|
||||
}
|
||||
sb.safePrintf ( "</table><br>" );
|
||||
*/
|
||||
|
||||
/*
|
||||
// print proxy hosts table
|
||||
sb.safePrintf (
|
||||
"<table %s>"
|
||||
@ -693,7 +714,7 @@ skipReplaceHost:
|
||||
sb.safePrintf (
|
||||
"<tr bgcolor=#%s>"
|
||||
|
||||
"<td><a href=\"http://%s:%hi/master/hosts?"
|
||||
"<td><a href=\"http://%s:%hi/admin/hosts?"
|
||||
""
|
||||
"c=%s\">"
|
||||
"%li</a></td>"
|
||||
@ -736,6 +757,7 @@ skipReplaceHost:
|
||||
h->m_note );
|
||||
}
|
||||
sb.safePrintf ( "</table><br><br>" );
|
||||
*/
|
||||
|
||||
sb.safePrintf(
|
||||
"<style>"
|
||||
@ -753,6 +775,12 @@ skipReplaceHost:
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>host ip</td>"
|
||||
"<td>The primary IP address of the host."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>shard</td>"
|
||||
"<td>"
|
||||
@ -762,26 +790,20 @@ skipReplaceHost:
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>stripe</td>"
|
||||
"<td>mirror</td>"
|
||||
"<td>"
|
||||
"Hosts with the same stripe serve the same shard "
|
||||
"of data."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>ip1</td>"
|
||||
"<td>The primary IP address of the host."
|
||||
"A shard can be mirrored multiple times for "
|
||||
"data redundancy."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>ip2</td>"
|
||||
"<td>The secondary IP address of the host."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>udp port</td>"
|
||||
"<td>The UDP port the host uses to send and recieve "
|
||||
@ -794,7 +816,6 @@ skipReplaceHost:
|
||||
"<td>The UDP port used to send and receive dns traffic with."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>http port</td>"
|
||||
@ -802,7 +823,6 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>best switch id</td>"
|
||||
"<td>The host prefers to be on this switch because it "
|
||||
@ -868,6 +888,43 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>avg split time</td>"
|
||||
"<td>Average time this host took to compute the docids "
|
||||
"for a query. Useful for guaging the slowness of a host "
|
||||
"compare to other hosts."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>splits done</td>"
|
||||
"<td>Number of queries this host completed. Used in "
|
||||
"computation of the <i>avg split time</i>."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>status</td>"
|
||||
"<td>Status flags for the host. See key below."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>slow reads</td>"
|
||||
"<td>Number of slow disk reads the host has had. "
|
||||
"When this is big compared to other hosts it is a good "
|
||||
"indicator its drives are relatively slow."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>docs indexed</td>"
|
||||
"<td>Number of documents this host has indexed over all "
|
||||
"collections. All hosts should have close to the same "
|
||||
"number in a well-sharded situation."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
//"<tr class=poo>"
|
||||
//"<td>loadavg</td>"
|
||||
//"<td>1-minute sliding-window load average from "
|
||||
@ -877,13 +934,26 @@ skipReplaceHost:
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>mem used</td>"
|
||||
"<td>percentage of memory currently used."
|
||||
"<td>Percentage of memory currently used."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>cpu usage</td>"
|
||||
"<td>percentage of cpu resources in use by the gb process."
|
||||
"<td>Percentage of cpu resources in use by the gb process."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>disk usage</td>"
|
||||
"<td>Percentage of disk in use. When this gets close to "
|
||||
"100%% you need to do something."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>max ping1</td>"
|
||||
"<td>The worst ping latency from host to host."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
@ -900,6 +970,7 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>ping2</td>"
|
||||
"<td>Ping time to this host on the seconday/shotgun "
|
||||
@ -907,6 +978,7 @@ skipReplaceHost:
|
||||
"network is not enabled in the master controls."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>M (status flag)</td>"
|
||||
@ -932,6 +1004,27 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>R (status flag)</td>"
|
||||
"<td>Indicates host is performing a rebalance operation."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>F (status flag)</td>"
|
||||
"<td>Indicates host has foreign records and requires "
|
||||
"a rebalance operation."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>x (status flag)</td>"
|
||||
"<td>Indicates host has abruptly exited due to a fatal "
|
||||
"error (cored) and "
|
||||
"restarted itself."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
|
||||
,
|
||||
TABLE_STYLE
|
||||
@ -1156,3 +1249,11 @@ int cpuUsageSort ( const void *i1, const void *i2 ) {
|
||||
if ( h1->m_cpuUsage < h2->m_cpuUsage ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int diskUsageSort ( const void *i1, const void *i2 ) {
|
||||
Host *h1 = g_hostdb.getHost ( *(long*)i1 );
|
||||
Host *h2 = g_hostdb.getHost ( *(long*)i2 );
|
||||
if ( h1->m_diskUsage > h2->m_diskUsage ) return -1;
|
||||
if ( h1->m_diskUsage < h2->m_diskUsage ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ public:
|
||||
Msg1 m_msg1;
|
||||
IndexList m_list;
|
||||
//IndexList m_list2;
|
||||
collnum_t m_collnum;
|
||||
char m_query[MAX_QUERY_LEN+1];
|
||||
long m_queryLen;
|
||||
//char m_coll[MAX_COLL_LEN+1];
|
||||
@ -84,6 +85,10 @@ bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
g_errno = ECOLLTOOBIG;
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
}
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
if ( ! cr ) {
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
}
|
||||
// make a state
|
||||
State10 *st ;
|
||||
try { st = new (State10); }
|
||||
@ -121,10 +126,11 @@ bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
//st->m_collLen = collLen;
|
||||
//st->m_coll [ collLen ] ='\0';
|
||||
st->m_coll = coll;
|
||||
st->m_collnum = cr->m_collnum;
|
||||
// save the TcpSocket
|
||||
st->m_socket = s;
|
||||
// and if the request is local/internal or not
|
||||
st->m_isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
|
||||
st->m_isLocal = r->isLocal();
|
||||
st->m_r.copy ( r );
|
||||
// . check for add/delete request
|
||||
@ -151,7 +157,7 @@ bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
// call msg1 to add/delete key
|
||||
if ( ! st->m_msg1.addList ( &st->m_keyList,
|
||||
RDB_INDEXDB,
|
||||
st->m_coll,
|
||||
st->m_collnum,
|
||||
st,
|
||||
addedKeyWrapper,
|
||||
false,
|
||||
@ -174,7 +180,7 @@ bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
// skip if nothing
|
||||
else return gotTermFreq ( st );
|
||||
// get the termfreq of this term!
|
||||
if ( ! st->m_msg36.getTermFreq ( coll ,
|
||||
if ( ! st->m_msg36.getTermFreq ( st->m_collnum ,
|
||||
0 ,
|
||||
st->m_termId,
|
||||
st ,
|
||||
@ -258,7 +264,7 @@ loop:
|
||||
0 , // max cache age
|
||||
false , // add to cache?
|
||||
rdbId , // RDB_INDEXDB , // rdbId of 2 = indexdb
|
||||
st->m_coll ,
|
||||
st->m_collnum ,
|
||||
&st->m_list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
@ -405,7 +411,7 @@ bool gotIndexList2 ( void *state , RdbList *list ) {
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base;
|
||||
if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true;
|
||||
if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_collnum)))return true;
|
||||
|
||||
// print the standard header for admin pages
|
||||
pbuf->safePrintf (
|
||||
@ -529,8 +535,8 @@ bool gotIndexList2 ( void *state , RdbList *list ) {
|
||||
"<tr><td>%li.</td>"
|
||||
"<td>%s%i</td>"
|
||||
"<td>"
|
||||
//"<a href=http://%s:%hu/master/titledb?d=%llu>"
|
||||
"<a href=/master/titledb?c=%s&d=%llu>"
|
||||
//"<a href=http://%s:%hu/admin/titledb?d=%llu>"
|
||||
"<a href=/admin/titledb?c=%s&d=%llu>"
|
||||
"%llu"
|
||||
//"<td><a href=/cgi/4.cgi?d=%llu>%llu"
|
||||
"</td>"
|
||||
@ -596,8 +602,8 @@ bool gotIndexList2 ( void *state , RdbList *list ) {
|
||||
"<td>%llu</td>"
|
||||
"<td>%lu</td><td>%i</td>"
|
||||
"<td>"
|
||||
//"<a href=http://%s:%hu/master/titledb?d=%llu>"
|
||||
"<a href=/master/titledb?c=%s&d=%llu>"
|
||||
//"<a href=http://%s:%hu/admin/titledb?d=%llu>"
|
||||
"<a href=/admin/titledb?c=%s&d=%llu>"
|
||||
"%llu"
|
||||
//"<td><a href=/cgi/4.cgi?d=%llu>%llu"
|
||||
"</td></tr>\n" ,
|
||||
|
@ -55,8 +55,8 @@ bool sendPageInject ( TcpSocket *s , HttpRequest *r ) {
|
||||
strncpy(msg7->m_coll,coll,MAX_COLL_LEN);
|
||||
|
||||
// for diffbot
|
||||
if ( crawlbotAPI )
|
||||
msg7->m_hr.copy ( r );
|
||||
//if ( crawlbotAPI )
|
||||
msg7->m_hr.copy ( r );
|
||||
|
||||
// a scrape request?
|
||||
char *qts = r->getString("qts",NULL);
|
||||
@ -169,12 +169,7 @@ bool sendReply ( void *state ) {
|
||||
SafeBuf sb;
|
||||
|
||||
// print admin bar
|
||||
g_pages.printAdminTop ( &sb, // p , pend ,
|
||||
PAGE_INJECT,
|
||||
NULL, // msg7->m_username ,
|
||||
msg7->m_coll ,
|
||||
NULL , // pwd
|
||||
s->m_ip );
|
||||
g_pages.printAdminTop ( &sb, s , &msg7->m_hr );
|
||||
|
||||
// if there was an error let them know
|
||||
char msg[1024];
|
||||
@ -231,10 +226,23 @@ bool sendReply ( void *state ) {
|
||||
"indexed in real time "
|
||||
"while you wait. The browser will return the "
|
||||
"final index status code. Alternatively, "
|
||||
"use the <i>add urls</i> page "
|
||||
"to add URLs in bulk or to just add to the spider queue "
|
||||
"without having to wait for the page or pages to be "
|
||||
"actually indexed in realtime."
|
||||
"use the <a href=/admin/addurl>add url</a> page "
|
||||
"to add urls individually or in bulk "
|
||||
"without having to wait for the pages to be "
|
||||
"actually indexed in realtime. "
|
||||
|
||||
"By default, injected urls "
|
||||
"take precedence over the \"insitelist\" directive in the "
|
||||
"<a href=/admin/filters>url filters</a> "
|
||||
"so injected urls need not match the "
|
||||
"<a href=/admin/sites>spider sites</a> patterns. You can "
|
||||
"change that behavior in the <a href=/admin/filters>url "
|
||||
"filters</a> if you want. "
|
||||
"Injected urls will have a "
|
||||
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
|
||||
"The injection api is described on the "
|
||||
"<a href=/admin/api>api</a> page."
|
||||
|
||||
"</font>"
|
||||
"</td>"
|
||||
|
||||
@ -242,7 +250,15 @@ bool sendReply ( void *state ) {
|
||||
"<input type=text name=u value=\"\" size=50>"
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr class=poo><td><b>query to scrape</b></td>"
|
||||
"<tr class=poo><td><b>query to scrape</b>"
|
||||
|
||||
"<br>"
|
||||
"<font size=-2>"
|
||||
"Scrape other search engines and inject their links "
|
||||
"for this query. "
|
||||
"</font>"
|
||||
|
||||
"</td>"
|
||||
"<td>\n"
|
||||
"<input type=text name=qts value=\"\" size=50>"
|
||||
"</td></tr>\n\n"
|
||||
|
@ -49,9 +49,9 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
p->reserve2x(65535);
|
||||
|
||||
//long user = g_pages.getUserType( s , r );
|
||||
char *username = g_users.getUsername(r);
|
||||
char *pwd = r->getString ("pwd");
|
||||
char *coll = r->getString ("c");
|
||||
//char *username = g_users.getUsername(r);
|
||||
//char *pwd = r->getString ("pwd");
|
||||
//char *coll = r->getString ("c");
|
||||
long refreshRate = r->getLong("rr", 0);
|
||||
long sampleSize = r->getLong("ss", 2048);
|
||||
if(refreshRate > 0)
|
||||
@ -61,8 +61,8 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
// char *ss = p->getBuf();
|
||||
// char *ssend = p->getBufEnd();
|
||||
g_pages.printAdminTop ( p , PAGE_LOGVIEW, username,
|
||||
coll , pwd , s->m_ip );
|
||||
g_pages.printAdminTop ( p, s, r );
|
||||
|
||||
// p->incrementLength(sss - ss);
|
||||
|
||||
long nh = g_hostdb.getNumHosts();
|
||||
|
148
PageLogin.cpp
148
PageLogin.cpp
@ -1,148 +0,0 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "Pages.h"
|
||||
#include "Parms.h"
|
||||
#include "Users.h"
|
||||
|
||||
bool sendPageLogin ( TcpSocket *s , HttpRequest *r ) {
|
||||
return sendPageLogin ( s , r, NULL);
|
||||
}
|
||||
|
||||
bool sendPageLogin ( TcpSocket *s , HttpRequest *r , char *emsg ) {
|
||||
|
||||
// get the collection
|
||||
long collLen = 0;
|
||||
char *coll = r->getString("c",&collLen);
|
||||
if ( ! coll || ! coll[0] ) {
|
||||
//coll = g_conf.m_defaultColl;
|
||||
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
|
||||
collLen = gbstrlen(coll);
|
||||
}
|
||||
|
||||
// does collection exist? ...who cares, proxy doesn't have coll data.
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
//if ( ! cr ) emsg = "Collection does not exist.";
|
||||
|
||||
|
||||
// log off user whose username is in the cookie
|
||||
char *username = r->getStringFromCookie("username",NULL);
|
||||
char *password = r->getString("pwd",NULL);
|
||||
if ( username && !password ) g_users.logoffUser( username, s->m_ip );
|
||||
|
||||
// get username from the request
|
||||
username = NULL;
|
||||
username = r->getString("username",NULL);
|
||||
|
||||
// reset emsg if user is coming for the first time
|
||||
long page = g_pages.getDynamicPageNumber(r);
|
||||
if ( !username && !password &&
|
||||
(page == PAGE_LOGIN || page == PAGE_LOGIN2) && emsg)
|
||||
emsg ="";
|
||||
|
||||
// just make cookie same format as an http request for ez parsing
|
||||
char cookieData[2024];
|
||||
char host[1024]="";
|
||||
/*if ( cr && userType == USER_MASTER && username )
|
||||
return g_parms.sendPageGeneric ( s , r , PAGE_MASTER , cookie);
|
||||
if ( userType == USER_ADMIN && username )
|
||||
return g_parms.sendPageGeneric ( s , r , PAGE_SEARCH , cookie);
|
||||
*/
|
||||
|
||||
|
||||
// print it
|
||||
char buf [ 2*1024 ];
|
||||
char *p = buf;
|
||||
char *pend = buf + 2*1024;
|
||||
|
||||
// print colors
|
||||
p = g_pages.printColors ( p , pend );
|
||||
// start table
|
||||
sprintf ( p , "<table><tr><td>");
|
||||
p += gbstrlen ( p );
|
||||
// print logo
|
||||
p = g_pages.printLogo ( p , pend , coll );
|
||||
|
||||
// make it printable
|
||||
char *pu = g_users.getUsername(r);
|
||||
if ( ! pu ) pu = "";
|
||||
|
||||
// then Login
|
||||
if ( r->getHostLen() < 1024 )
|
||||
strncpy ( host, r->getHost(), r->getHostLen() );
|
||||
|
||||
char *cookie = NULL;
|
||||
User *user = NULL;
|
||||
if ( username && host[0] ) user = g_users.getUser(username);
|
||||
if ( user && !emsg ){
|
||||
sprintf ( cookieData , "username=%s;expires=0;"
|
||||
,username);
|
||||
|
||||
// try to the get reference Page
|
||||
long refPage = r->getLong("ref",-1);
|
||||
if ( refPage >= 0 && refPage != PAGE_LOGIN && refPage != PAGE_LOGIN2
|
||||
&& g_users.hasPermission(username,refPage)){
|
||||
WebPage *page = g_pages.getPage(refPage);
|
||||
sprintf ( p, "<meta http-equiv=\"refresh\" content=\"0;"
|
||||
"http://%s/%s?c=%s\">",
|
||||
host,page->m_filename,coll);
|
||||
}
|
||||
else{
|
||||
long pageNum = user->firstPage();
|
||||
char *path = g_pages.getPath(pageNum);
|
||||
sprintf ( p, "<meta http-equiv=\"refresh\" content=\"0;"
|
||||
"http://%s/%s?c=%s\">",
|
||||
host,path,coll);
|
||||
}
|
||||
p += gbstrlen ( p );
|
||||
cookie = cookieData;
|
||||
}
|
||||
|
||||
if ( !emsg ) emsg = "";
|
||||
sprintf ( p ,
|
||||
" "
|
||||
"</td><td><font size=+1><b>Login</b></font></td></tr>"
|
||||
"</table>"
|
||||
"<form method=post action=\"/login\" name=f>"
|
||||
"<input type=hidden name=ref value=\"%li\">"
|
||||
"<center>"
|
||||
"<br><br>"
|
||||
"<font color=ff0000><b>%s</b></font>"
|
||||
"<br><br>"
|
||||
|
||||
"<table cellpadding=2><tr><td>"
|
||||
"<b>Username</td><td>"
|
||||
"<input type=text name=username size=30 value=\"%s\">"
|
||||
"</td><td></td></tr>"
|
||||
"<tr><td>"
|
||||
|
||||
"<b>Collection</td><td>"
|
||||
"<input type=text name=c size=30 value=\"%s\">"
|
||||
"</td><td></td></tr>"
|
||||
"<tr><td>"
|
||||
"<b>Password</td><td><input type=password name=pwd size=30>"
|
||||
"</td><td>"
|
||||
"<input type=submit value=ok border=0></td>"
|
||||
"</tr></table>"
|
||||
"</center>"
|
||||
"<br><br>",
|
||||
page, emsg , pu , coll );
|
||||
p += gbstrlen ( p );
|
||||
// master test
|
||||
/*
|
||||
long user = g_pages.getUserType ( s , r );
|
||||
if ( user != USER_MASTER ) {
|
||||
sprintf ( p , "\n<input type=hidden name=master value=0>\n"
|
||||
"</form>" );
|
||||
p += gbstrlen ( p );
|
||||
}
|
||||
*/
|
||||
// print the tail
|
||||
p = g_pages.printTail ( p , pend , r->isLocal() ); // pwd
|
||||
// send the page
|
||||
return g_httpServer.sendDynamicPage ( s , buf , p - buf ,
|
||||
-1 , // cacheTime
|
||||
false , // POSTReply?
|
||||
NULL , // contentType
|
||||
-1 ,
|
||||
cookie);// Forbidden http status
|
||||
}
|
@ -1451,8 +1451,8 @@ bool sendPageOverview ( TcpSocket *s , HttpRequest *r ) {
|
||||
"You can specify different indexing and spider parameters on a per URL basis by one or more of the following methods:\n"
|
||||
"<br><br>\n"
|
||||
"<ul>\n"
|
||||
"<li>Using the <a href=\"/master/tagdb\">tagdb interface</a>, you can assign a <a href=#ruleset>ruleset</a> to a set of sites. All you do is provide Gigablast with a list of sites and the ruleset to use for those sites.\n"
|
||||
"You can enter the sites via the <a href=\"/master/tagdb\">HTML form</a> or you can provide Gigablast with a file of the sites. Each file must be limited to 1 Megabyte, but you can add hundreds of millions of sites. \n"
|
||||
"<li>Using the <a href=\"/admin/tagdb\">tagdb interface</a>, you can assign a <a href=#ruleset>ruleset</a> to a set of sites. All you do is provide Gigablast with a list of sites and the ruleset to use for those sites.\n"
|
||||
"You can enter the sites via the <a href=\"/admin/tagdb\">HTML form</a> or you can provide Gigablast with a file of the sites. Each file must be limited to 1 Megabyte, but you can add hundreds of millions of sites. \n"
|
||||
"Sites can be full URLs, hostnames, domain names or IP addresses.\n"
|
||||
"If you add a site which is just a canonical domain name with no explicit host name, like gigablast.com, then any URL with the same domain name, regardless of its host name will match that site. That is, \"hostname.gigablast.com\" will match the site \"gigablast.com\" and therefore be assigned the associated ruleset.\n"
|
||||
"Sites may also use IP addresses instead of domain names. If the least significant byte of an IP address that you submit to tagdb is 0 then any URL with the same top 3 IP bytes as that IP will be considered a match.\n"
|
||||
@ -1917,7 +1917,7 @@ bool sendPageOverview ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<br>\n"
|
||||
"After the base score is computed, it is multiplied by the number of occurences of the word or phrase in the portion of the document being indexed as specified by the index rule. This score may then be reduced if spam detection occurred and the word or phrase was deemed repetitious. Spam detection is triggered when the quality of the document is at or below the value specified in the <minQualityForSpamDetect> tag in the index rule. Finally, the score is mapped into an 8 bit value, from 1 to 255, and stored in the index."
|
||||
"<br><br>\n"
|
||||
"To see the scoring algorithm in action you can use the <b><a href=\"/master/parser\">Parser Tool</a></b>. It will show each indexed word and phrase and its associated score, as well as some attributes associated with the indexed document."
|
||||
"To see the scoring algorithm in action you can use the <b><a href=\"/admin/parser\">Parser Tool</a></b>. It will show each indexed word and phrase and its associated score, as well as some attributes associated with the indexed document."
|
||||
""
|
||||
"<br>\n"
|
||||
"<br>\n"
|
||||
|
@ -1,8 +1,8 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "PageParser.h"
|
||||
#include "IndexTable.h"
|
||||
#include "IndexTable2.h"
|
||||
//#include "IndexTable.h"
|
||||
//#include "IndexTable2.h"
|
||||
//#include "XmlDoc.h" // addCheckboxSpan()
|
||||
|
||||
bool g_inPageParser = false;
|
||||
@ -101,7 +101,7 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
st->m_termFreqs = termFreqs;
|
||||
st->m_termFreqWeights = termFreqWeights;
|
||||
st->m_affWeights = affWeights;
|
||||
st->m_total = (score_t)-1;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
@ -561,7 +561,7 @@ bool processLoop ( void *state ) {
|
||||
// . save the ips.txt file if we are the test coll
|
||||
// . saveTestBuf() is a function in Msge1.cpp
|
||||
CollectionRec *cr = xd->getCollRec();
|
||||
if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
|
||||
if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
|
||||
// use same dir that XmlDoc::getTestDir() would use
|
||||
saveTestBuf ( "test-page-parser" );
|
||||
// now get the meta list, in the process it will print out a
|
||||
@ -654,7 +654,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
|
||||
//st->m_termFreqs = termFreqs;
|
||||
//st->m_termFreqWeights = termFreqWeights;
|
||||
//st->m_affWeights = affWeights;
|
||||
st->m_total = (score_t)-1;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
@ -855,7 +855,7 @@ bool gotXmlDoc ( void *state ) {
|
||||
|
||||
// . save the ips.txt file if we are the test coll
|
||||
// . saveTestBuf() is a function in Msge1.cpp
|
||||
//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test"))
|
||||
//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123"))
|
||||
// // use same dir that XmlDoc::getTestDir() would use
|
||||
// saveTestBuf ( "test-page-parser" );
|
||||
|
||||
|
@ -80,7 +80,7 @@ public:
|
||||
long long *m_termFreqs;
|
||||
float *m_termFreqWeights;
|
||||
float *m_affWeights;
|
||||
score_t m_total;
|
||||
//score_t m_total;
|
||||
bool m_freeIt;
|
||||
bool m_blocked;
|
||||
|
||||
|
@ -29,11 +29,12 @@ static bool printInterface ( SafeBuf *sb , char *q ,//long user ,
|
||||
class State13 {
|
||||
public:
|
||||
char m_query [ MAX_QUERY_LEN + 1];
|
||||
char m_isAdmin;
|
||||
//char m_isAdmin;
|
||||
Msg1c m_msg1c;
|
||||
//Msg1d m_msg1d;
|
||||
char m_coll [ MAX_COLL_LEN + 1];
|
||||
long m_collLen;
|
||||
//char m_coll [ MAX_COLL_LEN + 1];
|
||||
//long m_collLen;
|
||||
collnum_t m_collnum;
|
||||
TcpSocket *m_socket;
|
||||
//char m_replyBuf[64*1024];
|
||||
//long m_replyBufSize;
|
||||
@ -61,7 +62,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
pwd [ len ] = '\0';
|
||||
|
||||
// are we the admin?
|
||||
bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
//bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
//long user = g_pages.getUserType ( s , r );
|
||||
char *username = g_users.getUsername ( r );
|
||||
char *errmsg = NULL;
|
||||
@ -75,6 +76,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
return g_httpServer.sendErrorReply ( s , 500 ,
|
||||
"Collection does not exist.");
|
||||
}
|
||||
/*
|
||||
bool isAssassin = cr->isAssassin ( s->m_ip );
|
||||
if ( isAdmin ) isAssassin = true;
|
||||
|
||||
@ -82,15 +84,14 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
if ( ! isAssassin && ! cr->hasPermission ( r , s ) ) {
|
||||
log("admin: Bad collection name "
|
||||
"or password. Query reindex failed. Permission denied.");
|
||||
return sendPageLogin ( s , r ,
|
||||
return sendPagexxxx ( s , r ,
|
||||
"Collection name or "
|
||||
"password is incorrect.");
|
||||
}
|
||||
|
||||
*/
|
||||
// get collection name and its length
|
||||
char *coll = cr->m_coll;
|
||||
long collLen = gbstrlen ( coll );
|
||||
|
||||
//long collLen = gbstrlen ( coll );
|
||||
|
||||
//char buf[64*1024];
|
||||
//char *p = buf;
|
||||
@ -131,7 +132,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
mnew ( st , sizeof(State13) , "PageReindex" );
|
||||
|
||||
// set stuff now
|
||||
st->m_isAdmin = isAdmin;
|
||||
//st->m_isAdmin = isAdmin;
|
||||
|
||||
|
||||
// save the query to static buffer
|
||||
@ -140,6 +141,8 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
memcpy ( st->m_query , t , len );
|
||||
st->m_query[len] = '\0';
|
||||
|
||||
st->m_collnum = cr->m_collnum;
|
||||
|
||||
// save start and end numbers
|
||||
long startNum = r->getLong ( "srn" , 0 );
|
||||
long endNum = r->getLong ( "ern" , 0 );
|
||||
@ -152,9 +155,9 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
bool updateTags = r->getLong ( "updatetags", 0 );
|
||||
|
||||
// copy collection
|
||||
memcpy ( st->m_coll , coll , collLen );
|
||||
st->m_coll [ collLen ] = '\0';
|
||||
st->m_collLen=collLen;
|
||||
//memcpy ( st->m_coll , coll , collLen );
|
||||
//st->m_coll [ collLen ] = '\0';
|
||||
//st->m_collLen=collLen;
|
||||
|
||||
// fix parms
|
||||
if ( startNum < 0 ) startNum = 0 ;
|
||||
@ -202,7 +205,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
// place holder, for holding response when we're done adding
|
||||
// all these docids to the spider queue
|
||||
st->m_placeOff = rp->length() ;
|
||||
for ( long i = 0 ; i < 100 ; i++ )
|
||||
for ( long i = 0 ; i < 200 ; i++ )
|
||||
rp->pushChar(' ');
|
||||
//memset ( rp , ' ' , 100 );
|
||||
//rp += 100;
|
||||
@ -239,7 +242,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
*/
|
||||
// let msg1d do all the work now
|
||||
if ( ! st->m_msg1c.reindexQuery ( st->m_query ,
|
||||
st->m_coll,
|
||||
st->m_collnum,
|
||||
startNum ,
|
||||
endNum ,
|
||||
(bool)forceDel ,
|
||||
@ -266,6 +269,7 @@ void doneReindexing ( void *state ) {
|
||||
g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
|
||||
mdelete ( st , sizeof(State13) , "PageTagdb" );
|
||||
delete (st);
|
||||
return;
|
||||
}
|
||||
// if no error, send the pre-generated page
|
||||
// this must be under 100 chars or it messes our reply buf up
|
||||
@ -462,12 +466,12 @@ static void addedListWrapper ( void *state ) ;
|
||||
Msg1c::Msg1c() {
|
||||
m_numDocIds = 0;
|
||||
m_numDocIdsAdded = 0;
|
||||
m_coll = NULL;
|
||||
m_collnum = -1;
|
||||
m_callback = NULL;
|
||||
}
|
||||
|
||||
bool Msg1c::reindexQuery ( char *query ,
|
||||
char *coll ,
|
||||
collnum_t collnum ,//char *coll ,
|
||||
long startNum ,
|
||||
long endNum ,
|
||||
bool forceDel ,
|
||||
@ -475,7 +479,7 @@ bool Msg1c::reindexQuery ( char *query ,
|
||||
void *state ,
|
||||
void (* callback) (void *state ) ) {
|
||||
|
||||
m_coll = coll;
|
||||
m_collnum = collnum;// = coll;
|
||||
m_startNum = startNum;
|
||||
m_endNum = endNum;
|
||||
m_forceDel = forceDel;
|
||||
@ -489,12 +493,15 @@ bool Msg1c::reindexQuery ( char *query ,
|
||||
// langunknown?
|
||||
m_qq.set2 ( query , langId , true ); // /*bool flag*/ );
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
// reset again just in case
|
||||
m_req.reset();
|
||||
// set our Msg39Request
|
||||
m_req.ptr_coll = coll;
|
||||
m_req.size_coll = gbstrlen(coll)+1;
|
||||
//m_req.ptr_coll = coll;
|
||||
//m_req.size_coll = gbstrlen(coll)+1;
|
||||
m_req.m_collnum = m_collnum;
|
||||
m_req.m_docsToGet = endNum;
|
||||
m_req.m_niceness = 0,
|
||||
m_req.m_getDocIdScoringInfo = false;
|
||||
@ -507,6 +514,7 @@ bool Msg1c::reindexQuery ( char *query ,
|
||||
m_req.m_queryExpansion = true; // so it's like regular rslts
|
||||
// add language dropdown or take from [query reindex] link
|
||||
m_req.m_language = langId;
|
||||
//m_req.m_debug = 1;
|
||||
|
||||
// log for now
|
||||
logf(LOG_DEBUG,"reindex: qlangid=%li q=%s",langId,query);
|
||||
@ -661,7 +669,7 @@ bool Msg1c::gotList ( ) {
|
||||
|
||||
if ( ! m_msg4.addMetaList ( m_sb.getBufStart() ,
|
||||
m_sb.length() ,
|
||||
m_coll ,
|
||||
m_collnum ,
|
||||
this ,
|
||||
addedListWrapper ,
|
||||
0 , // niceness
|
||||
|
@ -13,7 +13,7 @@ public:
|
||||
Msg1c();
|
||||
|
||||
bool reindexQuery ( char *query ,
|
||||
char *coll ,
|
||||
collnum_t collnum, // char *coll ,
|
||||
long startNum ,
|
||||
long endNum ,
|
||||
bool forceDel ,
|
||||
@ -23,7 +23,8 @@ public:
|
||||
|
||||
bool gotList ( );
|
||||
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
long m_startNum;
|
||||
long m_endNum;
|
||||
bool m_forceDel;
|
||||
|
1678
PageResults.cpp
1678
PageResults.cpp
File diff suppressed because it is too large
Load Diff
@ -2,15 +2,68 @@
|
||||
#define _PAGERESULTS_H_
|
||||
|
||||
#include "SafeBuf.h"
|
||||
#include "Language.h" // MAX_FRAG_SIZE
|
||||
#include "Msg40.h"
|
||||
#include "Msg0.h"
|
||||
|
||||
bool printDmozRadioButtons ( SafeBuf &sb , long catId ) ;
|
||||
bool printLogoAndSearchBox ( SafeBuf &sb , class HttpRequest *hr, long catId );
|
||||
class State0 {
|
||||
public:
|
||||
|
||||
bool printTermPairs ( SafeBuf &sb , class Query *q , class PairScore *ps ) ;
|
||||
bool printSingleTerm ( SafeBuf &sb , class Query *q , class SingleScore *ss );
|
||||
// store results page in this safebuf
|
||||
SafeBuf m_sb;
|
||||
|
||||
// if socket closes before we get a chance to send back
|
||||
// search results, we will know by comparing this to
|
||||
// m_socket->m_numDestroys
|
||||
long m_numDestroys;
|
||||
bool m_header;
|
||||
|
||||
collnum_t m_collnum;
|
||||
Query m_q;
|
||||
SearchInput m_si;
|
||||
Msg40 m_msg40;
|
||||
TcpSocket *m_socket;
|
||||
Msg0 m_msg0;
|
||||
long long m_startTime;
|
||||
//Ads m_ads;
|
||||
bool m_gotAds;
|
||||
bool m_gotResults;
|
||||
char m_spell [MAX_FRAG_SIZE]; // spelling recommendation
|
||||
bool m_gotSpell;
|
||||
long m_errno;
|
||||
Query m_qq3;
|
||||
long m_numDocIds;
|
||||
long long m_took; // how long it took to get the results
|
||||
HttpRequest m_hr;
|
||||
bool m_printedHeaderRow;
|
||||
char m_qe[MAX_QUERY_LEN+1];
|
||||
|
||||
// for printing our search result json items in csv:
|
||||
HashTableX m_columnTable;
|
||||
long m_numCSVColumns;
|
||||
|
||||
// stuff for doing redownloads
|
||||
bool m_didRedownload;
|
||||
XmlDoc *m_xd;
|
||||
long m_oldContentHash32;
|
||||
};
|
||||
|
||||
|
||||
bool printEventAddress ( SafeBuf &sb , char *addrStr , class SearchInput *si ,
|
||||
bool printSearchResultsHeader ( class State0 *st ) ;
|
||||
bool printResult ( class State0 *st, long ix );
|
||||
bool printSearchResultsTail ( class State0 *st ) ;
|
||||
|
||||
|
||||
|
||||
|
||||
bool printDmozRadioButtons ( SafeBuf *sb , long catId ) ;
|
||||
bool printLogoAndSearchBox ( SafeBuf *sb , class HttpRequest *hr, long catId );
|
||||
|
||||
bool printTermPairs ( SafeBuf *sb , class Query *q , class PairScore *ps ) ;
|
||||
bool printSingleTerm ( SafeBuf *sb , class Query *q , class SingleScore *ss );
|
||||
|
||||
|
||||
bool printEventAddress ( SafeBuf *sb , char *addrStr , class SearchInput *si ,
|
||||
double *lat , double *lon , bool isXml ,
|
||||
// use this for printing distance if lat/lon above
|
||||
// is invalid. only for non-xml printing though.
|
||||
@ -20,10 +73,10 @@ bool printEventAddress ( SafeBuf &sb , char *addrStr , class SearchInput *si ,
|
||||
double eventGeocoderLon,
|
||||
char *eventBestPlaceName );
|
||||
|
||||
bool printDMOZCrumb ( SafeBuf &sb , long catId , bool xml ) ;
|
||||
bool printDMOZSubTopics ( SafeBuf& sb, long catId, bool inXml ) ;
|
||||
bool printDMOZCrumb ( SafeBuf *sb , long catId , bool xml ) ;
|
||||
bool printDMOZSubTopics ( SafeBuf *sb, long catId, bool inXml ) ;
|
||||
|
||||
bool printEventCountdown2 ( SafeBuf &sb ,
|
||||
bool printEventCountdown2 ( SafeBuf *sb ,
|
||||
SearchInput *si,
|
||||
long now ,
|
||||
long timeZoneOffset ,
|
||||
|
14
PageRoot.cpp
14
PageRoot.cpp
@ -78,8 +78,9 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
//" <a href=/logout>Logout</a>"
|
||||
);
|
||||
|
||||
if ( r->isLocal() )
|
||||
sb.safePrintf(" [<a href=\"/master?\">Admin</a>]");
|
||||
//if ( r->isLocal() )
|
||||
sb.safePrintf(" [<a href=\"/admin/settings\">"
|
||||
"<font color=red>Admin</font></a>]");
|
||||
sb.safePrintf("</p></b></center></body></html>");
|
||||
return true;
|
||||
}
|
||||
@ -168,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("</form>\n");
|
||||
sb.safePrintf("<br>\n");
|
||||
sb.safePrintf("\n");
|
||||
|
||||
// print any red boxes we might need to
|
||||
if ( printRedBox2 ( &sb , true ) )
|
||||
sb.safePrintf("<br>\n");
|
||||
|
||||
sb.safePrintf("<table cellpadding=3>\n");
|
||||
sb.safePrintf("\n");
|
||||
|
||||
@ -1285,7 +1291,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
// see if they provided a url of a file of urls if they did not
|
||||
// provide a url to add directly
|
||||
bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
bool isAdmin = g_conf.isCollAdmin ( s , r );
|
||||
long ufuLen = 0;
|
||||
char *ufu = NULL;
|
||||
if ( isAdmin )
|
||||
@ -1561,7 +1567,7 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
// allow others to add now
|
||||
s_inprogress = false;
|
||||
// get the state properly
|
||||
//State1 *st1 = (State1 *) state;
|
||||
//State1i *st1 = (State1i *) state;
|
||||
// in order to see what sites are being added log it, then we can
|
||||
// more easily remove sites from sitesearch.gigablast.com that are
|
||||
// being added but not being searched
|
||||
|
@ -537,7 +537,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
long dlen;
|
||||
char *dbuf = ::getDomFast ( hostname,&dlen,false);
|
||||
p->safePrintf(
|
||||
" <a href=\"/master/tagdb?"
|
||||
" <a href=\"/admin/tagdb?"
|
||||
"user=admin&"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
|
@ -620,7 +620,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td colspan=50>"
|
||||
"<center><b>Spider Compression Proxy Stats</b> "
|
||||
|
||||
" [<a href=\"/master/stats?reset=2\">"
|
||||
" [<a href=\"/admin/stats?reset=2\">"
|
||||
"reset</a>]</td></tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
@ -828,7 +828,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td colspan=50>"
|
||||
"<center><b>Message Stats</b> "
|
||||
|
||||
" [<a href=\"/master/stats?reset=1\">"
|
||||
" [<a href=\"/admin/stats?reset=1\">"
|
||||
"reset</a>]</td></tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
|
@ -284,8 +284,8 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">"
|
||||
"rebuild all data</a> <a href=\"/master/thesaurus?"
|
||||
"<center><b><a href=\"/admin/thesaurus?rebuild=1&%s\">"
|
||||
"rebuild all data</a> <a href=\"/admin/thesaurus?"
|
||||
"rebuild=1&full=1&%s\">(full)</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf, getBuf);
|
||||
@ -300,7 +300,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?distribute=1&%s\">"
|
||||
"<center><b><a href=\"/admin/thesaurus?distribute=1&%s\">"
|
||||
"distribute data</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -314,7 +314,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b>"
|
||||
"<a href=\"/master/thesaurus?reload=1&cast=0&%s\">"
|
||||
"<a href=\"/admin/thesaurus?reload=1&cast=0&%s\">"
|
||||
"reload data</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -328,7 +328,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b>"
|
||||
"<a href=\"/master/thesaurus?reload=1&cast=1&%s\">"
|
||||
"<a href=\"/admin/thesaurus?reload=1&cast=1&%s\">"
|
||||
"reload data (all hosts)</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -342,7 +342,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%%>"
|
||||
"<form action=\"/master/thesaurus>\">"
|
||||
"<form action=\"/admin/thesaurus>\">"
|
||||
"<input type=text name=synonym size=20>"
|
||||
"<input type=submit value=Submit>"
|
||||
"%s"
|
||||
@ -365,7 +365,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?cancel=1&%s\">"
|
||||
"<center><b><a href=\"/admin/thesaurus?cancel=1&%s\">"
|
||||
"cancel running rebuild</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -380,8 +380,8 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">"
|
||||
"rebuild affinity</a> <a href=\"/master/thesaurus?"
|
||||
"<center><b><a href=\"/admin/thesaurus?rebuildaff=1&%s\">"
|
||||
"rebuild affinity</a> <a href=\"/admin/thesaurus?"
|
||||
"rebuildaff=1&full=1&%s\">(full)</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf, getBuf);
|
||||
@ -405,7 +405,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"character, optionally followed by another pipe and a type "
|
||||
"designation; any badly formatted lines will be silently "
|
||||
"ignored</font><br>\n"
|
||||
"<form action=\"/master/thesaurus\" method=post>"
|
||||
"<form action=\"/admin/thesaurus\" method=post>"
|
||||
"<textarea name=\"manualadd\" rows=20 cols=80>");
|
||||
|
||||
if (manualAdd && manualAddLen) {
|
||||
@ -434,7 +434,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"that these pairs will only work if the thesaurus otherwise "
|
||||
"has an entry for them, so add them to the manual add file "
|
||||
"above if need be</font><br>\n"
|
||||
"<form action=\"/master/thesaurus\" method=post>"
|
||||
"<form action=\"/admin/thesaurus\" method=post>"
|
||||
"<textarea name=\"affinityadd\" rows=20 cols=80>");
|
||||
|
||||
if (affinityAdd && affinityAddLen) {
|
||||
|
@ -58,7 +58,7 @@ bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) {
|
||||
// copy it
|
||||
st->m_r.copy ( r );
|
||||
// remember if http request is internal/local or not
|
||||
st->m_isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
|
||||
st->m_isLocal = r->isLocal();
|
||||
st->m_docId = docId;
|
||||
// password, too
|
||||
|
111
Pages.h
111
Pages.h
@ -5,6 +5,9 @@
|
||||
#ifndef _PAGES_H_
|
||||
#define _PAGES_H_
|
||||
|
||||
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ;
|
||||
bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
|
||||
|
||||
// for PageEvents.cpp and Accessdb.cpp
|
||||
//#define RESULTSWIDTHSTR "550px"
|
||||
|
||||
@ -26,13 +29,18 @@ extern char *g_msg;
|
||||
// . declare all dynamic functions here
|
||||
// . these are all defined in Page*.cpp files
|
||||
// . these are called to send a dynamic page
|
||||
bool sendPageBasicSettings ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageBasicStatus ( TcpSocket *s , HttpRequest *r );
|
||||
//bool sendPageBasicDiffbot ( TcpSocket *s , HttpRequest *r );
|
||||
|
||||
|
||||
|
||||
bool sendPageRoot ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie );
|
||||
bool sendPageResults ( TcpSocket *s , HttpRequest *r );
|
||||
//bool sendPageEvents ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageGet ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageLogin ( TcpSocket *s , HttpRequest *r , char *emsg );
|
||||
bool sendPageLogin ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageStats ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageHosts ( TcpSocket *s , HttpRequest *r );
|
||||
@ -156,35 +164,6 @@ class Pages {
|
||||
char* bodyJavascript = "" );
|
||||
|
||||
|
||||
char *printAdminTop ( char *p ,
|
||||
char *pend ,
|
||||
TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
char *qs = NULL,
|
||||
char* bodyJavascript = "" ) ;
|
||||
|
||||
bool printAdminTop ( SafeBuf *sb ,
|
||||
long page ,
|
||||
//long user ,
|
||||
char *username,
|
||||
char *coll ,
|
||||
char *pwd ,
|
||||
long fromIp ,
|
||||
char *qs = NULL,
|
||||
char* bodyJavascript = "" );
|
||||
|
||||
char *printAdminTop ( char *p ,
|
||||
char *pend ,
|
||||
long page ,
|
||||
//long user ,
|
||||
char *username,
|
||||
char *coll ,
|
||||
char *pwd ,
|
||||
long fromIp ,
|
||||
char *qs = "",
|
||||
char* bodyJavascript = "" ) ;
|
||||
|
||||
|
||||
bool printAdminTop2 ( SafeBuf *sb ,
|
||||
TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
@ -206,32 +185,33 @@ class Pages {
|
||||
void printFormTop( SafeBuf *sb, HttpRequest *r );
|
||||
void printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r );
|
||||
|
||||
char *printAdminBottom ( char *p, char *pend, HttpRequest *r );
|
||||
char *printAdminBottom ( char *p, char *pend);
|
||||
//char *printAdminBottom ( char *p, char *pend, HttpRequest *r );
|
||||
//char *printAdminBottom ( char *p, char *pend);
|
||||
bool printAdminBottom ( SafeBuf *sb, HttpRequest *r );
|
||||
bool printAdminBottom ( SafeBuf *sb);
|
||||
bool printAdminBottom2 ( SafeBuf *sb, HttpRequest *r );
|
||||
bool printAdminBottom2 ( SafeBuf *sb);
|
||||
bool printTail ( SafeBuf* sb,
|
||||
bool isLocal );
|
||||
bool printSubmit ( SafeBuf *sb ) ;
|
||||
//long user ,
|
||||
//char *username,
|
||||
//char *pwd );
|
||||
char *printTail ( char *p ,
|
||||
char *pend ,
|
||||
bool isLocal );
|
||||
//char *printTail ( char *p ,
|
||||
// char *pend ,
|
||||
// bool isLocal );
|
||||
//long user ,
|
||||
//char *username,
|
||||
//char *pwd ) ;
|
||||
bool printColors ( SafeBuf *sb , char* bodyJavascript = "" ) ;
|
||||
char *printColors ( char *p , char *pend ,
|
||||
char* bodyJavascript = "");
|
||||
//char *printColors ( char *p , char *pend ,
|
||||
// char* bodyJavascript = "");
|
||||
|
||||
char *printColors2 ( char *p , char *pend ) ;
|
||||
//char *printColors2 ( char *p , char *pend ) ;
|
||||
bool printColors3 ( SafeBuf *sb ) ;
|
||||
char *printFocus ( char *p , char *pend ) ;
|
||||
//char *printFocus ( char *p , char *pend ) ;
|
||||
bool printLogo ( SafeBuf *sb, char *coll ) ;
|
||||
char *printLogo ( char *p , char *pend , char *coll ) ;
|
||||
//char *printLogo ( char *p , char *pend , char *coll ) ;
|
||||
bool printHostLinks ( SafeBuf *sb ,
|
||||
long page ,
|
||||
char *username ,
|
||||
@ -240,7 +220,7 @@ class Pages {
|
||||
char *pwd ,
|
||||
long fromIp ,
|
||||
char *qs = NULL ) ;
|
||||
|
||||
/*
|
||||
char *printHostLinks ( char *p ,
|
||||
char *pend ,
|
||||
long page ,
|
||||
@ -248,14 +228,12 @@ class Pages {
|
||||
char *pwd ,
|
||||
long fromIp ,
|
||||
char *qs = NULL ) ;
|
||||
*/
|
||||
bool printAdminLinks ( SafeBuf *sb,
|
||||
long page ,
|
||||
//long user ,
|
||||
char *username,
|
||||
char *coll ,
|
||||
char *pwd ,
|
||||
bool top ) ;
|
||||
|
||||
bool isBasic );
|
||||
/*
|
||||
char *printAdminLinks ( char *p ,
|
||||
char *pend ,
|
||||
long page ,
|
||||
@ -264,6 +242,7 @@ class Pages {
|
||||
char *coll ,
|
||||
char *pwd ,
|
||||
bool top ) ;
|
||||
*/
|
||||
bool printCollectionNavBar ( SafeBuf *sb ,
|
||||
long page ,
|
||||
//long user ,
|
||||
@ -271,7 +250,7 @@ class Pages {
|
||||
char *coll ,
|
||||
char *pwd ,
|
||||
char *qs );
|
||||
|
||||
/*
|
||||
char *printCollectionNavBar ( char *p ,
|
||||
char *pend ,
|
||||
long page ,
|
||||
@ -280,7 +259,7 @@ class Pages {
|
||||
char *coll ,
|
||||
char *pwd ,
|
||||
char *qs = NULL );
|
||||
|
||||
*/
|
||||
/*
|
||||
bool printRulesetDropDown ( SafeBuf *sb ,
|
||||
long user ,
|
||||
@ -321,25 +300,43 @@ enum {
|
||||
PAGE_DIRECTORY ,
|
||||
PAGE_REPORTSPAM ,
|
||||
//PAGE_WORDVECTOR ,
|
||||
|
||||
|
||||
// basic controls page /admin/basic
|
||||
PAGE_BASIC_SETTINGS ,
|
||||
PAGE_BASIC_STATUS ,
|
||||
//PAGE_BASIC_SEARCH , // TODO
|
||||
//PAGE_BASIC_DIFFBOT , // TODO
|
||||
PAGE_BASIC_SECURITY ,
|
||||
PAGE_BASIC_SEARCH ,
|
||||
|
||||
// master admin pages
|
||||
PAGE_MASTER ,
|
||||
PAGE_SEARCH ,
|
||||
PAGE_SPIDER ,
|
||||
PAGE_LOG ,
|
||||
PAGE_SECURITY ,
|
||||
PAGE_ADDCOLL ,
|
||||
PAGE_DELCOLL ,
|
||||
PAGE_REPAIR ,
|
||||
PAGE_SITES , // site filters
|
||||
PAGE_FILTERS ,
|
||||
PAGE_INJECT ,
|
||||
PAGE_ADDURL2 ,
|
||||
PAGE_REINDEX ,
|
||||
|
||||
PAGE_HOSTS ,
|
||||
PAGE_STATS , // 10
|
||||
PAGE_STATSDB ,
|
||||
PAGE_PERF ,
|
||||
PAGE_SOCKETS ,
|
||||
PAGE_LOG ,
|
||||
|
||||
PAGE_LOGVIEW ,
|
||||
// PAGE_SYNC ,
|
||||
PAGE_SECURITY ,
|
||||
PAGE_ADDCOLL ,
|
||||
PAGE_DELCOLL ,
|
||||
PAGE_AUTOBAN , // 20
|
||||
//PAGE_SPIDERLOCKS ,
|
||||
PAGE_PROFILER ,
|
||||
PAGE_THREADS ,
|
||||
PAGE_REPAIR ,
|
||||
|
||||
// PAGE_THESAURUS ,
|
||||
|
||||
// . non master-admin pages (collection controls)
|
||||
@ -352,15 +349,9 @@ enum {
|
||||
PAGE_TITLEDB ,
|
||||
//PAGE_STATSDB ,
|
||||
|
||||
PAGE_SEARCH ,
|
||||
PAGE_SPIDER ,
|
||||
PAGE_CRAWLBOT , // 35
|
||||
PAGE_SPIDERDB ,
|
||||
//PAGE_PRIORITIES , // priority queue controls
|
||||
PAGE_FILTERS ,
|
||||
PAGE_INJECT ,
|
||||
PAGE_ADDURL2 ,
|
||||
PAGE_REINDEX ,
|
||||
//PAGE_KEYWORDS ,
|
||||
PAGE_SEO ,
|
||||
PAGE_ACCESS , //40
|
||||
|
44
Parms.h
44
Parms.h
@ -24,7 +24,9 @@ enum {
|
||||
enum {
|
||||
OBJ_CONF = 1 ,
|
||||
OBJ_COLL ,
|
||||
OBJ_SI }; // SearchInput class
|
||||
OBJ_SI , // SearchInput class
|
||||
OBJ_NONE
|
||||
};
|
||||
|
||||
enum {
|
||||
TYPE_BOOL = 1 ,
|
||||
@ -56,8 +58,8 @@ enum {
|
||||
TYPE_MONOM2 ,
|
||||
TYPE_LONG_CONST ,
|
||||
TYPE_SITERULE , // 29
|
||||
TYPE_SAFEBUF
|
||||
//TYPE_DIFFBOT_DROPDOWN
|
||||
TYPE_SAFEBUF ,
|
||||
TYPE_FILEUPLOADBUTTON
|
||||
};
|
||||
|
||||
//forward decls to make compiler happy:
|
||||
@ -95,9 +97,10 @@ class Page {
|
||||
#define PF_NOSYNC 0x40
|
||||
#define PF_DIFFBOT 0x80
|
||||
|
||||
#define PF_HIDDEN 0x0100
|
||||
#define PF_NOSAVE 0x0200
|
||||
|
||||
#define PF_HIDDEN 0x0100
|
||||
#define PF_NOSAVE 0x0200
|
||||
#define PF_DUP 0x0400
|
||||
#define PF_TEXTAREA 0x0800
|
||||
|
||||
class Parm {
|
||||
public:
|
||||
@ -197,29 +200,22 @@ class Parms {
|
||||
|
||||
void init();
|
||||
|
||||
bool sendPageGeneric ( class TcpSocket *s, class HttpRequest *r,
|
||||
long page , char *cookie = NULL ,
|
||||
// Diffbot.cpp uses this to print the
|
||||
// url filters into
|
||||
SafeBuf *pageBuf = NULL ,
|
||||
// used by diffbot.cpp
|
||||
char *collOverride = NULL ,
|
||||
bool isJSON = false ) ;
|
||||
|
||||
bool sendPageGeneric2 ( class TcpSocket *s , class HttpRequest *r ,
|
||||
long page , char *coll , char *pwd ) ;
|
||||
bool sendPageGeneric ( class TcpSocket *s, class HttpRequest *r );
|
||||
|
||||
bool printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r );
|
||||
|
||||
//char *printParms (char *p, char *pend, TcpSocket *s, HttpRequest *r);
|
||||
bool printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r );
|
||||
|
||||
//char *printParms (char *p,char *pend,long page,char *username,
|
||||
// void *THIS, char *coll , char *pwd ,
|
||||
// long nc , long pd ) ;
|
||||
bool printParms (SafeBuf* sb, long page,char *username,void *THIS,
|
||||
char *coll , char *pwd , long nc , long pd ,
|
||||
bool isCrawlbot = false ,
|
||||
bool isJSON = false );
|
||||
bool printParms2 (SafeBuf* sb,
|
||||
long page,
|
||||
CollectionRec *cr,
|
||||
long nc ,
|
||||
long pd ,
|
||||
bool isCrawlbot ,
|
||||
bool isJSON,
|
||||
TcpSocket *sock
|
||||
);
|
||||
|
||||
/*
|
||||
char *printParm ( char *p ,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user