Merge branch 'diffbot' into diffbot-testing
This commit is contained in:
commit
6e23d37e47
AutoBan.cppBigFile.cppClusterdb.cppDiskPageCache.cppHttpServer.cppLinkdb.cppMsg40.cppMsg5.cppPageCatdb.cppPageEvents.cppPageHosts.cppPageIndexdb.cppPageOverview.cppPageSockets.cppPageStats.cppPageThesaurus.cppParms.cppPosdb.cppProcess.cppProcess.hRdbBase.cppRdbBase.hRdbList.cppRebalance.cppRebalance.hRepair.cppSpider.cppTagdb.cppTest.cppThreads.cppThreads.hTitledb.cpp
coll.main.0
gb.confmain.cpp
20
AutoBan.cpp
20
AutoBan.cpp
@ -859,7 +859,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
&msecs);
|
||||
sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>"
|
||||
"<center><b>Code Usage "
|
||||
"(<a href=\"/master/"
|
||||
"(<a href=\"/admin/"
|
||||
"autoban?c=%s&resetcodes=1\">reset</a> "
|
||||
"%li days %li hours %li "
|
||||
"minutes %li sec ago)"
|
||||
@ -1271,15 +1271,15 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
// "%li days %li hrs %li min ago"
|
||||
// "</center></td>"
|
||||
|
||||
"<td><center><a href=\"/master/"
|
||||
"<td><center><a href=\"/admin/"
|
||||
"autoban?c=%s&allow=%s&showAllIps=%li\">"
|
||||
"allow/</a>"
|
||||
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&deny=%s&showAllIps=%li\">"
|
||||
"deny/</a>"
|
||||
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&clear=%s&showAllIps=%li\">"
|
||||
"clear</a></center>"
|
||||
"</td>",color,
|
||||
@ -1320,22 +1320,22 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td bgcolor=#%s><center><b>Show Ips by Number of Queries"
|
||||
"</b></center></td>",
|
||||
LIGHT_BLUE);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=0\">"
|
||||
"0 Queries</a></b>"
|
||||
"</font></center></td>",
|
||||
coll);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=1\">"
|
||||
"1 Query</a></b>"
|
||||
"</font></center></td>",
|
||||
coll);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=10\">"
|
||||
"10 Queries</a></b>"
|
||||
"</font></center></td>",
|
||||
coll);
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
|
||||
sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
|
||||
"autoban?c=%s&showAllIps=100\">"
|
||||
"100 Queries</a></b>"
|
||||
"</font></center></td></tr>",
|
||||
@ -1469,10 +1469,10 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
m_detectVals[i].m_timesBanned);
|
||||
}
|
||||
sb.safePrintf("<td><center>"
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&allow=%s&showAllIps=%li\">"
|
||||
"allow/</a>"
|
||||
"<a href=\"/master/"
|
||||
"<a href=\"/admin/"
|
||||
"autoban?c=%s&deny=%s&showAllIps=%li\">"
|
||||
"deny</a></center>"
|
||||
"</td>",
|
||||
|
16
BigFile.cpp
16
BigFile.cpp
@ -468,6 +468,9 @@ bool BigFile::readwrite ( void *buf ,
|
||||
fstate->m_callback = callback;
|
||||
fstate->m_niceness = niceness;
|
||||
fstate->m_flags = m_flags;
|
||||
// sanity
|
||||
if ( fstate->m_bytesToGo > 150000000 )
|
||||
log("file: huge read of %lli bytes",(long long)size);
|
||||
// . set our fd's before entering the thread in case RdbMerge
|
||||
// calls our unlinkPart()
|
||||
// . it's thread-UNsafe to call getfd() from within the thread
|
||||
@ -563,10 +566,12 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// request originated through Multicast, then multicast will sleep
|
||||
// and retry. Msg3 could retry, the multicast thing should be more
|
||||
// for running out of udp slots though...
|
||||
if ( g_errno && ! doWrite && g_errno != ENOTHREADSLOTS ) {
|
||||
log (LOG_INFO,"disk: May retry later.");
|
||||
return true;
|
||||
}
|
||||
// crap, call to clone() now fails a lot since we use pthreads
|
||||
// library ... so assume that is it i guess (MDW 3/15/2014)
|
||||
//if ( g_errno && ! doWrite && g_errno != ENOTHREADSLOTS ) {
|
||||
// log (LOG_INFO,"disk: May retry later.");
|
||||
// return true;
|
||||
//}
|
||||
// otherwise, thread spawn failed, do it blocking then
|
||||
g_errno = 0;
|
||||
// if threads are manually disabled don't print these msgs because
|
||||
@ -577,7 +582,8 @@ bool BigFile::readwrite ( void *buf ,
|
||||
if ( now - s_lastTime >= 1 ) {
|
||||
s_lastTime = now;
|
||||
log (LOG_INFO,
|
||||
"disk: Doing blocking disk access. This will hurt "
|
||||
"disk: Doing blocking disk access. "
|
||||
"This will hurt "
|
||||
"performance. isWrite=%li.",(long)doWrite);
|
||||
}
|
||||
}
|
||||
|
@ -395,6 +395,8 @@ bool Clusterdb::verify ( char *coll ) {
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//unsigned long groupId = getGroupId ( RDB_CLUSTERDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
|
@ -108,7 +108,10 @@ bool DiskPageCache::init ( const char *dbname ,
|
||||
// void (*rmVfd2)(DiskPageCache*, long) ) {
|
||||
reset();
|
||||
|
||||
// fix cores while rebalancing
|
||||
// seems like we lose data when it prints "Caught add breach"
|
||||
// so let's stop using until we fix that... happens while we are
|
||||
// dumping i think and somehow the data seems to get lost that
|
||||
// we were dumping.
|
||||
//maxMem = 0;
|
||||
|
||||
m_rdbId = rdbId;
|
||||
|
@ -1903,7 +1903,7 @@ long getMsgSize ( char *buf, long bufSize, TcpSocket *s ) {
|
||||
max = 0x7fffffff; // maxOtherDocLen not available
|
||||
// if post is a /cgi/12.cgi (tagdb) allow 10 megs
|
||||
//if ( pp + 11 < ppend && strncmp ( pp ,"/cgi/12.cgi",11)==0)
|
||||
if ( pp + 11 < ppend && strncmp ( pp ,"/master/tagdb",13)==0)
|
||||
if ( pp + 12 < ppend && strncmp ( pp ,"/admin/tagdb",12)==0)
|
||||
max = 10*1024*1024;
|
||||
if ( pp + 4 < ppend && strncmp ( pp ,"/vec",4)==0)
|
||||
max = 0x7fffffff;
|
||||
|
@ -233,6 +233,8 @@ bool Linkdb::verify ( char *coll ) {
|
||||
list.skipCurrentRecord() ) {
|
||||
key224_t k;
|
||||
list.getCurrentKey((char*)&k);
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//uint32_t shardNum = getShardNum ( RDB_LINKDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
@ -2588,7 +2590,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
for ( long j = 0 ; j < MAX_ENTRY_DOCIDS ; j++ ) {
|
||||
if ( e->m_docIds[j] == -1LL ) break;
|
||||
if ( ! m_printInXml )
|
||||
m_pbuf->safePrintf ("<a href=\"/master/titledb"
|
||||
m_pbuf->safePrintf ("<a href=\"/admin/titledb"
|
||||
"?c=%s&d=%lli\">"
|
||||
"%li</a> ",
|
||||
coll,e->m_docIds[j],j);
|
||||
@ -4608,7 +4610,7 @@ bool LinkInfo::print ( SafeBuf *sb , char *coll ) {
|
||||
"<tr><td colspan=2>link #%04li "
|
||||
"("
|
||||
//"baseScore=%010li, "
|
||||
"d=<a href=\"/master/titledb?c=%s&"
|
||||
"d=<a href=\"/admin/titledb?c=%s&"
|
||||
"d=%lli\">%016lli</a>, "
|
||||
"siterank=%li, "
|
||||
"hopcount=%03li "
|
||||
|
@ -2156,7 +2156,9 @@ bool Msg40::gotSummary ( ) {
|
||||
m_docsToGet , m_msg3aRecallCnt);
|
||||
|
||||
// if we do not have enough visible, try to get more
|
||||
if ( visible < m_docsToGetVisible && m_msg3a.m_moreDocIdsAvail ) {
|
||||
if ( visible < m_docsToGetVisible && m_msg3a.m_moreDocIdsAvail &&
|
||||
// doesn't work on multi-coll just yet, it cores
|
||||
m_numCollsToSearch == 1 ) {
|
||||
// can it cover us?
|
||||
long need = m_msg3a.m_docsToGet + 20;
|
||||
// note it
|
||||
|
8
Msg5.cpp
8
Msg5.cpp
@ -722,6 +722,11 @@ bool Msg5::readList ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
// limit to 20MB so we don't go OOM!
|
||||
if ( m_newMinRecSizes > 2 * m_minRecSizes &&
|
||||
m_newMinRecSizes > 20000000 )
|
||||
m_newMinRecSizes = 20000000;
|
||||
|
||||
|
||||
QUICKPOLL((m_niceness));
|
||||
// debug msg
|
||||
@ -849,6 +854,9 @@ bool Msg5::needsRecall ( ) {
|
||||
// seems to be very common for doledb, so don't log unless extreme
|
||||
//if ( m_rdbId == RDB_DOLEDB && m_round < 15 ) logIt = false;
|
||||
if ( m_round > 100 && (m_round % 1000) != 0 ) logIt = false;
|
||||
// seems very common when doing rebalancing then merging to have
|
||||
// to do at least one round of re-reading, so note that
|
||||
if ( m_round == 0 ) logIt = false;
|
||||
if ( logIt )
|
||||
logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
|
||||
"got %li) this=0x%lx round=%li.",
|
||||
|
@ -181,14 +181,14 @@ bool sendReply ( void *state ) {
|
||||
// print the generate Catdb link
|
||||
sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
|
||||
"<td><center>"
|
||||
"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
|
||||
"<a href=\"/admin/catdb?c=%s&gencatdb=2\">"
|
||||
"Update Catdb</a> "
|
||||
"</center></td></tr>",
|
||||
st->m_coll );
|
||||
sb.safePrintf ( "<tr class=poo>"
|
||||
"<td>Generate New Catdb from DMOZ data.</td>"
|
||||
"<td><center>"
|
||||
"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
|
||||
"<a href=\"/admin/catdb?c=%s&gencatdb=1\">"
|
||||
"Generate Catdb</a> "
|
||||
"</center></td></tr>",
|
||||
st->m_coll );
|
||||
|
@ -7610,8 +7610,8 @@ bool printAdminLinks ( SafeBuf &sb , State7 *st ) {
|
||||
// get the filename directly
|
||||
sb.safePrintf (" "
|
||||
"<font color=red><b>"
|
||||
//"<a href=\"/master/tagdb?f=%li&c=%s&u=%s\">"
|
||||
"<a href=\"/master/tagdb?"
|
||||
//"<a href=\"/admin/tagdb?f=%li&c=%s&u=%s\">"
|
||||
"<a href=\"/admin/tagdb?"
|
||||
//"tagid0=%li&"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
@ -7631,7 +7631,7 @@ bool printAdminLinks ( SafeBuf &sb , State7 *st ) {
|
||||
//long bannedTagId = getTagTypeFromStr("manualban",9);
|
||||
sb.safePrintf (" "
|
||||
"<font color=red><b>"
|
||||
"<a href=\"/master/tagdb?"
|
||||
"<a href=\"/admin/tagdb?"
|
||||
//"tagid0=%li&"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
@ -7876,7 +7876,7 @@ void printAdminEventOptions ( SafeBuf* sb,
|
||||
sb->safePrintf("Ban By Domain: ");
|
||||
|
||||
//long bannedTagId = getTagTypeFromStr("manualban",9);
|
||||
sb->safePrintf("<a href=\"/master/tagdb?"
|
||||
sb->safePrintf("<a href=\"/admin/tagdb?"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
"u=%s&c=%s\">"
|
||||
@ -8561,13 +8561,13 @@ static bool printResult ( CollectionRec *cr,
|
||||
// . if it's local, don't put the hostname/port in
|
||||
// there cuz it will mess up Global Spec's machine
|
||||
//if ( h->m_groupId == g_hostdb.m_groupId )
|
||||
sb.safePrintf(" - <a href=\"/master/titledb?c=%s&"
|
||||
sb.safePrintf(" - <a href=\"/admin/titledb?c=%s&"
|
||||
"d=%lli",coll,mr->m_docId);
|
||||
// then the [info] link to show the TitleRec
|
||||
sb.safePrintf ( "\">[info]</a>" );
|
||||
|
||||
// now the analyze link
|
||||
sb.safePrintf (" - <a href=\"/master/parser?c=%s&"
|
||||
sb.safePrintf (" - <a href=\"/admin/parser?c=%s&"
|
||||
"old=1&hc=%li&u=",
|
||||
coll,
|
||||
(long)mr->m_hopcount);
|
||||
@ -8629,7 +8629,7 @@ static bool printResult ( CollectionRec *cr,
|
||||
dbuf ,
|
||||
coll , dbuf );
|
||||
sb.safePrintf(" - "
|
||||
" <a href=\"/master/tagdb?"
|
||||
" <a href=\"/admin/tagdb?"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
"u=%s&c=%s\">"
|
||||
@ -8641,7 +8641,7 @@ static bool printResult ( CollectionRec *cr,
|
||||
memcpy ( dbuf , uu.getHost() , dlen );
|
||||
dbuf [ dlen ] = '\0';
|
||||
sb.safePrintf(" - "
|
||||
" <a href=\"/master/tagdb?"
|
||||
" <a href=\"/admin/tagdb?"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
"u=%s&c=%s\">"
|
||||
@ -17616,7 +17616,7 @@ bool gotCaptchaReply ( State9 *st9 , TcpSocket *s ) {
|
||||
if ( st9->m_isAdmin && 1 == 2) {
|
||||
SafeBuf ttt;
|
||||
ttt.safePrintf("<br>"
|
||||
"<a href=/master/parser?"
|
||||
"<a href=/admin/parser?"
|
||||
//"user=mwells&pwd=mwell62&"
|
||||
"c=%s&u=%s&content=",
|
||||
st9->m_coll,
|
||||
|
@ -131,7 +131,7 @@ skipReplaceHost:
|
||||
if ( g_conf.m_useShotgun ) {
|
||||
colspan = "31";
|
||||
//shotcol = "<td><b>ip2</b></td>";
|
||||
sprintf ( shotcol, "<td><a href=\"/master/hosts?c=%s"
|
||||
sprintf ( shotcol, "<td><a href=\"/admin/hosts?c=%s"
|
||||
"&sort=2\">"
|
||||
"<b>ping2</b></td></a>",
|
||||
coll);
|
||||
@ -143,12 +143,12 @@ skipReplaceHost:
|
||||
"<tr><td colspan=%s><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Hosts "
|
||||
"(<a href=\"/master/hosts?c=%s&sort=%li&reset=1\">"
|
||||
"(<a href=\"/admin/hosts?c=%s&sort=%li&reset=1\">"
|
||||
"reset)</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=0\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=0\">"
|
||||
|
||||
"<b>hostId</b></td>"
|
||||
"<td><b>host ip</b></td>"
|
||||
@ -188,52 +188,52 @@ skipReplaceHost:
|
||||
//"<td><b>resends sent</td>"
|
||||
//"<td><b>errors recvd</td>"
|
||||
//"<td><b>ETRYAGAINS recvd</td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=3\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=3\">"
|
||||
"<b>dgrams resent</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=4\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=4\">"
|
||||
"<b>errors recvd</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=5\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=5\">"
|
||||
"<b>ETRY AGAINS recvd</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=6\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=6\">"
|
||||
"<b>dgrams to</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=7\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=7\">"
|
||||
"<b>dgrams from</a></td>"
|
||||
|
||||
//"<td><a href=\"/master/hosts?c=%s&sort=8\">"
|
||||
//"<td><a href=\"/admin/hosts?c=%s&sort=8\">"
|
||||
//"<b>loadavg</a></td>"
|
||||
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=13\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=13\">"
|
||||
"<b>avg split time</a></td>"
|
||||
|
||||
"<td><b>splits done</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=12\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=12\">"
|
||||
"<b>status</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=15\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=15\">"
|
||||
"<b>slow reads</a></td>"
|
||||
|
||||
"<td><b>docs indexed</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=9\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=9\">"
|
||||
"<b>mem used</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=10\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=10\">"
|
||||
"<b>cpu</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=17\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=17\">"
|
||||
"<b>disk</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=14\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=14\">"
|
||||
"<b>max ping1</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=11\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=11\">"
|
||||
"<b>ping1 age</a></td>"
|
||||
|
||||
//"<td><b>ip1</td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=1\">"
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=1\">"
|
||||
"<b>ping1</a></td>"
|
||||
|
||||
"%s"// "<td><b>ip2</td>"
|
||||
@ -452,7 +452,7 @@ skipReplaceHost:
|
||||
// print it
|
||||
sb.safePrintf (
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><a href=\"http://%s:%hi/master/hosts?"
|
||||
"<td><a href=\"http://%s:%hi/admin/hosts?"
|
||||
""
|
||||
"c=%s"
|
||||
"&sort=%li\">%li</a></td>"
|
||||
@ -711,7 +711,7 @@ skipReplaceHost:
|
||||
sb.safePrintf (
|
||||
"<tr bgcolor=#%s>"
|
||||
|
||||
"<td><a href=\"http://%s:%hi/master/hosts?"
|
||||
"<td><a href=\"http://%s:%hi/admin/hosts?"
|
||||
""
|
||||
"c=%s\">"
|
||||
"%li</a></td>"
|
||||
|
@ -535,8 +535,8 @@ bool gotIndexList2 ( void *state , RdbList *list ) {
|
||||
"<tr><td>%li.</td>"
|
||||
"<td>%s%i</td>"
|
||||
"<td>"
|
||||
//"<a href=http://%s:%hu/master/titledb?d=%llu>"
|
||||
"<a href=/master/titledb?c=%s&d=%llu>"
|
||||
//"<a href=http://%s:%hu/admin/titledb?d=%llu>"
|
||||
"<a href=/admin/titledb?c=%s&d=%llu>"
|
||||
"%llu"
|
||||
//"<td><a href=/cgi/4.cgi?d=%llu>%llu"
|
||||
"</td>"
|
||||
@ -602,8 +602,8 @@ bool gotIndexList2 ( void *state , RdbList *list ) {
|
||||
"<td>%llu</td>"
|
||||
"<td>%lu</td><td>%i</td>"
|
||||
"<td>"
|
||||
//"<a href=http://%s:%hu/master/titledb?d=%llu>"
|
||||
"<a href=/master/titledb?c=%s&d=%llu>"
|
||||
//"<a href=http://%s:%hu/admin/titledb?d=%llu>"
|
||||
"<a href=/admin/titledb?c=%s&d=%llu>"
|
||||
"%llu"
|
||||
//"<td><a href=/cgi/4.cgi?d=%llu>%llu"
|
||||
"</td></tr>\n" ,
|
||||
|
@ -1451,8 +1451,8 @@ bool sendPageOverview ( TcpSocket *s , HttpRequest *r ) {
|
||||
"You can specify different indexing and spider parameters on a per URL basis by one or more of the following methods:\n"
|
||||
"<br><br>\n"
|
||||
"<ul>\n"
|
||||
"<li>Using the <a href=\"/master/tagdb\">tagdb interface</a>, you can assign a <a href=#ruleset>ruleset</a> to a set of sites. All you do is provide Gigablast with a list of sites and the ruleset to use for those sites.\n"
|
||||
"You can enter the sites via the <a href=\"/master/tagdb\">HTML form</a> or you can provide Gigablast with a file of the sites. Each file must be limited to 1 Megabyte, but you can add hundreds of millions of sites. \n"
|
||||
"<li>Using the <a href=\"/admin/tagdb\">tagdb interface</a>, you can assign a <a href=#ruleset>ruleset</a> to a set of sites. All you do is provide Gigablast with a list of sites and the ruleset to use for those sites.\n"
|
||||
"You can enter the sites via the <a href=\"/admin/tagdb\">HTML form</a> or you can provide Gigablast with a file of the sites. Each file must be limited to 1 Megabyte, but you can add hundreds of millions of sites. \n"
|
||||
"Sites can be full URLs, hostnames, domain names or IP addresses.\n"
|
||||
"If you add a site which is just a canonical domain name with no explicit host name, like gigablast.com, then any URL with the same domain name, regardless of its host name will match that site. That is, \"hostname.gigablast.com\" will match the site \"gigablast.com\" and therefore be assigned the associated ruleset.\n"
|
||||
"Sites may also use IP addresses instead of domain names. If the least significant byte of an IP address that you submit to tagdb is 0 then any URL with the same top 3 IP bytes as that IP will be considered a match.\n"
|
||||
@ -1917,7 +1917,7 @@ bool sendPageOverview ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<br>\n"
|
||||
"After the base score is computed, it is multiplied by the number of occurences of the word or phrase in the portion of the document being indexed as specified by the index rule. This score may then be reduced if spam detection occurred and the word or phrase was deemed repetitious. Spam detection is triggered when the quality of the document is at or below the value specified in the <minQualityForSpamDetect> tag in the index rule. Finally, the score is mapped into an 8 bit value, from 1 to 255, and stored in the index."
|
||||
"<br><br>\n"
|
||||
"To see the scoring algorithm in action you can use the <b><a href=\"/master/parser\">Parser Tool</a></b>. It will show each indexed word and phrase and its associated score, as well as some attributes associated with the indexed document."
|
||||
"To see the scoring algorithm in action you can use the <b><a href=\"/admin/parser\">Parser Tool</a></b>. It will show each indexed word and phrase and its associated score, as well as some attributes associated with the indexed document."
|
||||
""
|
||||
"<br>\n"
|
||||
"<br>\n"
|
||||
|
@ -537,7 +537,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
long dlen;
|
||||
char *dbuf = ::getDomFast ( hostname,&dlen,false);
|
||||
p->safePrintf(
|
||||
" <a href=\"/master/tagdb?"
|
||||
" <a href=\"/admin/tagdb?"
|
||||
"user=admin&"
|
||||
"tagtype0=manualban&"
|
||||
"tagdata0=1&"
|
||||
|
@ -620,7 +620,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td colspan=50>"
|
||||
"<center><b>Spider Compression Proxy Stats</b> "
|
||||
|
||||
" [<a href=\"/master/stats?reset=2\">"
|
||||
" [<a href=\"/admin/stats?reset=2\">"
|
||||
"reset</a>]</td></tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
@ -828,7 +828,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td colspan=50>"
|
||||
"<center><b>Message Stats</b> "
|
||||
|
||||
" [<a href=\"/master/stats?reset=1\">"
|
||||
" [<a href=\"/admin/stats?reset=1\">"
|
||||
"reset</a>]</td></tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
|
@ -284,8 +284,8 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">"
|
||||
"rebuild all data</a> <a href=\"/master/thesaurus?"
|
||||
"<center><b><a href=\"/admin/thesaurus?rebuild=1&%s\">"
|
||||
"rebuild all data</a> <a href=\"/admin/thesaurus?"
|
||||
"rebuild=1&full=1&%s\">(full)</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf, getBuf);
|
||||
@ -300,7 +300,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?distribute=1&%s\">"
|
||||
"<center><b><a href=\"/admin/thesaurus?distribute=1&%s\">"
|
||||
"distribute data</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -314,7 +314,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b>"
|
||||
"<a href=\"/master/thesaurus?reload=1&cast=0&%s\">"
|
||||
"<a href=\"/admin/thesaurus?reload=1&cast=0&%s\">"
|
||||
"reload data</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -328,7 +328,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b>"
|
||||
"<a href=\"/master/thesaurus?reload=1&cast=1&%s\">"
|
||||
"<a href=\"/admin/thesaurus?reload=1&cast=1&%s\">"
|
||||
"reload data (all hosts)</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -342,7 +342,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%%>"
|
||||
"<form action=\"/master/thesaurus>\">"
|
||||
"<form action=\"/admin/thesaurus>\">"
|
||||
"<input type=text name=synonym size=20>"
|
||||
"<input type=submit value=Submit>"
|
||||
"%s"
|
||||
@ -365,7 +365,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?cancel=1&%s\">"
|
||||
"<center><b><a href=\"/admin/thesaurus?cancel=1&%s\">"
|
||||
"cancel running rebuild</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf);
|
||||
@ -380,8 +380,8 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td width=12%% bgcolor=#0000ff>"
|
||||
"<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">"
|
||||
"rebuild affinity</a> <a href=\"/master/thesaurus?"
|
||||
"<center><b><a href=\"/admin/thesaurus?rebuildaff=1&%s\">"
|
||||
"rebuild affinity</a> <a href=\"/admin/thesaurus?"
|
||||
"rebuildaff=1&full=1&%s\">(full)</a></b></center>"
|
||||
"</td>"
|
||||
"</tr>\n", getBuf, getBuf);
|
||||
@ -405,7 +405,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"character, optionally followed by another pipe and a type "
|
||||
"designation; any badly formatted lines will be silently "
|
||||
"ignored</font><br>\n"
|
||||
"<form action=\"/master/thesaurus\" method=post>"
|
||||
"<form action=\"/admin/thesaurus\" method=post>"
|
||||
"<textarea name=\"manualadd\" rows=20 cols=80>");
|
||||
|
||||
if (manualAdd && manualAddLen) {
|
||||
@ -434,7 +434,7 @@ bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
|
||||
"that these pairs will only work if the thesaurus otherwise "
|
||||
"has an entry for them, so add them to the manual add file "
|
||||
"above if need be</font><br>\n"
|
||||
"<form action=\"/master/thesaurus\" method=post>"
|
||||
"<form action=\"/admin/thesaurus\" method=post>"
|
||||
"<textarea name=\"affinityadd\" rows=20 cols=80>");
|
||||
|
||||
if (affinityAdd && affinityAddLen) {
|
||||
|
15
Parms.cpp
15
Parms.cpp
@ -16794,14 +16794,15 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
|
||||
if ( strncmp(path,"/crawlbot",9) == 0 ) customCrawl = 1;
|
||||
if ( strncmp(path,"/v2/crawl",9) == 0 ) customCrawl = 1;
|
||||
if ( strncmp(path,"/v2/bulk" ,8) == 0 ) customCrawl = 2;
|
||||
if (cr) {
|
||||
// throw error if collection record custom crawl type doesn't equal the crawl type of current request
|
||||
if (customCrawl != cr->m_isCustomCrawl) {
|
||||
g_errno = ECUSTOMCRAWLMISMATCH;
|
||||
return false;
|
||||
|
||||
// throw error if collection record custom crawl type doesn't equal
|
||||
// the crawl type of current request
|
||||
if (cr && customCrawl && customCrawl != cr->m_isCustomCrawl ) {
|
||||
g_errno = ECUSTOMCRAWLMISMATCH;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
bool hasAddCrawl = hr->hasField("addCrawl");
|
||||
|
||||
bool hasAddCrawl = hr->hasField("addCrawl");
|
||||
bool hasAddBulk = hr->hasField("addBulk");
|
||||
bool hasAddColl = hr->hasField("addColl");
|
||||
// sometimes they try to delete a collection that is not there so do
|
||||
|
@ -289,6 +289,8 @@ bool Posdb::verify ( char *coll ) {
|
||||
list.skipCurrentRecord() ) {
|
||||
key144_t k;
|
||||
list.getCurrentKey(&k);
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
|
||||
//unsigned long groupId = getGroupId ( RDB_POSDB , &k );
|
||||
|
19
Process.cpp
19
Process.cpp
@ -403,6 +403,7 @@ Process::Process ( ) {
|
||||
bool Process::init ( ) {
|
||||
// -1 means unknown
|
||||
m_diskUsage = -1.0;
|
||||
m_diskAvail = -1LL;
|
||||
// we do not know if the fans are turned off or on
|
||||
m_currentFanState = -1;
|
||||
m_threadOut = false;
|
||||
@ -877,12 +878,14 @@ void hdtempDoneWrapper ( void *state , ThreadEntry *t ) {
|
||||
|
||||
|
||||
// set Process::m_diskUsage
|
||||
float getDiskUsage ( ) {
|
||||
float getDiskUsage ( long long *diskAvail ) {
|
||||
|
||||
// first get disk usage now
|
||||
char cmd[10048];
|
||||
char *out = "/tmp/diskusage";
|
||||
snprintf(cmd,10000,"df -ka %s | tail -1 | awk '{print $5}' > %s",
|
||||
char out[1024];
|
||||
sprintf(out,"%sdiskusage",g_hostdb.m_dir);
|
||||
snprintf(cmd,10000,"df -ka %s | tail -1 | "
|
||||
"awk '{print $4\" \"$5}' > %s",
|
||||
g_hostdb.m_dir,
|
||||
out);
|
||||
int err = system ( cmd );
|
||||
@ -897,7 +900,7 @@ float getDiskUsage ( ) {
|
||||
}
|
||||
|
||||
// read in temperatures from file
|
||||
int fd = open ( "/tmp/diskusage" , O_RDONLY );
|
||||
int fd = open ( out , O_RDONLY );
|
||||
if ( fd < 0 ) {
|
||||
//m_errno = errno;
|
||||
log("build: Could not open %s for reading: %s.",
|
||||
@ -917,17 +920,19 @@ float getDiskUsage ( ) {
|
||||
close ( fd );
|
||||
|
||||
float usage;
|
||||
sscanf(buf,"%f",&usage);
|
||||
long long avail;
|
||||
sscanf(buf,"%lli %f",&avail,&usage);
|
||||
// it is in KB so make it into bytes
|
||||
if ( diskAvail ) *diskAvail = avail * 1000LL;
|
||||
return usage;
|
||||
}
|
||||
|
||||
|
||||
// . sets m_errno on error
|
||||
// . taken from Msg16.cpp
|
||||
void *hdtempStartWrapper_r ( void *state , ThreadEntry *t ) {
|
||||
|
||||
// run the df -ka cmd
|
||||
g_process.m_diskUsage = getDiskUsage();
|
||||
g_process.m_diskUsage = getDiskUsage( &g_process.m_diskAvail );
|
||||
|
||||
|
||||
// ignore temps now. ssds don't have it
|
||||
|
@ -93,6 +93,7 @@ class Process {
|
||||
long m_currentFanState;
|
||||
long m_desiredFanState;
|
||||
float m_diskUsage;
|
||||
long long m_diskAvail;
|
||||
};
|
||||
|
||||
extern Process g_process;
|
||||
|
107
RdbBase.cpp
107
RdbBase.cpp
@ -933,13 +933,16 @@ bool RdbBase::incorporateMerge ( ) {
|
||||
long b = m_mergeStartFileNum + m_numFilesToMerge;
|
||||
// shouldn't be called if no files merged
|
||||
if ( a == b ) {
|
||||
// unless resuming after a merge completed and we exited
|
||||
// but forgot to finish renaming the final file!!!!
|
||||
log("merge: renaming final file");
|
||||
// decrement this count
|
||||
if ( m_isMerging ) m_rdb->m_numMergesOut--;
|
||||
// exit merge mode
|
||||
m_isMerging = false;
|
||||
// return the merge token, no need for a callback
|
||||
g_msg35.releaseToken ( );
|
||||
return true;
|
||||
//return true;
|
||||
}
|
||||
// file #x is the merge file
|
||||
long x = a - 1;
|
||||
@ -1033,6 +1036,9 @@ bool RdbBase::incorporateMerge ( ) {
|
||||
|
||||
// on success unlink the files we merged and free them
|
||||
for ( long i = a ; i < b ; i++ ) {
|
||||
// incase we are starting with just the
|
||||
// linkdb0001.003.dat file and not the stuff we merged
|
||||
if ( ! m_files[i] ) continue;
|
||||
// debug msg
|
||||
log(LOG_INFO,"merge: Unlinking merged file %s (#%li).",
|
||||
m_files[i]->getFilename(),i);
|
||||
@ -1413,6 +1419,15 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
m_minToMerge = 4;
|
||||
if ( cr && m_rdb == g_tagdb.getRdb() )
|
||||
m_minToMerge = 2;//cr->m_tagdbMinFilesToMerge;
|
||||
|
||||
// if we are reblancing this coll then keep merges tight so all
|
||||
// the negative recs annihilate with the positive recs to free
|
||||
// up disk space since we could be short on disk space.
|
||||
//if ( g_rebalance.m_isScanning &&
|
||||
// // if might have moved on if not able to merge because
|
||||
// // another was merging... so do this anyway...
|
||||
// g_rebalance.m_collnum == m_collnum )
|
||||
// m_minToMerge = 2;
|
||||
|
||||
|
||||
// secondary rdbs are used for rebuilding, so keep their limits high
|
||||
@ -1467,6 +1482,13 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
m_dbname);
|
||||
g_numUrgentMerges++;
|
||||
}
|
||||
|
||||
|
||||
// tfndb has his own merge class since titledb merges write tfndb recs
|
||||
RdbMerge *m = &g_merge;
|
||||
if ( m->isMerging() )
|
||||
return;
|
||||
|
||||
// if we are tfndb and someone else is merging, do not merge unless
|
||||
// we have 3 or more files
|
||||
long minToMerge = m_minToMerge;
|
||||
@ -1486,6 +1508,31 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
resuming = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// what percent of recs in the collections' rdb are negative?
|
||||
// the rdbmaps hold this info
|
||||
float percentNegativeRecs = getPercentNegativeRecsOnDisk ( );
|
||||
// 1. if disk space is tight and >20% negative recs, force it
|
||||
if ( g_process.m_diskAvail >= 0 &&
|
||||
g_process.m_diskAvail < 10000000000LL && // 10GB
|
||||
percentNegativeRecs > .20 ) {
|
||||
m_nextMergeForced = true;
|
||||
forceMergeAll = true;
|
||||
log("rdb: hit negative rec concentration of %.01f for "
|
||||
"collnum %li on db %s when diskAvail=%lli bytes",
|
||||
percentNegativeRecs,(long)m_collnum,m_rdb->m_dbname,
|
||||
g_process.m_diskAvail);
|
||||
}
|
||||
// 2. if >40% negative recs force it
|
||||
if ( percentNegativeRecs > .40 ) {
|
||||
m_nextMergeForced = true;
|
||||
forceMergeAll = true;
|
||||
log("rdb: hit negative rec concentration of %.01f for "
|
||||
"collnum %li on db %s",
|
||||
percentNegativeRecs,(long)m_collnum,m_rdb->m_dbname);
|
||||
}
|
||||
|
||||
|
||||
// . don't merge if we don't have the min # of files
|
||||
// . but skip this check if there is a merge to be resumed from b4
|
||||
if ( ! resuming && ! forceMergeAll && numFiles < minToMerge ) return;
|
||||
@ -1719,11 +1766,49 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
"original %li files.",mm,n);
|
||||
// how many files to merge?
|
||||
n = mm;
|
||||
// allow a single file to continue merging if the other
|
||||
// file got merged out already
|
||||
if ( mm > 0 ) overide = true;
|
||||
|
||||
// if we've already merged and already unlinked, then the
|
||||
// process exited, now we restart with just the final
|
||||
// merge final and we need to do the rename
|
||||
if ( mm == 0 ) {
|
||||
m_isMerging = false;
|
||||
// make a fake file before us that we were merging
|
||||
// since it got nuked on disk
|
||||
//incorporateMerge();
|
||||
char fbuf[256];
|
||||
sprintf(fbuf,"%s%04li.dat",m_dbname,mergeFileId-1);
|
||||
if ( m_isTitledb )
|
||||
sprintf(fbuf,"%s%04li-%03li.dat",
|
||||
m_dbname,mergeFileId-1,id2);
|
||||
log("merge: renaming final merged file %s",fbuf);
|
||||
m_files[j]->rename(fbuf);
|
||||
sprintf(fbuf,"%s%04li.map",m_dbname,mergeFileId-1);
|
||||
//File *mf = m_maps[j]->getFile();
|
||||
m_maps[j]->rename(fbuf);
|
||||
log("merge: renaming final merged file %s",fbuf);
|
||||
return;
|
||||
}
|
||||
|
||||
// resume the merging
|
||||
goto startMerge;
|
||||
}
|
||||
|
||||
minToMerge = m_minToMerge;
|
||||
|
||||
|
||||
// if we are reblancing this coll then keep merges tight so all
|
||||
// the negative recs annihilate with the positive recs to free
|
||||
// up disk space since we could be short on disk space.
|
||||
//if ( g_rebalance.m_isScanning &&
|
||||
// // if might have moved on if not able to merge because
|
||||
// // another was merging... so do this anyway...
|
||||
// g_rebalance.m_collnum == m_collnum )
|
||||
// minToMerge = 2;
|
||||
|
||||
|
||||
//if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
|
||||
// minToMerge = 3;
|
||||
|
||||
@ -1772,6 +1857,9 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
//smini = -1;
|
||||
// but if we are forcing then merge ALL, except one being dumped
|
||||
if ( m_nextMergeForced ) n = numFiles;
|
||||
// or if doing relabalncing, merge them all. tight merge
|
||||
//if ( g_rebalance.m_isScanning && g_rebalance.m_collnum == m_collnum)
|
||||
// n = numFiles;
|
||||
//else if ( m_isTitledb ) {
|
||||
// RdbBase *base = g_tfndb.getRdb()->m_bases[m_collnum];
|
||||
// tfndbSize = base->getDiskSpaceUsed();
|
||||
@ -2305,6 +2393,10 @@ bool RdbBase::verifyFileSharding ( ) {
|
||||
list.skipCurrentRecord() ) {
|
||||
//key144_t k;
|
||||
list.getCurrentKey(k);
|
||||
|
||||
// skip negative keys
|
||||
if ( (k[0] & 0x01) == 0x00 ) continue;
|
||||
|
||||
count++;
|
||||
//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
|
||||
//unsigned long groupId = getGroupId ( RDB_POSDB , &k );
|
||||
@ -2349,4 +2441,15 @@ bool RdbBase::verifyFileSharding ( ) {
|
||||
//return true;
|
||||
}
|
||||
|
||||
|
||||
float RdbBase::getPercentNegativeRecsOnDisk ( ) {
|
||||
// scan the maps
|
||||
long long numPos = 0LL;
|
||||
long long numNeg = 0LL;
|
||||
for ( long i = 0 ; i < m_numFiles ; i++ ) {
|
||||
numPos += m_maps[i]->getNumPositiveRecs();
|
||||
numNeg += m_maps[i]->getNumNegativeRecs();
|
||||
}
|
||||
long long total = numPos + numNeg;
|
||||
float percent = (float)numNeg / (float)total;
|
||||
return percent;
|
||||
}
|
||||
|
@ -168,6 +168,8 @@ class RdbBase {
|
||||
|
||||
//RdbMem *getRdbMem () { return &m_mem; };
|
||||
|
||||
float getPercentNegativeRecsOnDisk ( ) ;
|
||||
|
||||
// how much mem is alloced for our maps?
|
||||
long long getMapMemAlloced ();
|
||||
|
||||
|
@ -3039,7 +3039,7 @@ bool RdbList::posdbMerge_r ( RdbList **lists ,
|
||||
if ( maxPtr > m_alloc + m_allocSize ) maxPtr = m_alloc + m_allocSize;
|
||||
|
||||
// debug note
|
||||
if ( m_listSize )
|
||||
if ( m_listSize && g_conf.m_logDebugBuild )
|
||||
log(LOG_LOGIC,"db: storing recs in a non-empty list for merge"
|
||||
" probably from recall from negative key loss");
|
||||
|
||||
|
@ -21,8 +21,9 @@
|
||||
Rebalance g_rebalance;
|
||||
|
||||
Rebalance::Rebalance ( ) {
|
||||
m_registered = false;
|
||||
m_allowSave = false;
|
||||
m_inRebalanceLoop = false;
|
||||
//m_inRebalanceLoop = false;
|
||||
m_numForeignRecs = 0;
|
||||
m_rebalanceCount = 0LL;
|
||||
m_scannedCount = 0LL;
|
||||
@ -225,6 +226,15 @@ void Rebalance::scanLoop ( ) {
|
||||
m_lastRdb = rdb;
|
||||
// reset key cursor as well!!!
|
||||
KEYMIN ( m_nextKey , MAX_KEY_BYTES );
|
||||
|
||||
// This logic now in RdbBase.cpp.
|
||||
// let's keep posdb and titledb tight-merged so
|
||||
// we do not run out of disk space because we
|
||||
// will be dumping tons of negative recs
|
||||
//RdbBase *base = rdb->getBase(m_collnum);
|
||||
//base->m_savedMin = base->m_minFilesToMerge;
|
||||
//base->m_minFilesToMerge = 2;
|
||||
|
||||
}
|
||||
// percent update?
|
||||
long percent = (unsigned char)m_nextKey[rdb->m_ks-1];
|
||||
@ -245,6 +255,12 @@ void Rebalance::scanLoop ( ) {
|
||||
m_rebalanceCount = 0;
|
||||
m_scannedCount = 0;
|
||||
m_lastPercent = -1;
|
||||
|
||||
// This logic now in RdbBase.cpp.
|
||||
// go back to normal merge threshold
|
||||
//RdbBase *base = rdb->getBase(m_collnum);
|
||||
//base->m_minFilesToMerge = base->m_savedMin;
|
||||
|
||||
}
|
||||
// reset it for next colls
|
||||
m_rdbNum = 0;
|
||||
@ -310,6 +326,11 @@ static void gotListWrapper ( void *state , RdbList *list, Msg5 *msg5 ) {
|
||||
g_rebalance.scanLoop();
|
||||
}
|
||||
|
||||
void sleepWrapper ( int fd , void *state ) {
|
||||
// try a re-call since we were merging last time
|
||||
g_rebalance.scanLoop();
|
||||
}
|
||||
|
||||
bool Rebalance::scanRdb ( ) {
|
||||
|
||||
// get collrec i guess
|
||||
@ -317,6 +338,40 @@ bool Rebalance::scanRdb ( ) {
|
||||
|
||||
Rdb *rdb = g_process.m_rdbs[m_rdbNum];
|
||||
|
||||
// unregister it if it was registered
|
||||
if ( m_registered ) {
|
||||
g_loop.unregisterSleepCallback ( NULL,sleepWrapper );
|
||||
m_registered = false;
|
||||
}
|
||||
|
||||
if ( g_process.m_mode == EXIT_MODE ) return false;
|
||||
|
||||
// . if this rdb is merging wait until merge is done
|
||||
// . we will be dumping out a lot of negative recs and if we are
|
||||
// short on disk space we need to merge them in immediately with
|
||||
// all our data so that they annihilate quickly with the positive
|
||||
// keys in there to free up more disk
|
||||
RdbBase *base = rdb->getBase ( m_collnum );
|
||||
// base is NULL for like monitordb...
|
||||
if ( base && base->isMerging() ) {
|
||||
log("rebal: waiting for merge on %s for coll #%li to complete",
|
||||
rdb->m_dbname,(long)m_collnum);
|
||||
g_loop.registerSleepCallback ( 1000,NULL,sleepWrapper,1);
|
||||
m_registered = true;
|
||||
// we blocked, return false
|
||||
return false;
|
||||
}
|
||||
// or really if any merging is going on way for it to save disk space
|
||||
if ( rdb->isMerging() ) {
|
||||
log("rebal: waiting for merge on %s for coll ??? to complete",
|
||||
rdb->m_dbname);
|
||||
g_loop.registerSleepCallback ( 1000,NULL,sleepWrapper,1);
|
||||
m_registered = true;
|
||||
// we blocked, return false
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// skip empty collrecs, unless like statsdb or something
|
||||
//if ( ! cr && ! rdb->m_isCollectionLess ) return true;
|
||||
|
||||
|
@ -23,7 +23,7 @@ class Rebalance {
|
||||
bool gotList ( ) ;
|
||||
bool saveRebalanceFile ( ) ;
|
||||
|
||||
bool m_inRebalanceLoop;
|
||||
//bool m_inRebalanceLoop;
|
||||
long m_numForeignRecs;
|
||||
long long m_rebalanceCount;
|
||||
long long m_scannedCount;
|
||||
@ -43,6 +43,7 @@ class Rebalance {
|
||||
long m_blocked;
|
||||
bool m_allowSave;
|
||||
|
||||
bool m_registered;
|
||||
RdbList m_list;
|
||||
SafeBuf m_posMetaList;
|
||||
SafeBuf m_negMetaList;
|
||||
|
@ -2289,7 +2289,7 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td width=50%%><b>host ID with min repair mode"
|
||||
"</b></td>"
|
||||
"<td><a href=\"http://%s:%hu/master/repair\">"
|
||||
"<td><a href=\"http://%s:%hu/admin/repair\">"
|
||||
"%li</a></td></tr>\n"
|
||||
|
||||
"<tr bgcolor=#%s><td><b>old collection</b></td>"
|
||||
|
@ -4124,6 +4124,14 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
continue;
|
||||
if ( sreq->m_hopCount < m_tailHopCount )
|
||||
goto gotNewWinner;
|
||||
// if hopcounts tied prefer the unindexed doc
|
||||
// i don't think we need this b/c spidertimems
|
||||
// for new docs should be less than old docs...
|
||||
// TODO: verify that
|
||||
//if ( sreq->m_isIndexed && ! m_tailIsIndexed )
|
||||
// continue;
|
||||
//if ( ! sreq->m_isIndexed && m_tailIsIndexed )
|
||||
// goto gotNewWinner;
|
||||
// if tied, use actual times. assuming both<nowGlobalMS
|
||||
if ( spiderTimeMS > m_tailTimeMS )
|
||||
continue;
|
||||
|
@ -1921,6 +1921,8 @@ bool Tagdb::verify ( char *coll ) {
|
||||
//key128_t k = list.getCurrentKey();
|
||||
key128_t k;
|
||||
list.getCurrentKey ( &k );
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
// see if it is the "old" school tagdb rec
|
||||
//char *data = list.getCurrentData();
|
||||
|
2
Test.cpp
2
Test.cpp
@ -586,7 +586,7 @@ void Test::stopIt ( ) {
|
||||
// link to page parser
|
||||
char ubuf[2000];
|
||||
urlEncode(ubuf,2000,u,gbstrlen(u),true);
|
||||
tmp.safePrintf(" <a href=\"/master/parser?c=test&"
|
||||
tmp.safePrintf(" <a href=\"/admin/parser?c=test&"
|
||||
"u=%s\">parser</a> ",ubuf);
|
||||
//tmp.safePrintf(" (%llu)",h);
|
||||
tmp.safePrintf("<br>\n");
|
||||
|
23
Threads.cpp
23
Threads.cpp
@ -464,7 +464,12 @@ bool Threads::call ( char type ,
|
||||
// . try to launch as many threads as we can
|
||||
// . this sets g_errno on error
|
||||
// . if it has an error, just ignore it, our thread is queued
|
||||
m_threadQueues[i].launchThread ( t ) ;
|
||||
m_threadQueues[i].launchThread2 ( NULL );
|
||||
//if ( ! m_threadQueues[i].launchThread2 ( t ) && g_errno ) {
|
||||
// log("thread: failed thread launch: %s",mstrerror(g_errno));
|
||||
// return false;
|
||||
//}
|
||||
|
||||
// return false if there was an error launching the thread
|
||||
//if ( g_errno ) return false;
|
||||
// clear g_errno
|
||||
@ -512,7 +517,7 @@ long Threads::launchThreads ( ) {
|
||||
// clear g_errno
|
||||
g_errno = 0;
|
||||
// launch as many threads as we can from queue #i
|
||||
while ( m_threadQueues[i].launchThread ( ) ) numLaunched++;
|
||||
while ( m_threadQueues[i].launchThread2(NULL) ) numLaunched++;
|
||||
// continue if no g_errno set
|
||||
if ( ! g_errno ) continue;
|
||||
// otherwise bitch about it
|
||||
@ -1596,7 +1601,7 @@ long Threads::getNumActiveHighPriorityThreads() {
|
||||
// . sets g_errno on error
|
||||
// . don't launch a low priority thread if a high priority thread is running
|
||||
// . i.e. don't launch a high niceness thread if a low niceness is running
|
||||
bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
bool ThreadQueue::launchThread2 ( ThreadEntry *te ) {
|
||||
// debug msg
|
||||
//log("trying to launch for type=%li",(long)m_threadType);
|
||||
// clean up any threads that have exited
|
||||
@ -2151,13 +2156,23 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
mfree ( fs->m_allocBuf , fs->m_allocSize , "ThreadReadBuf" );
|
||||
fs->m_buf = NULL;
|
||||
}
|
||||
|
||||
// i'm not sure return value matters at this point? the thread
|
||||
// is queued and hopefully will launch at some point
|
||||
return false;
|
||||
|
||||
// if this is the direct thread request do not call callback, just
|
||||
// return false
|
||||
// return false, otherwise we get into an unexpected loop thingy
|
||||
if ( t == te )
|
||||
return log("thread: Returning false.");
|
||||
// do it blocking
|
||||
log("thread: Calling without thread. This will crash many times. "
|
||||
"Please fix it.");
|
||||
// return false so caller will re-do without thread!
|
||||
// so BigFile::readwrite() will retry without thread and we won't
|
||||
// get into a wierd loop thingy
|
||||
if ( te ) return false;
|
||||
|
||||
// unsigned long long profilerStart,profilerEnd;
|
||||
// unsigned long long statStart,statEnd;
|
||||
|
||||
|
@ -139,7 +139,7 @@ class ThreadQueue {
|
||||
|
||||
// . launch a thread from our queue
|
||||
// . returns false and sets errno on error
|
||||
bool launchThread ( ThreadEntry *te = NULL );
|
||||
bool launchThread2 ( ThreadEntry *te );
|
||||
|
||||
void print ( ) ;
|
||||
|
||||
|
@ -182,6 +182,8 @@ bool Titledb::verify ( char *coll ) {
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//unsigned long groupId = getGroupId ( RDB_TITLEDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
|
@ -12,7 +12,7 @@
|
||||
# must be represented as <, >, " and # respectively.
|
||||
|
||||
# Controls just the spiders for this collection.
|
||||
<spideringEnabled>1</>
|
||||
<spideringEnabled>0</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously PER HOST for THIS collection?
|
||||
@ -289,7 +289,6 @@
|
||||
# spidered, or if it has already been indexed, it will be deleted when it is
|
||||
# respidered.<br><br>
|
||||
<filterExpression><![CDATA[isdocidbased]]></>
|
||||
<filterExpression><![CDATA[!insitelist && !ismanualadd]]></>
|
||||
<filterExpression><![CDATA[ismedia]]></>
|
||||
<filterExpression><![CDATA[errorcount>=3 && hastmperror]]></>
|
||||
<filterExpression><![CDATA[errorcount>=1 && hastmperror]]></>
|
||||
@ -307,7 +306,6 @@
|
||||
<filterExpression><![CDATA[isnew]]></>
|
||||
<filterExpression><![CDATA[default]]></>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>0</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
@ -325,7 +323,6 @@
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
@ -346,7 +343,6 @@
|
||||
# Do not allow more than this many outstanding spiders for all urls in this
|
||||
# priority.
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>0</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>1</>
|
||||
@ -366,7 +362,6 @@
|
||||
|
||||
# Allow this many spiders per IP.
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>7</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
@ -402,10 +397,8 @@
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<filterPriority>80</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>3</>
|
||||
<filterPriority>45</>
|
||||
<filterPriority>85</>
|
||||
|
2
gb.conf
2
gb.conf
@ -51,7 +51,7 @@
|
||||
<readOnlyMode>0</>
|
||||
|
||||
# Controls all spidering for all collections
|
||||
<spideringEnabled>1</>
|
||||
<spideringEnabled>0</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously for ALL collections PER HOST?
|
||||
|
19
main.cpp
19
main.cpp
@ -4298,6 +4298,7 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
"%slocalhosts.conf "
|
||||
//"%shosts2.conf "
|
||||
"%sgb.conf "
|
||||
"%slocalgb.conf "
|
||||
"%stmpgb "
|
||||
//"%scollections.dat "
|
||||
"%sgb.pem "
|
||||
@ -4351,6 +4352,7 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
dir,
|
||||
dir,
|
||||
dir,
|
||||
dir,
|
||||
|
||||
dir,
|
||||
dir,
|
||||
@ -5756,11 +5758,12 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
lastKey.n1,lastKey.n0,
|
||||
k.n1,k.n0);
|
||||
lastKey = k;
|
||||
long shard = g_hostdb.getShardNum ( RDB_TITLEDB , &k );
|
||||
// print deletes
|
||||
if ( (k.n0 & 0x01) == 0) {
|
||||
fprintf(stdout,"n1=%08lx n0=%016llx docId=%012lli "
|
||||
"(del)\n",
|
||||
k.n1 , k.n0 , docId );
|
||||
"shard=%li (del)\n",
|
||||
k.n1 , k.n0 , docId , shard );
|
||||
continue;
|
||||
}
|
||||
// free the mem
|
||||
@ -5830,6 +5833,7 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
"redir=%s "
|
||||
"url=%s "
|
||||
"firstdup=1 "
|
||||
"shard=%li "
|
||||
"\n",
|
||||
k.n1 , k.n0 ,
|
||||
//rec[0] ,
|
||||
@ -5852,7 +5856,8 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
//ms,
|
||||
(long)xd->m_hopCount,
|
||||
ru,
|
||||
u->getUrl() );
|
||||
u->getUrl() ,
|
||||
shard );
|
||||
prevId = docId;
|
||||
count = 0;
|
||||
continue;
|
||||
@ -5952,6 +5957,7 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
"version=%02li "
|
||||
//"maxLinkTextWeight=%06lu%% "
|
||||
"hc=%li "
|
||||
"shard=%li "
|
||||
//"diffbot=%li "
|
||||
"redir=%s "
|
||||
"url=%s\n",
|
||||
@ -5975,6 +5981,7 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
(long)xd->m_version,
|
||||
//ms,
|
||||
(long)xd->m_hopCount,
|
||||
shard,
|
||||
//(long)xd->m_isDiffbotJSONObject,
|
||||
ru,
|
||||
u->getUrl() );
|
||||
@ -14438,6 +14445,8 @@ bool checkDataParity ( ) {
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
|
||||
uint32_t shardNum = getShardNum ( RDB_INDEXDB, &k );
|
||||
@ -14485,6 +14494,8 @@ bool checkDataParity ( ) {
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
uint32_t shardNum = getShardNum ( RDB_TITLEDB , &k );
|
||||
//long groupId = k.n1 & g_hostdb.m_groupMask;
|
||||
@ -14527,6 +14538,8 @@ bool checkDataParity ( ) {
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip negative keys
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
count++;
|
||||
// verify the group
|
||||
uint32_t shardNum = getShardNum ( RDB_TFNDB , &k );
|
||||
|
Loading…
Reference in New Issue
Block a user