Merge branch 'master' into dev-encoding

This commit is contained in:
Ai Lin Chia
2017-06-21 11:49:33 +02:00
18 changed files with 644 additions and 353 deletions

@ -324,6 +324,8 @@ bool Collectiondb::addNewColl ( const char *coll,
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
cr->setNeedsSave();
//
// BEGIN NEW CODE
//
@ -488,6 +490,8 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) {
log(LOG_INFO,"db: deleting coll \"%s\" (%" PRId32")",coll,
(int32_t)cr->m_collnum);
cr->setNeedsSave();
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now

2
File.h

@ -17,7 +17,7 @@
#ifndef GB_FILE_H
#define GB_FILE_H
#define MAX_FILENAME_LEN 128
#define MAX_FILENAME_LEN 256
#include <pthread.h>

@ -140,7 +140,7 @@ bool HighFrequencyTermShortcuts::load()
buffer = new_buffer;
//All the entries are full 18-byte entries in all their glory
//But PosdbTable::intersectLists10_r() doesn't like that and fails in
//But PosdbTable::intersectLists() doesn't like that and fails in
//a "sanity check" due to unhealthy knowledge of not only the
//posdb format but also the workings and algorithms.
//So we have to compress the non-entries to 12 byte.

@ -617,7 +617,7 @@ createFile:
}
// and working dir
if ( wdirlen > 127 ) {
if ( wdirlen > 255 ) {
g_errno = EBADENGINEER;
log(LOG_WARN, "conf: Host working dir too long in %s line %" PRId32".", filename, line);
return false;
@ -656,9 +656,9 @@ createFile:
// add slash if none there
if ( wdir[wdirlen-1] !='/' ) wdir[wdirlen++] = '/';
// don't breach Host::m_dir[128] buffer
if ( wdirlen >= 128 ) {
log(LOG_WARN, "conf: working dir %s is too long, >= 128 chars.", wdir);
// don't breach Host::m_dir[256] buffer
if ( wdirlen >= 256 ) {
log(LOG_WARN, "conf: working dir %s is too long, >= 256 chars.", wdir);
return false;
}

@ -95,9 +95,9 @@ public:
// we now include the working dir in the hosts.conf file
// so main.cpp can do gb --install and gb --allstart
char m_dir[128];
char m_mergeDir[128];
char m_mergeLockDir[128];
char m_dir[256];
char m_mergeDir[256];
char m_mergeLockDir[256];
char m_hostname[16];

101
Jenkinsfile vendored Normal file

@ -0,0 +1,101 @@
#!/usr/bin/env groovy
pipeline {
agent any
options {
skipDefaultCheckout()
}
environment {
GB_DIR = 'open-source-search-engine'
PYWEBTEST_DIR = 'pywebtest'
GTEST_OUTPUT = 'xml'
}
stages {
stage('Checkout') {
steps {
parallel (
'open-source-search-engine': {
checkout([
$class: 'GitSCM',
branches: scm.branches,
doGenerateSubmoduleConfigurations: false,
extensions: scm.extensions +
[[$class: 'SubmoduleOption',
disableSubmodules: false,
parentCredentials: false,
recursiveSubmodules: true,
reference: '',
trackingSubmodules: false]] +
[[$class: 'RelativeTargetDirectory',
relativeTargetDir: "${env.GB_DIR}"]] +
[[$class: 'CleanBeforeCheckout']],
userRemoteConfigs: scm.userRemoteConfigs
])
},
'pywebtest': {
checkout([
$class: 'GitSCM',
branches: [[name: '*/master']],
doGenerateSubmoduleConfigurations: false,
extensions: [[$class: 'RelativeTargetDirectory',
relativeTargetDir: "${env.PYWEBTEST_DIR}"]] +
[[$class: 'CleanBeforeCheckout']],
userRemoteConfigs: [[url: 'https://github.com/privacore/pywebtest.git']]
])
}
)
}
}
stage('Build') {
steps {
sh "cd ${env.GB_DIR} && make -j8"
}
}
stage('Test') {
steps {
parallel(
'unit test': {
sh "cd ${env.GB_DIR} && make -j8 unittest"
},
'system test': {
sh "cd ${env.PYWEBTEST_DIR} && ./run_all_testcases.py"
}
)
}
post {
always {
step([$class: 'XUnitPublisher',
thresholds: [[$class: 'FailedThreshold', unstableThreshold: '0']],
tools: [[$class: 'GoogleTestType', pattern: '**/test_detail.xml']]])
step([$class: 'XUnitPublisher',
thresholds: [[$class: 'FailedThreshold', unstableThreshold: '0']],
tools: [[$class: 'JUnitType', pattern: "${env.PYWEBTEST_DIR}/output.xml"]]])
}
}
}
}
post {
changed {
script {
if (currentBuild.result == "SUCCESS") {
slackSend color: 'good', message: "${env.JOB_NAME} - #${env.BUILD_NUMBER} Back to normal (<${env.BUILD_URL}|Open>)"
}
}
}
failure {
slackSend color: 'danger', message: "${env.JOB_NAME} - #${env.BUILD_NUMBER} Failure (<${env.BUILD_URL}|Open>)"
}
unstable {
slackSend color: 'warning', message: "${env.JOB_NAME} - #${env.BUILD_NUMBER} Unstable (<${env.BUILD_URL}|Open>)"
}
}
}

@ -377,6 +377,9 @@ cleandb:
rm -rf coll.main.?
rm -f *-saved.dat spiderproxystats.dat addsinprogress.dat robots.txt.cache dns.cache
.PHONY: cleantest
cleantest: cleandb
rm -f fatal_error
# shortcuts
.PHONY: release-safe

@ -486,7 +486,7 @@ void Msg39::controlLoop ( ) {
m_numTotalHits += m_posdbTable.getTotalHits();
// minus the shit we filtered out because of gbminint/gbmaxint/
// gbmin/gbmax/gbsortby/gbrevsortby/gbsortbyint/gbrevsortbyint
m_numTotalHits -= m_posdbTable.m_filtered;
m_numTotalHits -= m_posdbTable.getFilteredCount();
chunksSearched++;
}
@ -766,7 +766,6 @@ void Msg39::intersectLists(const DocumentIndexChecker &documentIndexChecker) {
// if msg2 had ALL empty lists we can cut it short
//todo: check if msg2 lists are all null or empty. If so then bail out
//previously: if ( m_toptree.getNumNodes() == 0 ) { //isj: shouldn't this call getNumUsedNodes() ?
//estimateHitsAndSendReply ( );
// do not re do it if doing docid range splitting
@ -810,7 +809,7 @@ void Msg39::intersectLists(const DocumentIndexChecker &documentIndexChecker) {
m_msg39req->m_niceness) ) {
jobState.wait_for_finish();
} else
m_posdbTable.intersectLists10_r();
m_posdbTable.intersectLists();
// time it
@ -835,7 +834,7 @@ void Msg39::intersectListsThreadFunction ( void *state ) {
// . this returns false and sets g_errno on error
// . Msg2 always compresses the lists so be aware that the termId
// has been discarded
that->m_posdbTable.intersectLists10_r ( );
that->m_posdbTable.intersectLists();
// . exit the thread
// . threadDoneWrapper will be called by g_loop when he gets the

@ -44,8 +44,8 @@ bool printCrawlDetails2 (SafeBuf *sb , CollectionRec *cx , char format ) {
"\t\"statusCode\":%" PRId32",\n"
"\t\"statusMsg\":\"%s\",\n"
, crawlStatus, tmp.getBufStart());
sb->safePrintf("\t\"currentTime\":%" PRIu32",\n", (uint32_t)getTimeGlobal() );
sb->safePrintf("\t\"currentTimeUTC\":%" PRIu32"\n", (uint32_t)getTimeGlobal() );
sb->safePrintf("\t\"processStartTime\":%" PRId64",\n", (g_process.m_processStartTime / 1000));
sb->safePrintf("\t\"currentTime\":%" PRIu32"\n", (uint32_t)getTimeGlobal() );
sb->safePrintf("\t}\n");
sb->safePrintf("}\n");
}

@ -153,6 +153,9 @@ static bool printList ( State11 *st ) {
RdbList *list = &st->m_list;
// row count
int32_t j = 0;
char format = st->m_r.getReplyFormat();
// put it in there
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// stop if we got enough
@ -183,9 +186,17 @@ static bool printList ( State11 *st ) {
}
// get the spider rec, encapsed in the data of the doledb rec
SpiderRequest *sreq = (SpiderRequest *)rec;
// print it into sbTable
if ( ! sreq->printToTable ( sbTable,"ready",NULL,j))
return false;
if (format == FORMAT_JSON) {
if (!sreq->printToJSON(sbTable, "ready", NULL, j)) {
return false;
}
} else {
// print it into sbTable
if (!sreq->printToTable(sbTable, "ready", NULL, j)) {
return false;
}
}
// count row
j++;
}
@ -200,9 +211,9 @@ static bool printList ( State11 *st ) {
st->m_startKey = Doledb::makeFirstKey2 (st->m_priority);
st->m_endKey = Doledb::makeLastKey2 (st->m_priority);
// if we printed something, print a blank line after it
if ( st->m_count > 0 )
sbTable->safePrintf("<tr><td colspan=30>..."
"</td></tr>\n");
if ( st->m_count > 0 && format == FORMAT_HTML) {
sbTable->safePrintf("<tr><td colspan=30>...</td></tr>\n");
}
// reset for each priority
st->m_count = 0;
}
@ -211,141 +222,83 @@ static bool printList ( State11 *st ) {
return true;
}
static bool sendPage(State11 *st) {
// generate a query string to pass to host bar
char qs[64]; sprintf ( qs , "&n=%" PRId32, st->m_numRecs );
// store the page in here!
SafeBuf sb;
if( !sb.reserve ( 64*1024 ) ) {
logError("Could not reserve needed mem, bailing!");
return false;
}
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r , qs );
// get spider coll
collnum_t collnum = g_collectiondb.getCollnum ( st->m_coll );
// and coll rec
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
sb.length() );
}
static bool generatePageHTML(CollectionRec *cr, SafeBuf *sb, const SafeBuf *doledbbuf) {
// print reason why spiders are not active for this collection
int32_t tmp2;
SafeBuf mb;
if ( cr ) getSpiderStatusMsg ( cr , &mb , &tmp2 );
getSpiderStatusMsg ( cr , &mb , &tmp2 );
if ( mb.length() && tmp2 != SP_INITIALIZING )
sb.safePrintf(//"<center>"
"<table cellpadding=5 "
//"style=\""
//"border:2px solid black;"
"max-width:600px\" "
"border=0"
">"
"<tr>"
//"<td bgcolor=#ff6666>"
"<td>"
"For collection <i>%s</i>: "
"<b><font color=red>%s</font></b>"
"</td>"
"</tr>"
"</table>\n"
, cr->m_coll
, mb.getBufStart() );
sb->safePrintf("<table cellpadding=5 style=\"max-width:600px\" border=0>"
"<tr>"
"<td>"
"For collection <i>%s</i>: "
"<b><font color=red>%s</font></b>"
"</td>"
"</tr>"
"</table>\n"
, cr->m_coll
, mb.getBufStart() );
// begin the table
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
//"<center>"
"<b>Currently Spidering on This Host</b>"
" (%" PRId32" spiders)"
//" (%" PRId32" locks)"
//"</center>"
"</td></tr>\n"
, TABLE_STYLE
, g_spiderLoop.getNumSpidersOut()
//, g_spiderLoop.m_lockTable.m_numSlotsUsed
sb->safePrintf("<table %s>\n"
"<tr><td colspan=50>"
"<b>Currently Spidering on This Host</b>(%" PRId32" spiders)"
"</td></tr>\n"
, TABLE_STYLE
, g_spiderLoop.getNumSpidersOut()
);
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb , true ) ) return false;
if (!SpiderRequest::printTableHeader(sb, true)) {
return false;
}
// count # of spiders out
int32_t j = 0;
// first print the spider recs we are spidering
for ( int32_t i = 0 ; i < (int32_t)MAX_SPIDERS ; i++ ) {
for (int32_t i = 0; i < (int32_t)MAX_SPIDERS; i++) {
// get it
XmlDoc *xd = g_spiderLoop.m_docs[i];
// skip if empty
if ( ! xd ) continue;
if (!xd) continue;
// sanity check
if ( ! xd->m_sreqValid ) { g_process.shutdownAbort(true); }
if (!xd->m_sreqValid) { g_process.shutdownAbort(true); }
// grab it
SpiderRequest *oldsr = &xd->m_sreq;
// get status
const char *status = xd->m_statusMsg;
// show that
if ( ! oldsr->printToTable ( &sb , status,xd,j) ) return false;
if (!oldsr->printToTable(sb, status, xd, j)) return false;
// inc count
j++;
}
// now print the injections as well!
XmlDoc *xd = getInjectHead ( ) ;
for ( ; xd ; xd = xd->m_nextInject ) {
XmlDoc *xd = getInjectHead();
for (; xd; xd = xd->m_nextInject) {
// how does this happen?
if ( ! xd->m_sreqValid ) continue;
if (!xd->m_sreqValid) continue;
// grab it
SpiderRequest *oldsr = &xd->m_sreq;
// get status
SafeBuf xb;
xb.safePrintf("[<font color=red><b>injecting</b></font>] %s",
xd->m_statusMsg);
xb.safePrintf("[<font color=red><b>injecting</b></font>] %s", xd->m_statusMsg);
char *status = xb.getBufStart();
// show that
if ( ! oldsr->printToTable ( &sb , status,xd,j) ) return false;
if (!oldsr->printToTable(sb, status, xd, j)) return false;
// inc count
j++;
}
// end the table
sb.safePrintf ( "</table>\n" );
sb.safePrintf ( "<br>\n" );
sb->safePrintf("</table>\n");
sb->safePrintf("<br>\n");
// then spider collection
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
//
// spiderdb rec stats, from scanning spiderdb
//
// if not there, forget about it
if ( sc ) sc->printStats ( sb );
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
// done if no sc
if ( ! sc ) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
sb.length() );
return true;
}
/////
@ -354,20 +307,18 @@ static bool sendPage(State11 *st) {
//
/////
int32_t ns = 0;
if ( sc ) ns = sc->getDoledbIpTableCount();
int32_t ns = sc->getDoledbIpTableCount();
// begin the table
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<b>URLs Ready to Spider for collection "
"<font color=red><b>%s</b>"
"</font>"
" (%" PRId32" ips in doleiptable)"
,
TABLE_STYLE,
st->m_coll ,
ns );
sb->safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<b>URLs Ready to Spider for collection "
"<font color=red><b>%s</b>"
"</font>"
" (%" PRId32" ips in doleiptable)",
TABLE_STYLE,
cr->m_coll ,
ns );
// print time format: 7/23/1971 10:45:32
time_t nowUTC = getTimeGlobal();
@ -376,19 +327,16 @@ static bool sendPage(State11 *st) {
struct tm tm_buf;
timeStruct = gmtime_r(&nowUTC,&tm_buf);
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb.safePrintf("</b>" // (current time = %s = %" PRIu32") "
"</td></tr>\n"
//,time,nowUTC
);
sb->safePrintf("</b></td></tr>\n");
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb ,false ) ) return false;
if (!SpiderRequest::printTableHeader(sb, false)) return false;
// the the doledb spider recs
char *bs = st->m_safeBuf.getBufStart();
if ( bs && ! sb.safePrintf("%s",bs) ) return false;
const char *bs = doledbbuf->getBufStart();
if (bs && !sb->safePrintf("%s", bs)) return false;
// end the table
sb.safePrintf ( "</table>\n" );
sb.safePrintf ( "<br>\n" );
sb->safePrintf ( "</table>\n" );
sb->safePrintf ( "<br>\n" );
@ -399,30 +347,26 @@ static bool sendPage(State11 *st) {
// each row is an ip. print the next url to spider for that ip.
//
/////////////////
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<b>IPs Waiting for Selection Scan for collection "
"<font color=red><b>%s</b>"
"</font>"
,
TABLE_STYLE,
st->m_coll );
sb->safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<b>IPs Waiting for Selection Scan for collection "
"<font color=red><b>%s</b>"
"</font>",
TABLE_STYLE,
cr->m_coll );
// print time format: 7/23/1971 10:45:32
int64_t timems = gettimeofdayInMilliseconds();
sb.safePrintf("</b> (current time = %" PRIu64")(totalcount=%" PRId32")"
"(waittablecount=%" PRId32")",
timems,
sc->m_waitingTree.getNumUsedNodes(),
sc->getWaitingTableCount());
sb->safePrintf("</b> (current time = %" PRIu64")(totalcount=%" PRId32")(waittablecount=%" PRId32")",
timems, sc->m_waitingTree.getNumUsedNodes(), sc->getWaitingTableCount());
char ipbuf[16];
sb.safePrintf("(spiderdb scanning ip %s)", iptoa(sc->getScanningIp(),ipbuf));
sb->safePrintf("(spiderdb scanning ip %s)", iptoa(sc->getScanningIp(),ipbuf));
sb.safePrintf("</td></tr>\n");
sb.safePrintf("<tr bgcolor=#%s>",DARK_BLUE);
sb.safePrintf("<td><b>spidertime (MS)</b></td>\n");
sb.safePrintf("<td><b>firstip</b></td>\n");
sb.safePrintf("</tr>\n");
sb->safePrintf("</td></tr>\n");
sb->safePrintf("<tr bgcolor=#%s>",DARK_BLUE);
sb->safePrintf("<td><b>spidertime (MS)</b></td>\n");
sb->safePrintf("<td><b>firstip</b></td>\n");
sb->safePrintf("</tr>\n");
// the the waiting tree
int32_t count = 0;
@ -443,34 +387,241 @@ static bool sendPage(State11 *st) {
const char *note = "";
// get the rest of the data
sb.safePrintf("<tr bgcolor=#%s>"
"<td>%" PRId64"%s</td>"
"<td>%s</td>"
"</tr>\n",
LIGHT_BLUE,
(int64_t)spiderTimeMS,
note,
iptoa(firstIp,ipbuf));
sb->safePrintf("<tr bgcolor=#%s>"
"<td>%" PRId64"%s</td>"
"<td>%s</td>"
"</tr>\n",
LIGHT_BLUE,
(int64_t)spiderTimeMS,
note,
iptoa(firstIp,ipbuf));
// stop after 20
if (++count == 20) break;
}
}
// ...
if ( count )
sb.safePrintf("<tr bgcolor=#%s>"
sb->safePrintf("<tr bgcolor=#%s>"
"<td colspan=10>...</td></tr>\n",
LIGHT_BLUE);
// end the table
sb.safePrintf ( "</table>\n" );
sb.safePrintf ( "<br>\n" );
sb->safePrintf ( "</table>\n" );
sb->safePrintf ( "<br>\n" );
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),sb.length() );
return true;
}
/*
* {
* "response": {
* "statusCode": 0,
* "statusMsg": "Job is initializing.",
* "currentSpiders": 0,
*
* }
* }
*/
static bool generatePageJSON(CollectionRec *cr, SafeBuf *sb, const SafeBuf *doledbbuf) {
sb->safePrintf("{\n\"response\": {\n");
int32_t crawlStatus;
SafeBuf crawlMsg;
getSpiderStatusMsg ( cr , &crawlMsg , &crawlStatus );
sb->safePrintf("\t\"statusCode\": %d,\n", crawlStatus);
sb->safePrintf("\t\"statusMsg\": \"%s\",\n", crawlMsg.getBufStart());
sb->safePrintf("\t\"spiderCount\": %d,\n", g_spiderLoop.getNumSpidersOut());
sb->safePrintf("\t\"spiders\": [\n");
// count # of spiders out
int32_t j = 0;
// first print the spider recs we are spidering
for (int32_t i = 0; i < (int32_t)MAX_SPIDERS; i++) {
XmlDoc *xd = g_spiderLoop.m_docs[i];
if (!xd) {
continue;
}
// sanity check
if (!xd->m_sreqValid) {
g_process.shutdownAbort(true);
}
// grab it
SpiderRequest *oldsr = &xd->m_sreq;
if (!oldsr->printToJSON(sb, xd->m_statusMsg, xd, j)) {
return false;
}
j++;
}
// now print the injections as well!
XmlDoc *xd = getInjectHead();
for (; xd; xd = xd->m_nextInject) {
// how does this happen?
if (!xd->m_sreqValid) {
continue;
}
SpiderRequest *oldsr = &xd->m_sreq;
// get status
SafeBuf xb;
xb.safePrintf("injecting - %s", xd->m_statusMsg);
// show that
if (!oldsr->printToJSON(sb, xb.getBufStart(), xd, j)) {
return false;
}
// inc count
j++;
}
// end the table
sb->safePrintf("\t]\n");
// then spider collection
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
// done if no sc
if (!sc) {
sb->safePrintf("}\n}\n");
return true;
}
sb->safePrintf("\t,\n");
/////
//
// READY TO SPIDER table
//
/////
sb->safePrintf("\t\"doleIPCount\": %d,\n", sc->getDoledbIpTableCount());
sb->safePrintf("\t\"doleIPs\": [\n");
// the the doledb spider recs
const char *bs = doledbbuf->getBufStart();
if (bs && !sb->safePrintf("%s", bs)) {
return false;
}
sb->safePrintf("\t],\n");
/////////////////
//
// PRINT WAITING TREE
//
// each row is an ip. print the next url to spider for that ip.
//
/////////////////
sb->safePrintf("\t\"waitingTreeCount\": %d,\n", sc->m_waitingTree.getNumUsedNodes());
sb->safePrintf("\t\"waitingTrees\": [\n");
// the the waiting tree
char ipbuf[16];
int32_t count = 0;
{
ScopedLock sl(sc->m_waitingTree.getLock());
for (int32_t node = sc->m_waitingTree.getFirstNode_unlocked(); node >= 0; node = sc->m_waitingTree.getNextNode_unlocked(node)) {
// get key
const key96_t *key = reinterpret_cast<const key96_t *>(sc->m_waitingTree.getKey_unlocked(node));
// get ip from that
int32_t firstIp = (key->n0) & 0xffffffff;
// get the timedocs
uint64_t spiderTimeMS = key->n1;
// shift upp
spiderTimeMS <<= 32;
// or in
spiderTimeMS |= (key->n0 >> 32);
if (count != 0) {
sb->safePrintf("\t\t,\n");
}
sb->safePrintf("\t\t{\n");
sb->safePrintf("\t\t\t\"spiderTime\": %" PRIu64",\n", spiderTimeMS);
sb->safePrintf("\t\t\t\"firstIp\": \"%s\"\n", iptoa(firstIp,ipbuf));
sb->safePrintf("\t\t}\n");
// stop after 20
if (++count == 20) break;
}
}
sb->safePrintf("\t]\n");
sb->safePrintf("}\n}\n");
return true;
}
static bool sendPage(State11 *st) {
// generate a query string to pass to host bar
char qs[64]; sprintf ( qs , "&n=%" PRId32, st->m_numRecs );
// store the page in here!
SafeBuf sb;
if( !sb.reserve ( 64*1024 ) ) {
logError("Could not reserve needed mem, bailing!");
return false;
}
char format = st->m_r.getReplyFormat();
if (format == FORMAT_HTML) {
g_pages.printAdminTop(&sb, st->m_socket, &st->m_r, qs);
}
// get spider coll
collnum_t collnum = g_collectiondb.getCollnum ( st->m_coll );
// and coll rec
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length());
}
bool result;
const char *contentType;
switch (format) {
case FORMAT_JSON:
result = generatePageJSON(cr, &sb, &st->m_safeBuf);
contentType = "application/json";
break;
case FORMAT_HTML:
default:
result = generatePageHTML(cr, &sb, &st->m_safeBuf);
contentType = NULL;
break;
}
if (result) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), -1, false, contentType);
}
return false;
}

@ -94,12 +94,12 @@ void PosdbTable::reset() {
m_siteRankMultiplier = 0.0;
m_addListsTime = 0;
m_t2 = 0;
m_qpos = NULL;
m_wikiPhraseIds = NULL;
m_quotedStartIds = NULL;
m_freqWeights = NULL;
m_bflags = NULL;
m_qtermNums = NULL;
m_qpos.clear();
m_wikiPhraseIds.clear();
m_quotedStartIds.clear();
m_freqWeights.clear();
m_bflags.clear();
m_qtermNums.clear();
m_bestMinTermPairWindowScore = 0.0;
m_bestMinTermPairWindowPtrs = NULL;
m_msg2 = NULL;
@ -2672,7 +2672,7 @@ bool PosdbTable::advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qti
// false - docid does not meet minimum score requirement
// true - docid can potentially be a top scoring docid
//
bool PosdbTable::prefilterMaxPossibleScoreByDistance(const QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore) {
bool PosdbTable::prefilterMaxPossibleScoreByDistance(const QueryTermInfo *qtibuf, float minWinningScore) {
unsigned char ringBuf[RINGBUFSIZE+10];
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
@ -2843,7 +2843,7 @@ bool PosdbTable::prefilterMaxPossibleScoreByDistance(const QueryTermInfo *qtibuf
}
// query distance
qdist = qpos[m_minTermListIdx] - qpos[i];
qdist = m_qpos[m_minTermListIdx] - m_qpos[i];
// compute it
float maxScore2 = getMaxPossibleScore(&qtibuf[i],
bestDist,
@ -2885,7 +2885,7 @@ void PosdbTable::mergeTermSubListsForDocId(QueryTermInfo *qtibuf, char *miniMerg
// all posdb keys for this docid should fit in here, the
// mini merge buf:
char *mptr = miniMergeBuf;
miniMergeBufEnd -= 1000; //fragile hack but no worse than the original code
char *miniMergeBufSafeEnd = miniMergeBufEnd - 1000; //fragile hack but no worse than the original code
char *lastMptr = NULL;
// Merge each set of sublists, like we merge a term's list with
@ -3098,7 +3098,7 @@ void PosdbTable::mergeTermSubListsForDocId(QueryTermInfo *qtibuf, char *miniMerg
}
} // mink != -1
//log("skipping ks=%" PRId32,(int32_t)ks);
} while( !currTermDone && mptr < miniMergeBufEnd ); // merge more ...
} while( !currTermDone && mptr < miniMergeBufSafeEnd ); // merge more ...
// wrap it up here since done merging
miniMergedListEnd[j] = mptr;
@ -3107,6 +3107,7 @@ void PosdbTable::mergeTermSubListsForDocId(QueryTermInfo *qtibuf, char *miniMerg
// breach?
if ( mptr > miniMergeBufEnd ) {
log(LOG_ERROR,"%s:%s:%d: miniMergeBuf=%p miniMergeBufEnd=%p mptr=%p", __FILE__, __func__, __LINE__, miniMergeBuf, miniMergeBufEnd, mptr);
gbshutdownAbort(true);
}
@ -3234,7 +3235,7 @@ void PosdbTable::createNonBodyTermPairScoreMatrix(const char **miniMergedListSta
// store in matrix for "sub out" algo below
// when doing sliding window
scoreMatrix[i*m_nqt+j] = wts;
scoreMatrix[i*m_numQueryTermInfos+j] = wts;
}
}
logTrace(g_conf.m_logTracePosdb, "END");
@ -3247,6 +3248,8 @@ void PosdbTable::createNonBodyTermPairScoreMatrix(const char **miniMergedListSta
//
float PosdbTable::getMinSingleTermScoreSum(const char **miniMergedListStart, const char **miniMergedListEnd, const char **highestScoringNonBodyPos, DocIdScore *pdcs) {
float minSingleScore = 999999999.0;
bool mergedListFound = false;
bool allSpecialTerms = true;
bool scoredTerm = false;
logTrace(g_conf.m_logTracePosdb, "BEGIN");
@ -3259,16 +3262,21 @@ float PosdbTable::getMinSingleTermScoreSum(const char **miniMergedListStart, con
//
// This should be highly negative if singles[i] has a '-'
// termsign...
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
if ( ! miniMergedListStart[i] ) {
continue;
}
mergedListFound = true;
// skip if to the left of a pipe operator
if( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) {
continue;
}
allSpecialTerms = false;
// sometimes there is no wordpos subtermlist for this docid
// because it just has the bigram, like "streetlight" and not
// the word "light" by itself for the query 'street light'
@ -3301,7 +3309,12 @@ float PosdbTable::getMinSingleTermScoreSum(const char **miniMergedListStart, con
}
}
if( !scoredTerm ) {
if( !mergedListFound || (!scoredTerm && !allSpecialTerms) ) {
// Fix default value if no single terms were scored, and all terms are not special (e.g. numbers).
// This returns -1 for documents matching bigrams only, and not single terms. Can happen when searching
// for "bridget jones" and a document has the text "bridgetjon es" as the only match (bigram).
//
// If terms are numbers, do NOT return -1, otherwise gbsortbyint queries do not work.
minSingleScore = -1;
}
@ -3323,6 +3336,8 @@ float PosdbTable::getMinSingleTermScoreSum(const char **miniMergedListStart, con
void PosdbTable::findMinTermPairScoreInWindow(const char **ptrs, const char **highestScoringNonBodyPos, float *scoreMatrix) {
int32_t qdist = 0;
float minTermPairScoreInWindow = 999999999.0;
bool mergedListFound = false;
bool allSpecialTerms = true;
bool scoredTerms = false;
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
@ -3331,16 +3346,17 @@ void PosdbTable::findMinTermPairScoreInWindow(const char **ptrs, const char **hi
// is the term whose position got advanced in the sliding window.
for ( int32_t i = 0 ; i < m_numQueryTermInfos; i++ ) {
// skip if to the left of a pipe operator
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) )
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) {
continue;
}
allSpecialTerms = false;
// skip empty list
if( !ptrs[i] ) {
continue;
}
mergedListFound = true;
//if ( ptrs[i] ) wpi = ptrs[i];
// if term does not occur in body, sub-in the best term
@ -3351,10 +3367,10 @@ void PosdbTable::findMinTermPairScoreInWindow(const char **ptrs, const char **hi
// loop over other terms
for(int32_t j = i + 1; j < m_numQueryTermInfos; j++) {
// skip if to the left of a pipe operator
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) )
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) {
continue;
}
// skip empty list
if( !ptrs[j] ) {
@ -3425,8 +3441,8 @@ void PosdbTable::findMinTermPairScoreInWindow(const char **ptrs, const char **hi
max *= m_freqWeights[i] * m_freqWeights[j];
// use score from scoreMatrix if bigger
if ( scoreMatrix[m_nqt*i+j] > max ) {
max = scoreMatrix[m_nqt*i+j];
if ( scoreMatrix[i*m_numQueryTermInfos+j] > max ) {
max = scoreMatrix[i*m_numQueryTermInfos+j];
}
@ -3475,8 +3491,13 @@ void PosdbTable::findMinTermPairScoreInWindow(const char **ptrs, const char **hi
}
}
if( !mergedListFound || (!scoredTerms && !allSpecialTerms) ) {
// Similar fix as in getMinSingleTermScoreSum, but should not happen in this function ...
minTermPairScoreInWindow = -1;
}
// Our best minimum score better than current best minimum score?
if ( minTermPairScoreInWindow <= m_bestMinTermPairWindowScore || !scoredTerms ) {
if ( minTermPairScoreInWindow <= m_bestMinTermPairWindowScore ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return;
}
@ -3575,6 +3596,7 @@ float PosdbTable::getMinTermPairScoreSlidingWindow(const char **miniMergedListSt
// if no terms in body, no need to do sliding window
bool doneSliding = allNull ? true : false;
logTrace(g_conf.m_logTracePosdb, "Run sliding window algo? %s", !doneSliding?"yes":"no, no matches found in body");
while( !doneSliding ) {
//
@ -3708,7 +3730,6 @@ float PosdbTable::getMinTermPairScoreSlidingWindow(const char **miniMergedListSt
if ( ! miniMergedListStart[j] ) {
continue;
}
// . this limits its scoring to the winning sliding window
// as far as the in-body terms are concerned
// . it will do sub-outs using the score matrix
@ -3737,6 +3758,16 @@ float PosdbTable::getMinTermPairScoreSlidingWindow(const char **miniMergedListSt
//simple wrapper around intersectLists_real() just for transforming std::bad_alloca exceptions in ENOMEM
void PosdbTable::intersectLists() {
try {
intersectLists_real();
} catch(std::bad_alloc&) {
log(LOG_ERROR,"posdb: caught std::bad_alloc - out of memory");
if(g_errno==0)
g_errno = ENOMEM;
}
}
// . compare the output of this to intersectLists9_r()
@ -3744,7 +3775,7 @@ float PosdbTable::getMinTermPairScoreSlidingWindow(const char **miniMergedListSt
// . IDEAS:
// we could also note that if a term was not in the title or
// inlink text it could never beat the 10th score.
void PosdbTable::intersectLists10_r ( ) {
void PosdbTable::intersectLists_real() {
logTrace(g_conf.m_logTracePosdb, "BEGIN. numTerms: %" PRId32, m_q->m_numTerms);
if(!allocateTopTree()) {
@ -3752,6 +3783,10 @@ void PosdbTable::intersectLists10_r ( ) {
g_errno = ENOMEM;
return;
}
if(m_topTree->getNumNodes()==0) {
logTrace(g_conf.m_logTracePosdb, "END. toptree has zero size");
return;
}
if(!allocateScoringInfo()) {
logTrace(g_conf.m_logTracePosdb, "END. could not allocate scoring info");
@ -3794,45 +3829,20 @@ void PosdbTable::intersectLists10_r ( ) {
//
// TRANSFORM QueryTermInfo::m_* vars into old style arrays
//
// MUST MATCH allocation in allocTopScoringDocIdsData
//
int32_t nqt = m_q->m_numTerms;
int32_t need = 0;
need += 4 * nqt; // wikiPhraseIds
need += 4 * nqt; // quotedStartIds
need += 4 * nqt; // qpos
need += 4 * nqt; // qtermNums
need += sizeof(float ) * nqt; // freqWeights
need += sizeof(char *) * nqt; // miniMergedListStart
need += sizeof(char *) * nqt; // miniMergedListEnd
need += sizeof(char *) * nqt; // highestScoringNonBodyPos
need += sizeof(char *) * nqt; // winnerStack
need += sizeof(char *) * nqt; // xpos
need += sizeof(char ) * nqt; // bflags
need += sizeof(float ) * nqt * nqt; // scoreMatrix
SmallBuf<1024> workingStorageBuf("stkbuf1");
if(!workingStorageBuf.reserve(need)) {
g_errno = ENOMEM;
return;
}
char *pp = workingStorageBuf.getBufStart();
int32_t *wikiPhraseIds = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo
int32_t *quotedStartIds = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo
int32_t *qpos = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo
int32_t *qtermNums = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo
float *freqWeights = (float *)pp; pp += sizeof(float) * nqt; // from QueryTermInfo
const char **miniMergedListStart = (const char **)pp; pp += sizeof(const char *) * nqt;
const char **miniMergedListEnd = (const char **)pp; pp += sizeof(const char *) * nqt;
const char **highestScoringNonBodyPos = (const char **)pp; pp += sizeof(const char *) * nqt;
const char **winnerStack = (const char **)pp; pp += sizeof(const char *) * nqt;
const char **xpos = (const char **)pp; pp += sizeof(const char *) * nqt;
char *bflags = (char *)pp; pp += sizeof(char) * nqt;
float *scoreMatrix = (float *)pp; pp += sizeof(float) *nqt*nqt;
if ( pp > workingStorageBuf.getBufEnd() )
gbshutdownAbort(true);
m_wikiPhraseIds.resize(m_numQueryTermInfos);
m_quotedStartIds.resize(m_numQueryTermInfos);
m_qpos.resize(m_numQueryTermInfos);
m_qtermNums.resize(m_numQueryTermInfos);
m_freqWeights.resize(m_numQueryTermInfos);
m_bflags.resize(m_numQueryTermInfos);
std::vector<const char *> miniMergedListStart(m_numQueryTermInfos);
std::vector<const char *> miniMergedListEnd(m_numQueryTermInfos);
std::vector<const char *> highestScoringNonBodyPos(m_numQueryTermInfos);
std::vector<const char *> winnerStack(m_numQueryTermInfos);
std::vector<const char *> xpos(m_numQueryTermInfos);
std::vector<float> scoreMatrix(m_numQueryTermInfos*m_numQueryTermInfos);
int64_t lastTime = gettimeofdayInMilliseconds();
int64_t now;
int64_t took;
@ -3846,23 +3856,15 @@ void PosdbTable::intersectLists10_r ( ) {
// get it
QueryTermInfo *qti = &qtibuf[i];
// set it
wikiPhraseIds [i] = qti->m_wikiPhraseId;
quotedStartIds[i] = qti->m_quotedStartId;
m_wikiPhraseIds [i] = qti->m_wikiPhraseId;
m_quotedStartIds[i] = qti->m_quotedStartId;
// query term position
qpos [i] = qti->m_qpos;
qtermNums [i] = qti->m_qtermNum;
freqWeights [i] = qti->m_termFreqWeight;
m_qpos [i] = qti->m_qpos;
m_qtermNums [i] = qti->m_qtermNum;
m_freqWeights [i] = qti->m_termFreqWeight;
}
// for findMinTermPairScoreInWindow() function
m_freqWeights = freqWeights;
m_qtermNums = qtermNums;
m_bflags = bflags;
//////////
//
// OLD MAIN INTERSECTION LOGIC
@ -4094,7 +4096,7 @@ void PosdbTable::intersectLists10_r ( ) {
if ( minWinningScore >= 0.0 && m_sortByTermNum < 0 && m_sortByTermNumInt < 0 ) {
if( !prefilterMaxPossibleScoreByDistance(qtibuf, qpos, minWinningScore*completeScoreMultiplier) ) {
if( !prefilterMaxPossibleScoreByDistance(qtibuf, minWinningScore*completeScoreMultiplier) ) {
docIdPtr += 6;
prefiltBestDistMaxPossScoreFail++;
skipToNextDocId = true;
@ -4139,7 +4141,7 @@ void PosdbTable::intersectLists10_r ( ) {
//## the miniMerged* pointers point into..
//##
mergeTermSubListsForDocId(qtibuf, miniMergeBuf, miniMergeBuf+sizeof(miniMergeBuf), miniMergedListStart, miniMergedListEnd, &highestInlinkSiteRank);
mergeTermSubListsForDocId(qtibuf, miniMergeBuf, miniMergeBuf+sizeof(miniMergeBuf), &(miniMergedListStart[0]), &(miniMergedListEnd[0]), &highestInlinkSiteRank);
// clear the counts on this DocIdScore class for this new docid
pdcs = NULL;
@ -4154,22 +4156,19 @@ void PosdbTable::intersectLists10_r ( ) {
if ( !m_q->m_isBoolean ) {
// Used by the various scoring functions called below
m_qpos = qpos;
m_wikiPhraseIds = wikiPhraseIds;
m_quotedStartIds = quotedStartIds;
m_bestMinTermPairWindowScore = -2.0;
//#
//# NON-BODY TERM PAIR SCORING LOOP
//#
createNonBodyTermPairScoreMatrix(miniMergedListStart, miniMergedListEnd, scoreMatrix);
createNonBodyTermPairScoreMatrix(&(miniMergedListStart[0]), &(miniMergedListEnd[0]), &(scoreMatrix[0]));
//#
//# SINGLE TERM SCORE LOOP
//#
minSingleScore = getMinSingleTermScoreSum(miniMergedListStart, miniMergedListEnd, highestScoringNonBodyPos, pdcs);
minSingleScore = getMinSingleTermScoreSum(&(miniMergedListStart[0]), &(miniMergedListEnd[0]), &(highestScoringNonBodyPos[0]), pdcs);
logTrace(g_conf.m_logTracePosdb, "minSingleScore=%f before multiplication for docId %" PRIu64 "", minSingleScore, m_docId);
minSingleScore *= completeScoreMultiplier;
@ -4201,7 +4200,7 @@ void PosdbTable::intersectLists10_r ( ) {
// term positions set ("window") that has the highest minimum score. These
// pointers are used when determining the minimum term pair score returned
// by the function.
float minPairScore = getMinTermPairScoreSlidingWindow(miniMergedListStart, miniMergedListEnd, highestScoringNonBodyPos, winnerStack, xpos, scoreMatrix, pdcs);
float minPairScore = getMinTermPairScoreSlidingWindow(&(miniMergedListStart[0]), &(miniMergedListEnd[0]), &(highestScoringNonBodyPos[0]), &(winnerStack[0]), &(xpos[0]), &(scoreMatrix[0]), pdcs);
logTrace(g_conf.m_logTracePosdb, "minPairScore=%f before multiplication for docId %" PRIu64 "", minPairScore, m_docId);
minPairScore *= completeScoreMultiplier;
@ -4481,6 +4480,14 @@ void PosdbTable::intersectLists10_r ( ) {
m_t1 = t1;
m_t2 = now;
//opportunistic cleanup (memory release)
m_wikiPhraseIds.clear();
m_quotedStartIds.clear();
m_qpos.clear();
m_qtermNums.clear();
m_freqWeights.clear();
m_bflags.clear();
logTrace(g_conf.m_logTracePosdb, "END. Took %" PRId64" msec", m_addListsTime);
}
@ -4781,11 +4788,10 @@ bool PosdbTable::allocateTopTree() {
if(list && !list->isEmpty()) {
if(m_debug) {
log(LOG_INFO, "toptree: adding listsize %" PRId32" to nn2", list->getListSize());
// each new docid in this termlist will compress
// the 6 byte termid out, so reduce by 6.
nn2 += list->getListSize() / (sizeof(posdbkey_t)-6);
}
// each new docid in this termlist will compress
// the 6 byte termid out, so reduce by 6.
nn2 += list->getListSize() / (sizeof(posdbkey_t)-6);
}
}

@ -3,6 +3,7 @@
#include "RdbList.h"
#include "HashTableX.h"
#include <vector>
float getDiversityWeight ( unsigned char diversityRank );
float getDensityWeight ( unsigned char densityRank );
@ -115,7 +116,7 @@ class PosdbTable {
void logDebugScoreInfo(int32_t loglevel);
void removeScoreInfoForDeletedDocIds();
bool advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qtibuf);
bool prefilterMaxPossibleScoreByDistance(const QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore);
bool prefilterMaxPossibleScoreByDistance(const QueryTermInfo *qtibuf, float minWinningScore);
void mergeTermSubListsForDocId(QueryTermInfo *qtibuf, char *miniMergeBuf, char *miniMergeBufEnd, const char **miniMergedList, const char **miniMergedEnd, int *highestInlinkSiteRank);
void createNonBodyTermPairScoreMatrix(const char **miniMergedList, const char **miniMergedEnd, float *scoreMatrix);
@ -136,12 +137,12 @@ private:
TopTree *m_topTree;
//used during intersection, part of working area
int32_t *m_wikiPhraseIds;
int32_t *m_quotedStartIds;
int32_t *m_qpos;
int32_t *m_qtermNums;
float *m_freqWeights;
char *m_bflags;
std::vector<int32_t> m_wikiPhraseIds;
std::vector<int32_t> m_quotedStartIds;
std::vector<int32_t> m_qpos;
std::vector<int32_t> m_qtermNums;
std::vector<float> m_freqWeights;
std::vector<char> m_bflags;
//used during intersection, simple variables
float m_bestMinTermPairWindowScore; //Best minimum score in a "sliding window"
const char **m_bestMinTermPairWindowPtrs; //Position pointers of best minimum score
@ -201,9 +202,10 @@ private:
bool allocateScoringInfo();
bool setQueryTermInfo();
void intersectLists_real();
public:
// the new intersection/scoring algo
void intersectLists10_r ( );
void intersectLists();
void delNonMatchingDocIdsFromSubLists();
@ -222,7 +224,9 @@ public:
const QueryTermInfo *qtm ) ;
int64_t getTotalHits() const { return m_docIdVoteBuf.length() / 6; }
int32_t getFilteredCount() const { return m_filtered; }
private:
// stuff set in setQueryTermInf() function:
SafeBuf m_qiBuf;
int32_t m_numQueryTermInfos;

@ -158,6 +158,18 @@ bool Robots::parseUserAgent( const char *field, int32_t fieldLen, bool *isUserAg
m_userAgentFound = true;
*isUserAgentPtr = true;
} else {
// try substring match
const char *match = strncasestr(value, m_userAgent, valueLen, m_userAgentLen);
if (match && value < match - 1) {
// we should only match prefix and not suffix (eg: testbot will match testbot-test and not atestbot)
char c = *(match - 1);
if (!isalnum(c)) {
m_userAgentFound = true;
*isUserAgentPtr = true;
}
}
}
}

@ -220,15 +220,76 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
return sb->length();
}
/*
* {
* "elapsedMS" : 0,
* "url": "http://example.com/",
* "status": "getting web page",
* "priority": 15,
* "ufn": 3,
* "firstIp": "127.0.0.1",
* "errCount": 0,
* "urlHash48": 123456789
* "siteInLinks": 0,
* "hops": 0,
* "addedTime: 14000000,
* "pageNumInLinks: 1,
* "parentDocId": 123456789,
* }
*/
int32_t SpiderRequest::printToJSON(SafeBuf *sb, const char *status, XmlDoc *xd, int32_t row) {
if (row != 0) {
sb->safePrintf("\t\t,\n");
}
sb->safePrintf("\t\t{\n");
int32_t SpiderRequest::printToTable ( SafeBuf *sb , const char *status ,
XmlDoc *xd , int32_t row ) {
int64_t elapsedMS = 0;
if (xd) {
elapsedMS = gettimeofdayInMilliseconds() - xd->m_startTime;
}
sb->safePrintf("<tr bgcolor=#%s>\n",LIGHT_BLUE);
sb->safePrintf("\t\t\t\"elapsedMS\": %" PRId64",\n", elapsedMS);
sb->safePrintf("\t\t\t\"url\": \"%s\",\n", m_url);
sb->safePrintf("\t\t\t\"status\": \"%s\",\n", status);
sb->safePrintf("\t\t\t\"priority\": %hhd,\n", m_priority);
sb->safePrintf("\t\t\t\"ufn\": %" PRId16",\n", m_ufn);
char ipbuf[16];
sb->safePrintf("\t\t\t\"firstIp\": \"%s\",\n", iptoa(m_firstIp,ipbuf));
sb->safePrintf("\t\t\t\"errCount\": %hhd,\n", m_errCount);
sb->safePrintf("\t\t\t\"urlHash48\": %" PRId64",\n", getUrlHash48());
sb->safePrintf("\t\t\t\"siteInLinks\": %" PRId32",\n", m_siteNumInlinks);
sb->safePrintf("\t\t\t\"hops\": %" PRId16",\n", m_hopCount);
sb->safePrintf("\t\t\t\"addedTime\": %" PRIu32",\n", m_addedTime);
sb->safePrintf("\t\t\t\"pageNumInLinks\": %" PRIu8",\n", m_pageNumInlinks);
sb->safePrintf("\t\t\t\"parentDocId\": %" PRId64"\n", getParentDocId());
/// @todo ALC add flags to json response
// if ( m_isAddUrl ) sb->safePrintf("ISADDURL ");
// if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX ");
// if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER ");
// if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID ");
// if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT ");
// if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT ");
// if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
// if ( m_isInjecting ) sb->safePrintf("ISINJECTING ");
// if ( m_forceDelete ) sb->safePrintf("FORCEDELETE ");
// if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
sb->safePrintf("\t\t}\n");
return sb->length();
}
int32_t SpiderRequest::printToTable(SafeBuf *sb, const char *status, XmlDoc *xd, int32_t row) {
// show elapsed time
if ( xd ) {
if (xd) {
int64_t now = gettimeofdayInMilliseconds();
int64_t elapsed = now - xd->m_startTime;
sb->safePrintf(" <td>%" PRId32"</td>\n",row);
@ -317,7 +378,7 @@ int32_t SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering)
sb->safePrintf(" <td><b>siteInlinks</b></td>\n");
sb->safePrintf(" <td><b>hops</b></td>\n");
sb->safePrintf(" <td><b>addedTime</b></td>\n");
sb->safePrintf(" <td><b>parentIp</b></td>\n");
sb->safePrintf(" <td><b>pageNumInLinks</b></td>\n");
sb->safePrintf(" <td><b>parentDocId</b></td>\n");
sb->safePrintf(" <td><b>flags</b></td>\n");
sb->safePrintf("</tr>\n");
@ -2839,21 +2900,6 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
return msg->safePrintf("Job is initializing.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Nothing currently "
"available to spider. "
"Change your url filters, try "
"adding new urls, or wait for "
"existing urls to be respidered.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Job round completed.");
}
if ( ! g_conf.m_spideringEnabled ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("All crawling temporarily paused "

@ -29,16 +29,16 @@ class SpiderColl;
// . values for CollectionRec::m_spiderStatus
// . reasons why crawl is not happening
#define SP_INITIALIZING 0
//#define SP_UNUSED 1
//#define SP_UNUSED 2
//#define SP_UNUSED 3
#define SP_ROUNDDONE 4 // spider round is done
#define SP_NOURLS 5 // initializing
#define SP_PAUSED 6 // user paused spider
#define SP_INPROGRESS 7 // it is going on!
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
#define SP_INITIALIZING 0
//#define SP_UNUSED_1 1
//#define SP_UNUSED_2 2
//#define SP_UNUSED_3 3
//#define SP_UNUSED_4 4
//#define SP_UNUSED_5 5
#define SP_PAUSED 6 // user paused spider
#define SP_INPROGRESS 7 // it is going on!
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
//#define SP_UNUSED_9 9
bool getSpiderStatusMsg ( class CollectionRec *cx ,
class SafeBuf *msg ,
@ -679,6 +679,7 @@ public:
int32_t print( class SafeBuf *sb );
int32_t printToTable( SafeBuf *sb, const char *status, class XmlDoc *xd, int32_t row ) ;
int32_t printToJSON( SafeBuf *sb, const char *status, class XmlDoc *xd, int32_t row ) ;
static int32_t printTableHeader ( SafeBuf *sb, bool currentlSpidering ) ;

@ -44,7 +44,6 @@ CollectionRec *SpiderColl::getCollectionRec ( ) {
SpiderColl::SpiderColl(CollectionRec *cr) {
m_overflowList = NULL;
m_lastOverflowFirstIp = 0;
m_lastPrinted = 0;
m_deleteMyself = false;
m_isLoading = false;
m_gettingList1 = false;
@ -81,7 +80,6 @@ SpiderColl::SpiderColl(CollectionRec *cr) {
m_numAdded = 0;
m_numBytesScanned = 0;
m_lastPrintCount = 0;
m_lastPrinted = 0;
m_collnum = -1;
m_countingPagesIndexed = false;
m_lastReqUh48a = 0;
@ -1050,28 +1048,6 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
}
}
uint64_t SpiderColl::getNextSpiderTimeFromWaitingTree ( ) {
ScopedLock sl(m_waitingTree.getLock());
// if nothing to scan, bail
if (m_waitingTree.isEmpty_unlocked() ) return 0LL;
// the key
key96_t mink; mink.setMin();
// set node from wait tree key. this way we can resume from a prev key
int32_t node = m_waitingTree.getNextNode_unlocked(0, (char *)&mink);
// if empty, stop
if ( node < 0 ) return 0LL;
// get the key
const key96_t *wk = reinterpret_cast<const key96_t*>(m_waitingTree.getKey_unlocked(node));
// time from that
uint64_t spiderTimeMS = (wk->n1);
spiderTimeMS <<= 32;
spiderTimeMS |= ((wk->n0) >> 32);
// stop if need to wait for this one
return spiderTimeMS;
}
void SpiderColl::gotSpiderdbWaitingTreeListWrapper(void *state, RdbList *list, Msg5 *msg5) {
SpiderColl *THIS = (SpiderColl *)state;
@ -2888,7 +2864,7 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
log(LOG_DEBUG,"spider: removed2 time=%" PRId64" ip=%s from "
"waiting tree. nn=%" PRId32".",
timestamp64, iptoa(firstIp,ipbuf),
m_waitingTree.getNumUsedNodes());
m_waitingTree.getNumUsedNodes_unlocked());
removeFromWaitingTable(firstIp);
return true;
@ -3374,12 +3350,6 @@ void SpiderColl::setPriority(int32_t pri) {
m_msg5StartKey = m_nextDoledbKey;
}
bool SpiderColl::printStats ( SafeBuf &sb ) {
return true;
}
bool SpiderColl::tryToDeleteSpiderColl ( SpiderColl *sc , const char *msg ) {
// if not being deleted return false
if ( ! sc->m_deleteMyself ) return false;

@ -67,7 +67,6 @@ public:
key96_t m_nextKeys[MAX_SPIDER_PRIORITIES];
int64_t m_lastPrintCount;
int64_t m_lastPrinted;
// used by SpiderLoop.cpp
int32_t m_spidersOut;
@ -89,7 +88,6 @@ public:
bool printWaitingTree ( ) ;
bool addToWaitingTree(int32_t firstIp);
uint64_t getNextSpiderTimeFromWaitingTree ( ) ;
void populateDoledbFromWaitingTree ( );
void populateWaitingTreeFromSpiderdb ( bool reentry ) ;
@ -119,13 +117,9 @@ public:
key96_t m_nextDoledbKey;
int32_t m_pri2;
bool gettingSpiderdbList() const { return m_gettingList1; }
// how many outstanding spiders a priority has
int32_t m_outstandingSpiders[MAX_SPIDER_PRIORITIES];
bool printStats ( SafeBuf &sb ) ;
bool isFirstIpInOverflowList ( int32_t firstIp ) ;
private:

@ -65,7 +65,7 @@ TEST_F(RdbListTest, MergeTestPosdbEmptyAll) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(0, final1.getListSize());
@ -96,7 +96,7 @@ TEST_F(RdbListTest, MergeTestPosdbEmptyOne) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(list1.getListSize(), final1.getListSize());
@ -115,7 +115,7 @@ TEST_F(RdbListTest, MergeTestPosdbEmptyOne) {
RdbList final2;
final2.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final2.prepareForMerge(lists2, lists2_size, -1);
final2.merge_r(lists2, lists2_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, true);
final2.merge_r(lists2, lists2_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(list1.getListSize(), final2.getListSize());
@ -156,7 +156,7 @@ TEST_F(RdbListTest, MergeTestPosdbVerifyListOrder) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(0, final1.getListSize());
@ -171,7 +171,7 @@ TEST_F(RdbListTest, MergeTestPosdbVerifyListOrder) {
RdbList final2;
final2.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final2.prepareForMerge(lists2, lists2_size, -1);
final2.merge_r(lists2, lists2_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, true);
final2.merge_r(lists2, lists2_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(list1.getListSize(), final2.getListSize());
@ -213,7 +213,7 @@ TEST_F(RdbListTest, MergeTestPosdbVerifyRemoveNegRecords) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, true, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(0, final1.getListSize());
@ -222,7 +222,7 @@ TEST_F(RdbListTest, MergeTestPosdbVerifyRemoveNegRecords) {
RdbList final2;
final2.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final2.prepareForMerge(lists1, lists1_size, -1);
final2.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, 0, 0, true);
final2.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, 0, 0, false);
// verify merged list
EXPECT_EQ(list2.getListSize(), final2.getListSize());
@ -262,7 +262,7 @@ TEST_F(RdbListTest, MergeTestTitledb) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Titledb::getFixedDataSize(), true, Titledb::getUseHalfKeys(), Titledb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, startKey, endKey, -1, false, RDB_TITLEDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, startKey, endKey, -1, false, RDB_TITLEDB, 0, 0, false);
// verify merged list
int i = 1;
@ -310,7 +310,7 @@ TEST_F(RdbListTest, MergeTestTitledbDelEndKey) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Titledb::getFixedDataSize(), true, Titledb::getUseHalfKeys(), Titledb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, startKey, endKey, -1, false, RDB_TITLEDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, startKey, endKey, -1, false, RDB_TITLEDB, 0, 0, false);
// verify merged list
int i = 1;
@ -358,7 +358,7 @@ TEST_F(RdbListTest, MergeTestTitledbDoubleDelEndKey) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Titledb::getFixedDataSize(), true, Titledb::getUseHalfKeys(), Titledb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, startKey, endKey, -1, false, RDB_TITLEDB, 0, 0, true);
final1.merge_r(lists1, lists1_size, startKey, endKey, -1, false, RDB_TITLEDB, 0, 0, false);
// verify merged list
int i = 1;
@ -454,7 +454,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbSingleDocSpiderSpiderSpider) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
EXPECT_EQ(list3.getListSize(), final1.getListSize());
for (list3.resetListPtr(), final1.resetListPtr(); !final1.isExhausted(); list3.skipCurrentRecord(), final1.skipCurrentRecord()) {
@ -507,7 +507,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbSingleDocSpiderSpiderDelete) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
EXPECT_EQ(list3.getListSize(), final1.getListSize());
for (list3.resetListPtr(), final1.resetListPtr(); !final1.isExhausted(); list3.skipCurrentRecord(), final1.skipCurrentRecord()) {
@ -560,7 +560,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbSingleDocSpiderDeleteSpider) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
EXPECT_EQ(list3.getListSize(), final1.getListSize());
for (list3.resetListPtr(), final1.resetListPtr(); !final1.isExhausted(); list3.skipCurrentRecord(), final1.skipCurrentRecord()) {
@ -617,7 +617,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbSingleDocMergeStartSecondFile) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 1, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 1, false);
EXPECT_EQ(list3.getListSize(), final1.getListSize());
for (list3.resetListPtr(), final1.resetListPtr(); !final1.isExhausted(); list3.skipCurrentRecord(), final1.skipCurrentRecord()) {
@ -683,7 +683,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbMultiDocS1S2N1S2S1N2) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
EXPECT_EQ(list2.getListSize() + list3.getListSize(), final1.getListSize());
@ -752,7 +752,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbMultiDocS1N2N1S2S1N2) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
EXPECT_EQ(list2.getListSize() + list3.getListSize(), final1.getListSize());
@ -823,7 +823,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbMultiDocS1S2D1S2S1N2) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
// first record from list2 is not in output list
list2.resetListPtr();
@ -898,7 +898,7 @@ TEST_F(RdbListNoMergeTest, MergeTestPosdbMultiDocS1S2D1S2S1D2) {
RdbList final1;
final1.set(nullptr, 0, nullptr, 0, Posdb::getFixedDataSize(), true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
final1.prepareForMerge(lists1, lists1_size, -1);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, true);
final1.merge_r(lists1, lists1_size, KEYMIN(), KEYMAX(), -1, false, RDB_POSDB, collNum, 0, false);
EXPECT_EQ(list3.getListSize(), final1.getListSize());
for (list3.resetListPtr(), final1.resetListPtr(); !final1.isExhausted(); list3.skipCurrentRecord(), final1.skipCurrentRecord()) {