added isfakeip url filter expression to help

speed up bulk jobs
This commit is contained in:
Matt Wells
2015-06-17 13:59:13 -07:00
parent 43130f3a8d
commit b8049aae58
3 changed files with 38 additions and 4 deletions

@ -3594,8 +3594,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
// ethan wants some speed
if ( isEthan )
m_spiderIpMaxSpiders[i] = 30;
// if ( isEthan )
// m_spiderIpMaxSpiders[i] = 30;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] = respiderFreq;
//m_spiderDiffbotApiUrl[i].purge();
@ -3618,6 +3618,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_forceDelete [i] = 1;
i++;
// de-prioritize fakefirstip urls so we don't give the impression our
// spiders are slow. like if someone adds a bulk job with 100,000 urls
// then we sit there and process to lookup their ips and add a real
// spider request (if it falls onto the same shard) before we actually
// do any real spidering. so keep the priority here low.
m_regExs[i].set("isfakeip");
m_maxSpidersPerRule [i] = 7;
m_spiderIpMaxSpiders [i] = 7;
m_spiderPriorities [i] = 20;
m_spiderIpWaits [i] = 0;
i++;
// hopcount filter if asked for
if( m_diffbotMaxHops >= 0 ) {

@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
KEYSET(lastKey,k,m_ks); continue; }
// just bitch for now
log(
"db: Key out of order in map file %s%s. "
"page = %"INT32". key offset = %"INT64". Map or data file is "
"db: Key out of order in map file %s/%s. "
"page = %"INT32". key offset = %"INT64". "
"Map or data file is "
"corrupt, but it is probably the data file. Please "
"delete the map file and restart.",
m_file.m_dir,m_file.getFilename() ,
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
KEY1(lastKey,m_ks),KEY0(lastKey));
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
log("db: m_numPages = %"INT32"",m_numPages);
SafeBuf cmd;
cmd.safePrintf("mv %s/%s %s/trash/",
m_file.m_dir,
m_file.getFilename(),
g_hostdb.m_dir);
log("db: %s",cmd.getBufStart() );
gbsystem ( cmd.getBufStart() );
exit(0);
//char *xx=NULL;*xx=0;
// was k too small?

@ -11636,6 +11636,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
p += 8;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;