mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-12 02:26:07 -04:00
added isfakeip url filter expression to help
speed up bulk jobs
This commit is contained in:
@ -3594,8 +3594,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
|
||||
// ethan wants some speed
|
||||
if ( isEthan )
|
||||
m_spiderIpMaxSpiders[i] = 30;
|
||||
// if ( isEthan )
|
||||
// m_spiderIpMaxSpiders[i] = 30;
|
||||
//m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] = respiderFreq;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
@ -3618,6 +3618,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_forceDelete [i] = 1;
|
||||
i++;
|
||||
|
||||
// de-prioritize fakefirstip urls so we don't give the impression our
|
||||
// spiders are slow. like if someone adds a bulk job with 100,000 urls
|
||||
// then we sit there and process to lookup their ips and add a real
|
||||
// spider request (if it falls onto the same shard) before we actually
|
||||
// do any real spidering. so keep the priority here low.
|
||||
m_regExs[i].set("isfakeip");
|
||||
m_maxSpidersPerRule [i] = 7;
|
||||
m_spiderIpMaxSpiders [i] = 7;
|
||||
m_spiderPriorities [i] = 20;
|
||||
m_spiderIpWaits [i] = 0;
|
||||
i++;
|
||||
|
||||
// hopcount filter if asked for
|
||||
if( m_diffbotMaxHops >= 0 ) {
|
||||
|
||||
|
14
RdbMap.cpp
14
RdbMap.cpp
@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEYSET(lastKey,k,m_ks); continue; }
|
||||
// just bitch for now
|
||||
log(
|
||||
"db: Key out of order in map file %s%s. "
|
||||
"page = %"INT32". key offset = %"INT64". Map or data file is "
|
||||
"db: Key out of order in map file %s/%s. "
|
||||
"page = %"INT32". key offset = %"INT64". "
|
||||
"Map or data file is "
|
||||
"corrupt, but it is probably the data file. Please "
|
||||
"delete the map file and restart.",
|
||||
m_file.m_dir,m_file.getFilename() ,
|
||||
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEY1(lastKey,m_ks),KEY0(lastKey));
|
||||
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
|
||||
log("db: m_numPages = %"INT32"",m_numPages);
|
||||
|
||||
SafeBuf cmd;
|
||||
cmd.safePrintf("mv %s/%s %s/trash/",
|
||||
m_file.m_dir,
|
||||
m_file.getFilename(),
|
||||
g_hostdb.m_dir);
|
||||
log("db: %s",cmd.getBufStart() );
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
|
||||
exit(0);
|
||||
//char *xx=NULL;*xx=0;
|
||||
// was k too small?
|
||||
|
12
Spider.cpp
12
Spider.cpp
@ -11636,6 +11636,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if no match continue
|
||||
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
|
||||
p += 8;
|
||||
p = strstr(p, "&&");
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
|
Reference in New Issue
Block a user