mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
better warc injection load balancing
This commit is contained in:
1
Hostdb.h
1
Hostdb.h
@ -211,6 +211,7 @@ class Host {
|
||||
int64_t m_lastPing;
|
||||
|
||||
char m_tmpBuf[4];
|
||||
int16_t m_tmpCount;
|
||||
|
||||
// . first time we sent an unanswered ping request to this host
|
||||
// . used so we can determine when to send an email alert
|
||||
|
@ -124,6 +124,53 @@ Host *getHostToHandleInjection ( char *url ) {
|
||||
Host *group = g_hostdb.getShard ( shardNum );
|
||||
int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
|
||||
Host *host = &group[hostNum];
|
||||
|
||||
bool isWarcInjection = false;
|
||||
int32_t ulen = gbstrlen(url);
|
||||
if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
|
||||
isWarcInjection = true;
|
||||
if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
|
||||
isWarcInjection = true;
|
||||
|
||||
if ( ! isWarcInjection ) return host;
|
||||
|
||||
// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
|
||||
// a msg7 injection request for each doc in the warc/arc file
|
||||
// so let's do load balancing differently for them so one host
|
||||
// doesn't end up doing a bunch of wget/gunzips on warc files
|
||||
// thereby bottlenecking the cluster. get the first hostid that
|
||||
// we have not sent a msg7 injection request to that is still out
|
||||
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
||||
Host *h = g_hostdb.getHost(i);
|
||||
h->m_tmpCount = 0;
|
||||
}
|
||||
for ( UdpSlot *slot = g_udpServer.m_head2 ;
|
||||
slot ;
|
||||
slot = slot->m_next2 ) {
|
||||
// skip if not injection request
|
||||
if ( slot->m_msgType != 0x07 ) continue;
|
||||
//if ( ! slot->m_weInitiated ) continue;
|
||||
// if we did not initiate the injection request, i.e. if
|
||||
// it is to us, skip it
|
||||
if ( ! slot->m_callback ) continue;
|
||||
// who is it from?
|
||||
int32_t hostId = slot->m_hostId;
|
||||
if ( hostId < 0 ) continue;
|
||||
Host *h = g_hostdb.getHost ( hostId );
|
||||
if ( ! h ) continue;
|
||||
h->m_tmpCount++;
|
||||
}
|
||||
int32_t min = 999999;
|
||||
Host *minh = NULL;
|
||||
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
||||
Host *h = g_hostdb.getHost(i);
|
||||
if ( h->m_tmpCount == 0 ) return h;
|
||||
if ( h->m_tmpCount >= min ) continue;
|
||||
min = h->m_tmpCount;
|
||||
minh = h;
|
||||
}
|
||||
if ( minh ) return minh;
|
||||
// how can this happen?
|
||||
return host;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user