better warc injection load balancing

This commit is contained in:
Matt
2015-09-15 15:04:26 -06:00
parent aada7d5e51
commit 32d7f5cb97
2 changed files with 48 additions and 0 deletions

@ -211,6 +211,7 @@ class Host {
int64_t m_lastPing;
char m_tmpBuf[4];
int16_t m_tmpCount;
// . first time we sent an unanswered ping request to this host
// . used so we can determine when to send an email alert

@ -124,6 +124,53 @@ Host *getHostToHandleInjection ( char *url ) {
Host *group = g_hostdb.getShard ( shardNum );
int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
Host *host = &group[hostNum];
bool isWarcInjection = false;
int32_t ulen = gbstrlen(url);
if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
isWarcInjection = true;
if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
isWarcInjection = true;
if ( ! isWarcInjection ) return host;
// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
// a msg7 injection request for each doc in the warc/arc file
// so let's do load balancing differently for them so one host
// doesn't end up doing a bunch of wget/gunzips on warc files
// thereby bottlenecking the cluster. get the first hostid that
// we have not sent a msg7 injection request to that is still out
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
h->m_tmpCount = 0;
}
for ( UdpSlot *slot = g_udpServer.m_head2 ;
slot ;
slot = slot->m_next2 ) {
// skip if not injection request
if ( slot->m_msgType != 0x07 ) continue;
//if ( ! slot->m_weInitiated ) continue;
// if we did not initiate the injection request, i.e. if
// it is to us, skip it
if ( ! slot->m_callback ) continue;
// who is it from?
int32_t hostId = slot->m_hostId;
if ( hostId < 0 ) continue;
Host *h = g_hostdb.getHost ( hostId );
if ( ! h ) continue;
h->m_tmpCount++;
}
int32_t min = 999999;
Host *minh = NULL;
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
if ( h->m_tmpCount == 0 ) return h;
if ( h->m_tmpCount >= min ) continue;
min = h->m_tmpCount;
minh = h;
}
if ( minh ) return minh;
// how can this happen?
return host;
}