Fix load balance of msg22s to use the udp slots in pinginfo.

Fix sigchild interrupting popen, when pdftohtml segfaults
popen was hanging forever.
Fix another bug when content length in http header was one off.
This commit is contained in:
Zak Betz
2015-11-03 11:51:19 -07:00
parent ff6caf79a2
commit baa817b51d
6 changed files with 63 additions and 22 deletions

@ -1649,9 +1649,11 @@ Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum ) {
for(int32_t i = 0; i < m_numHostsPerShard; i++) {
Host *hh = &shard[i];
if(isDead(hh)) continue;
if(hh->m_numOutstandingRequests > minOutstandingRequests) continue;
// log("host %"INT32 " numOutstanding is %"INT32, hh->m_hostId,
// hh->m_pingInfo.m_udpSlotsInUseIncoming);
if(hh->m_pingInfo.m_udpSlotsInUseIncoming > minOutstandingRequests) continue;
minOutstandingRequests = hh->m_numOutstandingRequests;
minOutstandingRequests = hh->m_pingInfo.m_udpSlotsInUseIncoming;
minOutstandingRequestsIndex = i;
}
if(minOutstandingRequestsIndex == -1) return shard;

@ -2708,6 +2708,32 @@ void Loop::enableTimer() {
}
FILE* gbpopen(char* cmd) {
// Block everything from interrupting this system call because
// if there is an alarm or a child thread crashes (pdftohtml)
// then this will hang forever.
// We should actually write our own popen so that we do
// fork, close all fds in the child, then exec.
// These child processes can hold open the http server and
// prevent a new gb from running even after it has died.
g_loop.disableTimer();
sigset_t oldSigs;
sigset_t sigs;
sigfillset ( &sigs );
if ( sigprocmask ( SIG_BLOCK , &sigs, &oldSigs ) < 0 ) {
log("build: had error blocking signals for popen");
}
FILE* fh = popen(cmd, "r");
if ( sigprocmask ( SIG_SETMASK , &oldSigs, NULL ) < 0 ) {
log("build: had error unblocking signals for popen");
}
g_loop.enableTimer();
return fh;
}
//calling with a 0 niceness will turn off the timer interrupt

4
Loop.h

@ -18,7 +18,9 @@
#define QUERYPRIORITYWEIGHT 16
#define QUICKPOLL_INTERVAL 10
int gbsystem(char *cmd ) ;
int gbsystem(char *cmd);
FILE* gbpopen(char* cmd);
#define sleep(a) { char *xx=NULL;*xx=0; }
//#define sleep(a) logf(LOG_INFO,"sleep: sleep");

@ -607,6 +607,11 @@ loop:
// debug msg
//log("Multicast:: no hosts left to send to");
g_errno = ENOHOSTS; return false; }
// log("build: msg %x sent to host %"INT32 " first hostId is %"INT32
// " oustanding msgs %"INT32,
// m_msgType, i, firstHostId, m_hostPtrs[i]->m_numOutstandingRequests);
// . send to this guy, if we haven't yet
// . returns false and sets g_errno on error
// . if it returns true, we sent ok, so we should return true

@ -2803,6 +2803,7 @@ bool Msg8a::launchGetRequests ( ) {
//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
//Host *group = g_hostdb.getGroup ( gid );
int32_t shardNum = getShardNum ( m_rdbId , &startKey );//, true );
int32_t firstHostId = g_hostdb.getLeastLoadedInShard ( shardNum )->m_hostId;
Host *group = g_hostdb.getShard ( shardNum );
//int32_t numTwins = g_hostdb.getNumHostsPerShard();
@ -2837,7 +2838,7 @@ bool Msg8a::launchGetRequests ( ) {
true , // error correction?
true , // include tree?
true , // doMerge?
-1 , // firstHostId
firstHostId , // firstHostId
0 , // startFileNum
-1 , // numFiles
3600*24*365 );// timeout

@ -1301,23 +1301,29 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// we also index "amp" which is bad.
m_content = utf8Content;
if ( m_mimeValid && m_mime.m_contentLen > 0) {
m_contentLen = m_mime.m_contentLen;
// Hack to prevent core, FIXME... the content length in
// the header is wrong???
if(payloadLen != -1) {
m_contentLen = payloadLen;
} else {
m_contentLen = m_mime.m_contentLen;
}
} else {
m_contentLen = gbstrlen(utf8Content);
}
if(payloadLen != -1 && m_contentLen > payloadLen) {
// When injecting a doc from a warc sometimes the doc is truncated
// and the content len http header is wrong, so we make sure that
// we don't have trailing garbage off the end of the doc by doing
// m_contentLen = max(m_mime.contentLen, payloadLen)
m_contentLen = payloadLen;
if(m_contentLen > 0 && m_content[m_contentLen-1] == '\0') {
m_contentLen--;
}
}
log("build:payloadlen %"INT32 " contentLen %"INT32 " headerlen %"INT64,
payloadLen, m_contentLen, m_mime.getContent() - utf8ContentArg);
// if(payloadLen != -1 && m_contentLen > payloadLen) {
// // When injecting a doc from a warc sometimes the doc is truncated
// // and the content len http header is wrong, so we make sure that
// // we don't have trailing garbage off the end of the doc by doing
// // m_contentLen = max(m_mime.contentLen, payloadLen)
// m_contentLen = payloadLen;
// if(m_contentLen > 0 && m_content[m_contentLen-1] == '\0') {
// m_contentLen--;
// }
// }
// log("build:payloadlen %"INT32 " contentLen %"INT32 " headerlen %"INT64,
// payloadLen, m_contentLen, m_mime.getContent() - utf8ContentArg);
m_contentValid = true;
@ -1333,7 +1339,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
//m_utf8ContentValid = true;
m_contentInjected = true;
m_wasContentInjected = true;
m_wasContentInjected = true;
m_contentType = contentType;
m_contentTypeValid = true;
// use this ip as well for now to avoid ip lookup
@ -19689,9 +19695,8 @@ FILE *XmlDoc::getUtf8ContentInFile () {
log("build: wget: %s",cmd );
g_loop.disableTimer();
FILE* fh = popen(cmd, "r");
g_loop.enableTimer();
FILE* fh = gbpopen(cmd);
int fd = fileno(fh);
int flags = fcntl(fd, F_GETFL, 0);
if(fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
@ -29866,7 +29871,7 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
bool XmlDoc::hashMetaData ( HashTableX *tt ) {
if ( ! ptr_metadata ) return true;
if ( ! ptr_metadata || !ptr_metadata[0] ) return true;
Json jp;