Move document indexing from main thraed to a job

XmlDoc::indexDoc/indexDoc2/getMetaList() was being called from a callback in main thraed. Move that work to a thrad. Due to the multiple async calls done by getMetaList() there will be some overhead by this but at least the main thread won't be clogged when encountering 5MB+ documents anymore.
This commit is contained in:
Ivan Skytte Jørgensen
2017-03-14 15:23:15 +01:00
parent abfa828dbd
commit 4af955e174
5 changed files with 39 additions and 20 deletions

@ -409,6 +409,7 @@ bool JobScheduler_impl::submit(thread_type_t thread_type, JobEntry &e)
case thread_type_spider_write: job_queue = &cpu_job_queue; break;
case thread_type_spider_filter: job_queue = &external_job_queue; break;
case thread_type_spider_query: job_queue = &cpu_job_queue; break;
case thread_type_spider_index: job_queue = &cpu_job_queue; break;
case thread_type_merge_filter: job_queue = &merge_job_queue; break;
case thread_type_replicate_write: job_queue = &cpu_job_queue; break;
case thread_type_replicate_read: job_queue = &cpu_job_queue; break;

@ -32,6 +32,7 @@ enum thread_type_t {
thread_type_spider_write,
thread_type_spider_filter, //pdf2html/doc2html/...
thread_type_spider_query, //?
thread_type_spider_index,
thread_type_merge_filter,
thread_type_replicate_write,
thread_type_replicate_read,

@ -18,6 +18,7 @@ static const char *thread_type_name(thread_type_t tt) {
case thread_type_spider_write: return "spider-write";
case thread_type_spider_filter: return "spider-filter";
case thread_type_spider_query: return "spider-query";
case thread_type_spider_index: return "spider-index";
case thread_type_merge_filter: return "merge-filter";
case thread_type_replicate_write: return "replicate-write";
case thread_type_replicate_read: return "replicate-read";

@ -91,6 +91,7 @@ XmlDoc::XmlDoc() {
void *pend = &m_VALIDEND;
memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p
m_msg22Request.m_inUse = 0;
m_indexedDoc = false;
m_msg4Waiting = false;
m_msg4Launched = false;
m_dupTrPtr = NULL;
@ -148,6 +149,7 @@ void XmlDoc::reset ( ) {
m_loaded = false;
m_indexedDoc = false;
m_msg4Launched = false;
//m_downloadAttempted = false;
@ -1228,6 +1230,31 @@ void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) {
static void indexDoc3(void *state) {
XmlDoc *that = reinterpret_cast<XmlDoc*>(state);
logTrace( g_conf.m_logTraceXmlDoc, "Calling XmlDoc::indexDoc" );
// return if it blocked
if (!that->indexDoc()) {
logTrace(g_conf.m_logTraceXmlDoc, "END, indexDoc blocked");
return;
}
// otherwise, all done, call the caller callback
that->m_indexedDoc = true;
logTrace(g_conf.m_logTraceXmlDoc, "END");
}
static void indexedDoc3(void *state, job_exit_t exit_type) {
XmlDoc *that = reinterpret_cast<XmlDoc*>(state);
if(that->m_indexedDoc) {
logTrace(g_conf.m_logTraceXmlDoc, "Calling callback");
that->callCallback();
}
}
static void indexDocWrapper ( void *state ) {
logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );
@ -1237,31 +1264,18 @@ static void indexDocWrapper ( void *state ) {
// note it
THIS->setStatus ( "in index doc wrapper" );
logTrace( g_conf.m_logTraceXmlDoc, "Calling XmlDoc::indexDoc" );
// return if it blocked
if ( ! THIS->indexDoc( ) )
{
logTrace( g_conf.m_logTraceXmlDoc, "END, indexDoc blocked" );
//shovel this off to a thread
if(g_jobScheduler.submit(&indexDoc3,indexedDoc3,THIS,thread_type_spider_index,THIS->m_niceness)) {
//excellent
logTrace( g_conf.m_logTraceXmlDoc, "END, queued for thread" );
return;
}
// otherwise, all done, call the caller callback
// g_statsdb.addStat ( MAX_NICENESS,
// "docs_indexed",
// 20,
// 21,
// );
logTrace( g_conf.m_logTraceXmlDoc, "Calling callback" );
THIS->callCallback();
logTrace( g_conf.m_logTraceXmlDoc, "END" );
//threads not available (or oom or simmilar)
indexDoc3(THIS);
}
// . the highest level function in here
// . user is requesting to inject this url
// . returns false if blocked and your callback will be called when done

@ -1116,6 +1116,8 @@ public:
bool m_freed;
bool m_indexedDoc; //indexDoc() perfomrned completely
bool m_msg4Waiting;
bool m_msg4Launched;