From 22aa13e34d533ce1973e2b4db8678ca6013e0efc Mon Sep 17 00:00:00 2001 From: Matt Wells <mwells@trinity.(none)> Date: Sat, 18 Jan 2014 11:09:30 -0800 Subject: [PATCH] do not set indexcode to EFAKEFIRSTIP for INJECTED urls, just added urls. fix add url page to not always use 'main' collection. added reset/restart cmds to spider page. --- PageRoot.cpp | 20 ++++++++++++++++---- Parms.cpp | 38 ++++++++++++++++++++------------------ XmlDoc.cpp | 8 +++++++- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/PageRoot.cpp b/PageRoot.cpp index aba21301..e19a102e 100644 --- a/PageRoot.cpp +++ b/PageRoot.cpp @@ -391,8 +391,15 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) { sb.safePrintf("<br><br>\n"); sb.safePrintf("<form method=get action=/addurl name=f>\n"); + //CollectionRec *cr = g_collectiondb.getRec ( "main" ); - //sb.safePrintf("<input type=hidden name=c value=\"%s\">",cr->m_coll); + + // the collection we want to add the url to + char *coll = r->getString("c"); + if ( ! coll ) coll = ""; + if ( coll ) + sb.safePrintf("<input type=hidden name=c value=\"%s\">",coll); + sb.safePrintf("<input name=u type=text size=60 value=\""); if ( url ) { SafeBuf tmp; @@ -453,11 +460,12 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) { unsigned long long rand64 = gettimeofdayInMillisecondsLocal(); // msg7 needs an explicit collection for /addurl for injecting // in PageInject.cpp. it does not use defaults for safety. - sb.safePrintf("&id=%lu&c=main&rand=%llu';\n" + sb.safePrintf("&id=%lu&c=%s&rand=%llu';\n" "client.open('GET', url );\n" "client.send();\n" "</script>\n" , h32 + , coll , rand64 ); sb.safePrintf("</div>\n"); @@ -1552,6 +1560,8 @@ void doneInjectingWrapper3 ( void *st ) { //CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll ); // collection name + char *coll = st1->m_coll; + if ( ! coll ) coll = ""; //char tt [ 128 ]; //tt[0] = '\0'; @@ -1658,8 +1668,10 @@ void doneInjectingWrapper3 ( void *st ) { unsigned long rand32 = rand(); // in the mime to 0 seconds! sb.safePrintf("<b>Url successfully added. " - "<a href=/search?rand=%lu&q=url%%3A", - rand32); + "<a href=/search?rand=%lu&" + "c=%s&q=url%%3A", + rand32, + coll); sb.urlEncode(url); sb.safePrintf(">Check it</a> or " "<a href=http://www.gigablast.com/seo?u="); diff --git a/Parms.cpp b/Parms.cpp index 013ee45a..30fdc001 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -9271,24 +9271,6 @@ void Parms::init ( ) { m->m_cast = 1; m++; - m->m_title = "reset collection"; - m->m_desc = "reset collection"; - m->m_cgi = "reset"; - m->m_type = TYPE_CMD; - m->m_page = PAGE_NONE; - m->m_func2 = CommandResetColl; - m->m_cast = 1; - m++; - - m->m_title = "restart collection"; - m->m_desc = "restart collection"; - m->m_cgi = "restart"; - m->m_type = TYPE_CMD; - m->m_page = PAGE_NONE; - m->m_func2 = CommandRestartColl; - m->m_cast = 1; - m++; - m->m_title = "in sync"; m->m_desc = "signify in sync with host 0"; m->m_cgi = "insync"; @@ -9321,6 +9303,26 @@ void Parms::init ( ) { m->m_def = "1"; m++; + m->m_title = "reset collection"; + m->m_desc = "Remove all documents from the collection and turn " + "spiders off."; + m->m_cgi = "reset"; + m->m_type = TYPE_CMD; + m->m_page = PAGE_SPIDER; + m->m_func2 = CommandResetColl; + m->m_cast = 1; + m++; + + m->m_title = "restart collection"; + m->m_desc = "Remove all documents from the collection and start " + "spidering over again."; + m->m_cgi = "restart"; + m->m_type = TYPE_CMD; + m->m_page = PAGE_SPIDER; + m->m_func2 = CommandRestartColl; + m->m_cast = 1; + m++; + /* m->m_title = "new spidering enabled"; m->m_desc = "When enabled the spider adds NEW " diff --git a/XmlDoc.cpp b/XmlDoc.cpp index aaa2f70b..20fbd5da 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -2106,7 +2106,13 @@ bool XmlDoc::indexDoc2 ( ) { // do this before we increment pageDownloadAttempts below so that // john's smoke tests, which use those counts, are not affected - if ( m_oldsrValid && m_oldsr.m_fakeFirstIp && + if ( m_oldsrValid && + m_oldsr.m_fakeFirstIp && + // only do for add url, not for injects. injects expect + // the doc to be indexed while the browser waits. add url + // is really just adding the spider request and returning + // to the browser without delay. + ! m_oldsr.m_isInjecting && // diffbot requests are ok though! ! strstr(m_oldsr.m_url,"-diffbotxyz") ) { m_indexCodeValid = true;