code checkpoint

This commit is contained in:
mwells 2013-10-14 17:19:30 -06:00
parent a562c65627
commit c19310cb7e

@ -1919,6 +1919,48 @@ static class HelpItem s_his[] = {
{NULL,NULL}
};
// get the input string from the httprequest or the json post
char *getInputString ( char *string , HttpRequest *hr , Json *JS ) {
// try to get it from http request
char *val = hr->getString(string);
// if token in json post, use that
if ( ! val ) {
JsonItem *ji = JS.getItem(string);
if ( ji ) val = ji->getValue();
}
return val;
}
char *getSpecifiedCollName ( HttpRequest *hr , Json *JS ) {
char *token = getInputString("token");
if ( ! token ) {
log("crawlbot: no token supplied");
return NULL;
}
if ( gbstrlen(token) > 32 ) {
log("crawlbot: token is over 32 chars.");
return NULL;
}
// get collection name from "name" in json or "&name=" in cgi
char *name = getInputString("name");
if ( ! name ) {
log("crawlbot: no name supplied");
return NULL;
}
if ( gbstrlen(name) > 30 ) {
log("crawlbot: collection name is over 32 chars.");
return NULL;
}
// make the collection name so it includes the token
static char s_collName[MAX_COLL_LEN+1];
// sanity
if ( MAX_COLL_LEN < 64 ) { char *xx=NULL;*xx=0; }
// make a compound name for collection of token and name
sprintf(s_collName,"%s-%s",token,name);
// try to get that
return s_collName;
}
// . when we receive the request from john we call broadcastRequest() from
// Pages.cpp. then msg28 sends this replay with a &cast=0 appended to it
@ -1927,12 +1969,19 @@ static class HelpItem s_his[] = {
// . so if no &cast is present we are the original!!!
bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// Msg28::massConfig() puts a &cast=0 on the secondary requests sent to
// each host in the network
// . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot
// requests to every host in the network unless a cast=0 is
// explicitly given
// . Msg28::massConfig() puts a &cast=0 on the secondary requests
// sent to each host in the network
long cast = hr->getLong("cast",1);
// if no token... they need to login or signup
char *token = getTokenFromHttpRequest ( hr );
// httpserver/httprequest should not try to decode post if
// it's application/json.
char *json = hr->getPOST();
Json JS;
if ( json ) JS.parseJsonStringIntoJsonItems ( json );
// . now show stats for the current crawl
// . put in xml or json if format=xml or format=json or
@ -1942,15 +1991,90 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// give john a json api
if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
// if we got json as input, give it as output
if ( JS.getFirstItem() ) fmt = FMT_JSON;
// token is always required. get from json or html form input
char *token = getInputString ( "token" );
if ( ! token && ( cast == 0 || fm == FMT_JSON ) ) {
char *msg = "invalid token";
return sendErrorReply2 (socket,fmt,msg);
}
if ( ! token ) {
// print token form if html
SafeBuf sb;
sb.safePrintf("In order to use crawlbot you must "
"first LOGIN:"
"<form action=/crawlbot method=get>"
"<br>"
"<input type=text name=token size=50>"
"<input type=submit name=submit value=OK>"
"</form>"
"<br>"
"<b>- OR -</b>"
"<br> SIGN UP"
"<form action=/crawlbot method=get>"
"Name: <input type=text name=name size=50>"
"<br>"
"Email: <input type=text name=email size=50>"
"<br>"
"<input type=submit name=submit value=OK>"
"</form>"
"</body>"
"</html>");
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0); // cachetime
}
// . they must supply the token AND the NAME of the crawl
// . we create the collection name like %s-%s,token,name
char *collName = getSpecifiedCollName ( hr , &JS );
// return if they gave no name and we couldn't make and official
// collection name from the provided input
if ( ! collName ) {
log("crawlbot: no crawl name specified.");
char *msg = "invalid or missing \"name\"";
return sendErrorReply2 (socket,fmt,msg);
}
// if they did not specify the token/name of an existing collection
// then cr will be NULL and we'll add it below
CollectionRec *cr = g_collectiondb.getRec(collName);
// if no token... they need to login or signup
//char *token = getTokenFromHttpRequest ( hr );
// get coll name if any
char *c = hr->getString("c");
if ( ! c ) c = hr->getString("id");
//char *c = hr->getString("c");
//if ( ! c ) c = hr->getString("id");
// get some other parms provided optionally
char *addColl = hr->getString("addcoll");
char *delColl = hr->getString("delcoll");
char *resetColl = hr->getString("resetcoll");
//char *addColl = hr->getString("addcoll");
bool delColl = hr->hasParm("deleteCrawl");
bool resetColl = hr->hasParm("resetCrawl");
// try json
//if ( JS.getInputString("addNewCrawl") ) addColl = collName;
if ( JS.getInputString("deleteCrawl") ) delColl = true;
if ( JS.getInputString("resetCrawl") ) resetColl = true;
if ( delColl && ! cr ) {
log("crawlbot: no collection found to delete.");
char *msg = "Could not find crawl to delete.";
return sendErrorReply2 (socket,fmt,msg);
}
if ( resetColl && ! cr ) {
log("crawlbot: no collection found to reset.");
char *msg = "Could not find crawl to reset.";
return sendErrorReply2 (socket,fmt,msg);
}
// . if this is a cast=0 request it is received by all hosts in the
// network
@ -1963,11 +2087,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// hopefully it will still be set
// . but we should take care of add/del/reset coll here.
if ( cast == 0 ) {
// each host should return right away if token not given
if ( ! token ) {
char *msg = "invalid token";
return sendErrorReply2 (socket,fmt,msg);
}
// . we can't sync these operations on a dead host when it
// comes back up yet. we can only sync parms, not collection
// adds/deletes/resets
@ -1988,98 +2107,47 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// return sendErrorReply2(socket,fmt,msg);
// }
//}
if ( delColl ) {
// delete collection name
g_collectiondb.deleteRec ( delColl , true );
g_collectiondb.deleteRec ( collName , true );
// all done
return g_httpServer.sendDynamicPage (socket,"OK",2);
}
CollectionRec *cr = NULL;
if ( addColl ) {
// name of new collection will is "c" parm
cr = addNewDiffbotColl ( addColl , hr );
}
else if ( resetColl ) {
cr = g_collectiondb.getRec ( resetColl );
g_collectiondb.resetColl ( resetColl );
if ( resetColl ) {
//cr = g_collectiondb.getRec ( resetColl );
g_collectiondb.resetColl ( cr->m_coll );//resetColl );
// if reset from crawlbot api page then enable spiders
// to avoid user confusion
if ( cr ) cr->m_spideringEnabled = 1;
}
// get it from the "c" parm otherwise. just for display
// or modifying parms.
else
cr = g_collectiondb.getRec ( c );
// add a new collection by default
if ( ! cr ) {
cr = addNewDiffbotColl ( collName , hr );
// problem?
if ( ! cr ) {
// send back error
char *msg = "Error. No collection identified.";
char *msg = "Collection add failed");
// log it
log("crawlbot: %s",msg);
// make sure this returns in json if required
return sendErrorReply2(socket,fmt,msg);
}
/*
// alias must be unique!
char *alias = hr->getString("alias");
if ( alias && ! isAliasUnique(cr,token,alias) ) {
char *msg = "alias is not unqiue";
return sendErrorReply2 (socket,fmt,msg);
}
if ( alias ) {
cr->m_collectionNameAlias.set(alias);
cr->m_collectionNameAlias.nullTerm();
}
*/
// update the url filters for now since that is complicated
// supply "cr" directly since "c" may not be in the http
// request if addcoll=xxxxxx (just created a new rec)
long page = PAGE_FILTERS;
WebPage *pg = g_pages.getPage ( page ) ;
g_parms.setFromRequest ( hr , socket , pg->m_function, cr );
//
// set other diffbot parms for this collection
//
long maxToCrawl = hr->getLongLong("maxtocrawl",-1LL);
long maxToProcess = hr->getLongLong("maxtoprocess",-1LL);
if ( maxToCrawl != -1 ) {
cr->m_diffbotMaxToCrawl = maxToCrawl;
cr->m_needsSave = 1;
}
if ( maxToProcess != -1 ) {
cr->m_diffbotMaxToProcess = maxToProcess;
cr->m_needsSave = 1;
}
char *email = hr->getString("notifyemail",NULL,NULL);
if ( email ) {
cr->m_notifyEmail.set(email);
cr->m_notifyEmail.nullTerm();
}
char *url = hr->getString("notifyurl",NULL,NULL);
if ( url ) {
cr->m_notifyUrl.set(url);
cr->m_notifyUrl.nullTerm();
}
long pause = hr->getLong("pause",-1);
if ( pause == 0 ) cr->m_spideringEnabled = 1;
if ( pause == 1 ) cr->m_spideringEnabled = 0;
long urt = hr->getLong("urt",-1);
if ( urt != -1 ) cr->m_useRobotsTxt = urt;
char *ppp = hr->getString("pageprocesspattern",NULL);
if ( ppp ) {
cr->m_diffbotPageProcessPattern.set(ppp);
cr->m_diffbotPageProcessPattern.nullTerm();
}
// this will set the the collection parms from json
setSpiderParmsFromJSONPost ( socket , hr , cr , &JS );
// also support the good 'ole html form interface
setSpiderParmsFromHtmlRequest ( socket , hr , cr );
// this is a cast, so just return simple response
return g_httpServer.sendDynamicPage (socket,"OK",2);
}
/////////
//
// after all hosts have replied to the request, we finally send the
// request here, with no &cast=0 appended to it. so there is where we
// send the final reply back to the browser
//
/////////
// print help
long help = hr->getLong("help",0);
@ -2112,88 +2180,25 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
0); // cachetime
}
//
// after all hosts have replied to the request, we finally send the
// request here, with no &cast=0 appended to it. so there is where we
// send the final reply back to the sender
//
if ( ! token ) {
// send back json error msg
if ( fmt == FMT_JSON ) {
char *msg = "invalid token";
return sendErrorReply2 (socket,fmt,msg);
}
// print token form if html
SafeBuf sb;
sb.safePrintf("In order to use crawlbot you must "
"first LOGIN:"
"<form action=/crawlbot method=get>"
"<br>"
"<input type=text name=token size=50>"
"<input type=submit name=submit value=OK>"
"</form>"
"<br>"
"<b>- OR -</b>"
"<br> SIGN UP"
"<form action=/crawlbot method=get>"
"Name: <input type=text name=name size=50>"
"<br>"
"Email: <input type=text name=email size=50>"
"<br>"
"<input type=submit name=submit value=OK>"
"</form>"
"</body>"
"</html>");
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0); // cachetime
}
// get collection name if any was specified
char *coll = hr->getString("c",NULL,NULL);
if ( ! coll ) coll = hr->getString("id",NULL,NULL);
if ( ! coll ) coll = addColl;
if ( ! coll ) coll = resetColl;
//if ( ! coll ) coll = delColl;
// collectionrec must be non-null at this point. i.e. we added it
if ( ! cr )
return sendErrorReply2(socket,fmt,"no collection found");
char *urlData = hr->getString("urldata",NULL,NULL);
char *injectUrl = hr->getString("injecturl",NULL,NULL);
// we need a name!!
if ( ( injectUrl || urlData ) && ! coll )
return sendErrorReply2(socket,fmt,"no coll name specified");
//
// use a default collname if it was not specified and we are not
// doing an inject or url upload
//
for ( long i = 0 ; ! coll && i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cx = g_collectiondb.m_recs[i];
if ( ! cx ) continue;
if ( strcmp ( cx->m_diffbotToken.getBufStart(),token) )
continue;
// got it
coll = cx->m_coll;
}
// and rec
CollectionRec *cr = NULL;
if ( coll ) cr = g_collectiondb.getRec ( coll );
if ( ! cr && delColl )
return sendReply2 (socket,fmt,"OK");
if ( ! cr )
return sendErrorReply2(socket,fmt,"no collection found");
//for ( long i = 0 ; ! coll && i < g_collectiondb.m_numRecs ; i++ ) {
// CollectionRec *cx = g_collectiondb.m_recs[i];
// if ( ! cx ) continue;
// if ( strcmp ( cx->m_diffbotToken.getBufStart(),token) )
// continue;
// // got it
// coll = cx->m_coll;
//}
// make a new state
StateCD *st;
@ -3275,15 +3280,23 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
*/
}
CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
char *token = getTokenFromHttpRequest ( hr );
CollectionRec *addNewDiffbotColl ( char *myName , char *token ) {
//char *token = getTokenFromHttpRequest ( hr );
if ( ! token ) {
log("crawlbot: need token to add new coll");
return NULL;
}
if ( gbstrlen(myName) + 1 + gbstrlen(token) > MAX_COLL_LEN ) {
log("crawlbot: token or collection name too long");
return NULL;
}
// make the new collection name
char addColl[MAX_COLL_LEN+1];
sprintf(addColl,"%s-%s",token,myName);
// this saves it to disk!
if ( ! g_collectiondb.addRec ( addColl ,
NULL , // copy from
@ -3301,6 +3314,8 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
// did an alloc fail?
if ( ! cr ) { char *xx=NULL;*xx=0; }
log("crawlbot: added new collection \"%s\"", addColl );
// normalize the seed url
//Url norm;
//norm.set ( seed );
@ -3348,6 +3363,7 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
cr->m_diffbotOnlyProcessIfNew = true;
// just the basics on these for now
resetUrlFilters ( cr );
// default respider to off
@ -3366,71 +3382,9 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
cr->m_replies = 0;
cr->m_requests = 0;
// support current web page api i guess for test crawls
//cr->m_isDiffbotTestCrawl = false;
//char *strange = hr->getString("href",NULL);
//if ( strange && strcmp ( strange,"/dev/crawl#testCrawl" ) == 0 )
// cr->m_isDiffbotTestCrawl = true;
/*
///////
//
// extra diffbot ARTICLE parms
//
///////
// . ppl mostly use meta, html and tags.
// . dropping support for dontStripAds. mike is ok with that.
// . use for jsonp requests. needed for cross-domain ajax.
//char *callback = hr->getString("callback",NULL);
// a download timeout
//long timeout = hr->getLong("timeout",5000);
// "xml" or "json"
char *format = hr->getString("format",NULL,"json");
// save that
cr->m_diffbotFormat.safeStrcpy(format);
*/
// return all content from page? for frontpage api.
// TODO: can we put "all" into "fields="?
//bool all = hr->hasField("all");
/*
/////////
//
// specify diffbot fields to return in the json output
//
/////////
// point to the safebuf that holds the fields the user wants to
// extract from each url. comma separated list of supported diffbot
// fields like "meta","tags", ...
SafeBuf *f = &cr->m_diffbotFields;
// transcribe provided fields if any
char *fields = hr->getString("fields",NULL);
// appends those to our field buf
if ( fields ) f->safeStrcpy(fields);
// if something there push a comma in case we add more below
if ( f->length() ) f->pushChar(',');
// return contents of the page's meta tags? twitter card metadata, ..
if ( hr->hasField("meta" ) ) f->safeStrcpy("meta,");
if ( hr->hasField("html" ) ) f->safeStrcpy("html,");
if ( hr->hasField("tags" ) ) f->safeStrcpy("tags,");
if ( hr->hasField("comments") ) f->safeStrcpy("comments,");
if ( hr->hasField("summary" ) ) f->safeStrcpy("summary,");
if ( hr->hasField("all" ) ) f->safeStrcpy("all,");
// if we added crap to "fields" safebuf remove trailing comma
f->removeLastChar(',');
*/
// set some defaults. max spiders for all priorities in this collection
cr->m_maxNumSpiders = 10;
//cr->m_spiderPriorities [1] = -1; // filtered? or banned?
//cr->m_maxSpidersPerRule [1] = 10;
//cr->m_spiderIpWaits [1] = 500; // 500 ms for now
cr->m_needsSave = 1;
// start the spiders!
@ -3540,7 +3494,9 @@ bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) {
// json can be provided via get or post but content type must be
// url-encoded so we can test with a simple html form page.
bool setSpiderParmsFromJSONPost ( TcpSocket *socket , HttpRequest *hr ) {
bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
HttpRequest *hr ,
CollectionRec *cr ) {
// get the json
char *json = hr->getString("json");
@ -3558,45 +3514,6 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket , HttpRequest *hr ) {
return sendReply2 ( socket, FMT_JSON,
"Error with JSON parser.");
// get collection
JsonItem *ji;
ji = JP.getItem("name");
if ( ! ji )
return sendReply2 ( socket, FMT_JSON,
"No \"name\" parm given in JSON.");
char *name = ji->getValue();
long nameLen = gbstrlen(name);
if ( nameLen > 32 )
return sendReply2 ( socket, FMT_JSON,
"\"name\" value is over 32 bytes long.");
ji = JP.getItem("token");
if ( ! ji )
return sendReply2 ( socket, FMT_JSON,
"No \"token\" parm given in JSON.");
char *token = ji->getValue();
long tokenLen = gbstrlen(token);
if ( tokenLen > 32 )
return sendReply2 ( socket, FMT_JSON,
"\"token\" value is over 32 bytes long.");
// . create new collection? default is false if not there
// . TODO: support "true" in json parser
//bool addNewColl = JP.getValueAsBool("addNew",false);
// combine name with token to get collection name
char coll[256];
sprintf(coll,"%s-%s",token,name);
// get that
CollectionRec *cr = g_collectiondb.getRec ( coll );
// if does not exist, create it
if ( ! cr ) cr = addNewDiffbotColl ( coll , hr );
// error adding it?
if ( ! cr )
return sendReply2 ( socket,FMT_JSON,
@ -3807,3 +3724,46 @@ bool resetUrlFilters ( CollectionRec *cr ) {
return true;
}
bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
HttpRequest *hr ,
CollectionRec *cr ) {
// update the url filters for now since that is complicated
// supply "cr" directly since "c" may not be in the http
// request if addcoll=xxxxxx (just created a new rec)
long page = PAGE_FILTERS;
WebPage *pg = g_pages.getPage ( page ) ;
g_parms.setFromRequest ( hr , socket , pg->m_function, cr );
//
// set other diffbot parms for this collection
//
long maxToCrawl = hr->getLongLong("maxtocrawl",-1LL);
long maxToProcess = hr->getLongLong("maxtoprocess",-1LL);
if ( maxToCrawl != -1 ) {
cr->m_diffbotMaxToCrawl = maxToCrawl;
cr->m_needsSave = 1;
}
if ( maxToProcess != -1 ) {
cr->m_diffbotMaxToProcess = maxToProcess;
cr->m_needsSave = 1;
}
char *email = hr->getString("notifyemail",NULL,NULL);
if ( email ) {
cr->m_notifyEmail.set(email);
cr->m_notifyEmail.nullTerm();
}
char *url = hr->getString("notifyurl",NULL,NULL);
if ( url ) {
cr->m_notifyUrl.set(url);
cr->m_notifyUrl.nullTerm();
}
long pause = hr->getLong("pause",-1);
if ( pause == 0 ) cr->m_spideringEnabled = 1;
if ( pause == 1 ) cr->m_spideringEnabled = 0;
long urt = hr->getLong("urt",-1);
if ( urt != -1 ) cr->m_useRobotsTxt = urt;
char *ppp = hr->getString("pageprocesspattern",NULL);
if ( ppp ) {
cr->m_diffbotPageProcessPattern.set(ppp);
cr->m_diffbotPageProcessPattern.nullTerm();
}