/v2/bulk api updates.

2013-11-11 15:52:04 -08:00 · 2013-11-11 15:52:04 -08:00 · ad61e9ea5a
commit ad61e9ea5a
parent 7248641bc4
2 changed files with 93 additions and 50 deletions
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -40,7 +40,8 @@ bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
 //CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) ;
 //CollectionRec *getCollRecFromCrawlId ( char *crawlId );
 //void printCrawlStatsWrapper ( void *state ) ;
-CollectionRec *addNewDiffbotColl ( char *addColl , char *token,char *name ) ;
+CollectionRec *addNewDiffbotColl ( char *addColl , char *token,char *name ,
+				   class HttpRequest *hr ) ;
 //bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) ;
 bool resetUrlFilters ( CollectionRec *cr ) ;

@ -2065,10 +2066,10 @@ static class HelpItem s_his[] = {
 	{"crawlDelay","Wait this many seconds between crawling urls from the "
 	 "same IP address. Can be a floating point number."},

-	{"deleteCrawl","Same as delete."},
-	{"resetCrawl","Same as delete."},
-	{"pauseCrawl","Same as pause."},
-	{"repeatCrawl","Same as repeat."},
+	//{"deleteCrawl","Same as delete."},
+	//{"resetCrawl","Same as delete."},
+	//{"pauseCrawl","Same as pause."},
+	//{"repeatCrawl","Same as repeat."},

 	{"seeds","Whitespace separated list of URLs used to seed the crawl. "
 	 "Will only follow outlinks on the same domain of seed URLs."
@ -2076,6 +2077,8 @@ static class HelpItem s_his[] = {
 	{"spots",
 	 "Whitespace separated list of URLs to add to the crawl. "
 	 "Outlinks will not be followed." },
+	{"urls",
+	 "Same as spots."},
 	//{"spiderLinks","Use 1 or 0 to spider the links or NOT spider "
 	// "the links, respectively, from "
 	// "the provided seed or addUrls parameters. "
@ -2083,20 +2086,24 @@ static class HelpItem s_his[] = {


 	{"maxToCrawl", "Specify max pages to successfully download."},
+	//{"maxToDownload", "Specify max pages to successfully download."},
+
 	{"maxToProcess", "Specify max pages to successfully process through "
 	 "diffbot."},
-	{"maxCrawlRounds", "Specify maximum number of crawl rounds. Use "
+	{"maxRounds", "Specify maximum number of crawl rounds. Use "
 	 "-1 to indicate no max."},

 	{"onlyProcessIfNew", "Specify 1 to avoid re-processing pages "
 	 "that have already been processed once before."},

 	{"notifyEmail","Send email alert to this email when crawl hits "
-	 "the maxtocrawl or maxtoprocess limit, or when the crawl completes."},
+	 "the maxtocrawl or maxtoprocess limit, or when the crawl "
+	 "completes."},
 	{"notifyWebhook","Fetch this URL when crawl hits "
-	 "the maxtocrawl or maxtoprocess limit, or when the crawl completes."},
+	 "the maxtocrawl or maxtoprocess limit, or when the crawl "
+	 "completes."},
 	{"obeyRobots","Obey robots.txt files?"},
-	{"restrictDomain","Restrict crawled urls to domains of seeds?"},
+	{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
 	{"pageProcessPattern","List of || separated strings. If the page "
 	 "contains any of these then we send it to diffbot for processing. "
 	 "If this is empty we send all pages to diffbot for processing."},
@ -2386,7 +2393,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	if ( cast == 0 ) {
 		// add a new collection by default
 		if ( ! cr && name && name[0] ) 
-			cr = addNewDiffbotColl ( collName , token , name );
+			cr = addNewDiffbotColl ( collName , token , name, hr );
 		// also support the good 'ole html form interface
 		if ( cr ) setSpiderParmsFromHtmlRequest ( socket , hr , cr );
 		// . we can't sync these operations on a dead host when it
@ -2508,7 +2515,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	if ( seeds )
 		log("crawlbot: adding seeds=\"%s\"",seeds);

-
 	///////
 	// 
 	// handle file of urls upload. can be HUGE!
@ -2697,33 +2703,39 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
 	long sentAlert = (long)ci->m_sentCrawlDoneAlert;
 	if ( sentAlert ) sentAlert = 1;

+	char *crawlTypeStr = "crawl";
+	//char *nomen = "crawl";
+	if ( cx->m_isCustomCrawl == 2 ) {
+		crawlTypeStr = "bulk";
+		//nomen = "job";
+	}
+
+
 	sb.safePrintf("\n\n{"
 		      "\"name\":\"%s\",\n"
+		      "\"type\":\"%s\",\n"
 		      //"\"alias\":\"%s\",\n"
 		      //"\"crawlingEnabled\":%li,\n"
-		      "\"crawlStatus\":{"
+		      "\"jobStatus\":{" // nomen = jobStatus / crawlStatus
 		      "\"status\":%li,"
 		      "\"message\":\"%s\"},\n"
-		      "\"sentCrawlDoneNotification\":%li,\n"
+		      "\"sentJobDoneNotification\":%li,\n"
 		      //"\"crawlingPaused\":%li,\n"
 		      "\"objectsFound\":%lli,\n"
 		      "\"urlsHarvested\":%lli,\n"
 		      //"\"urlsExamined\":%lli,\n"
-		      "\"pageCrawlAttempts\":%lli,\n"
-		      "\"pageCrawlSuccesses\":%lli,\n"
+		      "\"pageDownloadAttempts\":%lli,\n"
+		      "\"pageDownloadSuccesses\":%lli,\n"
 		      "\"pageProcessAttempts\":%lli,\n"
 		      "\"pageProcessSuccesses\":%lli,\n"
-		      // settable parms
-		      "\"maxToCrawl\":%lli,\n"
-		      "\"maxToProcess\":%lli,\n"
-		      "\"maxCrawlRounds\":%li,\n"
-		      "\"obeyRobots\":%li,\n"
-		      "\"restrictDomain\":%li,\n"
-		      "\"repeatCrawl\":%f,\n"
+
+		      "\"maxRounds\":%li,\n"
+		      "\"repeat\":%f,\n"
 		      "\"crawlDelay\":%f,\n"
-		      "\"onlyProcessIfNew\":%li,\n"
+
 		      //,cx->m_coll
 		      , cx->m_diffbotCrawlName.getBufStart()
+		      , crawlTypeStr
 		      //, alias
 		      //, (long)cx->m_spideringEnabled
 		      , crawlStatus
@ -2738,23 +2750,37 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
 		      , cx->m_globalCrawlInfo.m_pageDownloadSuccesses
 		      , cx->m_globalCrawlInfo.m_pageProcessAttempts
 		      , cx->m_globalCrawlInfo.m_pageProcessSuccesses
-		      , cx->m_maxToCrawl
-		      , cx->m_maxToProcess
+
 		      , (long)cx->m_maxCrawlRounds
-		      , (long)cx->m_useRobotsTxt
-		      , (long)cx->m_restrictDomain
 		      , cx->m_collectiveRespiderFrequency
 		      , cx->m_collectiveCrawlDelay
-		      , (long)cx->m_diffbotOnlyProcessIfNew
 		      );
-	sb.safePrintf("\"seeds\":\"");
-	sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
-	sb.safePrintf("\",\n");

-	sb.safePrintf("\"crawlRoundsCompleted\":%li,\n",
+	// if not a "bulk" injection, show crawl stats
+	if ( cx->m_isCustomCrawl != 2 ) {
+
+		sb.safePrintf(
+			      // settable parms
+			      "\"maxToCrawl\":%lli,\n"
+			      "\"maxToProcess\":%lli,\n"
+			      "\"obeyRobots\":%li,\n"
+			      "\"restrictDomain\":%li,\n"
+			      "\"onlyProcessIfNew\":%li,\n"
+			      , cx->m_maxToCrawl
+			      , cx->m_maxToProcess
+			      , (long)cx->m_useRobotsTxt
+			      , (long)cx->m_restrictDomain
+			      , (long)cx->m_diffbotOnlyProcessIfNew
+			      );
+		sb.safePrintf("\"seeds\":\"");
+		sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
+		sb.safePrintf("\",\n");
+	}
+
+	sb.safePrintf("\"roundsCompleted\":%li,\n",
 		      cx->m_spiderRoundNum);

-	sb.safePrintf("\"crawlRoundStartTime\":%lu,\n",
+	sb.safePrintf("\"roundStartTime\":%lu,\n",
 		      cx->m_spiderRoundStartTime);

 	sb.safePrintf("\"currentTime\":%lu,\n",
@ -2980,8 +3006,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	//////

+	// the items in the array now have type:bulk or type:crawl
+	// so call them 'jobs'
 	if ( fmt == FMT_JSON )
-		sb.safePrintf("\"crawls\":[");//\"collections\":");
+		sb.safePrintf("\"jobs\":[");//\"collections\":");

 	long summary = hr->getLong("summary",0);
 	// enter summary mode for json
@ -2994,8 +3022,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "<td><b>Objects Found</b></td>"
 			      "<td><b>URLs Harvested</b></td>"
 			      "<td><b>URLs Examined</b></td>"
-			      "<td><b>Page Crawl Attempts</b></td>"
-			      "<td><b>Page Crawl Successes</b></td>"
+			      "<td><b>Page Download Attempts</b></td>"
+			      "<td><b>Page Download Successes</b></td>"
 			      "<td><b>Page Process Attempts</b></td>"
 			      "<td><b>Page Process Successes</b></td>"
 			      "</tr>"
@ -3504,7 +3532,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "<tr>"
 			      "<td><b>Repeat Crawl:</b> "
 			      "</td><td>"
-			      "<input type=text name=repeatCrawl "
+			      "<input type=text name=repeat "
 			      "size=10 value=\"%f\"> "
 			      "<input type=submit name=submit value=OK>"
 			      " days"
@ -3558,9 +3586,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "</tr>"

 			      "<tr>"
-			      "<td><b>Max Crawl Rounds:</b>"
+			      "<td><b>Max Rounds:</b>"
 			      "</td><td>"
-			      "<input type=text name=maxCrawlRounds "
+			      "<input type=text name=maxRounds "
 			      "size=9 value=%li> "
 			      "<input type=submit name=submit value=OK>"
 			      "</td>"
@ -3941,7 +3969,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 */
 }

-CollectionRec *addNewDiffbotColl ( char *collName, char *token, char *name ) {
+CollectionRec *addNewDiffbotColl ( char *collName, char *token, char *name ,
+				   HttpRequest *hr ) {

 	//char *token = getTokenFromHttpRequest ( hr );
 	//if ( ! token ) {
@ -4022,7 +4051,16 @@ CollectionRec *addNewDiffbotColl ( char *collName, char *token, char *name ) {

 	// show the ban links in the search results. the collection name
 	// is cryptographic enough to show that
-	cr->m_isCustomCrawl = true;
+	cr->m_isCustomCrawl = 1;//true;
+
+	// john wants tp print out "type":"bulk" or "type":"crawl"
+	char *filename = hr->getFilename();
+	long flen = hr->getFilenameLen();
+	if ( filename && flen >= 5 &&
+	     ( strncmp(filename+flen-4,"bulk",4)==0 ||
+	       strncmp(filename+flen-5,"bulk/",5)==0 ) )
+		cr->m_isCustomCrawl = 2;
+

 	cr->m_diffbotOnlyProcessIfNew = true;

@ -4541,6 +4579,8 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 	// set other diffbot parms for this collection
 	//
 	long maxToCrawl = hr->getLongLong("maxToCrawl",-1LL);
+	if ( maxToCrawl == -1 ) 
+		maxToCrawl = hr->getLongLong("maxToDownload",-1LL);
 	if ( maxToCrawl != -1 ) {
 		cr->m_maxToCrawl = maxToCrawl;
 		cr->m_needsSave = 1;
@ -4552,6 +4592,8 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 	}
 	// -1 means no max, so use -2 as default here
 	long maxCrawlRounds = hr->getLongLong("maxCrawlRounds",-2LL);
+	if ( maxCrawlRounds == -2 )
+		maxCrawlRounds = hr->getLongLong("maxRounds",-2LL);
 	if ( maxCrawlRounds != -2 ) {
 		cr->m_maxCrawlRounds = maxCrawlRounds;
 		cr->m_needsSave = 1;
@ -4599,8 +4641,9 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 		cr->m_diffbotPageProcessPattern.set(ppp);
 		cr->m_needsSave = 1;
 	}
-	float respider = hr->getFloat("repeatCrawl",-1.0);
+	float respider = hr->getFloat("repeatJob",-1.0);
 	if ( respider == -1.0 ) respider = hr->getFloat("repeat",-1.0);
+	if ( respider == -1.0 ) respider = hr->getFloat("repeatCrawl",-1.0);
 	if ( respider >= 0.0 ) {
 		// if not 0, then change this by the delta
 		if ( cr->m_spiderRoundStartTime ) {
--- a/Spider.cpp
+++ b/Spider.cpp
@ -10231,19 +10231,19 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {

 	if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
 		*status = SP_MAXTOCRAWL;
-		return msg->safePrintf ( "Crawl has reached maxToCrawl "
+		return msg->safePrintf ( "Job has reached maxToCrawl "
 					 "limit." );
 	}

 	if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
 		*status = SP_MAXTOPROCESS;
-		return msg->safePrintf ( "Crawl has reached maxToProcess "
+		return msg->safePrintf ( "Job has reached maxToProcess "
 					 "limit." );
 	}

 	if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
 		*status = SP_MAXROUNDS;
-		return msg->safePrintf ( "Crawl has reached maxCrawlRounds "
+		return msg->safePrintf ( "Job has reached maxRounds "
 					 "limit." );
 	}

@ -10260,7 +10260,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {

 	if ( ! cx->m_spideringEnabled ) {
 		*status = SP_PAUSED;
-		return msg->safePrintf("Crawl paused.");
+		return msg->safePrintf("Job paused.");
 	}

 	if ( ! g_conf.m_spideringEnabled ) {
@ -10280,7 +10280,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {

 	if ( cx->m_spiderStatus == SP_INITIALIZING ) {
 		*status = SP_INITIALIZING;
-		return msg->safePrintf("Crawl is initializing.");
+		return msg->safePrintf("Job is initializing.");
 	}

 	// if we sent an email simply because no urls
@ -10288,16 +10288,16 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
 	if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
 	     ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
 		*status = SP_COMPLETED;
-		return msg->safePrintf("Crawl has completed and no "
+		return msg->safePrintf("Job has completed and no "
 			"repeatCrawl is scheduled.");
 	}

 	if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
 		*status = SP_ROUNDDONE;
-		return msg->safePrintf ( "Crawl round completed.");
+		return msg->safePrintf ( "Job round completed.");
 	}

 	// otherwise in progress?
 	*status = SP_INPROGRESS;
-	return msg->safePrintf("Crawl is in progress.");
+	return msg->safePrintf("Job is in progress.");
 }