universal api updates

2013-09-13 18:10:03 -07:00
parent d982997b0c
commit 7ecffec40f
3 changed files with 64 additions and 86 deletions
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -71,7 +71,7 @@


 // how many counts are in CrawlInfo below????
-#define NUMCRAWLSTATS 7
+#define NUMCRAWLSTATS 8

 // used by diffbot to control spidering per collection
 class CrawlInfo {
@ -83,6 +83,7 @@ class CrawlInfo {
 	long long m_pageDownloadSuccesses; // 5
 	long long m_pageProcessAttempts;   // 6
 	long long m_pageProcessSuccesses;  // 7
+	long long m_urlsHarvested;         // 8

 	long m_lastUpdateTime;

--- a/Diffbot.cpp
+++ b/Diffbot.cpp
@ -1582,8 +1582,54 @@ bool printCrawlBotPage ( TcpSocket *s , HttpRequest *hr ) {
 			      "<tr>"
 			      "<td><b>Objects Indexed</b></td>"
 			      "<td>%lli</td>"
+			      //
+			      "<td><b>Download Objects:</b> "
+			      "<a href=/crawlbot/downloadobjects?token=&id="
+			      "format=json>"
+			      "json</a>"
+			      "&nbsp; "
+			      "<a href=/crawlbot/downloadobjects?"
+			      "token=&id=&"
+			      "format=xml>"
+			      "xml</a>"
+			      " &nbsp; "
+			      "<b>Search Objects:</b> "
+			      "<input type=text name=q size=50>"
+			      "</td>"
+
 			      "</tr>"

+			      "<tr>"
+			      "<td><b>URLs Harvested</b></td>"
+			      "<td>%lli</td>"
+			      //
+			      "<td><b>Download Urls:</b> "
+			      "<a href=/crawlbot/downloadurls?token=&id="
+			      "format=json>"
+			      "json</a>"
+			      " &nbsp; "
+			      "<a href=/crawlbot/downloadurls?"
+			      "token=&id=&"
+			      "format=xml>"
+			      "xml</a>"
+			      "&nbsp; "
+			      "<a href=/crawlbot/downloadurls?"
+			      "token=&id=&"
+			      "format=csv>"
+			      "csv</a>"
+			      //
+			      " &nbsp; "
+			      "<b>Add Url: </b> "
+			      "<input type=text name=addurl size=50>"
+			      " &nbsp; &nbsp; <input type=checkbox "
+			      "name=spiderlinks "
+			      "checked>"
+			      " <i>crawl links on this page?</i>"
+			      "</td>"
+			      
+			      "</tr>"
+			      
+
 			      "<tr>"
 			      "<td><b>URLs Considered</b></td>"
 			      "<td>%lli</td>"
@ -1597,6 +1643,10 @@ bool printCrawlBotPage ( TcpSocket *s , HttpRequest *hr ) {
 			      "<tr>"
 			      "<td><b>Page Download Successes</b></td>"
 			      "<td>%lli</td>"
+			      //
+			      "<td><b>Max:</b> "
+			      "<input type=text name=maxToCrawl "
+			      "size=9 value=%lli>"
 			      "</tr>"

 			      "<tr>"
@ -1607,18 +1657,29 @@ bool printCrawlBotPage ( TcpSocket *s , HttpRequest *hr ) {
 			      "<tr>"
 			      "<td><b>Page Process Successes</b></td>"
 			      "<td>%lli</td>"
+			      //
+			      "<td><b>Max:</b> "
+			      "<input type=text name=maxToProcess "
+			      "size=9 value=%lli>"
 			      "</tr>"
+
 			      
 			      "</table>"
 			      "<br>"

 			      , cr->m_globalCrawlInfo.m_objectsAdded -
 			        cr->m_globalCrawlInfo.m_objectsDeleted
+			      , cr->m_globalCrawlInfo.m_urlsHarvested
 			      , cr->m_globalCrawlInfo.m_urlsConsidered
+
 			      , cr->m_globalCrawlInfo.m_pageDownloadAttempts
 			      , cr->m_globalCrawlInfo.m_pageDownloadSuccesses
+			      , cr->m_diffbotMaxToCrawl 
+
 			      , cr->m_globalCrawlInfo.m_pageProcessAttempts
 			      , cr->m_globalCrawlInfo.m_pageProcessSuccesses
+			      , cr->m_diffbotMaxToProcess
+
 			      );
 	}

@ -1666,62 +1727,6 @@ bool printCrawlBotPage ( TcpSocket *s , HttpRequest *hr ) {
 	sb.safePrintf ( "<br>\n" );


-	
-	// 
-	// downloads
-	//
-	sb.safePrintf("<table cellpadding=5>"
-
-		      "<tr>"
-
-		      "<td><a href=/crawlbot/downloadurls?c=%s&format=csv>"
-		      "download urls (csv)"
-		      "</td>"
-
-		      "<td><a href=/crawlbot/downloadurls?c=%s&format=csv>"
-		      "download urls (json)"
-		      "</td>"
-
-		      "<td><a href=/crawlbot/downloadurls?c=%s&format=csv>"
-		      "download urls (xml)"
-		      "</td>"
-
-		      "<td><a href=/crawlbot/downloadobjects?c=%s&"
-		      "format=json>"
-		      "download objects (json)"
-		      "</td>"
-
-		      "<td><a href=/crawlbot/downloadobjects?c=%s&"
-		      "format=xml>"
-		      "download objects (xml)"
-		      "</td>"
-
-		      "<table>\n"
-		      
-		      , cr->m_coll
-		      , cr->m_coll
-		      , cr->m_coll
-		      , cr->m_coll
-		      , cr->m_coll
-		      );
-
-
-	//
-	// search
-	//
-	sb.safePrintf("<br>"
-		      "<table cellpadding=5>"
-		      "<tr>"
-		      "<td>"
-		      "Search this crawl <input type=text name=q size=50>"
-		      "</td>"
-		      "<td>"
-		      "<input type=submit name=submit value=Search>"
-		      "</td>"
-		      "</tr>"
-		      "</table>");
-
-
 	//
 	// add search box to your site
 	//
@ -1740,34 +1745,6 @@ bool printCrawlBotPage ( TcpSocket *s , HttpRequest *hr ) {
 	// show input boxes
 	//

-	sb.safePrintf("<br>"
-		      "<table cellpadding=5>"
-		      "<tr>"
-		      "<td><b>Add Url</b></td>"
-		      "<td><input type=text name=addurl size=50>"
-		      " &nbsp; &nbsp; <input type=checkbox name=spiderlinks "
-		      "checked>"
-		      " <i>crawl links on this page?</i>"
-		      "</tr>"
-
-
-		      "<tr>"
-		      "<td><b>Max Pages to Crawl</b></td>"
-		      "<td><input type=text name=maxToCrawl "
-		      "size=9 value=%lli>"
-		      "</tr>"
-
-		      "<tr>"
-		      "<td><b>Max Pages to Process</b></td>"
-		      "<td><input type=text name=maxToProcess "
-		      "size=9 value=%lli>"
-		      "</tr>"
-
-
-		      "</table>"
-		      , cr->m_diffbotMaxToCrawl 
-		      , cr->m_diffbotMaxToProcess
-		      );

 	sb.safePrintf("<br>"
 		      "<table cellpadding=5>"
--- a/Spider.cpp
+++ b/Spider.cpp
@ -8030,8 +8030,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// add the LOCAL stats we got from the remote into the GLOBAL stats
 	if ( slot ) {
 		CrawlInfo *stats = (CrawlInfo *)(slot->m_readBuf);
-		long long *ss = (long long *)&stats;
 		long long *gs = (long long *)&cr->m_globalCrawlInfo;
+		long long *ss = (long long *)&stats;
 		for ( long i = 0 ; i < NUMCRAWLSTATS ; i++ ) {
 			*gs = *gs + *ss;
 			gs++;