use -diffbotxyz%li as a more unique appendage.

show token on crawlbot page.
2013-09-18 17:05:41 -07:00
parent 29f5c5d644
commit 022caeec04
2 changed files with 13 additions and 23 deletions
--- a/Diffbot.cpp
+++ b/Diffbot.cpp
@ -1203,20 +1203,8 @@ void StateCD::printTitledbList ( RdbList *list , SafeBuf *sb , char *format ) {
 		}

 		// skip if not a diffbot json url
-		char *url = xd.m_firstUrl.m_url;
-		long ulen = gbstrlen(url);
-		char *p = url + ulen - 1;
-		// must be digit like <url>-diffbot-%li
-		// if no digit at end it is not a diffbot json obj url
-		if ( ! is_digit ( *p ) ) 
-			continue;
-		// back up over digits
-		while ( p > url && is_digit(*p) ) p--;
-		// then "-diffbot-" (see XmlDoc.cpp for this appendage)
-		if ( p - 9 < url ) 
-			continue;
-		if ( strncmp(p-8,"-diffbot-",9) ) 
-			continue;
+		if ( ! xd.m_isDiffbotJSONObject ) continue;
+
 		// get the json content
 		char *json = xd.ptr_utf8Content;
 		
@ -1517,11 +1505,6 @@ char *getNewCollName ( ) { // char *token , long tokenLen ) {
 	crawlId64 <<= 32;
 	crawlId64 |= r2;

-	// the name of the new collection we are creating for this crawl
-	// will be <tokenId>-<crawlId>. if it is a "test" crawl as
-	// specified as an option in the diffbot crawlbot api page,
-	// then make it <tokenId>-<crawlId>-test. Test crawls do not index,
-	// they only crawl.
 	static char s_collBuf[MAX_COLL_LEN+1];

 	//long tokenLen = gbstrlen(token);
@ -1903,6 +1886,11 @@ bool printCrawlBotPage ( TcpSocket *s ,

 		sb.safePrintf(
 			      //
+			      "<tr>"
+			      "<td><b>Token:</td>"
+			      "<td>%s</td>"
+			      "</tr>"
+
 			      "<tr>"
 			      "<td><b>Download Objects:</b> "
 			      "</td><td>"
@ -1998,6 +1986,8 @@ bool printCrawlBotPage ( TcpSocket *s ,

 			      "</form>"

+			      , cr->m_diffbotToken.getBufStart()
+
 			      , cr->m_coll
 			      , cr->m_coll
 			      //, cr->m_coll
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -16879,7 +16879,7 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
 // . returns ptr to status
 // . diffbot uses this to remove the indexed json pages associated with
 //   a url. each json object is basically its own url. a json object
-//   url is the parent page's url with a -diffbot-%li appended to it
+//   url is the parent page's url with a -diffbotxyz-%li appended to it
 //   where %li is the object # starting at 0 and incrementing from there.
 // . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
 long *XmlDoc::nukeJSONObjects ( ) {
@ -16911,8 +16911,8 @@ long *XmlDoc::nukeJSONObjects ( ) {
 			// make the fake url for this json object for indexing
 			SafeBuf fakeUrl;
 			fakeUrl.set ( m_firstUrl.getUrl() );
-			// append -diffbot-0 etc. for fake url
-			fakeUrl.safePrintf("-diffbot-%li",m_joc);
+			// append -diffbot0 etc. for fake url
+			fakeUrl.safePrintf("-diffbotxyz%li",m_joc);
 			// set url of new xmldoc
 			if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
 					    m_coll ,
@ -17695,7 +17695,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			SafeBuf fakeUrl;
 			fakeUrl.set ( m_firstUrl.getUrl() );
 			// append -diffbot-0 etc. for fake url
-			fakeUrl.safePrintf("-diffbot-%li",
+			fakeUrl.safePrintf("-diffbotxyz%li",
 					   (long)m_diffbotJSONCount);
 			m_diffbotJSONCount++;
 			// this can go on the stack since set4() copies it