use -diffbotxyz%li as a more unique appendage.

show token on crawlbot page.
This commit is contained in:
Matt Wells
2013-09-18 17:05:41 -07:00
parent 29f5c5d644
commit 022caeec04
2 changed files with 13 additions and 23 deletions

@ -1203,20 +1203,8 @@ void StateCD::printTitledbList ( RdbList *list , SafeBuf *sb , char *format ) {
}
// skip if not a diffbot json url
char *url = xd.m_firstUrl.m_url;
long ulen = gbstrlen(url);
char *p = url + ulen - 1;
// must be digit like <url>-diffbot-%li
// if no digit at end it is not a diffbot json obj url
if ( ! is_digit ( *p ) )
continue;
// back up over digits
while ( p > url && is_digit(*p) ) p--;
// then "-diffbot-" (see XmlDoc.cpp for this appendage)
if ( p - 9 < url )
continue;
if ( strncmp(p-8,"-diffbot-",9) )
continue;
if ( ! xd.m_isDiffbotJSONObject ) continue;
// get the json content
char *json = xd.ptr_utf8Content;
@ -1517,11 +1505,6 @@ char *getNewCollName ( ) { // char *token , long tokenLen ) {
crawlId64 <<= 32;
crawlId64 |= r2;
// the name of the new collection we are creating for this crawl
// will be <tokenId>-<crawlId>. if it is a "test" crawl as
// specified as an option in the diffbot crawlbot api page,
// then make it <tokenId>-<crawlId>-test. Test crawls do not index,
// they only crawl.
static char s_collBuf[MAX_COLL_LEN+1];
//long tokenLen = gbstrlen(token);
@ -1903,6 +1886,11 @@ bool printCrawlBotPage ( TcpSocket *s ,
sb.safePrintf(
//
"<tr>"
"<td><b>Token:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Download Objects:</b> "
"</td><td>"
@ -1998,6 +1986,8 @@ bool printCrawlBotPage ( TcpSocket *s ,
"</form>"
, cr->m_diffbotToken.getBufStart()
, cr->m_coll
, cr->m_coll
//, cr->m_coll

@ -16879,7 +16879,7 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
// . returns ptr to status
// . diffbot uses this to remove the indexed json pages associated with
// a url. each json object is basically its own url. a json object
// url is the parent page's url with a -diffbot-%li appended to it
// url is the parent page's url with a -diffbotxyz-%li appended to it
// where %li is the object # starting at 0 and incrementing from there.
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
long *XmlDoc::nukeJSONObjects ( ) {
@ -16911,8 +16911,8 @@ long *XmlDoc::nukeJSONObjects ( ) {
// make the fake url for this json object for indexing
SafeBuf fakeUrl;
fakeUrl.set ( m_firstUrl.getUrl() );
// append -diffbot-0 etc. for fake url
fakeUrl.safePrintf("-diffbot-%li",m_joc);
// append -diffbot0 etc. for fake url
fakeUrl.safePrintf("-diffbotxyz%li",m_joc);
// set url of new xmldoc
if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
m_coll ,
@ -17695,7 +17695,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
SafeBuf fakeUrl;
fakeUrl.set ( m_firstUrl.getUrl() );
// append -diffbot-0 etc. for fake url
fakeUrl.safePrintf("-diffbot-%li",
fakeUrl.safePrintf("-diffbotxyz%li",
(long)m_diffbotJSONCount);
m_diffbotJSONCount++;
// this can go on the stack since set4() copies it