use -diffbotxyz%li as a more unique appendage.
show token on crawlbot page.
This commit is contained in:
28
Diffbot.cpp
28
Diffbot.cpp
@ -1203,20 +1203,8 @@ void StateCD::printTitledbList ( RdbList *list , SafeBuf *sb , char *format ) {
|
||||
}
|
||||
|
||||
// skip if not a diffbot json url
|
||||
char *url = xd.m_firstUrl.m_url;
|
||||
long ulen = gbstrlen(url);
|
||||
char *p = url + ulen - 1;
|
||||
// must be digit like <url>-diffbot-%li
|
||||
// if no digit at end it is not a diffbot json obj url
|
||||
if ( ! is_digit ( *p ) )
|
||||
continue;
|
||||
// back up over digits
|
||||
while ( p > url && is_digit(*p) ) p--;
|
||||
// then "-diffbot-" (see XmlDoc.cpp for this appendage)
|
||||
if ( p - 9 < url )
|
||||
continue;
|
||||
if ( strncmp(p-8,"-diffbot-",9) )
|
||||
continue;
|
||||
if ( ! xd.m_isDiffbotJSONObject ) continue;
|
||||
|
||||
// get the json content
|
||||
char *json = xd.ptr_utf8Content;
|
||||
|
||||
@ -1517,11 +1505,6 @@ char *getNewCollName ( ) { // char *token , long tokenLen ) {
|
||||
crawlId64 <<= 32;
|
||||
crawlId64 |= r2;
|
||||
|
||||
// the name of the new collection we are creating for this crawl
|
||||
// will be <tokenId>-<crawlId>. if it is a "test" crawl as
|
||||
// specified as an option in the diffbot crawlbot api page,
|
||||
// then make it <tokenId>-<crawlId>-test. Test crawls do not index,
|
||||
// they only crawl.
|
||||
static char s_collBuf[MAX_COLL_LEN+1];
|
||||
|
||||
//long tokenLen = gbstrlen(token);
|
||||
@ -1903,6 +1886,11 @@ bool printCrawlBotPage ( TcpSocket *s ,
|
||||
|
||||
sb.safePrintf(
|
||||
//
|
||||
"<tr>"
|
||||
"<td><b>Token:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Download Objects:</b> "
|
||||
"</td><td>"
|
||||
@ -1998,6 +1986,8 @@ bool printCrawlBotPage ( TcpSocket *s ,
|
||||
|
||||
"</form>"
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, cr->m_coll
|
||||
, cr->m_coll
|
||||
//, cr->m_coll
|
||||
|
@ -16879,7 +16879,7 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
||||
// . returns ptr to status
|
||||
// . diffbot uses this to remove the indexed json pages associated with
|
||||
// a url. each json object is basically its own url. a json object
|
||||
// url is the parent page's url with a -diffbot-%li appended to it
|
||||
// url is the parent page's url with a -diffbotxyz-%li appended to it
|
||||
// where %li is the object # starting at 0 and incrementing from there.
|
||||
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
|
||||
long *XmlDoc::nukeJSONObjects ( ) {
|
||||
@ -16911,8 +16911,8 @@ long *XmlDoc::nukeJSONObjects ( ) {
|
||||
// make the fake url for this json object for indexing
|
||||
SafeBuf fakeUrl;
|
||||
fakeUrl.set ( m_firstUrl.getUrl() );
|
||||
// append -diffbot-0 etc. for fake url
|
||||
fakeUrl.safePrintf("-diffbot-%li",m_joc);
|
||||
// append -diffbot0 etc. for fake url
|
||||
fakeUrl.safePrintf("-diffbotxyz%li",m_joc);
|
||||
// set url of new xmldoc
|
||||
if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
|
||||
m_coll ,
|
||||
@ -17695,7 +17695,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
SafeBuf fakeUrl;
|
||||
fakeUrl.set ( m_firstUrl.getUrl() );
|
||||
// append -diffbot-0 etc. for fake url
|
||||
fakeUrl.safePrintf("-diffbot-%li",
|
||||
fakeUrl.safePrintf("-diffbotxyz%li",
|
||||
(long)m_diffbotJSONCount);
|
||||
m_diffbotJSONCount++;
|
||||
// this can go on the stack since set4() copies it
|
||||
|
Reference in New Issue
Block a user