added url download support

This commit is contained in:
Matt Wells
2014-01-20 23:17:04 -08:00
parent 41cdfcef96
commit 8d5e1cb547
4 changed files with 64 additions and 34 deletions

@ -925,6 +925,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
strncmp ( path , "/v2/bulk/download/" ,18 ) == 0 )
return sendBackDump ( s , r );
// "GET /download/mycoll_urls.csv"
if ( strncmp ( path , "/download/", 10 ) == 0 )
return sendBackDump ( s , r );
// . is it a diffbot api request, like "GET /api/*"
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
//if ( strncmp ( path , "/api/" , 5 ) == 0 )

@ -160,6 +160,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_TXT;
}
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_TXT;
@ -518,6 +522,8 @@ bool StateCD::sendList ( ) {
// (long)m_printedEndingBracket);
bool lastChunk = false;
if ( ! m_someoneNeedsMore )
lastChunk = true;
// if nobody needs to read more...
if ( m_rdbId == RDB_TITLEDB &&
@ -528,7 +534,6 @@ bool StateCD::sendList ( ) {
// end array of json objects. might be empty!
sb.safePrintf("\n]\n");
//log("adding ]. len=%li",sb.length());
lastChunk = true;
}
TcpServer *tcp = &g_httpServer.m_tcp;
@ -715,7 +720,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
m_isFirstTime = false;
sb->safePrintf("\"Url\","
"\"Entry Method\","
"\"Processed?\","
);
if ( cr->m_isCustomCrawl )
sb->safePrintf("\"Processed?\",");
sb->safePrintf(
"\"Add Time\","
"\"Last Crawled\","
"\"Last Status\","
@ -747,12 +755,15 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// but default to csv
else {
sb->safePrintf("\"%s\",\"%s\","
"%li,%lu,%lu,\"%s\",\"%s\",\""
//",%s"
//"\n"
, sreq->m_url
, as
, (long)isProcessed
);
if ( cr->m_isCustomCrawl )
sb->safePrintf("%li,",(long)isProcessed);
sb->safePrintf(
"%lu,%lu,\"%s\",\"%s\",\""
//",%s"
//"\n"
// when was it first added to spiderdb?
, sreq->m_addedTime
// last time spidered, 0 if none

@ -1847,41 +1847,55 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
if ( i == PAGE_SEARCHBOX ) continue;
if ( i == PAGE_TITLEDB ) continue;
// print "url download" before "inject url"
// GET /mycollname_urls.csv
if ( i == PAGE_INJECT ) {
sb->safePrintf (
"<b>"
"<a style=text-decoration:none; "
"href=\"/download/%s_urls.txt\">"
"url download"
"</a>"
"</b>"
" &nbsp; \n",
coll );
}
if ( cr && ! cr->m_isCustomCrawl && i == PAGE_CRAWLBOT )
continue;
// print it out
if ( i == PAGE_LOGIN || i == PAGE_LOGIN2 )
sb->safePrintf(
"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?"
//"user=%s&pwd=%s&"
"c=%s%s\">%s</a>"
"</span>"
" &nbsp; \n",s_pages[i].m_filename,
//username,pwd,
coll,
buf,s_pages[i].m_name);
//"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?"
//"user=%s&pwd=%s&"
"c=%s%s\">%s</a>"
//"</span>"
" &nbsp; \n",s_pages[i].m_filename,
//username,pwd,
coll,
buf,s_pages[i].m_name);
else if ( page == i )
sb->safePrintf(
"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?c=%s%s\"><b>"
"<font color=red>%s</font></b></a>"
"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
//"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?c=%s%s\"><b>"
"<font color=red>%s</font></b></a>"
//"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
else
sb->safePrintf(
"<span style=\"white-space:nowrap\">"
"<b>"
"<a style=text-decoration:none; "
"href=\"/%s?c=%s%s\">%s</a>"
"</b>"
"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
//"<span style=\"white-space:nowrap\">"
"<b>"
"<a style=text-decoration:none; "
"href=\"/%s?c=%s%s\">%s</a>"
"</b>"
//"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
// print <br> after the last master admin control
/*
if ( i == PAGE_DELCOLL && user == USER_MASTER ) {

@ -11378,11 +11378,12 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
"repeat is scheduled.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE &&
! cx->m_isCustomCrawl ) {
if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Nothing currently "
"available to spider.");
"available to spider. "
"Change your url filters or try "
"adding new urls.");
}