forked from Mirrors/privacore-open-source-search-engine
added url download support
This commit is contained in:
@ -925,6 +925,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
strncmp ( path , "/v2/bulk/download/" ,18 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
// "GET /download/mycoll_urls.csv"
|
||||
if ( strncmp ( path , "/download/", 10 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
// . is it a diffbot api request, like "GET /api/*"
|
||||
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
|
||||
//if ( strncmp ( path , "/api/" , 5 ) == 0 )
|
||||
|
@ -160,6 +160,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_CSV;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_TXT;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_TXT;
|
||||
@ -518,6 +522,8 @@ bool StateCD::sendList ( ) {
|
||||
// (long)m_printedEndingBracket);
|
||||
|
||||
bool lastChunk = false;
|
||||
if ( ! m_someoneNeedsMore )
|
||||
lastChunk = true;
|
||||
|
||||
// if nobody needs to read more...
|
||||
if ( m_rdbId == RDB_TITLEDB &&
|
||||
@ -528,7 +534,6 @@ bool StateCD::sendList ( ) {
|
||||
// end array of json objects. might be empty!
|
||||
sb.safePrintf("\n]\n");
|
||||
//log("adding ]. len=%li",sb.length());
|
||||
lastChunk = true;
|
||||
}
|
||||
|
||||
TcpServer *tcp = &g_httpServer.m_tcp;
|
||||
@ -715,7 +720,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
m_isFirstTime = false;
|
||||
sb->safePrintf("\"Url\","
|
||||
"\"Entry Method\","
|
||||
"\"Processed?\","
|
||||
);
|
||||
if ( cr->m_isCustomCrawl )
|
||||
sb->safePrintf("\"Processed?\",");
|
||||
sb->safePrintf(
|
||||
"\"Add Time\","
|
||||
"\"Last Crawled\","
|
||||
"\"Last Status\","
|
||||
@ -747,12 +755,15 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// but default to csv
|
||||
else {
|
||||
sb->safePrintf("\"%s\",\"%s\","
|
||||
"%li,%lu,%lu,\"%s\",\"%s\",\""
|
||||
//",%s"
|
||||
//"\n"
|
||||
, sreq->m_url
|
||||
, as
|
||||
, (long)isProcessed
|
||||
);
|
||||
if ( cr->m_isCustomCrawl )
|
||||
sb->safePrintf("%li,",(long)isProcessed);
|
||||
sb->safePrintf(
|
||||
"%lu,%lu,\"%s\",\"%s\",\""
|
||||
//",%s"
|
||||
//"\n"
|
||||
// when was it first added to spiderdb?
|
||||
, sreq->m_addedTime
|
||||
// last time spidered, 0 if none
|
||||
|
64
Pages.cpp
64
Pages.cpp
@ -1847,41 +1847,55 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
if ( i == PAGE_SEARCHBOX ) continue;
|
||||
if ( i == PAGE_TITLEDB ) continue;
|
||||
|
||||
// print "url download" before "inject url"
|
||||
// GET /mycollname_urls.csv
|
||||
if ( i == PAGE_INJECT ) {
|
||||
sb->safePrintf (
|
||||
"<b>"
|
||||
"<a style=text-decoration:none; "
|
||||
"href=\"/download/%s_urls.txt\">"
|
||||
"url download"
|
||||
"</a>"
|
||||
"</b>"
|
||||
" \n",
|
||||
coll );
|
||||
}
|
||||
|
||||
if ( cr && ! cr->m_isCustomCrawl && i == PAGE_CRAWLBOT )
|
||||
continue;
|
||||
|
||||
// print it out
|
||||
if ( i == PAGE_LOGIN || i == PAGE_LOGIN2 )
|
||||
sb->safePrintf(
|
||||
"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?"
|
||||
//"user=%s&pwd=%s&"
|
||||
"c=%s%s\">%s</a>"
|
||||
"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
//username,pwd,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
//"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?"
|
||||
//"user=%s&pwd=%s&"
|
||||
"c=%s%s\">%s</a>"
|
||||
//"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
//username,pwd,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
else if ( page == i )
|
||||
sb->safePrintf(
|
||||
"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?c=%s%s\"><b>"
|
||||
"<font color=red>%s</font></b></a>"
|
||||
"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
//"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?c=%s%s\"><b>"
|
||||
"<font color=red>%s</font></b></a>"
|
||||
//"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
else
|
||||
sb->safePrintf(
|
||||
"<span style=\"white-space:nowrap\">"
|
||||
"<b>"
|
||||
"<a style=text-decoration:none; "
|
||||
"href=\"/%s?c=%s%s\">%s</a>"
|
||||
"</b>"
|
||||
"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
//"<span style=\"white-space:nowrap\">"
|
||||
"<b>"
|
||||
"<a style=text-decoration:none; "
|
||||
"href=\"/%s?c=%s%s\">%s</a>"
|
||||
"</b>"
|
||||
//"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
// print <br> after the last master admin control
|
||||
/*
|
||||
if ( i == PAGE_DELCOLL && user == USER_MASTER ) {
|
||||
|
@ -11378,11 +11378,12 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
|
||||
"repeat is scheduled.");
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_ROUNDDONE &&
|
||||
! cx->m_isCustomCrawl ) {
|
||||
if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
return msg->safePrintf ( "Nothing currently "
|
||||
"available to spider.");
|
||||
"available to spider. "
|
||||
"Change your url filters or try "
|
||||
"adding new urls.");
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user