2013-09-13 16:22:07 -07:00
// diffbot api implementaion
//
// WHAT APIs are here?
//
// . 1. the CrawlBot API to start a crawl
// . 2. To directly process a provided URL (injection)
// . 3. the Cache API so phantomjs can quickly check the cache for files
// and quickly add files to the cache.
//
// Related pages:
//
// * http://diffbot.com/dev/docs/ (Crawlbot API tab, and others)
// * http://diffbot.com/dev/crawl/
2014-06-07 16:29:39 -07:00
# include "Errno.h"
2013-10-01 15:14:39 -06:00
# include "PageCrawlBot.h"
2013-09-13 16:22:07 -07:00
# include "TcpServer.h"
# include "HttpRequest.h"
# include "HttpServer.h"
# include "Pages.h" // g_msg
# include "XmlDoc.h" // for checkRegex()
2013-09-25 15:37:20 -06:00
# include "PageInject.h" // Msg7
2014-02-16 15:18:50 -08:00
# include "Repair.h"
2013-12-10 15:28:04 -08:00
# include "Parms.h"
2013-09-13 16:22:07 -07:00
2013-09-30 10:46:54 -06:00
// so user can specify the format of the reply/output
2014-04-05 18:09:04 -07:00
//#define FMT_HTML 1
//#define FMT_XML 2
//#define FMT_JSON 3
//#define FMT_CSV 4
//#define FMT_TXT 5
2013-09-30 10:46:54 -06:00
2013-09-13 16:22:07 -07:00
void doneSendingWrapper ( void * state , TcpSocket * sock ) ;
2013-09-17 10:25:54 -07:00
bool sendBackDump ( TcpSocket * s , HttpRequest * hr ) ;
2013-11-11 15:52:04 -08:00
CollectionRec * addNewDiffbotColl ( char * addColl , char * token , char * name ,
class HttpRequest * hr ) ;
2013-10-14 16:10:48 -06:00
bool resetUrlFilters ( CollectionRec * cr ) ;
2013-09-18 15:07:47 -07:00
2013-10-14 18:19:59 -06:00
bool setSpiderParmsFromHtmlRequest ( TcpSocket * socket ,
HttpRequest * hr ,
CollectionRec * cr ) ;
2013-09-13 16:22:07 -07:00
////////////////
//
2013-09-17 10:25:54 -07:00
// SUPPORT FOR DOWNLOADING an RDB DUMP
2013-09-13 16:22:07 -07:00
//
// We ask each shard for 10MB of Spiderdb records. If 10MB was returned
// then we repeat. Everytime we get 10MB from each shard we print the
// Spiderdb records out into "safebuf" and transmit it to the user. once
// the buffer has been transmitted then we ask the shards for another 10MB
// worth of spider records.
//
////////////////
// use this as a state while dumping out spiderdb for a collection
class StateCD {
public :
StateCD ( ) { m_needsMime = true ; } ;
void sendBackDump2 ( ) ;
2013-09-30 13:48:37 -06:00
bool readDataFromRdb ( ) ;
2013-09-30 14:12:22 -06:00
bool sendList ( ) ;
2013-10-25 14:54:24 -07:00
void printSpiderdbList ( RdbList * list , SafeBuf * sb ,
char * * lastKeyPtr ) ;
void printTitledbList ( RdbList * list , SafeBuf * sb ,
char * * lastKeyPtr ) ;
2013-11-12 13:51:52 -08:00
bool printJsonItemInCsv ( char * json , SafeBuf * sb ) ;
2013-09-13 16:22:07 -07:00
2014-10-30 13:36:39 -06:00
int64_t m_lastUh48 ;
2014-11-10 14:45:11 -08:00
int32_t m_lastFirstIp ;
2014-10-30 13:36:39 -06:00
int64_t m_prevReplyUh48 ;
2014-11-10 14:45:11 -08:00
int32_t m_prevReplyFirstIp ;
int32_t m_prevReplyError ;
2013-12-05 10:09:06 -08:00
time_t m_prevReplyDownloadTime ;
2013-09-25 15:37:20 -06:00
char m_fmt ;
Msg4 m_msg4 ;
HttpRequest m_hr ;
Msg7 m_msg7 ;
2014-11-10 14:45:11 -08:00
int32_t m_dumpRound ;
2014-10-30 13:36:39 -06:00
int64_t m_accumulated ;
2013-09-25 15:37:20 -06:00
2013-10-30 13:12:46 -07:00
WaitEntry m_waitEntry ;
2013-11-04 10:49:31 -08:00
bool m_isFirstTime ;
2013-11-04 11:29:22 -08:00
bool m_printedFirstBracket ;
2013-11-04 11:34:22 -08:00
bool m_printedEndingBracket ;
2013-11-04 11:29:22 -08:00
bool m_printedItem ;
2013-11-04 10:49:31 -08:00
2013-11-12 13:51:52 -08:00
bool m_needHeaderRow ;
2013-11-14 13:16:08 -08:00
SafeBuf m_seedBank ;
2013-12-18 15:57:10 -08:00
SafeBuf m_listBuf ;
2013-11-14 13:16:08 -08:00
2013-09-13 16:22:07 -07:00
bool m_needsMime ;
char m_rdbId ;
2013-09-17 10:25:54 -07:00
bool m_downloadJSON ;
2013-09-13 16:22:07 -07:00
collnum_t m_collnum ;
2014-11-10 14:45:11 -08:00
int32_t m_numRequests ;
int32_t m_numReplies ;
int32_t m_minRecSizes ;
2013-09-13 16:22:07 -07:00
bool m_someoneNeedsMore ;
TcpSocket * m_socket ;
Msg0 m_msg0s [ MAX_HOSTS ] ;
key128_t m_spiderdbStartKeys [ MAX_HOSTS ] ;
key_t m_titledbStartKeys [ MAX_HOSTS ] ;
RdbList m_lists [ MAX_HOSTS ] ;
bool m_needMore [ MAX_HOSTS ] ;
} ;
// . basically dump out spiderdb
2013-09-30 11:10:43 -06:00
// . returns urls in csv format in reply to a
// "GET /api/download/%s_data.json"
// "GET /api/download/%s_data.xml"
// "GET /api/download/%s_urls.csv"
// "GET /api/download/%s_pages.txt"
// where %s is the collection name
2013-09-13 16:22:07 -07:00
// . the ordering of the urls is not specified so whatever order they are
// in spiderdb will do
// . the gui that lists the urls as they are spidered in real time when you
// do a test crawl will just have to call this repeatedly. it shouldn't
// be too slow because of disk caching, and, most likely, the spider requests
// will all be in spiderdb's rdbtree any how
// . because we are distributed we have to send a msg0 request to each
// shard/group asking for all the spider urls. dan says 30MB is typical
// for a csv file, so for now we will just try to do a single spiderdb
// request.
2013-09-30 11:10:43 -06:00
bool sendBackDump ( TcpSocket * sock , HttpRequest * hr ) {
2013-09-17 10:25:54 -07:00
2013-09-30 11:10:43 -06:00
char * path = hr - > getPath ( ) ;
2014-11-10 14:45:11 -08:00
int32_t pathLen = hr - > getPathLen ( ) ;
2013-09-30 11:10:43 -06:00
char * pathEnd = path + pathLen ;
char * str = strstr ( path , " /download/ " ) ;
if ( ! str ) {
char * msg = " bad download request " ;
log ( " crawlbot: %s " , msg ) ;
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
return true ;
}
2014-02-13 08:45:13 -08:00
// when downloading csv socket closes because we can take minutes
// before we send over the first byte, so try to keep open
//int parm = 1;
//if(setsockopt(sock->m_sd,SOL_TCP,SO_KEEPALIVE,&parm,sizeof(int))<0){
// log("crawlbot: setsockopt: %s",mstrerror(errno));
// errno = 0;
//}
2014-11-10 14:45:11 -08:00
//int32_t pathLen = hr->getPathLen();
2013-10-30 10:00:46 -07:00
char rdbId = RDB_NONE ;
bool downloadJSON = false ;
2014-11-10 14:45:11 -08:00
int32_t fmt ;
2013-10-30 10:00:46 -07:00
char * xx ;
2015-04-25 14:37:16 -07:00
int32_t dt = CT_JSON ;
2013-10-30 10:00:46 -07:00
if ( ( xx = strstr ( path , " _data.json " ) ) ) {
rdbId = RDB_TITLEDB ;
2014-04-05 18:09:04 -07:00
fmt = FORMAT_JSON ;
2013-10-30 10:00:46 -07:00
downloadJSON = true ;
2015-04-25 14:37:16 -07:00
dt = CT_JSON ;
2013-10-30 10:00:46 -07:00
}
2015-04-24 18:02:46 -06:00
else if ( ( xx = strstr ( path , " _html.json " ) ) ) {
rdbId = RDB_TITLEDB ;
fmt = FORMAT_JSON ;
downloadJSON = true ;
2015-04-25 14:37:16 -07:00
dt = CT_HTML ;
2015-04-24 18:02:46 -06:00
}
2013-11-12 13:51:52 -08:00
else if ( ( xx = strstr ( path , " _data.csv " ) ) ) {
2013-10-30 10:00:46 -07:00
rdbId = RDB_TITLEDB ;
downloadJSON = true ;
2014-04-05 18:09:04 -07:00
fmt = FORMAT_CSV ;
2013-10-30 10:00:46 -07:00
}
else if ( ( xx = strstr ( path , " _urls.csv " ) ) ) {
rdbId = RDB_SPIDERDB ;
2014-04-05 18:09:04 -07:00
fmt = FORMAT_CSV ;
2013-10-30 10:00:46 -07:00
}
2014-01-20 23:17:04 -08:00
else if ( ( xx = strstr ( path , " _urls.txt " ) ) ) {
rdbId = RDB_SPIDERDB ;
2014-04-05 18:09:04 -07:00
fmt = FORMAT_TXT ;
2014-01-20 23:17:04 -08:00
}
2013-10-30 10:00:46 -07:00
else if ( ( xx = strstr ( path , " _pages.txt " ) ) ) {
rdbId = RDB_TITLEDB ;
2014-04-05 18:09:04 -07:00
fmt = FORMAT_TXT ;
2013-10-30 10:00:46 -07:00
}
// sanity, must be one of 3 download calls
if ( rdbId = = RDB_NONE ) {
char * msg ;
msg = " usage: downloadurls, downloadpages, downloaddata " ;
2013-09-30 11:10:43 -06:00
log ( " crawlbot: %s " , msg ) ;
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
return true ;
}
2013-10-30 10:00:46 -07:00
char * coll = str + 10 ;
if ( coll > = pathEnd ) {
char * msg = " bad download request2 " ;
2013-09-30 11:10:43 -06:00
log ( " crawlbot: %s " , msg ) ;
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
return true ;
}
2013-10-30 10:00:46 -07:00
// get coll
char * collEnd = xx ;
2013-09-30 11:10:43 -06:00
//CollectionRec *cr = getCollRecFromHttpRequest ( hr );
CollectionRec * cr = g_collectiondb . getRec ( coll , collEnd - coll ) ;
2013-09-17 10:25:54 -07:00
if ( ! cr ) {
char * msg = " token or id (crawlid) invalid " ;
log ( " crawlbot: invalid token or crawlid to dump " ) ;
2013-09-30 11:10:43 -06:00
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
2013-09-17 10:25:54 -07:00
return true ;
}
2013-09-30 11:10:43 -06:00
2014-02-12 21:19:32 -08:00
// . if doing download of csv, make it search results now!
2013-11-20 10:45:10 -08:00
// . make an httprequest on stack and call it
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_CSV & & rdbId = = RDB_TITLEDB ) {
2013-11-20 10:45:10 -08:00
char tmp2 [ 5000 ] ;
SafeBuf sb2 ( tmp2 , 5000 ) ;
2014-11-10 14:45:11 -08:00
int32_t dr = 1 ;
2014-02-23 20:04:46 -08:00
// do not dedup bulk jobs
if ( cr - > m_isCustomCrawl = = 2 ) dr = 0 ;
2014-06-17 18:50:42 -07:00
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0 ;
2014-02-13 08:45:13 -08:00
sb2 . safePrintf ( " GET /search.csv?icc=1&format=csv&sc=0& "
// dedup. since stream=1 and pss=0 below
// this will dedup on page content hash only
// which is super fast.
2014-11-10 14:45:11 -08:00
" dr=% " INT32 " & "
2014-02-13 08:45:13 -08:00
" c=%s&n=1000000& "
2014-03-04 10:45:57 -08:00
// stream it now
" stream=1& "
2014-02-13 08:45:13 -08:00
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
" pss=0& "
2014-01-28 14:37:21 -08:00
// no gigabits
" dsrt=0& "
// do not compute summary. 0 lines.
" ns=0& "
2013-11-20 10:45:10 -08:00
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" \r \n \r \n "
2014-02-23 20:04:46 -08:00
, dr
2013-11-20 10:45:10 -08:00
, cr - > m_coll
) ;
2015-04-21 15:20:57 -07:00
log ( " crawlbot: %s " , sb2 . getBufStart ( ) ) ;
2013-11-20 10:45:10 -08:00
HttpRequest hr2 ;
hr2 . set ( sb2 . getBufStart ( ) , sb2 . length ( ) , sock ) ;
return sendPageResults ( sock , & hr2 ) ;
}
2014-02-12 21:19:32 -08:00
// . if doing download of json, make it search results now!
// . make an httprequest on stack and call it
2015-04-25 14:37:16 -07:00
if ( fmt = = FORMAT_JSON & & rdbId = = RDB_TITLEDB & & dt = = CT_HTML ) {
2014-02-12 21:19:32 -08:00
char tmp2 [ 5000 ] ;
SafeBuf sb2 ( tmp2 , 5000 ) ;
2014-11-10 14:45:11 -08:00
int32_t dr = 1 ;
2014-02-23 20:04:46 -08:00
// do not dedup bulk jobs
if ( cr - > m_isCustomCrawl = = 2 ) dr = 0 ;
2014-06-17 18:50:42 -07:00
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0 ;
2014-02-13 08:45:13 -08:00
sb2 . safePrintf ( " GET /search.csv?icc=1&format=json&sc=0& "
// dedup. since stream=1 and pss=0 below
// this will dedup on page content hash only
// which is super fast.
2014-11-10 14:45:11 -08:00
" dr=% " INT32 " & "
2014-06-17 18:50:42 -07:00
" c=%s&n=1000000& "
2014-02-12 21:19:32 -08:00
// we can stream this because unlink csv it
// has no header row that needs to be
// computed from all results.
" stream=1& "
2014-02-13 08:45:13 -08:00
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
" pss=0& "
2014-02-12 21:19:32 -08:00
// no gigabits
" dsrt=0& "
// do not compute summary. 0 lines.
" ns=0& "
2015-04-25 14:37:16 -07:00
//"q=gbsortby%%3Agbspiderdate&"
//"prepend=type%%3A%s"
" q=type%%3Ahtml "
2014-02-12 21:19:32 -08:00
" \r \n \r \n "
2014-02-23 20:04:46 -08:00
, dr
2014-02-12 21:19:32 -08:00
, cr - > m_coll
2015-04-25 14:37:16 -07:00
) ;
log ( " crawlbot: %s " , sb2 . getBufStart ( ) ) ;
HttpRequest hr2 ;
hr2 . set ( sb2 . getBufStart ( ) , sb2 . length ( ) , sock ) ;
return sendPageResults ( sock , & hr2 ) ;
}
if ( fmt = = FORMAT_JSON & & rdbId = = RDB_TITLEDB ) {
char tmp2 [ 5000 ] ;
SafeBuf sb2 ( tmp2 , 5000 ) ;
int32_t dr = 1 ;
// do not dedup bulk jobs
if ( cr - > m_isCustomCrawl = = 2 ) dr = 0 ;
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0 ;
sb2 . safePrintf ( " GET /search.csv?icc=1&format=json&sc=0& "
// dedup. since stream=1 and pss=0 below
// this will dedup on page content hash only
// which is super fast.
" dr=% " INT32 " & "
" c=%s&n=1000000& "
// we can stream this because unlink csv it
// has no header row that needs to be
// computed from all results.
" stream=1& "
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
" pss=0& "
// no gigabits
" dsrt=0& "
// do not compute summary. 0 lines.
" ns=0& "
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" \r \n \r \n "
, dr
, cr - > m_coll
2015-04-15 17:42:56 -06:00
) ;
2015-04-21 15:20:57 -07:00
log ( " crawlbot: %s " , sb2 . getBufStart ( ) ) ;
2015-04-15 17:42:56 -06:00
HttpRequest hr2 ;
hr2 . set ( sb2 . getBufStart ( ) , sb2 . length ( ) , sock ) ;
return sendPageResults ( sock , & hr2 ) ;
}
// . now the urls.csv is also a query on gbss files
// . make an httprequest on stack and call it
2015-04-15 17:48:55 -06:00
// . only do this for version 3
// i.e. GET /v3/crawl/download/token-collectionname_urls.csv
if ( fmt = = FORMAT_CSV & &
rdbId = = RDB_SPIDERDB & &
path [ 0 ] = = ' / ' & &
path [ 1 ] = = ' v ' & &
path [ 2 ] = = ' 3 ' ) {
2015-04-15 17:42:56 -06:00
char tmp2 [ 5000 ] ;
SafeBuf sb2 ( tmp2 , 5000 ) ;
// never dedup
int32_t dr = 0 ;
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0 ;
sb2 . safePrintf ( " GET /search? "
// this is not necessary
//"icc=1&"
" format=csv& "
// no site clustering
" sc=0& "
// never dedup.
" dr=0& "
" c=%s& "
" n=10000000& "
// stream it now
2015-04-21 15:20:57 -07:00
// can't stream until we fix headers be printed
// in Msg40.cpp. so gbssUrl->Url etc.
// mdw: ok should work now
2015-04-15 17:42:56 -06:00
" stream=1& "
2015-04-21 15:20:57 -07:00
//"stream=0&"
2015-04-15 17:42:56 -06:00
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
" pss=0& "
// no gigabits
" dsrt=0& "
// do not compute summary. 0 lines.
//"ns=0&"
" q=gbrevsortbyint%%3AgbssSpiderTime+ "
" gbssIsDiffbotObject%%3A0 "
" & "
//"prepend=type%%3Ajson"
" \r \n \r \n "
, cr - > m_coll
2014-02-12 21:19:32 -08:00
) ;
2015-04-21 15:20:57 -07:00
log ( " crawlbot: %s " , sb2 . getBufStart ( ) ) ;
2014-02-12 21:19:32 -08:00
HttpRequest hr2 ;
hr2 . set ( sb2 . getBufStart ( ) , sb2 . length ( ) , sock ) ;
return sendPageResults ( sock , & hr2 ) ;
}
2013-11-20 10:45:10 -08:00
2013-09-30 11:10:43 -06:00
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )
// rdbId = RDB_SPIDERDB;
//if ( strncmp ( path ,"/crawlbot/downloadpages",23 ) == 0 )
// rdbId = RDB_TITLEDB;
//if ( strncmp ( path ,"/crawlbot/downloaddata",22 ) == 0 ) {
// downloadJSON = true;
// rdbId = RDB_TITLEDB;
//}
2013-09-17 10:25:54 -07:00
2013-09-13 16:22:07 -07:00
StateCD * st ;
try { st = new ( StateCD ) ; }
catch ( . . . ) {
2013-11-10 16:28:00 -08:00
return g_httpServer . sendErrorReply ( sock , 500 , mstrerror ( g_errno ) ) ;
2013-09-13 16:22:07 -07:00
}
mnew ( st , sizeof ( StateCD ) , " statecd " ) ;
2014-01-17 18:28:17 -08:00
2013-09-13 16:22:07 -07:00
// initialize the new state
st - > m_rdbId = rdbId ;
2013-09-17 10:25:54 -07:00
st - > m_downloadJSON = downloadJSON ;
2013-09-30 11:10:43 -06:00
st - > m_socket = sock ;
2013-09-13 16:22:07 -07:00
// the name of the collections whose spiderdb we read from
st - > m_collnum = cr - > m_collnum ;
2013-09-30 10:46:54 -06:00
2013-09-30 11:10:43 -06:00
st - > m_fmt = fmt ;
2013-11-04 10:49:31 -08:00
st - > m_isFirstTime = true ;
2013-09-30 10:46:54 -06:00
2013-11-04 11:29:22 -08:00
st - > m_printedFirstBracket = false ;
st - > m_printedItem = false ;
2013-11-04 11:34:22 -08:00
st - > m_printedEndingBracket = false ;
2013-11-04 11:29:22 -08:00
2013-11-12 13:51:52 -08:00
// for csv...
st - > m_needHeaderRow = true ;
2013-12-05 10:09:06 -08:00
st - > m_lastUh48 = 0LL ;
2013-12-23 10:27:42 -08:00
st - > m_lastFirstIp = 0 ;
2013-12-05 10:09:06 -08:00
st - > m_prevReplyUh48 = 0LL ;
2013-12-23 10:27:42 -08:00
st - > m_prevReplyFirstIp = 0 ;
2013-12-05 10:09:06 -08:00
st - > m_prevReplyError = 0 ;
st - > m_prevReplyDownloadTime = 0LL ;
2014-01-31 14:53:49 -08:00
st - > m_dumpRound = 0 ;
st - > m_accumulated = 0LL ;
2013-12-05 10:09:06 -08:00
2013-11-10 16:28:00 -08:00
// debug
2014-11-10 14:45:11 -08:00
//log("mnew1: st=%"XINT32"",(int32_t)st);
2013-11-10 16:28:00 -08:00
2013-09-13 16:22:07 -07:00
// begin the possible segmented process of sending back spiderdb
// to the user's browser
st - > sendBackDump2 ( ) ;
// i dont think this return values matters at all since httpserver.cpp
// does not look at it when it calls sendReply()
return true ;
}
2014-01-17 16:26:17 -08:00
// . all wrappers call this
// . returns false if would block, true otherwise
2014-01-17 18:28:17 -08:00
bool readAndSendLoop ( StateCD * st , bool readFirst ) {
2014-01-17 16:26:17 -08:00
2013-09-30 13:48:37 -06:00
subloop :
2014-01-17 16:26:17 -08:00
2014-01-23 11:34:49 -08:00
// if we had a broken pipe on the sendChunk() call then hopefully
// this will kick in...
if ( g_errno ) {
log ( " crawlbot: readAndSendLoop: %s " , mstrerror ( g_errno ) ) ;
readFirst = true ;
st - > m_someoneNeedsMore = false ;
}
2014-01-24 12:07:28 -08:00
// wait if some are outstanding. how can this happen?
if ( st - > m_numRequests > st - > m_numReplies ) {
2014-11-10 14:45:11 -08:00
log ( " crawlbot: only got % " INT32 " of % " INT32 " replies. waiting for "
2014-01-24 12:07:28 -08:00
" all to come back in. " ,
st - > m_numReplies , st - > m_numRequests ) ;
return false ;
}
2014-02-06 18:21:22 -08:00
// are we all done? we still have to call sendList() to
// set socket's streamingMode to false to close things up
2014-01-17 18:28:17 -08:00
if ( readFirst & & ! st - > m_someoneNeedsMore ) {
2014-01-17 16:26:17 -08:00
log ( " crawlbot: done sending for download request " ) ;
2014-01-17 18:28:17 -08:00
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
delete st ;
2014-01-17 16:26:17 -08:00
return true ;
}
2013-09-13 16:22:07 -07:00
// begin reading from each shard and sending the spiderdb records
2013-09-30 13:48:37 -06:00
// over the network. return if that blocked
2014-01-17 18:28:17 -08:00
if ( readFirst & & ! st - > readDataFromRdb ( ) ) return false ;
2014-01-17 16:26:17 -08:00
2014-02-18 09:44:15 -08:00
// did user delete their collection midstream on us?
if ( g_errno ) {
log ( " crawlbot: read shard data had error: %s " ,
mstrerror ( g_errno ) ) ;
goto subloop ;
}
2014-01-17 16:26:17 -08:00
// send it to the browser socket. returns false if blocks.
2014-01-17 18:28:17 -08:00
if ( ! st - > sendList ( ) ) return false ;
2014-01-17 16:26:17 -08:00
// read again i guess
readFirst = true ;
// hey, it did not block... tcpserver caches writes...
goto subloop ;
2013-09-13 16:22:07 -07:00
}
void StateCD : : sendBackDump2 ( ) {
m_numRequests = 0 ;
m_numReplies = 0 ;
// read 10MB from each shard's spiderdb at a time
2014-01-23 01:02:11 -08:00
//m_minRecSizes = 9999999;
// 1ook to be more fluid
m_minRecSizes = 99999 ;
2013-09-13 16:22:07 -07:00
// we stop reading from all shards when this becomes false
m_someoneNeedsMore = true ;
// initialize the spiderdb startkey "cursor" for each shard's spiderdb
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_hostdb . m_numShards ; i + + ) {
2013-09-13 16:22:07 -07:00
m_needMore [ i ] = true ;
KEYMIN ( ( char * ) & m_spiderdbStartKeys [ i ] , sizeof ( key128_t ) ) ;
KEYMIN ( ( char * ) & m_titledbStartKeys [ i ] , sizeof ( key_t ) ) ;
}
2014-01-17 18:28:17 -08:00
// begin reading from the shards and trasmitting back on m_socket
readAndSendLoop ( this , true ) ;
2013-09-13 16:22:07 -07:00
}
2014-01-17 16:26:17 -08:00
static void gotListWrapper7 ( void * state ) {
// get the Crawler dump State
StateCD * st = ( StateCD * ) state ;
// inc it up here
st - > m_numReplies + + ;
// wait for all
if ( st - > m_numReplies < st - > m_numRequests ) return ;
// read and send loop
2014-01-17 18:28:17 -08:00
readAndSendLoop ( st , false ) ;
2014-01-17 16:26:17 -08:00
}
2013-09-13 16:22:07 -07:00
2013-09-30 13:48:37 -06:00
bool StateCD : : readDataFromRdb ( ) {
2013-09-13 16:22:07 -07:00
// set end key to max key. we are limiting using m_minRecSizes for this
key128_t ek ; KEYMAX ( ( char * ) & ek , sizeof ( key128_t ) ) ;
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2014-02-18 09:44:15 -08:00
// collection got nuked?
if ( ! cr ) {
2014-11-10 14:45:11 -08:00
log ( " crawlbot: readdatafromrdb: coll % " INT32 " got nuked " ,
( int32_t ) m_collnum ) ;
2014-02-18 09:44:15 -08:00
g_errno = ENOCOLLREC ;
return true ;
}
2013-09-30 13:48:37 -06:00
// top:
2013-09-13 16:22:07 -07:00
// launch one request to each shard
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_hostdb . m_numShards ; i + + ) {
2013-10-25 14:54:24 -07:00
// reset each one
m_lists [ i ] . freeList ( ) ;
// if last list was exhausted don't bother
if ( ! m_needMore [ i ] ) continue ;
2013-09-13 16:22:07 -07:00
// count it
m_numRequests + + ;
// this is the least nice. crawls will yield to it mostly.
2014-11-10 14:45:11 -08:00
int32_t niceness = 0 ;
2013-09-13 16:22:07 -07:00
// point to right startkey
char * sk ;
if ( m_rdbId = = RDB_SPIDERDB )
sk = ( char * ) & m_spiderdbStartKeys [ i ] ;
else
sk = ( char * ) & m_titledbStartKeys [ i ] ;
// get host
2013-10-04 16:18:56 -07:00
Host * h = g_hostdb . getLiveHostInShard ( i ) ;
2014-01-31 14:53:49 -08:00
// show it
2014-11-10 14:45:11 -08:00
int32_t ks = getKeySizeFromRdbId ( m_rdbId ) ;
log ( " dump: asking host #% " INT32 " for list sk=%s " ,
2014-01-31 14:53:49 -08:00
h - > m_hostId , KEYSTR ( sk , ks ) ) ;
2013-09-13 16:22:07 -07:00
// msg0 uses multicast in case one of the hosts in a shard is
// dead or dies during this call.
if ( ! m_msg0s [ i ] . getList ( h - > m_hostId , // use multicast
h - > m_ip ,
h - > m_port ,
0 , // maxcacheage
false , // addtocache?
m_rdbId ,
2014-03-06 15:48:11 -08:00
cr - > m_collnum ,
2013-09-13 16:22:07 -07:00
& m_lists [ i ] ,
sk ,
( char * ) & ek ,
// get at most about
// "minRecSizes" worth of spiderdb
// records
m_minRecSizes ,
this ,
2014-01-17 16:26:17 -08:00
gotListWrapper7 ,
2013-09-30 13:48:37 -06:00
niceness ) ) {
log ( " crawlbot: blocked getting list from shard " ) ;
2013-09-13 16:22:07 -07:00
// continue if it blocked
continue ;
2013-09-30 13:48:37 -06:00
}
2013-11-06 17:15:29 -08:00
log ( " crawlbot: did not block getting list from shard err=%s " ,
mstrerror ( g_errno ) ) ;
2013-09-13 16:22:07 -07:00
// we got a reply back right away...
m_numReplies + + ;
}
// all done? return if still waiting on more msg0s to get their data
2013-09-30 13:48:37 -06:00
if ( m_numReplies < m_numRequests ) return false ;
// i guess did not block, empty single shard? no, must have been
2013-09-30 14:12:22 -06:00
// error becaues sendList() would have sent back on the tcp
2013-09-30 13:48:37 -06:00
// socket and blocked and returned false if not error sending
return true ;
2013-09-13 16:22:07 -07:00
}
2013-09-30 14:12:22 -06:00
bool StateCD : : sendList ( ) {
2013-09-13 16:22:07 -07:00
// get the Crawler dump State
// inc it
2013-09-30 14:12:22 -06:00
//m_numReplies++;
2013-09-30 13:48:37 -06:00
// sohw it
2014-11-10 14:45:11 -08:00
log ( " crawlbot: got list from shard. req=% " INT32 " rep=% " INT32 " " ,
2013-09-30 13:48:37 -06:00
m_numRequests , m_numReplies ) ;
2013-09-13 16:22:07 -07:00
// return if still awaiting more replies
2013-09-30 13:48:37 -06:00
if ( m_numReplies < m_numRequests ) return false ;
2013-09-13 16:22:07 -07:00
SafeBuf sb ;
//sb.setLabel("dbotdmp");
2013-09-30 10:46:54 -06:00
char * ct = " text/csv " ;
2014-04-05 18:09:04 -07:00
if ( m_fmt = = FORMAT_JSON )
2013-09-30 10:46:54 -06:00
ct = " application/json " ;
2014-04-05 18:09:04 -07:00
if ( m_fmt = = FORMAT_XML )
2013-09-30 10:46:54 -06:00
ct = " text/xml " ;
2014-04-05 18:09:04 -07:00
if ( m_fmt = = FORMAT_TXT )
2013-09-30 11:10:43 -06:00
ct = " text/plain " ;
2014-04-05 18:09:04 -07:00
if ( m_fmt = = FORMAT_CSV )
2013-11-12 13:51:52 -08:00
ct = " text/csv " ;
2013-09-30 10:46:54 -06:00
2013-09-13 16:22:07 -07:00
// . if we haven't yet sent an http mime back to the user
// then do so here, the content-length will not be in there
// because we might have to call for more spiderdb data
if ( m_needsMime ) {
2014-01-17 16:26:17 -08:00
m_needsMime = false ;
2013-09-13 16:22:07 -07:00
HttpMime mime ;
mime . makeMime ( - 1 , // totel content-lenght is unknown!
0 , // do not cache (cacheTime)
0 , // lastModified
0 , // offset
- 1 , // bytesToSend
NULL , // ext
false , // POSTReply
2013-09-30 10:46:54 -06:00
ct , // "text/csv", // contenttype
2013-09-13 16:22:07 -07:00
" utf-8 " , // charset
- 1 , // httpstatus
NULL ) ; //cookie
sb . safeMemcpy ( mime . getMime ( ) , mime . getMimeLen ( ) ) ;
}
2013-09-30 09:18:22 -06:00
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
2013-09-13 16:22:07 -07:00
2014-04-05 18:09:04 -07:00
if ( ! m_printedFirstBracket & & m_fmt = = FORMAT_JSON ) {
2013-11-04 11:29:22 -08:00
sb . safePrintf ( " [ \n " ) ;
m_printedFirstBracket = true ;
}
2013-12-28 20:32:28 -08:00
// these are csv files not xls
2014-04-05 18:09:04 -07:00
//if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) {
2013-12-28 20:32:28 -08:00
// sb.safePrintf("sep=,\n");
// m_printedFirstBracket = true;
//}
2013-11-04 11:29:22 -08:00
2013-09-13 16:22:07 -07:00
// we set this to true below if any one shard has more spiderdb
// records left to read
m_someoneNeedsMore = false ;
//
// got all replies... create the HTTP reply and send it back
//
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_hostdb . m_numShards ; i + + ) {
2013-09-13 16:22:07 -07:00
if ( ! m_needMore [ i ] ) continue ;
// get the list from that group
RdbList * list = & m_lists [ i ] ;
2013-09-30 13:48:37 -06:00
// should we try to read more?
m_needMore [ i ] = false ;
2014-01-31 14:53:49 -08:00
// report it
2014-11-10 14:45:11 -08:00
log ( " dump: got list of % " INT32 " bytes from host #% " INT32 " round #% " INT32 " " ,
2014-01-31 14:53:49 -08:00
list - > getListSize ( ) , i , m_dumpRound ) ;
2013-09-30 13:48:37 -06:00
if ( list - > isEmpty ( ) ) {
list - > freeList ( ) ;
continue ;
}
2013-09-13 16:22:07 -07:00
// get the format
2013-09-30 09:18:22 -06:00
//char *format = cr->m_diffbotFormat.getBufStart();
//if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
2013-09-30 10:46:54 -06:00
//char *format = NULL;
2013-09-13 16:22:07 -07:00
2013-10-25 14:54:24 -07:00
// this cores because msg0 does not transmit lastkey
//char *ek = list->getLastKey();
char * lastKeyPtr = NULL ;
2013-09-13 16:22:07 -07:00
// now print the spiderdb list out into "sb"
if ( m_rdbId = = RDB_SPIDERDB ) {
// print SPIDERDB list into "sb"
2013-10-25 14:54:24 -07:00
printSpiderdbList ( list , & sb , & lastKeyPtr ) ;
2013-09-13 16:22:07 -07:00
// update spiderdb startkey for this shard
2013-10-25 14:54:24 -07:00
KEYSET ( ( char * ) & m_spiderdbStartKeys [ i ] , lastKeyPtr ,
2013-09-13 16:22:07 -07:00
sizeof ( key128_t ) ) ;
// advance by 1
m_spiderdbStartKeys [ i ] + = 1 ;
}
2013-10-25 14:54:24 -07:00
else if ( m_rdbId = = RDB_TITLEDB ) {
2013-09-13 16:22:07 -07:00
// print TITLEDB list into "sb"
2013-10-25 14:54:24 -07:00
printTitledbList ( list , & sb , & lastKeyPtr ) ;
2013-09-13 16:22:07 -07:00
// update titledb startkey for this shard
2013-10-25 14:54:24 -07:00
KEYSET ( ( char * ) & m_titledbStartKeys [ i ] , lastKeyPtr ,
2013-09-13 16:22:07 -07:00
sizeof ( key_t ) ) ;
// advance by 1
m_titledbStartKeys [ i ] + = 1 ;
}
2013-10-25 14:54:24 -07:00
else { char * xx = NULL ; * xx = 0 ; }
2013-09-30 13:48:37 -06:00
// figure out why we do not get the full list????
//if ( list->m_listSize >= 0 ) { // m_minRecSizes ) {
m_needMore [ i ] = true ;
m_someoneNeedsMore = true ;
//}
// save mem
list - > freeList ( ) ;
2013-09-13 16:22:07 -07:00
}
2014-01-31 14:53:49 -08:00
m_dumpRound + + ;
2014-11-10 14:45:11 -08:00
//log("rdbid=%"INT32" fmt=%"INT32" some=%"INT32" printed=%"INT32"",
// (int32_t)m_rdbId,(int32_t)m_fmt,(int32_t)m_someoneNeedsMore,
// (int32_t)m_printedEndingBracket);
2014-01-17 16:26:17 -08:00
2014-02-06 18:21:22 -08:00
m_socket - > m_streamingMode = true ;
2013-11-04 11:29:22 -08:00
// if nobody needs to read more...
2014-02-06 18:21:22 -08:00
if ( ! m_someoneNeedsMore & & ! m_printedEndingBracket ) {
// use this for printing out urls.csv as well...
2013-11-04 11:34:22 -08:00
m_printedEndingBracket = true ;
2013-11-04 11:29:22 -08:00
// end array of json objects. might be empty!
2014-04-05 18:09:04 -07:00
if ( m_rdbId = = RDB_TITLEDB & & m_fmt = = FORMAT_JSON )
2014-02-06 18:21:22 -08:00
sb . safePrintf ( " \n ] \n " ) ;
2014-11-10 14:45:11 -08:00
//log("adding ]. len=%"INT32"",sb.length());
2014-02-06 18:21:22 -08:00
// i'd like to exit streaming mode here. i fixed tcpserver.cpp
// so if we are called from makecallback() there it won't
// call destroysocket if we WERE in streamingMode just yet
m_socket - > m_streamingMode = false ;
2014-02-06 16:28:42 -08:00
}
2014-01-17 16:26:17 -08:00
TcpServer * tcp = & g_httpServer . m_tcp ;
// . transmit the chunk in sb
// . steals the allocated buffer from sb and stores in the
// TcpSocket::m_sendBuf, which it frees when socket is
// ultimately destroyed or we call sendChunk() again.
// . when TcpServer is done transmitting, it does not close the
// socket but rather calls doneSendingWrapper() which can call
// this function again to send another chunk
if ( ! tcp - > sendChunk ( m_socket ,
& sb ,
this ,
2014-02-06 16:28:42 -08:00
doneSendingWrapper ) )
2014-01-17 16:26:17 -08:00
return false ;
// we are done sending this chunk, i guess tcp write was cached
// in the network card buffer or something
return true ;
2013-09-13 16:22:07 -07:00
}
// TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf
void doneSendingWrapper ( void * state , TcpSocket * sock ) {
StateCD * st = ( StateCD * ) state ;
2014-02-12 13:21:30 -08:00
// error on socket?
//if ( g_errno ) st->m_socketError = g_errno;
2014-01-17 16:26:17 -08:00
//TcpSocket *socket = st->m_socket;
2014-01-31 14:53:49 -08:00
st - > m_accumulated + = sock - > m_totalSent ;
2013-09-13 16:22:07 -07:00
2014-11-10 14:45:11 -08:00
log ( " crawlbot: done sending on socket % " INT32 " /% " INT32 " [% " INT64 " ] bytes " ,
2013-09-30 14:12:22 -06:00
sock - > m_totalSent ,
2014-01-31 14:53:49 -08:00
sock - > m_sendBufUsed ,
st - > m_accumulated ) ;
2013-09-30 14:12:22 -06:00
2013-09-30 13:48:37 -06:00
2014-01-17 18:28:17 -08:00
readAndSendLoop ( st , true ) ;
2013-09-30 13:48:37 -06:00
2014-01-17 16:26:17 -08:00
return ;
2013-09-13 16:22:07 -07:00
}
2013-10-25 14:54:24 -07:00
void StateCD : : printSpiderdbList ( RdbList * list , SafeBuf * sb , char * * lastKeyPtr ) {
2013-09-13 16:22:07 -07:00
// declare these up here
SpiderRequest * sreq = NULL ;
SpiderReply * srep = NULL ;
2014-11-10 14:45:11 -08:00
int32_t badCount = 0 ;
2013-10-24 11:32:41 -07:00
2014-11-10 14:45:11 -08:00
int32_t nowGlobalMS = gettimeofdayInMillisecondsGlobal ( ) ;
2013-10-24 11:32:41 -07:00
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2014-11-10 14:45:11 -08:00
uint32_t lastSpidered = 0 ;
2013-10-30 10:00:46 -07:00
2013-09-13 16:22:07 -07:00
// parse through it
for ( ; ! list - > isExhausted ( ) ; list - > skipCurrentRec ( ) ) {
// this record is either a SpiderRequest or SpiderReply
char * rec = list - > getCurrentRec ( ) ;
2013-10-25 14:54:24 -07:00
// save it
* lastKeyPtr = rec ;
2013-09-13 16:22:07 -07:00
// we encounter the spiderreplies first then the
// spiderrequests for the same url
if ( g_spiderdb . isSpiderReply ( ( key128_t * ) rec ) ) {
srep = ( SpiderReply * ) rec ;
2013-10-30 10:00:46 -07:00
if ( sreq ) lastSpidered = 0 ;
2013-09-13 16:22:07 -07:00
sreq = NULL ;
2013-10-30 10:00:46 -07:00
if ( lastSpidered = = 0 )
lastSpidered = srep - > m_spideredTime ;
else if ( srep - > m_spideredTime > lastSpidered )
lastSpidered = srep - > m_spideredTime ;
2013-12-05 10:09:06 -08:00
m_prevReplyUh48 = srep - > getUrlHash48 ( ) ;
2013-12-23 10:27:42 -08:00
m_prevReplyFirstIp = srep - > m_firstIp ;
2013-09-13 16:22:07 -07:00
// 0 means indexed successfully. not sure if
// this includes http status codes like 404 etc.
// i don't think it includes those types of errors!
2013-12-05 10:09:06 -08:00
m_prevReplyError = srep - > m_errCode ;
m_prevReplyDownloadTime = srep - > m_spideredTime ;
2013-09-13 16:22:07 -07:00
continue ;
}
// ok, we got a spider request
sreq = ( SpiderRequest * ) rec ;
2014-07-22 06:34:34 -07:00
2016-11-07 09:11:27 -08:00
if ( sreq - > isCorrupt ( - 1 ) ) {
2014-07-22 06:34:34 -07:00
log ( " spider: encountered a corrupt spider req "
2014-11-10 14:45:11 -08:00
" when dumping cn=% " INT32 " . skipping. " ,
2016-11-07 09:11:27 -08:00
( int32_t ) - 1 ) ; //cr->m_collnum);
2014-07-22 06:34:34 -07:00
continue ;
}
2013-09-13 16:22:07 -07:00
// sanity check
if ( srep & & srep - > getUrlHash48 ( ) ! = sreq - > getUrlHash48 ( ) ) {
2013-09-25 13:09:02 -06:00
badCount + + ;
//log("diffbot: had a spider reply with no "
2014-11-10 14:45:11 -08:00
// "corresponding spider request for uh48=%"INT64""
2013-09-25 13:09:02 -06:00
// , srep->getUrlHash48());
2013-09-13 16:22:07 -07:00
//char *xx=NULL;*xx=0;
}
// print the url if not yet printed
2014-10-30 13:36:39 -06:00
int64_t uh48 = sreq - > getUrlHash48 ( ) ;
2014-11-10 14:45:11 -08:00
int32_t firstIp = sreq - > m_firstIp ;
2013-09-13 16:22:07 -07:00
bool printIt = false ;
// there can be multiple spiderrequests for the same url!
2013-12-05 10:09:06 -08:00
if ( m_lastUh48 ! = uh48 ) printIt = true ;
2013-12-23 10:27:42 -08:00
// sometimes the same url has different firstips now that
// we have the EFAKEFIRSTIP spider error to avoid spidering
// seeds twice...
if ( m_lastFirstIp ! = firstIp ) printIt = true ;
2013-09-13 16:22:07 -07:00
if ( ! printIt ) continue ;
2013-12-05 10:09:06 -08:00
m_lastUh48 = uh48 ;
2013-12-23 10:27:42 -08:00
m_lastFirstIp = firstIp ;
2013-09-13 16:22:07 -07:00
2013-10-30 10:00:46 -07:00
// make sure spiderreply is for the same url!
if ( srep & & srep - > getUrlHash48 ( ) ! = sreq - > getUrlHash48 ( ) )
srep = NULL ;
if ( ! srep )
lastSpidered = 0 ;
2013-11-04 10:49:31 -08:00
bool isProcessed = false ;
2015-04-14 15:23:12 -07:00
if ( srep ) isProcessed = srep - > m_sentToDiffbotThisTime ;
2013-11-04 10:49:31 -08:00
2013-11-18 17:38:14 -08:00
if ( srep & & srep - > m_hadDiffbotError )
isProcessed = false ;
2013-09-13 16:22:07 -07:00
// debug point
//if ( strstr(sreq->m_url,"chief") )
// log("hey");
// 1 means spidered, 0 means not spidered, -1 means error
2014-11-10 14:45:11 -08:00
int32_t status = 1 ;
2013-09-13 16:22:07 -07:00
// if unspidered, then we don't match the prev reply
// so set "status" to 0 to indicate hasn't been
// downloaded yet.
2013-12-05 10:09:06 -08:00
if ( m_lastUh48 ! = m_prevReplyUh48 ) status = 0 ;
2013-12-23 10:27:42 -08:00
if ( m_lastFirstIp ! = m_prevReplyFirstIp ) status = 0 ;
2013-09-13 16:22:07 -07:00
// if it matches, perhaps an error spidering it?
2013-12-05 10:09:06 -08:00
if ( status & & m_prevReplyError ) status = - 1 ;
2013-09-13 16:22:07 -07:00
// use the time it was added to spiderdb if the url
// was not spidered
time_t time = sreq - > m_addedTime ;
// if it was spidered, successfully or got an error,
// then use the time it was spidered
2013-12-05 10:09:06 -08:00
if ( status ) time = m_prevReplyDownloadTime ;
2013-09-13 16:22:07 -07:00
2013-10-30 13:39:10 -07:00
char * msg = " Successfully Downloaded " ; //Crawled";
if ( status = = 0 ) msg = " Not downloaded " ; //Unexamined";
2014-02-23 20:04:46 -08:00
if ( status = = - 1 ) {
msg = mstrerror ( m_prevReplyError ) ;
// do not print "Fake First Ip"...
if ( m_prevReplyError = = EFAKEFIRSTIP )
msg = " Initial crawl request " ;
2014-02-26 15:48:52 -08:00
// if the initial crawl request got a reply then that
// means the spiderrequest was added under the correct
// firstip... so skip it. i am assuming that the
// correct spidrerequest got added ok here...
if ( m_prevReplyError = = EFAKEFIRSTIP )
continue ;
2014-02-23 20:04:46 -08:00
}
2013-09-13 16:22:07 -07:00
2013-11-18 17:38:14 -08:00
if ( srep & & srep - > m_hadDiffbotError )
msg = " Diffbot processing error " ;
2014-07-02 11:53:24 -07:00
// indicate specific diffbot error if we have it
if ( srep & &
srep - > m_hadDiffbotError & &
srep - > m_errCode & &
// stick with "diffbot processing error" for these...
srep - > m_errCode ! = EDIFFBOTINTERNALERROR )
msg = mstrerror ( srep - > m_errCode ) ;
2013-10-24 11:32:41 -07:00
// matching url filter, print out the expression
2014-11-10 14:45:11 -08:00
int32_t ufn ;
2013-10-24 11:32:41 -07:00
ufn = : : getUrlFilterNum ( sreq ,
srep ,
nowGlobalMS ,
false ,
MAX_NICENESS ,
2014-01-20 16:45:27 -08:00
cr ,
false , // isoutlink?
2015-03-05 08:49:39 -08:00
NULL ,
- 1 ) ; // langIdArg
2013-10-24 11:32:41 -07:00
char * expression = NULL ;
2014-11-10 14:45:11 -08:00
int32_t priority = - 4 ;
2013-10-24 11:32:41 -07:00
// sanity check
if ( ufn > = 0 ) {
expression = cr - > m_regExs [ ufn ] . getBufStart ( ) ;
priority = cr - > m_spiderPriorities [ ufn ] ;
}
if ( ! expression ) {
expression = " error. matches no expression! " ;
priority = - 4 ;
}
// when spidering rounds we use the
// lastspidertime>={roundstart} --> spiders disabled rule
// so that we do not spider a url twice in the same round
2014-01-19 19:38:23 -08:00
if ( ufn > = 0 & & //! cr->m_spidersEnabled[ufn] ) {
2015-03-17 20:27:23 -06:00
cr - > m_regExs [ ufn ] . length ( ) & &
2014-01-19 19:38:23 -08:00
// we set this to 0 instead of using the checkbox
2015-03-17 20:27:23 -06:00
strstr ( cr - > m_regExs [ ufn ] . getBufStart ( ) , " round " ) ) {
//cr->m_maxSpidersPerRule[ufn] <= 0 ) {
2013-10-24 11:32:41 -07:00
priority = - 5 ;
}
2013-11-04 10:49:31 -08:00
char * as = " discovered " ;
if ( sreq & &
( sreq - > m_isInjecting | |
sreq - > m_isAddUrl ) ) {
as = " manually added " ;
}
// print column headers?
if ( m_isFirstTime ) {
m_isFirstTime = false ;
sb - > safePrintf ( " \" Url \" , "
" \" Entry Method \" , "
2014-01-20 23:17:04 -08:00
) ;
if ( cr - > m_isCustomCrawl )
sb - > safePrintf ( " \" Processed? \" , " ) ;
sb - > safePrintf (
2013-11-04 10:49:31 -08:00
" \" Add Time \" , "
" \" Last Crawled \" , "
" \" Last Status \" , "
" \" Matching Expression \" , "
" \" Matching Action \" \n " ) ;
}
2013-09-13 16:22:07 -07:00
// "csv" is default if json not specified
2014-04-05 18:09:04 -07:00
if ( m_fmt = = FORMAT_JSON )
2013-09-13 16:22:07 -07:00
sb - > safePrintf ( " [{ "
" { \" url \" : "
" \" %s \" }, "
" { \" time \" : "
2014-11-10 14:45:11 -08:00
" \" % " UINT32 " \" }, "
2013-09-13 16:22:07 -07:00
" { \" status \" : "
2014-11-10 14:45:11 -08:00
" \" % " INT32 " \" }, "
2013-09-13 16:22:07 -07:00
" { \" statusMsg \" : "
" \" %s \" } "
" }] \n "
, sreq - > m_url
// when was it first added to spiderdb?
, sreq - > m_addedTime
, status
, msg
) ;
// but default to csv
2013-10-24 11:32:41 -07:00
else {
2014-03-25 13:05:43 -07:00
if ( cr & & cr - > m_isCustomCrawl = = 1 & & sreq & & ! sreq - > m_isAddUrl & & ! sreq - > m_isInjecting ) {
if ( cr - > m_diffbotUrlCrawlPattern . m_length = = 0
& & cr - > m_diffbotUrlProcessPattern . m_length = = 0 ) {
// If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
if ( sreq & & ! sreq - > m_sameDom )
continue ;
} else {
// TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
// urlProcessPattern. We have to check if the current url matches the pattern
}
2014-03-11 19:46:38 -07:00
}
2013-11-04 10:49:31 -08:00
sb - > safePrintf ( " \" %s \" , \" %s \" , "
2013-09-13 16:22:07 -07:00
, sreq - > m_url
2013-11-04 10:49:31 -08:00
, as
2014-01-20 23:17:04 -08:00
) ;
if ( cr - > m_isCustomCrawl )
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " % " INT32 " , " , ( int32_t ) isProcessed ) ;
2014-01-20 23:17:04 -08:00
sb - > safePrintf (
2014-11-10 14:45:11 -08:00
" % " UINT32 " ,% " UINT32 " , \" %s \" , \" %s \" , \" "
2014-01-20 23:17:04 -08:00
//",%s"
//"\n"
2013-09-13 16:22:07 -07:00
// when was it first added to spiderdb?
, sreq - > m_addedTime
2013-10-30 10:00:46 -07:00
// last time spidered, 0 if none
, lastSpidered
2013-10-24 11:32:41 -07:00
//, status
2013-09-13 16:22:07 -07:00
, msg
2013-10-24 11:32:41 -07:00
// the url filter expression it matches
, expression
// the priority
//, priorityMsg
2013-09-13 16:22:07 -07:00
//, iptoa(sreq->m_firstIp)
) ;
2013-10-24 11:32:41 -07:00
// print priority
2015-03-17 20:27:23 -06:00
//if ( priority == SPIDER_PRIORITY_FILTERED )
// we just turn off the spiders now
if ( ufn > = 0 & & cr - > m_maxSpidersPerRule [ ufn ] < = 0 )
2013-10-24 11:32:41 -07:00
sb - > safePrintf ( " url ignored " ) ;
2015-03-17 20:27:23 -06:00
//else if ( priority == SPIDER_PRIORITY_BANNED )
// sb->safePrintf("url banned");
2013-10-24 11:32:41 -07:00
else if ( priority = = - 4 )
sb - > safePrintf ( " error " ) ;
else if ( priority = = - 5 )
sb - > safePrintf ( " will spider next round " ) ;
else
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " % " INT32 " " , priority ) ;
2013-10-24 11:32:41 -07:00
sb - > safePrintf ( " \" "
" \n " ) ;
}
2013-09-13 16:22:07 -07:00
}
2013-09-25 13:09:02 -06:00
if ( ! badCount ) return ;
log ( " diffbot: had a spider reply with no "
2014-11-10 14:45:11 -08:00
" corresponding spider request % " INT32 " times " , badCount ) ;
2013-09-13 16:22:07 -07:00
}
2013-10-25 14:54:24 -07:00
void StateCD : : printTitledbList ( RdbList * list , SafeBuf * sb , char * * lastKeyPtr ) {
2013-09-13 16:22:07 -07:00
XmlDoc xd ;
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2013-11-04 11:29:22 -08:00
// save it
* lastKeyPtr = NULL ;
2013-09-13 16:22:07 -07:00
// parse through it
for ( ; ! list - > isExhausted ( ) ; list - > skipCurrentRec ( ) ) {
// this record is either a SpiderRequest or SpiderReply
char * rec = list - > getCurrentRec ( ) ;
// skip ifnegative
if ( ( rec [ 0 ] & 0x01 ) = = 0x00 ) continue ;
2013-11-04 11:29:22 -08:00
// set it
* lastKeyPtr = rec ;
2013-09-30 14:25:33 -06:00
// reset first since set2() can't call reset()
xd . reset ( ) ;
2013-09-13 16:22:07 -07:00
// uncompress it
if ( ! xd . set2 ( rec ,
0 , // maxSize unused
cr - > m_coll ,
NULL , // ppbuf
0 , // niceness
NULL ) ) { // spiderRequest
log ( " diffbot: error setting titlerec in dump " ) ;
continue ;
}
// must be of type json to be a diffbot json object
2013-09-17 10:25:54 -07:00
if ( m_downloadJSON & & xd . m_contentType ! = CT_JSON ) continue ;
// or if downloading web pages...
if ( ! m_downloadJSON ) {
// skip if json object content type
if ( xd . m_contentType = = CT_JSON ) continue ;
// . just print the cached page
// . size should include the \0
sb - > safeStrcpy ( xd . m_firstUrl . m_url ) ;
// then \n
sb - > pushChar ( ' \n ' ) ;
// then page content
sb - > safeStrcpy ( xd . ptr_utf8Content ) ;
// null term just in case
//sb->nullTerm();
// separate pages with \0 i guess
sb - > pushChar ( ' \0 ' ) ;
// \n
sb - > pushChar ( ' \n ' ) ;
continue ;
}
2013-09-13 16:22:07 -07:00
// skip if not a diffbot json url
2013-09-18 17:05:41 -07:00
if ( ! xd . m_isDiffbotJSONObject ) continue ;
2013-09-13 16:22:07 -07:00
// get the json content
char * json = xd . ptr_utf8Content ;
2013-11-12 13:51:52 -08:00
// empty?
if ( xd . size_utf8Content < = 1 )
continue ;
2013-11-04 11:29:22 -08:00
2013-11-12 13:51:52 -08:00
// if not json, just print the json item out in csv
2013-11-12 16:33:45 -08:00
// moved into PageResults.cpp...
2014-04-05 18:09:04 -07:00
//if ( m_fmt == FORMAT_CSV ) {
2013-11-12 16:33:45 -08:00
// printJsonItemInCsv ( json , sb );
// continue;
//}
2013-11-04 11:29:22 -08:00
2013-09-13 16:22:07 -07:00
// just print that out. encode \n's and \r's back to \\n \\r
// and backslash to a \\ ...
// but if they originally had a \u<backslash> encoding and
// we made into utf8, do not put that back into the \u
// encoding because it is not necessary.
2013-11-12 13:51:52 -08:00
// print in json
if ( m_printedItem )
sb - > safePrintf ( " \n , \n " ) ;
m_printedItem = true ;
2014-01-28 11:04:53 -08:00
//if ( ! sb->safeStrcpyPrettyJSON ( json ) )
// log("diffbot: error printing json in dump");
sb - > safeStrcpy ( json ) ;
sb - > nullTerm ( ) ;
2013-09-13 16:22:07 -07:00
// separate each JSON object with \n i guess
2013-11-04 11:29:22 -08:00
//sb->pushChar('\n');
2013-11-12 13:51:52 -08:00
}
}
2013-09-17 10:25:54 -07:00
/*
2013-09-13 16:22:07 -07:00
////////////////
//
// SUPPORT FOR GET /api/crawls and /api/activecrawls
//
// Just scan each collection record whose collection name includes the
// provided "token" of the user. then print out the stats of just
//
////////////////
// example output for http://live.diffbot.com/api/crawls?token=matt
// [{"id":"c421f09d-7c31-4131-9da2-21e35d8130a9","finish":1378233585887,"matched":274,"status":"Stopped","start":1378233159848,"token":"matt","parameterMap":{"token":"matt","seed":"www.techcrunch.com","api":"article"},"crawled":274}]
// example output from activecrawls?id=....
// {"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":null,"matched":27,"status":"Crawling","start":1378322184332,"token":"matt","parameterMap":{"token":"matt","seed":"www.alleyinsider.com","api":"article"},"crawled":34}
// NOTE: it does not seem to include active crawls! bad!! like if you lost
// the crawlid...
// "cr" is NULL if showing all crawls!
bool showAllCrawls ( TcpSocket * s , HttpRequest * hr ) {
2014-11-10 14:45:11 -08:00
int32_t tokenLen = 0 ;
2013-09-13 16:22:07 -07:00
char * token = hr - > getString ( " token " , & tokenLen ) ;
// token MUST be there because this function's caller checked for it
if ( ! token ) { char * xx = NULL ; * xx = 0 ; }
// store the crawl stats as html into "sb"
SafeBuf sb ;
// scan the collection recs
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
2013-09-13 16:22:07 -07:00
// get it
CollectionRec * cr = g_collectiondb . m_recs [ i ] ;
// skip if empty
if ( ! cr ) continue ;
// get name
char * coll = cr - > m_coll ;
2014-11-10 14:45:11 -08:00
//int32_t collLen = cr->m_collLen;
2013-09-13 16:22:07 -07:00
// skip if first 16 or whatever characters does not match
// the user token because the name of a collection is
// <TOKEN>-<CRAWLID>
if ( coll [ 0 ] ! = token [ 0 ] ) continue ;
if ( coll [ 1 ] ! = token [ 1 ] ) continue ;
if ( coll [ 2 ] ! = token [ 2 ] ) continue ;
// scan the rest
bool match = true ;
2014-11-10 14:45:11 -08:00
for ( int32_t i = 3 ; coll [ i ] & & token [ i ] ; i + + ) {
2013-09-13 16:22:07 -07:00
// the name of a collection is <TOKEN>-<CRAWLID>
// so if we hit the hyphen we are done
if ( coll [ i ] = = ' - ' ) break ;
if ( coll [ i ] ! = token [ i ] ) { match = false ; break ; }
}
if ( ! match ) continue ;
// we got a match, print them out
printCrawlStats ( & sb , cr ) ;
}
// and send back now
return g_httpServer . sendDynamicPage ( s , sb . getBufStart ( ) ,
sb . length ( ) ,
2013-09-17 10:25:54 -07:00
- 1 ) ; // cachetime
2013-09-13 16:22:07 -07:00
}
2013-09-17 10:25:54 -07:00
*/
2013-09-13 16:22:07 -07:00
2013-10-15 14:08:55 -07:00
/*
2013-09-17 15:32:28 -07:00
char * getTokenFromHttpRequest ( HttpRequest * hr ) {
2013-09-17 11:27:31 -07:00
// provided directly?
2013-09-17 15:32:28 -07:00
char * token = hr - > getString ( " token " , NULL , NULL ) ;
2013-09-17 11:27:31 -07:00
if ( token ) return token ;
// extract token from coll?
char * c = hr - > getString ( " c " , NULL , NULL ) ;
2013-09-26 15:32:11 -06:00
// try new "id" approach
if ( ! c ) c = hr - > getString ( " id " , NULL , NULL ) ;
2013-09-17 11:27:31 -07:00
if ( ! c ) return NULL ;
2013-09-17 15:32:28 -07:00
CollectionRec * cr = g_collectiondb . getRec ( c ) ;
if ( ! cr ) return NULL ;
if ( cr - > m_diffbotToken . length ( ) < = 0 ) return NULL ;
token = cr - > m_diffbotToken . getBufStart ( ) ;
return token ;
2013-09-17 11:27:31 -07:00
}
2013-09-17 10:25:54 -07:00
CollectionRec * getCollRecFromHttpRequest ( HttpRequest * hr ) {
2013-09-17 11:27:31 -07:00
// if we have the collection name explicitly, get the coll rec then
2013-09-17 10:25:54 -07:00
char * c = hr - > getString ( " c " , NULL , NULL ) ;
2013-09-26 15:32:11 -06:00
// try new "id" approach
if ( ! c ) c = hr - > getString ( " id " , NULL , NULL ) ;
2013-09-17 10:25:54 -07:00
if ( c ) return g_collectiondb . getRec ( c ) ;
// no matches
return NULL ;
}
2013-10-15 14:08:55 -07:00
*/
2013-09-17 10:25:54 -07:00
/*
2013-09-13 16:22:07 -07:00
// doesn't have to be fast, so just do a scan
CollectionRec * getCollRecFromCrawlId ( char * crawlId ) {
2014-11-10 14:45:11 -08:00
int32_t idLen = gbstrlen ( crawlId ) ;
2013-09-13 16:22:07 -07:00
// scan collection names
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
2013-09-13 16:22:07 -07:00
// get it
CollectionRec * cr = g_collectiondb . m_recs [ i ] ;
// skip if empty
if ( ! cr ) continue ;
// get name
char * coll = cr - > m_coll ;
2014-11-10 14:45:11 -08:00
int32_t collLen = cr - > m_collLen ;
2013-09-13 16:22:07 -07:00
if ( collLen < 16 ) continue ;
// skip if first 16 or whatever characters does not match
// the user token because the name of a collection is
// <TOKEN>-<CRAWLID>
if ( coll [ collLen - 1 ] ! = crawlId [ idLen - 1 ] ) continue ;
if ( coll [ collLen - 2 ] ! = crawlId [ idLen - 2 ] ) continue ;
if ( coll [ collLen - 3 ] ! = crawlId [ idLen - 3 ] ) continue ;
if ( ! strstr ( coll , crawlId ) ) continue ;
return cr ;
}
return NULL ;
}
void printCrawlStatsWrapper ( void * state ) {
StateXX * sxx = ( StateXX * ) state ;
// get collection rec
CollectionRec * cr = g_collectiondb . getRec ( sxx - > m_collnum ) ;
// print out the crawl
SafeBuf sb ;
printCrawlStats ( & sb , cr ) ;
// save before nuking state
TcpSocket * sock = sxx - > m_socket ;
// nuke the state
mdelete ( sxx , sizeof ( StateXX ) , " stxx " ) ;
2014-01-17 18:28:17 -08:00
delete sxx ;
2013-09-13 16:22:07 -07:00
// and send back now
g_httpServer . sendDynamicPage ( sock ,
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-09-17 10:25:54 -07:00
- 1 ) ; // cachetime
2013-09-13 16:22:07 -07:00
}
void printCrawlStats ( SafeBuf * sb , CollectionRec * cr ) {
// if we are the first, print a '[' to start a json thingy
if ( sb - > length ( ) = = 0 )
sb - > pushChar ( ' [ ' ) ;
// otherwise, remove the previous ']' since we are not the last
else {
char * p = sb - > getBufStart ( ) ;
2014-11-10 14:45:11 -08:00
int32_t plen = sb - > length ( ) ;
2013-09-13 16:22:07 -07:00
if ( p [ plen - 1 ] = = ' [ ' )
sb - > incrementLength ( - 1 ) ;
}
sb - > safePrintf ( " { "
" \" id \" : \" "
) ;
// get the token from coll name
char * token = cr - > m_coll ;
// and the length, up to the hyphen that separates it from crawl id
2014-11-10 14:45:11 -08:00
int32_t tokenLen = 0 ;
2013-09-13 16:22:07 -07:00
for ( ; token [ tokenLen ] & & token [ tokenLen ] ! = ' - ' ; tokenLen + + ) ;
// now crawl id
char * crawlId = token + tokenLen ;
// skip hyphen
if ( crawlId [ 0 ] = = ' - ' ) crawlId + + ;
// print crawl id out
sb - > safeStrcpy ( crawlId ) ;
// end its quote
sb - > safeStrcpy ( " \" , " ) ;
// now the time the crawl finished.
if ( cr - > m_spideringEnabled )
sb - > safePrintf ( " \" finish \" :null, " ) ;
else
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" finish \" :% " INT64 " , " , cr - > m_diffbotCrawlEndTime ) ;
2013-09-13 16:22:07 -07:00
// how many urls we handoff to diffbot api. that implies successful
// download and that it matches the url crawl pattern and
// url process pattern and content regular expression pattern.
//
// NOTE: pageProcessAttempts can be higher than m_pageDownloadAttempts
// when we call getMetaList() on an *old* (in titledb) xmldoc,
// where we just get the cached content from titledb to avoid a
// download, but we still call getDiffbotReply(). perhaps reconstruct
// the diffbot reply from XmlDoc::m_diffbotJSONCount
//
// "processed" here corresponds to the "maxProcessed" cgi parm
// specified when instantiating the crawl parms for the first time.
//
// likewise "crawled" corresponds to "maxCrawled"
//
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" processedAttempts \" :% " INT64 " , " ,
2013-09-13 16:22:07 -07:00
cr - > m_globalCrawlInfo . m_pageProcessAttempts ) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" processed \" :% " INT64 " , " ,
2013-09-13 16:22:07 -07:00
cr - > m_globalCrawlInfo . m_pageProcessSuccesses ) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" crawlAttempts \" :% " INT64 " , " ,
2013-09-13 16:22:07 -07:00
cr - > m_globalCrawlInfo . m_pageDownloadAttempts ) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" crawled \" :% " INT64 " , " ,
2013-09-13 16:22:07 -07:00
cr - > m_globalCrawlInfo . m_pageDownloadSuccesses ) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" urlsConsidered \" :% " INT64 " , " ,
2013-09-13 16:22:07 -07:00
cr - > m_globalCrawlInfo . m_urlsConsidered ) ;
// how many spiders outstanding for this coll right now?
SpiderColl * sc = g_spiderCache . getSpiderColl ( cr - > m_collnum ) ;
2014-11-10 14:45:11 -08:00
int32_t spidersOut = sc - > getTotalOutstandingSpiders ( ) ;
2013-09-13 16:22:07 -07:00
// . status of the crawl: "Stopped" or "Active"?
// . TODO: check with dan to see if Active is correct and
// ShuttingDown is allowable
if ( cr - > m_spideringEnabled )
sb - > safePrintf ( " \" status \" : \" Active \" , " ) ;
else if ( spidersOut )
sb - > safePrintf ( " \" status \" : \" ShuttingDown \" , " ) ;
else
sb - > safePrintf ( " \" status \" : \" Stopped \" , " ) ;
// spider crawl start time
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" start \" :% " INT64 " , " , cr - > m_diffbotCrawlStartTime ) ;
2013-09-13 16:22:07 -07:00
// the token
sb - > safePrintf ( " \" token \" : \" " ) ;
sb - > safeMemcpy ( token , tokenLen ) ;
sb - > safePrintf ( " \" , " ) ;
//
// BEGIN parameter map
//
// the token again
sb - > safePrintf ( " { " ) ;
sb - > safePrintf ( " \" token \" : \" " ) ;
sb - > safeMemcpy ( token , tokenLen ) ;
sb - > safePrintf ( " \" , " ) ;
// the seed url
sb - > safePrintf ( " \" seed \" : \" %s \" , " , cr - > m_diffbotSeed . getBufStart ( ) ) ;
// the api
sb - > safePrintf ( " \" api \" : \" %s \" , " , cr - > m_diffbotApi . getBufStart ( ) ) ;
sb - > safePrintf ( " }, " ) ;
//
// END parameter map
//
// crawl count. counts non-errors. successful downloads.
2014-11-10 14:45:11 -08:00
//sb->safePrintf("\"crawled\":%"INT64"",
2013-09-13 16:22:07 -07:00
// cr->m_globalCrawlInfo.m_pageCrawlAttempts);
sb - > safePrintf ( " } " ) ;
// assume we are the last json object in the array
sb - > pushChar ( ' ] ' ) ;
}
2013-09-17 10:25:54 -07:00
*/
2013-09-13 16:22:07 -07:00
////////////////
//
// **** THE CRAWLBOT CONTROL PANEL *****
//
// . Based on http://diffbot.com/dev/crawl/ page.
// . got to /dev/crawl to see this!
//
////////////////
2013-09-27 10:49:24 -06:00
/*
2013-09-13 16:22:07 -07:00
// generate a random collection name
2014-11-10 14:45:11 -08:00
char * getNewCollName ( ) { // char *token , int32_t tokenLen ) {
2013-09-13 16:22:07 -07:00
// let's create a new crawl id. dan was making it 32 characters
// with 4 hyphens in it for a total of 36 bytes, but since
// MAX_COLL_LEN, the maximum length of a collection name, is just
// 64 bytes, and the token is already 32, let's limit to 16 bytes
// for the crawlerid. so if we print that out in hex, 16 hex chars
// 0xffffffff 0xffffffff is 64 bits. so let's make a random 64-bit
// value here.
2014-11-10 14:45:11 -08:00
uint32_t r1 = rand ( ) ;
uint32_t r2 = rand ( ) ;
2014-10-30 13:30:39 -06:00
uint64_t crawlId64 = ( uint64_t ) r1 ;
2013-09-13 16:22:07 -07:00
crawlId64 < < = 32 ;
crawlId64 | = r2 ;
static char s_collBuf [ MAX_COLL_LEN + 1 ] ;
2014-11-10 14:45:11 -08:00
//int32_t tokenLen = gbstrlen(token);
2013-09-13 16:22:07 -07:00
// include a +5 for "-test"
// include 16 for crawlid (16 char hex #)
2013-09-17 15:32:28 -07:00
//if ( tokenLen + 16 + 5>= MAX_COLL_LEN ) { char *xx=NULL;*xx=0;}
2014-11-17 18:24:38 -08:00
// ensure the crawlid is the full 16 characters long so we
2013-09-13 16:22:07 -07:00
// can quickly extricate the crawlid from the collection name
2015-01-13 12:25:42 -07:00
//gbmemcpy ( s_collBuf, token, tokenLen );
2014-11-10 14:45:11 -08:00
//sprintf(s_collBuf + tokenLen ,"-%016"XINT64"",crawlId64);
sprintf ( s_collBuf , " %016 " XINT64 " " , crawlId64 ) ;
2013-09-13 16:22:07 -07:00
return s_collBuf ;
}
2013-09-27 10:49:24 -06:00
*/
2013-09-13 16:22:07 -07:00
2013-09-25 15:37:20 -06:00
//////////////////////////////////////////
//
// MAIN API STUFF I GUESS
//
//////////////////////////////////////////
2013-09-13 16:22:07 -07:00
2014-11-10 14:45:11 -08:00
bool sendReply2 ( TcpSocket * socket , int32_t fmt , char * msg ) {
2013-09-27 10:49:24 -06:00
// log it
log ( " crawlbot: %s " , msg ) ;
2013-10-15 12:40:56 -07:00
char * ct = " text/html " ;
2013-09-27 10:49:24 -06:00
// send this back to browser
SafeBuf sb ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON ) {
2013-10-22 12:25:37 -07:00
sb . safePrintf ( " { \n \" response \" : \" success \" , \n "
" \" message \" : \" %s \" \n } \n "
2013-09-27 10:49:24 -06:00
, msg ) ;
2013-10-15 12:40:56 -07:00
ct = " application/json " ;
}
2013-09-27 10:49:24 -06:00
else
sb . safePrintf ( " <html><body> "
" success: %s "
" </body></html> "
, msg ) ;
//return g_httpServer.sendErrorReply(socket,500,sb.getBufStart());
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-10-15 12:40:56 -07:00
0 , // cachetime
false , // POST reply?
ct ) ;
2013-09-27 10:49:24 -06:00
}
2014-11-10 14:45:11 -08:00
bool sendErrorReply2 ( TcpSocket * socket , int32_t fmt , char * msg ) {
2013-09-13 16:22:07 -07:00
2013-09-25 15:37:20 -06:00
// log it
2013-12-18 17:20:53 -08:00
log ( " crawlbot: sending back 500 http status '%s' " , msg ) ;
2013-09-13 16:22:07 -07:00
2013-10-15 12:40:56 -07:00
char * ct = " text/html " ;
2013-09-25 15:37:20 -06:00
// send this back to browser
2013-09-13 16:22:07 -07:00
SafeBuf sb ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON ) {
2013-10-22 12:25:37 -07:00
sb . safePrintf ( " { \" error \" : \" %s \" } \n "
2013-09-25 15:37:20 -06:00
, msg ) ;
2013-10-15 12:40:56 -07:00
ct = " application/json " ;
}
2013-09-25 15:37:20 -06:00
else
sb . safePrintf ( " <html><body> "
" failed: %s "
" </body></html> "
, msg ) ;
2013-10-18 15:21:00 -07:00
// log it
2013-12-18 17:20:53 -08:00
//log("crawlbot: %s",msg );
2013-10-18 15:21:00 -07:00
2013-09-25 17:12:01 -06:00
//return g_httpServer.sendErrorReply(socket,500,sb.getBufStart());
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-10-15 12:40:56 -07:00
0 , // cachetime
false , // POST reply?
2013-12-18 17:20:53 -08:00
ct ,
500 ) ; // error! not 200...
2013-09-25 15:37:20 -06:00
}
2013-10-01 15:14:39 -06:00
bool printCrawlBotPage2 ( class TcpSocket * s ,
class HttpRequest * hr ,
char fmt ,
class SafeBuf * injectionResponse ,
class SafeBuf * urlUploadResponse ,
collnum_t collnum ) ;
2013-09-25 15:37:20 -06:00
void addedUrlsToSpiderdbWrapper ( void * state ) {
StateCD * st = ( StateCD * ) state ;
SafeBuf rr ;
2013-10-29 15:26:32 -07:00
rr . safePrintf ( " Successfully added urls for spidering. " ) ;
2013-09-25 15:37:20 -06:00
printCrawlBotPage2 ( st - > m_socket ,
& st - > m_hr ,
st - > m_fmt ,
NULL ,
2013-09-27 10:49:24 -06:00
& rr ,
st - > m_collnum ) ;
2013-09-25 15:37:20 -06:00
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2014-11-10 14:45:11 -08:00
//log("mdel2: st=%"XINT32"",(int32_t)st);
2013-09-25 15:37:20 -06:00
}
2013-10-18 15:21:00 -07:00
/*
2013-09-25 15:37:20 -06:00
void injectedUrlWrapper ( void * state ) {
StateCD * st = ( StateCD * ) state ;
Msg7 * msg7 = & st - > m_msg7 ;
// the doc we injected...
XmlDoc * xd = & msg7 - > m_xd ;
// make a status msg for the url
SafeBuf sb ;
2013-10-16 14:03:14 -07:00
SafeBuf js ; // for json reply
2013-09-25 15:37:20 -06:00
if ( xd - > m_indexCode = = 0 ) {
sb . safePrintf ( " <b><font color=black> "
" Successfully added " ) ;
2013-10-16 14:03:14 -07:00
js . safePrintf ( " Seed Successful. " ) ;
2013-09-25 15:37:20 -06:00
}
else if ( xd - > m_indexCode = = EDOCFILTERED ) {
sb . safePrintf ( " <b><font color=red> "
" Error: <i>%s</i> by matching "
2014-11-10 14:45:11 -08:00
" url filter #% " INT32 " "
2013-09-25 15:37:20 -06:00
" when adding "
, mstrerror ( xd - > m_indexCode )
2013-10-16 14:03:14 -07:00
// divide by 2 because we add a
// "manualadd &&" rule with every url filter
// that the client adds
, ( xd - > m_urlFilterNum - 2 ) / 2
2013-09-25 15:37:20 -06:00
) ;
2014-11-10 14:45:11 -08:00
js . safePrintf ( " Seed URL filtered by URL filter #% " INT32 " "
2013-10-16 14:03:14 -07:00
, ( xd - > m_urlFilterNum - 2 ) / 2 ) ;
2013-09-25 15:37:20 -06:00
}
else {
sb . safePrintf ( " <b><font color=red> "
" Error: <i>%s</i> when adding "
, mstrerror ( xd - > m_indexCode ) ) ;
2013-10-16 14:03:14 -07:00
js . safePrintf ( " Error adding seed url: %s "
, mstrerror ( xd - > m_indexCode ) ) ;
2013-09-25 15:37:20 -06:00
}
sb . safeTruncateEllipsis ( xd - > m_firstUrl . getUrl ( ) , 60 ) ;
if ( xd - > m_indexCode = = 0 ) {
2013-10-16 14:03:14 -07:00
if ( xd - > m_numOutlinksAddedValid ) {
2014-11-10 14:45:11 -08:00
sb . safePrintf ( " (added % " INT32 " outlinks) "
, ( int32_t ) xd - > m_numOutlinksAdded ) ;
js . safePrintf ( " Added % " INT32 " outlinks from same domain. "
" % " INT32 " outlinks were filtered. "
, ( int32_t ) xd - > m_numOutlinksAddedFromSameDomain
, ( int32_t ) xd - > m_numOutlinksFiltered
2013-10-16 14:03:14 -07:00
) ;
}
else {
2013-09-25 15:37:20 -06:00
sb . safePrintf ( " (added 0 outlinks) " ) ;
2013-10-16 14:03:14 -07:00
js . safePrintf ( " Added 0 outlinks from same domain. "
" 0 links were filtered. " ) ;
}
2013-09-25 15:37:20 -06:00
}
sb . safePrintf ( " </font></b> " ) ;
sb . nullTerm ( ) ;
2013-10-16 14:03:14 -07:00
js . nullTerm ( ) ;
// send back the html or json response?
SafeBuf * response = & sb ;
2014-04-05 18:09:04 -07:00
if ( st - > m_fmt = = FORMAT_JSON ) response = & js ;
2013-10-16 14:03:14 -07:00
2013-09-25 15:37:20 -06:00
// . this will call g_httpServer.sendReply()
// . pass it in the injection response, "sb"
printCrawlBotPage2 ( st - > m_socket ,
& st - > m_hr ,
st - > m_fmt ,
2013-10-16 14:03:14 -07:00
response ,
2013-09-27 10:49:24 -06:00
NULL ,
st - > m_collnum ) ;
2013-09-25 15:37:20 -06:00
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-09-25 15:37:20 -06:00
}
2013-10-18 15:21:00 -07:00
*/
2013-09-25 15:37:20 -06:00
class HelpItem {
public :
char * m_parm ;
char * m_desc ;
} ;
2013-10-22 21:27:21 -07:00
2013-09-25 15:37:20 -06:00
static class HelpItem s_his [ ] = {
2013-10-15 12:22:59 -06:00
{ " format " , " Use &format=html to show HTML output. Default is JSON. " } ,
2013-10-14 18:19:59 -06:00
{ " token " , " Required for all operations below. " } ,
{ " name " , " Name of the crawl. If missing will just show "
" all crawls owned by the given token. " } ,
2013-10-22 18:55:19 -07:00
{ " delete=1 " , " Deletes the crawl. " } ,
2013-11-14 13:16:08 -08:00
{ " reset=1 " , " Resets the crawl. Removes all seeds. " } ,
{ " restart=1 " , " Restarts the crawl. Keeps the seeds. " } ,
2013-10-14 18:19:59 -06:00
2013-10-22 18:55:19 -07:00
{ " pause " ,
2013-10-15 12:31:02 -06:00
" Specify 1 or 0 to pause or resume the crawl respectively. " } ,
2013-10-22 18:55:19 -07:00
{ " repeat " , " Specify number of days as floating point to "
2013-10-15 11:17:44 -07:00
" recrawl the pages. Set to 0.0 to NOT repeat the crawl. " } ,
2013-10-15 12:31:02 -06:00
2013-10-28 21:20:44 -07:00
{ " crawlDelay " , " Wait this many seconds between crawling urls from the "
" same IP address. Can be a floating point number. " } ,
2013-10-24 19:05:57 -07:00
2013-11-11 15:52:04 -08:00
//{"deleteCrawl","Same as delete."},
//{"resetCrawl","Same as delete."},
//{"pauseCrawl","Same as pause."},
//{"repeatCrawl","Same as repeat."},
2013-10-15 12:31:02 -06:00
2013-10-18 11:53:14 -07:00
{ " seeds " , " Whitespace separated list of URLs used to seed the crawl. "
" Will only follow outlinks on the same domain of seed URLs. "
2013-10-16 16:27:24 -07:00
} ,
2013-10-18 11:53:14 -07:00
{ " spots " ,
" Whitespace separated list of URLs to add to the crawl. "
" Outlinks will not be followed. " } ,
2013-11-11 15:52:04 -08:00
{ " urls " ,
" Same as spots. " } ,
2013-10-18 11:53:14 -07:00
//{"spiderLinks","Use 1 or 0 to spider the links or NOT spider "
// "the links, respectively, from "
// "the provided seed or addUrls parameters. "
// "The default is 1."},
2013-10-14 18:19:59 -06:00
2013-10-15 12:31:02 -06:00
2013-10-14 18:19:59 -06:00
{ " maxToCrawl " , " Specify max pages to successfully download. " } ,
2013-11-11 15:52:04 -08:00
//{"maxToDownload", "Specify max pages to successfully download."},
2013-10-14 18:19:59 -06:00
{ " maxToProcess " , " Specify max pages to successfully process through "
" diffbot. " } ,
2013-11-11 15:52:04 -08:00
{ " maxRounds " , " Specify maximum number of crawl rounds. Use "
2013-10-23 11:40:30 -07:00
" -1 to indicate no max. " } ,
2013-11-04 13:57:44 -08:00
{ " onlyProcessIfNew " , " Specify 1 to avoid re-processing pages "
" that have already been processed once before. " } ,
2013-10-14 18:19:59 -06:00
{ " notifyEmail " , " Send email alert to this email when crawl hits "
2013-11-11 15:52:04 -08:00
" the maxtocrawl or maxtoprocess limit, or when the crawl "
" completes. " } ,
2013-10-30 13:39:10 -07:00
{ " notifyWebhook " , " Fetch this URL when crawl hits "
2013-11-11 15:52:04 -08:00
" the maxtocrawl or maxtoprocess limit, or when the crawl "
" completes. " } ,
2013-10-14 18:19:59 -06:00
{ " obeyRobots " , " Obey robots.txt files? " } ,
2014-02-27 19:53:17 -08:00
//{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
2013-11-20 16:41:28 -08:00
{ " urlCrawlPattern " , " List of || separated strings. If the url "
" contains any of these then we crawl the url, otherwise, we do not. "
" An empty pattern matches all urls. " } ,
{ " urlProcessPattern " , " List of || separated strings. If the url "
" contains any of these then we send url to diffbot for processing. "
" An empty pattern matches all urls. " } ,
2013-10-14 18:19:59 -06:00
{ " pageProcessPattern " , " List of || separated strings. If the page "
2013-10-08 17:08:58 -07:00
" contains any of these then we send it to diffbot for processing. "
2013-11-20 16:41:28 -08:00
" An empty pattern matches all pages. " } ,
2013-10-14 18:19:59 -06:00
2013-12-03 16:23:05 -08:00
{ " urlCrawlRegEx " , " Regular expression that the url must match "
" in order to be crawled. If present then the urlCrawlPattern will "
" be ignored. "
" An empty regular expression matches all urls. " } ,
{ " urlProcessRegEx " , " Regular expression that the url must match "
" in order to be processed. "
" If present then the urlProcessPattern will "
" be ignored. "
" An empty regular expression matches all urls. " } ,
2013-12-16 14:10:39 -08:00
{ " apiUrl " , " Diffbot api url to use. We automatically append "
" token and url to it. " } ,
2013-12-03 16:23:05 -08:00
2013-12-16 14:10:39 -08:00
//{"expression","A pattern to match in a URL. List up to 100 "
// "expression/action pairs in the HTTP request. "
// "Example expressions:"},
//{"action","Take the appropriate action when preceeding pattern is "
// "matched. Specify multiple expression/action pairs to build a "
// "table of filters. Each URL being spidered will take the given "
// "action of the first expression it matches. Example actions:"},
2013-10-14 18:19:59 -06:00
2013-09-25 17:51:43 -06:00
2013-09-25 15:37:20 -06:00
{ NULL , NULL }
} ;
2013-10-14 18:19:59 -06:00
/*
2013-10-14 17:19:30 -06:00
// get the input string from the httprequest or the json post
char * getInputString ( char * string , HttpRequest * hr , Json * JS ) {
// try to get it from http request
char * val = hr - > getString ( string ) ;
// if token in json post, use that
if ( ! val ) {
JsonItem * ji = JS . getItem ( string ) ;
if ( ji ) val = ji - > getValue ( ) ;
}
return val ;
}
2013-10-14 18:19:59 -06:00
*/
2013-10-30 13:12:46 -07:00
void collOpDoneWrapper ( void * state ) {
StateCD * st = ( StateCD * ) state ;
TcpSocket * socket = st - > m_socket ;
2013-11-10 16:28:00 -08:00
log ( " crawlbot: done with blocked op. " ) ;
2013-10-30 13:12:46 -07:00
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2014-11-10 14:45:11 -08:00
//log("mdel3: st=%"XINT32"",(int32_t)st);
2013-10-30 13:12:46 -07:00
g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
}
2013-09-26 22:41:05 -06:00
// . when we receive the request from john we call broadcastRequest() from
// Pages.cpp. then msg28 sends this replay with a &cast=0 appended to it
// to every host in the network. then when msg28 gets back replies from all
// those hosts it calls sendPageCrawlbot() here but without a &cast=0
// . so if no &cast is present we are the original!!!
bool sendPageCrawlbot ( TcpSocket * socket , HttpRequest * hr ) {
2013-09-25 15:37:20 -06:00
2013-10-15 12:22:59 -06:00
// print help
2014-11-10 14:45:11 -08:00
int32_t help = hr - > getLong ( " help " , 0 ) ;
2013-10-15 12:22:59 -06:00
if ( help ) {
SafeBuf sb ;
sb . safePrintf ( " <html> "
" <title>Crawlbot API</title> "
" <h1>Crawlbot API</h1> "
" <b>Use the parameters below on the "
" <a href= \" /crawlbot \" >/crawlbot</a> page. "
" </b><br><br> "
" <table> "
) ;
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < 1000 ; i + + ) {
2013-10-15 12:22:59 -06:00
HelpItem * h = & s_his [ i ] ;
if ( ! h - > m_parm ) break ;
sb . safePrintf ( " <tr> "
" <td>%s</td> "
" <td>%s</td> "
" </tr> "
, h - > m_parm
, h - > m_desc
) ;
}
sb . safePrintf ( " </table> "
" </html> " ) ;
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
0 ) ; // cachetime
}
2013-10-14 17:19:30 -06:00
// . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot
// requests to every host in the network unless a cast=0 is
// explicitly given
// . Msg28::massConfig() puts a &cast=0 on the secondary requests
// sent to each host in the network
2014-11-10 14:45:11 -08:00
//int32_t cast = hr->getLong("cast",1);
2013-09-26 22:41:05 -06:00
2013-10-14 17:19:30 -06:00
// httpserver/httprequest should not try to decode post if
// it's application/json.
2013-10-14 18:19:59 -06:00
//char *json = hr->getPOST();
//Json JS;
//if ( json ) JS.parseJsonStringIntoJsonItems ( json );
2013-09-25 15:37:20 -06:00
// . now show stats for the current crawl
// . put in xml or json if format=xml or format=json or
// xml=1 or json=1 ...
2014-04-05 18:09:04 -07:00
char fmt = FORMAT_JSON ;
2014-01-22 23:40:38 -08:00
// token is always required. get from json or html form input
//char *token = getInputString ( "token" );
char * token = hr - > getString ( " token " ) ;
char * name = hr - > getString ( " name " ) ;
// . try getting token-name from ?c=
// . the name of the collection is encoded as <token>-<crawlname>
char * c = hr - > getString ( " c " ) ;
char tmp [ MAX_COLL_LEN + 100 ] ;
if ( ! token & & c ) {
strncpy ( tmp , c , MAX_COLL_LEN ) ;
token = tmp ;
name = strstr ( tmp , " - " ) ;
if ( name ) {
* name = ' \0 ' ;
name + + ;
}
// change default formatting to html
2014-04-05 18:09:04 -07:00
fmt = FORMAT_HTML ;
2014-01-22 23:40:38 -08:00
}
2015-07-13 18:08:25 -07:00
if ( token ) {
for ( int32_t i = 0 ; i < gbstrlen ( token ) ; i + + ) {
token [ i ] = tolower ( token [ i ] ) ;
}
}
2014-01-22 23:40:38 -08:00
2013-09-25 15:37:20 -06:00
char * fs = hr - > getString ( " format " , NULL , NULL ) ;
// give john a json api
2014-04-05 18:09:04 -07:00
if ( fs & & strcmp ( fs , " html " ) = = 0 ) fmt = FORMAT_HTML ;
if ( fs & & strcmp ( fs , " json " ) = = 0 ) fmt = FORMAT_JSON ;
if ( fs & & strcmp ( fs , " xml " ) = = 0 ) fmt = FORMAT_XML ;
2013-10-14 17:19:30 -06:00
// if we got json as input, give it as output
2014-04-05 18:09:04 -07:00
//if ( JS.getFirstItem() ) fmt = FORMAT_JSON;
2013-10-14 17:19:30 -06:00
2014-01-22 23:40:38 -08:00
2013-10-14 17:19:30 -06:00
2014-04-05 18:09:04 -07:00
if ( ! token & & fmt = = FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) {
2013-10-14 17:19:30 -06:00
char * msg = " invalid token " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
if ( ! token ) {
// print token form if html
SafeBuf sb ;
sb . safePrintf ( " In order to use crawlbot you must "
" first LOGIN: "
" <form action=/crawlbot method=get> "
" <br> "
" <input type=text name=token size=50> "
" <input type=submit name=submit value=OK> "
" </form> "
" <br> "
" <b>- OR -</b> "
" <br> SIGN UP "
" <form action=/crawlbot method=get> "
" Name: <input type=text name=name size=50> "
" <br> "
" Email: <input type=text name=email size=50> "
" <br> "
" <input type=submit name=submit value=OK> "
" </form> "
" </body> "
" </html> " ) ;
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
0 ) ; // cachetime
}
2013-10-15 12:22:59 -06:00
if ( gbstrlen ( token ) > 32 ) {
2013-10-18 15:21:00 -07:00
//log("crawlbot: token is over 32 chars");
2013-10-15 12:22:59 -06:00
char * msg = " crawlbot: token is over 32 chars " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-18 11:53:14 -07:00
char * seeds = hr - > getString ( " seeds " ) ;
char * spots = hr - > getString ( " spots " ) ;
2013-10-14 17:19:30 -06:00
2013-10-15 11:54:54 -06:00
// just existence is the operation
2013-12-16 14:35:27 -08:00
//bool delColl = hr->hasField("deleteCrawl");
//bool resetColl = hr->hasField("resetCrawl");
2013-10-15 11:54:54 -06:00
2013-10-22 18:51:09 -07:00
// /v2/bulk api support:
if ( ! spots ) spots = hr - > getString ( " urls " ) ;
2013-12-17 11:17:33 -08:00
if ( spots & & ! spots [ 0 ] ) spots = NULL ;
if ( seeds & & ! seeds [ 0 ] ) seeds = NULL ;
2013-12-16 14:35:27 -08:00
//if ( ! delColl ) delColl = hr->hasField("delete");
//if ( ! resetColl ) resetColl = hr->hasField("reset");
2013-10-22 18:51:09 -07:00
2013-12-17 11:17:33 -08:00
bool restartColl = hr - > hasField ( " restart " ) ;
2013-10-22 18:51:09 -07:00
2013-10-15 12:40:56 -07:00
//if ( delColl && ! && cast == 0 ) {
// log("crawlbot: no collection found to delete.");
// char *msg = "Could not find crawl to delete.";
// return sendErrorReply2 (socket,fmt,msg);
//}
2013-10-16 14:13:28 -07:00
// just send back a list of all the collections after the delete
2014-04-05 18:09:04 -07:00
//if ( delColl && cast && fmt == FORMAT_JSON ) {
2013-10-16 14:13:28 -07:00
// char *msg = "Collection deleted.";
// return sendReply2 (socket,fmt,msg);
//}
2013-10-15 12:40:56 -07:00
// default name to next available collection crawl name in the
// case of a delete operation...
2013-12-16 14:35:27 -08:00
char * msg = NULL ;
if ( hr - > hasField ( " delete " ) ) msg = " deleted " ;
2013-12-17 11:17:33 -08:00
// need to re-add urls for a restart
//if ( hr->hasField("restart") ) msg = "restarted";
2013-12-16 14:35:27 -08:00
if ( hr - > hasField ( " reset " ) ) msg = " reset " ;
if ( msg ) { // delColl && cast ) {
2013-10-15 12:40:56 -07:00
// this was deleted... so is invalid now
name = NULL ;
2014-11-17 18:13:36 -08:00
// no longer a delete function, we need to set "name" below
2013-12-16 14:35:27 -08:00
//delColl = false;//NULL;
2013-11-07 09:55:47 -08:00
// john wants just a brief success reply
2013-12-16 14:35:27 -08:00
SafeBuf tmp ;
tmp . safePrintf ( " { \" response \" : \" Successfully %s job. \" } " ,
msg ) ;
char * reply = tmp . getBufStart ( ) ;
2014-10-31 13:36:07 -07:00
if ( ! reply ) {
if ( ! g_errno ) g_errno = ENOMEM ;
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
}
2013-11-07 09:55:47 -08:00
return g_httpServer . sendDynamicPage ( socket ,
reply ,
gbstrlen ( reply ) ,
0 , // cacheTime
false , // POSTReply?
" application/json "
) ;
2013-10-15 12:40:56 -07:00
}
2013-10-15 11:54:54 -06:00
// if name is missing default to name of first existing
// collection for this token.
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) { // cast
2013-10-15 11:54:54 -06:00
if ( name ) break ;
// do not do this if doing an
// injection (seed) or add url or del coll or reset coll !!
2013-10-18 11:53:14 -07:00
if ( seeds ) break ;
if ( spots ) break ;
2013-12-16 14:35:27 -08:00
//if ( delColl ) break;
//if ( resetColl ) break;
2013-12-17 11:17:33 -08:00
if ( restartColl ) break ;
2013-10-15 11:54:54 -06:00
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
// deleted collections leave a NULL slot
if ( ! cx ) continue ;
// skip if token does not match
if ( strcmp ( cx - > m_diffbotToken . getBufStart ( ) , token ) )
continue ;
// got it
name = cx - > m_diffbotCrawlName . getBufStart ( ) ;
break ;
}
if ( ! name ) {
2014-01-06 13:55:58 -08:00
// if the token is valid
char * ct = " application/json " ;
char * msg = " {} \n " ;
return g_httpServer . sendDynamicPage ( socket ,
msg ,
gbstrlen ( msg ) ,
- 1 , // cachetime
false ,
ct ,
200 ) ; // http status
2013-10-18 15:21:00 -07:00
//log("crawlbot: no crawl name given");
2014-01-06 13:55:58 -08:00
//char *msg = "invalid or missing name";
//return sendErrorReply2 (socket,fmt,msg);
2013-10-14 17:19:30 -06:00
}
2013-10-15 11:54:54 -06:00
if ( gbstrlen ( name ) > 30 ) {
2013-10-18 15:21:00 -07:00
//log("crawlbot: name is over 30 chars");
2013-10-15 11:54:54 -06:00
char * msg = " crawlbot: name is over 30 chars " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
// make the collection name so it includes the token and crawl name
char collName [ MAX_COLL_LEN + 1 ] ;
// sanity
if ( MAX_COLL_LEN < 64 ) { char * xx = NULL ; * xx = 0 ; }
// make a compound name for collection of token and name
sprintf ( collName , " %s-%s " , token , name ) ;
2013-10-14 17:19:30 -06:00
// if they did not specify the token/name of an existing collection
// then cr will be NULL and we'll add it below
CollectionRec * cr = g_collectiondb . getRec ( collName ) ;
2013-12-16 14:35:27 -08:00
// i guess bail if not there?
if ( ! cr ) {
2014-07-01 16:57:25 -07:00
log ( " crawlbot: missing coll rec for coll %s " , collName ) ;
2014-11-05 09:36:42 -08:00
//char *msg = "invalid or missing collection rec";
char * msg = " Could not create job because missing seeds or "
" urls. " ;
2013-12-16 14:35:27 -08:00
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-14 17:19:30 -06:00
// if no token... they need to login or signup
//char *token = getTokenFromHttpRequest ( hr );
2013-09-13 16:22:07 -07:00
2013-09-27 10:49:24 -06:00
// get coll name if any
2013-10-14 17:19:30 -06:00
//char *c = hr->getString("c");
//if ( ! c ) c = hr->getString("id");
2013-09-13 16:22:07 -07:00
2013-09-27 10:49:24 -06:00
// get some other parms provided optionally
2013-10-14 17:19:30 -06:00
//char *addColl = hr->getString("addcoll");
2013-10-14 18:19:59 -06:00
2013-10-14 17:19:30 -06:00
// try json
//if ( JS.getInputString("addNewCrawl") ) addColl = collName;
2013-10-14 18:19:59 -06:00
//if ( JS.getInputString("deleteCrawl") ) delColl = true;
//if ( JS.getInputString("resetCrawl") ) resetColl = true;
2013-10-14 17:19:30 -06:00
2013-12-16 14:35:27 -08:00
//if ( resetColl && ! cr ) {
// //log("crawlbot: no collection found to reset.");
// char *msg = "Could not find crawl to reset.";
// return sendErrorReply2 (socket,fmt,msg);
//}
2013-09-26 22:41:05 -06:00
2013-12-16 14:35:27 -08:00
//if ( restartColl && ! cr ) {
// char *msg = "Could not find crawl to restart.";
// return sendErrorReply2 (socket,fmt,msg);
//}
2013-11-14 13:16:08 -08:00
2013-10-30 13:12:46 -07:00
// make a new state
StateCD * st ;
try { st = new ( StateCD ) ; }
catch ( . . . ) {
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
}
mnew ( st , sizeof ( StateCD ) , " statecd " ) ;
2013-11-10 16:28:00 -08:00
// debug
2014-11-10 14:45:11 -08:00
//log("mnew2: st=%"XINT32"",(int32_t)st);
2013-11-10 16:28:00 -08:00
2013-10-30 13:12:46 -07:00
// copy crap
st - > m_hr . copy ( hr ) ;
st - > m_socket = socket ;
st - > m_fmt = fmt ;
if ( cr ) st - > m_collnum = cr - > m_collnum ;
else st - > m_collnum = - 1 ;
2013-11-14 13:16:08 -08:00
// save seeds
2013-12-17 11:17:33 -08:00
if ( cr & & restartColl ) { // && cast ) {
// bail on OOM saving seeds
if ( ! st - > m_seedBank . safeMemcpy ( & cr - > m_diffbotSeeds ) | |
2014-03-11 17:02:24 -07:00
! st - > m_seedBank . pushChar ( ' \0 ' ) ) {
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
delete st ;
2013-12-17 11:17:33 -08:00
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
2014-03-11 17:02:24 -07:00
}
2013-12-17 11:17:33 -08:00
}
2013-12-03 16:23:05 -08:00
2016-05-11 14:03:45 -06:00
// If we can't compile the provided regexes, return error
// Now we do this before we broadcast the request to change
// the parameters too.
2013-12-03 16:23:05 -08:00
if ( cr ) {
char * rx1 = hr - > getString ( " urlCrawlRegEx " , NULL ) ;
if ( rx1 & & ! rx1 [ 0 ] ) rx1 = NULL ;
char * rx2 = hr - > getString ( " urlProcessRegEx " , NULL ) ;
if ( rx2 & & ! rx2 [ 0 ] ) rx2 = NULL ;
// this will store the compiled regular expression into ucr
regex_t re1 ;
regex_t re2 ;
2014-11-10 14:45:11 -08:00
int32_t status1 = 0 ;
int32_t status2 = 0 ;
2013-12-03 16:23:05 -08:00
if ( rx1 )
status1 = regcomp ( & re1 , rx1 ,
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ;
if ( rx2 )
status2 = regcomp ( & re2 , rx2 ,
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ;
if ( rx1 ) regfree ( & re1 ) ;
if ( rx2 ) regfree ( & re2 ) ;
SafeBuf em ;
if ( status1 ) {
log ( " xmldoc: regcomp %s failed. " , rx1 ) ;
2016-03-28 17:26:40 -06:00
em . safePrintf ( " Invalid regular expression: %s " , rx1 ) ;
2013-12-03 16:23:05 -08:00
}
else if ( status2 ) {
log ( " xmldoc: regcomp %s failed. " , rx2 ) ;
2016-03-28 17:26:40 -06:00
em . safePrintf ( " Invalid regular expression: %s " , rx2 ) ;
2013-12-03 16:23:05 -08:00
}
if ( status1 | | status2 ) {
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-12-03 16:23:05 -08:00
char * msg = em . getBufStart ( ) ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
}
2013-11-14 13:16:08 -08:00
2013-09-27 10:49:24 -06:00
// . if this is a cast=0 request it is received by all hosts in the
// network
2013-09-26 22:41:05 -06:00
// . this code is the only code run by EVERY host in the network
// . the other code is just run once by the receiving host
// . so we gotta create a coll rec on each host etc.
// . no need to update collectionrec parms here since Pages.cpp calls
// g_parms.setFromRequest() for us before calling this function,
2013-09-27 10:49:24 -06:00
// pg->m_function(). even though maxtocrawl is on "PAGE_NONE"
// hopefully it will still be set
2013-09-26 22:41:05 -06:00
// . but we should take care of add/del/reset coll here.
2013-12-10 13:09:55 -08:00
// . i guess this will be handled by the new parm syncing logic
// which deals with add/del coll requests
/*
2013-09-26 22:41:05 -06:00
if ( cast = = 0 ) {
2013-10-30 13:12:46 -07:00
// add a new collection by default
if ( ! cr & & name & & name [ 0 ] )
2013-11-11 15:52:04 -08:00
cr = addNewDiffbotColl ( collName , token , name , hr ) ;
2013-10-30 13:12:46 -07:00
// also support the good 'ole html form interface
if ( cr ) setSpiderParmsFromHtmlRequest ( socket , hr , cr ) ;
2013-09-26 22:41:05 -06:00
// . we can't sync these operations on a dead host when it
// comes back up yet. we can only sync parms, not collection
// adds/deletes/resets
2013-09-27 10:49:24 -06:00
// . TODO: make new collections just a list of rdb records,
// then they can leverage the msg4 and addsinprogress.dat
// functionality we have for getting dead hosts back up to
// sync. Call it Colldb.
2013-10-02 12:50:11 -06:00
// . PROBLEM: when just starting up seems like hasDeadHost()
// is returning true because it has not yet received its
// first ping reply
//if ( addColl || delColl || resetColl ) {
// // if any host in network is dead, do not do this
// if ( g_hostdb.hasDeadHost() ) {
// char *msg = "A host in the network is dead.";
// // log it
// log("crawlbot: %s",msg);
// // make sure this returns in json if required
// return sendErrorReply2(socket,fmt,msg);
// }
//}
2013-10-30 13:12:46 -07:00
2013-11-10 22:11:13 -08:00
// problem?
if ( ! cr ) {
// send back error
char * msg = " Collection add failed " ;
if ( delColl ) msg = " No such collection " ;
if ( resetColl ) msg = " No such collection " ;
2013-11-14 13:16:08 -08:00
if ( restartColl ) msg = " No such collection " ;
2013-11-10 22:11:13 -08:00
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-11-10 22:11:13 -08:00
// log it
2013-11-11 09:58:14 -08:00
log ( " crawlbot: cr is null. %s " , msg ) ;
2013-11-10 22:11:13 -08:00
// make sure this returns in json if required
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-30 13:12:46 -07:00
// set this up
WaitEntry * we = & st - > m_waitEntry ;
we - > m_state = st ;
we - > m_callback = collOpDoneWrapper ;
2013-11-10 22:11:13 -08:00
// this won't work, collname is on the stack!
//we->m_coll = collName;
we - > m_coll = cr - > m_coll ;
2013-10-30 13:12:46 -07:00
2013-09-26 22:41:05 -06:00
if ( delColl ) {
2013-11-11 09:58:14 -08:00
// note it
log ( " crawlbot: deleting coll " ) ;
2013-09-27 10:49:24 -06:00
// delete collection name
2013-10-30 13:12:46 -07:00
// this can block if tree is saving, it has to wait
// for tree save to complete before removing old
// collnum recs from tree
if ( ! g_collectiondb . deleteRec ( collName , we ) )
return false ;
2013-11-10 16:28:00 -08:00
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-09-27 10:49:24 -06:00
// all done
return g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
2013-09-26 22:41:05 -06:00
}
2013-09-27 10:49:24 -06:00
2013-11-14 13:16:08 -08:00
if ( resetColl | | restartColl ) {
2013-11-11 09:58:14 -08:00
// note it
2013-11-14 14:02:56 -08:00
log ( " crawlbot: resetting/restarting coll " ) ;
2013-10-14 17:19:30 -06:00
//cr = g_collectiondb.getRec ( resetColl );
2013-10-30 13:12:46 -07:00
// this can block if tree is saving, it has to wait
// for tree save to complete before removing old
// collnum recs from tree
2013-11-14 14:02:56 -08:00
bool purgeSeeds = true ;
if ( restartColl ) purgeSeeds = false ;
if ( ! g_collectiondb . resetColl ( collName ,
we ,
purgeSeeds ) )
2013-10-30 13:12:46 -07:00
return false ;
2013-10-15 12:40:56 -07:00
// it is a NEW ptr now!
cr = g_collectiondb . getRec ( collName ) ;
2013-09-26 22:41:05 -06:00
// if reset from crawlbot api page then enable spiders
// to avoid user confusion
if ( cr ) cr - > m_spideringEnabled = 1 ;
2013-11-10 16:28:00 -08:00
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-10-30 13:12:46 -07:00
// all done
return g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
2013-09-26 22:41:05 -06:00
}
2013-11-10 16:28:00 -08:00
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-10-14 17:19:30 -06:00
// this will set the the collection parms from json
2013-10-14 18:19:59 -06:00
//setSpiderParmsFromJSONPost ( socket , hr , cr , &JS );
2013-09-26 22:41:05 -06:00
// this is a cast, so just return simple response
return g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
}
2013-12-10 13:09:55 -08:00
*/
2013-09-26 22:41:05 -06:00
2013-10-14 17:19:30 -06:00
/////////
//
// after all hosts have replied to the request, we finally send the
// request here, with no &cast=0 appended to it. so there is where we
// send the final reply back to the browser
//
/////////
2013-09-27 11:39:23 -06:00
2013-12-16 14:35:27 -08:00
/*
2013-10-24 17:59:15 -07:00
// in case collection was just added above... try this!!
cr = g_collectiondb . getRec ( collName ) ;
2013-09-27 11:39:23 -06:00
2013-10-14 17:19:30 -06:00
// collectionrec must be non-null at this point. i.e. we added it
2013-10-24 17:59:15 -07:00
if ( ! cr ) {
2013-10-30 10:00:46 -07:00
char * msg = " Crawl name was not found. " ;
if ( name & & name [ 0 ] )
msg = " Failed to add crawl. Crawl name is illegal. " ;
2013-11-11 09:58:14 -08:00
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-10-25 14:54:24 -07:00
//log("crawlbot: no collection found. need to add a crawl");
2013-10-30 10:00:46 -07:00
return sendErrorReply2 ( socket , fmt , msg ) ;
2013-10-24 17:59:15 -07:00
}
2013-09-27 11:39:23 -06:00
2013-10-18 11:53:14 -07:00
//char *spots = hr->getString("spots",NULL,NULL);
//char *seeds = hr->getString("seeds",NULL,NULL);
2013-12-17 11:17:33 -08:00
*/
2013-09-27 10:49:24 -06:00
2013-11-14 13:16:08 -08:00
// check seed bank now too for restarting a crawl
if ( st - > m_seedBank . length ( ) & & ! seeds )
seeds = st - > m_seedBank . getBufStart ( ) ;
2014-02-25 11:09:07 -08:00
char * coll = " NONE " ;
if ( cr ) coll = cr - > m_coll ;
2013-10-24 17:59:15 -07:00
if ( seeds )
2014-11-10 14:45:11 -08:00
log ( " crawlbot: adding seeds= \" %s \" coll=%s (% " INT32 " ) " ,
seeds , coll , ( int32_t ) st - > m_collnum ) ;
2014-02-25 11:09:07 -08:00
2014-03-10 15:22:43 -07:00
char bulkurlsfile [ 1024 ] ;
2014-06-25 17:46:28 -07:00
// when a collection is restarted the collnum changes to avoid
// adding any records destined for that collnum that might be on
// the wire. so just put these in the root dir
snprintf ( bulkurlsfile , 1024 , " %sbulkurls-%s.txt " ,
2014-11-10 14:45:11 -08:00
g_hostdb . m_dir , coll ) ; //, (int32_t)st->m_collnum );
2014-03-11 18:52:14 -07:00
if ( spots & & cr & & cr - > m_isCustomCrawl = = 2 ) {
2014-11-10 14:45:11 -08:00
int32_t spotsLen = ( int32_t ) gbstrlen ( spots ) ;
log ( " crawlbot: got spots (len=% " INT32 " ) to add coll=%s (% " INT32 " ) " ,
spotsLen , coll , ( int32_t ) st - > m_collnum ) ;
2014-03-10 15:22:43 -07:00
FILE * f = fopen ( bulkurlsfile , " w " ) ;
2014-03-10 13:50:30 -07:00
if ( f ! = NULL ) {
2014-03-10 15:22:43 -07:00
// urls are space separated.
2014-05-14 14:40:46 -07:00
// as of 5/14/2014, it appears that spots is space-separated for some URLs (the first two)
// and newline-separated for the remainder. Make a copy that's space separated so that restarting bulk jobs works.
// Alternatives:
// 1) just write one character to disk at a time, replacing newlines with spaces
// 2) just output what you have, and then when you read in, replace newlines with spaces
// 3) probably the best option: change newlines to spaces earlier in the pipeline
char * spotsCopy = ( char * ) mmalloc ( spotsLen + 1 , " create a temporary copy of spots that we're about to delete " ) ;
for ( int i = 0 ; i < spotsLen ; i + + ) {
char c = spots [ i ] ;
if ( c = = ' \n ' )
c = ' ' ;
spotsCopy [ i ] = c ;
}
spotsCopy [ spotsLen ] = ' \0 ' ;
fprintf ( f , " %s " , spotsCopy ) ;
2014-03-10 13:50:30 -07:00
fclose ( f ) ;
2014-11-17 18:13:36 -08:00
mfree ( spotsCopy , spotsLen + 1 , " no longer need copy " ) ;
2014-03-10 13:50:30 -07:00
}
}
2013-10-24 17:59:15 -07:00
2014-06-04 09:36:26 -07:00
// if restart flag is on and the file with bulk urls exists,
// get spots from there
SafeBuf bb ;
if ( ! spots & & restartColl & & cr & & cr - > m_isCustomCrawl = = 2 ) {
bb . load ( bulkurlsfile ) ;
bb . nullTerm ( ) ;
spots = bb . getBufStart ( ) ;
2014-11-10 14:45:11 -08:00
log ( " crawlbot: restarting bulk job file=%s bufsize=% " INT32 " for %s " ,
2014-06-25 17:46:28 -07:00
bulkurlsfile , bb . length ( ) , cr - > m_coll ) ;
2014-06-04 09:36:26 -07:00
}
/*
2014-03-10 15:22:43 -07:00
FILE * f = fopen ( bulkurlsfile , " r " ) ;
if ( f ! = NULL ) {
fseek ( f , 0 , SEEK_END ) ;
2014-11-10 14:45:11 -08:00
int32_t size = ftell ( f ) ;
2014-03-10 15:22:43 -07:00
fseek ( f , 0 , SEEK_SET ) ;
char * bulkurls = ( char * ) mmalloc ( size , " reading in bulk urls " ) ;
2014-03-11 17:02:24 -07:00
if ( ! bulkurls ) {
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
delete st ;
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
}
2014-03-10 15:22:43 -07:00
fgets ( bulkurls , size , f ) ;
spots = bulkurls ;
2014-03-11 17:02:24 -07:00
fclose ( f ) ;
2014-03-10 15:22:43 -07:00
}
}
2014-06-04 09:36:26 -07:00
*/
2014-03-10 15:22:43 -07:00
2013-09-25 15:37:20 -06:00
///////
//
// handle file of urls upload. can be HUGE!
//
///////
2013-10-18 11:53:14 -07:00
if ( spots | | seeds ) {
2014-02-16 15:18:50 -08:00
// error
if ( g_repair . isRepairActive ( ) & &
g_repair . m_collnum = = st - > m_collnum ) {
log ( " crawlbot: repair active. can't add seeds "
" or spots while repairing collection. " ) ;
g_errno = EREPAIRING ;
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
}
2013-10-14 18:19:59 -06:00
// . avoid spidering links for these urls? i would say
// . default is to NOT spider the links...
// . support camel case and all lower case
2014-11-10 14:45:11 -08:00
//int32_t spiderLinks = hr->getLong("spiderLinks",1);
2013-10-18 11:53:14 -07:00
//spiderLinks = hr->getLong("spiderlinks",spiderLinks);
//bool spiderLinks = false;
2013-09-25 15:37:20 -06:00
// make a list of spider requests from these urls
2013-12-18 15:57:10 -08:00
//SafeBuf listBuf;
2013-09-25 16:04:16 -06:00
// this returns NULL with g_errno set
2013-10-18 11:53:14 -07:00
bool status = true ;
if ( ! getSpiderRequestMetaList ( seeds ,
2013-12-18 15:57:10 -08:00
& st - > m_listBuf ,
2013-10-21 17:35:14 -07:00
true , // spiderLinks?
cr ) )
2013-10-18 11:53:14 -07:00
status = false ;
// do not spider links for spots
if ( ! getSpiderRequestMetaList ( spots ,
2013-12-18 15:57:10 -08:00
& st - > m_listBuf ,
2013-10-21 17:35:14 -07:00
false , // spiderLinks?
NULL ) )
2013-10-18 11:53:14 -07:00
status = false ;
2013-09-25 16:04:16 -06:00
// empty?
2014-11-10 14:45:11 -08:00
int32_t size = st - > m_listBuf . length ( ) ;
2013-09-25 16:04:16 -06:00
// error?
2013-11-11 09:58:14 -08:00
if ( ! status ) {
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-09-25 17:12:01 -06:00
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
2013-11-11 09:58:14 -08:00
}
2013-09-25 16:04:16 -06:00
// if not list
2013-11-11 09:58:14 -08:00
if ( ! size ) {
// nuke it
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2013-09-25 17:12:01 -06:00
return sendErrorReply2 ( socket , fmt , " no urls found " ) ;
2013-11-11 09:58:14 -08:00
}
2013-09-25 15:37:20 -06:00
// add to spiderdb
2013-12-18 15:57:10 -08:00
if ( ! st - > m_msg4 . addMetaList ( st - > m_listBuf . getBufStart ( ) ,
st - > m_listBuf . length ( ) ,
2013-09-25 15:37:20 -06:00
cr - > m_coll ,
st ,
addedUrlsToSpiderdbWrapper ,
0 // niceness
) )
// blocked!
return false ;
// did not block, print page!
addedUrlsToSpiderdbWrapper ( st ) ;
return true ;
}
/////////
//
// handle direct injection of a url. looks at "spiderlinks=1" parm
// and all the other parms in Msg7::inject() in PageInject.cpp.
//
//////////
2013-10-18 11:53:14 -07:00
/*
2013-09-25 15:37:20 -06:00
if ( injectUrl ) {
// a valid collection is required
if ( ! cr )
2013-09-25 17:12:01 -06:00
return sendErrorReply2 ( socket , fmt ,
" invalid collection " ) ;
2013-09-25 15:37:20 -06:00
// begin the injection
if ( ! st - > m_msg7 . inject ( st - > m_socket ,
& st - > m_hr ,
st ,
2013-10-15 12:22:59 -06:00
injectedUrlWrapper ,
2013-10-15 16:31:59 -07:00
1 , // spiderLinks default is on
collName ) ) // coll override
2013-09-25 15:37:20 -06:00
// if blocked, return now
return false ;
// otherwise send back reply
injectedUrlWrapper ( st ) ;
return true ;
}
2013-10-18 11:53:14 -07:00
*/
2013-09-25 15:37:20 -06:00
2013-09-27 10:49:24 -06:00
// we do not need the state i guess
2013-09-26 22:41:05 -06:00
////////////
2013-09-25 15:37:20 -06:00
//
// print the html or json page of all the data
//
2013-09-27 10:49:24 -06:00
printCrawlBotPage2 ( socket , hr , fmt , NULL , NULL , cr - > m_collnum ) ;
// get rid of that state
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2014-01-17 18:28:17 -08:00
delete st ;
2014-11-10 14:45:11 -08:00
//log("mdel4: st=%"XINT32"",(int32_t)st);
2013-09-27 10:49:24 -06:00
return true ;
2013-09-25 15:37:20 -06:00
}
2013-12-16 14:35:27 -08:00
/*
2014-11-10 14:45:11 -08:00
bool printUrlFilters ( SafeBuf & sb , CollectionRec * cr , int32_t fmt ) {
2013-10-16 12:12:22 -07:00
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON )
2013-10-16 12:19:25 -07:00
sb . safePrintf ( " \" urlFilters \" :[ " ) ;
2013-10-15 12:45:23 -06:00
2013-10-21 12:04:08 -07:00
// skip first filters that are:
// 0. ismedia->ignore and
// 1. !isonsamedomain->ignore
// 2. lastspidertime or !isindexed
// 3. errorcount rule
// 4. errorcount rule
2014-11-10 14:45:11 -08:00
int32_t istart = 5 ;
2013-10-17 18:59:00 -07:00
// if respidering then we added an extra filter
// lastspidertime>={roundstart} --> FILTERED
2013-10-21 18:05:45 -07:00
//if ( cr->m_collectiveRespiderFrequency > 0.0 )
// istart++;
2013-10-17 18:59:00 -07:00
2014-11-10 14:45:11 -08:00
for ( int32_t i = istart ; i < cr - > m_numRegExs ; i + + ) {
2013-10-15 12:45:23 -06:00
//sb.safePrintf
2013-10-15 11:50:57 -07:00
char * expression = cr - > m_regExs [ i ] . getBufStart ( ) ;
// do not allow nulls
if ( ! expression ) expression = " " ;
// skip spaces
if ( * expression & & is_wspace_a ( * expression ) ) expression + + ;
if ( strcmp ( expression , " default " ) = = 0 ) expression = " * " ;
char * action = cr - > m_spiderDiffbotApiUrl [ i ] . getBufStart ( ) ;
// do not all nulls
if ( ! action ) action = " " ;
// skip spaces
if ( * action & & is_wspace_a ( * action ) ) action + + ;
// if no diffbot api url specified, do not process
if ( ! * action ) action = " doNotProcess " ;
// if filtered from crawling, do not even spider
2014-11-10 14:45:11 -08:00
int32_t priority = cr - > m_spiderPriorities [ i ] ;
2013-10-15 11:50:57 -07:00
if ( priority = = SPIDER_PRIORITY_FILTERED ) // -3
2013-10-21 18:05:45 -07:00
action = " doNotCrawl " ;
2013-10-16 12:12:22 -07:00
// we add this supplemental expressin/action for every
// one the user adds in order to give manually added
// urls higher spider priority, so skip it
if ( strncmp ( expression , " ismanualadd && " , 15 ) = = 0 )
continue ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-10-16 12:12:22 -07:00
sb . safePrintf ( " <tr> "
" <td>Expression "
" <input type=text "
" name=expression size=30 "
" value= \" %s \" > "
" </td><td> "
" Action "
" <input type=text name=action size=50 "
" value= \" %s \" > "
" </td> "
" </tr> \n "
, expression
, action
) ;
continue ;
}
2013-10-15 11:50:57 -07:00
// show it
sb . safePrintf ( " { \" expression \" : \" %s \" , " , expression ) ;
sb . safePrintf ( " \" action \" : \" %s \" } " , action ) ;
// more follow?
2013-10-16 12:12:22 -07:00
sb . pushChar ( ' , ' ) ;
2013-10-15 11:50:57 -07:00
sb . pushChar ( ' \n ' ) ;
2013-10-15 12:45:23 -06:00
}
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON ) {
2013-10-16 12:12:22 -07:00
// remove trailing comma
sb . removeLastChar ( ' \n ' ) ;
sb . removeLastChar ( ' , ' ) ;
2013-10-16 12:19:25 -07:00
sb . safePrintf ( " ] \n " ) ;
2013-10-16 12:12:22 -07:00
}
2013-10-15 12:45:23 -06:00
return true ;
}
2013-12-16 14:35:27 -08:00
*/
2013-10-15 12:45:23 -06:00
2014-05-28 10:41:32 -07:00
bool printCrawlDetailsInJson ( SafeBuf * sb , CollectionRec * cx ) {
return printCrawlDetailsInJson ( sb , cx , HTTP_REQUEST_DEFAULT_REQUEST_VERSION ) ;
}
2014-05-27 20:11:12 -07:00
bool printCrawlDetailsInJson ( SafeBuf * sb , CollectionRec * cx , int version ) {
2013-11-07 13:59:43 -08:00
SafeBuf tmp ;
2014-11-10 14:45:11 -08:00
int32_t crawlStatus = - 1 ;
2013-11-07 13:59:43 -08:00
getSpiderStatusMsg ( cx , & tmp , & crawlStatus ) ;
CrawlInfo * ci = & cx - > m_localCrawlInfo ;
2014-11-10 14:45:11 -08:00
int32_t sentAlert = ( int32_t ) ci - > m_sentCrawlDoneAlert ;
2013-11-07 13:59:43 -08:00
if ( sentAlert ) sentAlert = 1 ;
2013-11-11 15:52:04 -08:00
char * crawlTypeStr = " crawl " ;
//char *nomen = "crawl";
if ( cx - > m_isCustomCrawl = = 2 ) {
crawlTypeStr = " bulk " ;
//nomen = "job";
}
2014-06-03 20:30:10 -07:00
// don't print completed time if spidering is going on
2014-11-10 14:45:11 -08:00
uint32_t completed = cx - > m_diffbotCrawlEndTime ;
2014-06-03 20:30:10 -07:00
// if not yet done, make this zero
if ( crawlStatus = = SP_INITIALIZING ) completed = 0 ;
if ( crawlStatus = = SP_NOURLS ) completed = 0 ;
//if ( crawlStatus == SP_PAUSED ) completed = 0;
//if ( crawlStatus == SP_ADMIN_PAUSED ) completed = 0;
if ( crawlStatus = = SP_INPROGRESS ) completed = 0 ;
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \n \n { "
2013-11-07 13:59:43 -08:00
" \" name \" : \" %s \" , \n "
2013-11-11 15:52:04 -08:00
" \" type \" : \" %s \" , \n "
2014-04-25 14:12:18 -07:00
2014-11-10 14:45:11 -08:00
" \" jobCreationTimeUTC \" :% " INT32 " , \n "
" \" jobCompletionTimeUTC \" :% " INT32 " , \n "
2014-04-25 14:12:18 -07:00
2013-11-07 13:59:43 -08:00
//"\"alias\":\"%s\",\n"
2014-11-10 14:45:11 -08:00
//"\"crawlingEnabled\":%"INT32",\n"
2013-11-11 15:52:04 -08:00
" \" jobStatus \" :{ " // nomen = jobStatus / crawlStatus
2014-11-10 14:45:11 -08:00
" \" status \" :% " INT32 " , "
2013-11-07 13:59:43 -08:00
" \" message \" : \" %s \" }, \n "
2014-11-10 14:45:11 -08:00
" \" sentJobDoneNotification \" :% " INT32 " , \n "
//"\"crawlingPaused\":%"INT32",\n"
" \" objectsFound \" :% " INT64 " , \n "
" \" urlsHarvested \" :% " INT64 " , \n "
//"\"urlsExamined\":%"INT64",\n"
" \" pageCrawlAttempts \" :% " INT64 " , \n "
" \" pageCrawlSuccesses \" :% " INT64 " , \n "
" \" pageCrawlSuccessesThisRound \" :% " INT64 " , \n "
" \" pageProcessAttempts \" :% " INT64 " , \n "
" \" pageProcessSuccesses \" :% " INT64 " , \n "
" \" pageProcessSuccessesThisRound \" :% " INT64 " , \n "
" \" maxRounds \" :% " INT32 " , \n "
2013-11-11 15:52:04 -08:00
" \" repeat \" :%f, \n "
2013-11-07 13:59:43 -08:00
" \" crawlDelay \" :%f, \n "
2013-11-11 15:52:04 -08:00
2013-11-07 13:59:43 -08:00
//,cx->m_coll
, cx - > m_diffbotCrawlName . getBufStart ( )
2013-11-11 15:52:04 -08:00
, crawlTypeStr
2014-04-25 14:12:18 -07:00
, cx - > m_diffbotCrawlStartTime
// this is 0 if not over yet
2014-06-03 20:30:10 -07:00
, completed
2014-04-25 14:12:18 -07:00
2013-11-07 13:59:43 -08:00
//, alias
2014-11-10 14:45:11 -08:00
//, (int32_t)cx->m_spideringEnabled
2013-11-07 13:59:43 -08:00
, crawlStatus
, tmp . getBufStart ( )
, sentAlert
2014-11-10 14:45:11 -08:00
//, (int32_t)paused
2013-11-07 13:59:43 -08:00
, cx - > m_globalCrawlInfo . m_objectsAdded -
cx - > m_globalCrawlInfo . m_objectsDeleted
, cx - > m_globalCrawlInfo . m_urlsHarvested
//,cx->m_globalCrawlInfo.m_urlsConsidered
, cx - > m_globalCrawlInfo . m_pageDownloadAttempts
, cx - > m_globalCrawlInfo . m_pageDownloadSuccesses
2014-01-23 13:23:09 -08:00
, cx - > m_globalCrawlInfo . m_pageDownloadSuccessesThisRound
2013-11-07 13:59:43 -08:00
, cx - > m_globalCrawlInfo . m_pageProcessAttempts
, cx - > m_globalCrawlInfo . m_pageProcessSuccesses
2014-01-23 13:23:09 -08:00
, cx - > m_globalCrawlInfo . m_pageProcessSuccessesThisRound
2013-11-11 15:52:04 -08:00
2014-11-10 14:45:11 -08:00
, ( int32_t ) cx - > m_maxCrawlRounds
2013-11-07 13:59:43 -08:00
, cx - > m_collectiveRespiderFrequency
, cx - > m_collectiveCrawlDelay
) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" obeyRobots \" :% " INT32 " , \n "
, ( int32_t ) cx - > m_useRobotsTxt ) ;
2014-02-05 15:45:39 -08:00
2013-11-11 15:52:04 -08:00
// if not a "bulk" injection, show crawl stats
if ( cx - > m_isCustomCrawl ! = 2 ) {
2014-02-09 15:09:48 -07:00
sb - > safePrintf (
2013-11-11 15:52:04 -08:00
// settable parms
2014-11-10 14:45:11 -08:00
" \" maxToCrawl \" :% " INT64 " , \n "
" \" maxToProcess \" :% " INT64 " , \n "
//"\"restrictDomain\":%"INT32",\n"
" \" onlyProcessIfNew \" :% " INT32 " , \n "
2013-11-11 15:52:04 -08:00
, cx - > m_maxToCrawl
, cx - > m_maxToProcess
2014-11-10 14:45:11 -08:00
//, (int32_t)cx->m_restrictDomain
, ( int32_t ) cx - > m_diffbotOnlyProcessIfNewUrl
2013-11-11 15:52:04 -08:00
) ;
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" seeds \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotSeeds . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-11-11 15:52:04 -08:00
}
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" roundsCompleted \" :% " INT32 " , \n " ,
2013-11-07 13:59:43 -08:00
cx - > m_spiderRoundNum ) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" roundStartTime \" :% " UINT32 " , \n " ,
2013-11-07 13:59:43 -08:00
cx - > m_spiderRoundStartTime ) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \" currentTime \" :% " UINT32 " , \n " ,
( uint32_t ) getTimeGlobal ( ) ) ;
sb - > safePrintf ( " \" currentTimeUTC \" :% " UINT32 " , \n " ,
( uint32_t ) getTimeGlobal ( ) ) ;
2013-11-07 13:59:43 -08:00
2013-11-20 16:41:28 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" apiUrl \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotApiUrl . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-11-20 16:41:28 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" urlCrawlPattern \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotUrlCrawlPattern . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-11-20 16:41:28 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" urlProcessPattern \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotUrlProcessPattern . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-11-20 16:41:28 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" pageProcessPattern \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotPageProcessPattern . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-11-07 13:59:43 -08:00
2013-12-03 16:23:05 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" urlCrawlRegEx \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotUrlCrawlRegEx . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-12-03 16:23:05 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" urlProcessRegEx \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_diffbotUrlProcessRegEx . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-12-03 16:23:05 -08:00
2014-11-20 16:53:07 -08:00
sb - > safePrintf ( " \" maxHops \" :% " INT32 " , \n " ,
( int32_t ) cx - > m_diffbotMaxHops ) ;
2013-12-03 16:23:05 -08:00
2013-11-07 14:07:38 -08:00
char * token = cx - > m_diffbotToken . getBufStart ( ) ;
char * name = cx - > m_diffbotCrawlName . getBufStart ( ) ;
2013-11-13 14:30:51 -08:00
char * mt = " crawl " ;
if ( cx - > m_isCustomCrawl = = 2 ) mt = " bulk " ;
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" downloadJson \" : "
2014-05-27 20:11:12 -07:00
" \" http://api.diffbot.com/v%d/%s/download/ "
2013-11-07 14:07:38 -08:00
" %s-%s_data.json \" , \n "
2014-05-27 20:11:12 -07:00
, version
2013-11-13 14:30:51 -08:00
, mt
2013-11-07 14:07:38 -08:00
, token
, name
) ;
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" downloadUrls \" : "
2014-05-27 20:11:12 -07:00
" \" http://api.diffbot.com/v%d/%s/download/ "
2013-11-07 14:07:38 -08:00
" %s-%s_urls.csv \" , \n "
2014-05-27 20:11:12 -07:00
, version
2013-11-13 14:30:51 -08:00
, mt
2013-11-07 14:07:38 -08:00
, token
, name
) ;
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" notifyEmail \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_notifyEmail . getBufStart ( ) ) ;
sb - > safePrintf ( " \" , \n " ) ;
2013-11-07 13:59:43 -08:00
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " \" notifyWebhook \" : \" " ) ;
sb - > safeUtf8ToJSON ( cx - > m_notifyUrl . getBufStart ( ) ) ;
sb - > safePrintf ( " \" \n " ) ;
//sb->safePrintf("\",\n");
2013-11-26 09:17:38 -08:00
2013-11-07 13:59:43 -08:00
/////
//
// show url filters table. kinda hacky!!
//
/////
/*
g_parms . sendPageGeneric ( socket ,
hr ,
PAGE_FILTERS ,
NULL ,
& sb ,
cr - > m_coll , // coll override
true // isJSON?
) ;
*/
2014-04-05 18:09:04 -07:00
//printUrlFilters ( sb , cx , FORMAT_JSON );
2013-11-07 13:59:43 -08:00
// end that collection rec
2014-02-09 15:09:48 -07:00
sb - > safePrintf ( " } \n " ) ;
2013-11-07 13:59:43 -08:00
return true ;
}
2014-07-13 09:35:44 -07:00
bool printCrawlDetails2 ( SafeBuf * sb , CollectionRec * cx , char format ) {
2014-07-11 08:00:30 -07:00
SafeBuf tmp ;
2014-11-10 14:45:11 -08:00
int32_t crawlStatus = - 1 ;
2014-07-11 08:00:30 -07:00
getSpiderStatusMsg ( cx , & tmp , & crawlStatus ) ;
CrawlInfo * ci = & cx - > m_localCrawlInfo ;
2014-11-10 14:45:11 -08:00
int32_t sentAlert = ( int32_t ) ci - > m_sentCrawlDoneAlert ;
2014-07-11 08:00:30 -07:00
if ( sentAlert ) sentAlert = 1 ;
// don't print completed time if spidering is going on
2014-11-10 14:45:11 -08:00
uint32_t completed = cx - > m_diffbotCrawlEndTime ; // time_t
2014-07-11 08:00:30 -07:00
// if not yet done, make this zero
if ( crawlStatus = = SP_INITIALIZING ) completed = 0 ;
if ( crawlStatus = = SP_NOURLS ) completed = 0 ;
//if ( crawlStatus == SP_PAUSED ) completed = 0;
//if ( crawlStatus == SP_ADMIN_PAUSED ) completed = 0;
if ( crawlStatus = = SP_INPROGRESS ) completed = 0 ;
2014-07-13 09:35:44 -07:00
if ( format = = FORMAT_JSON ) {
sb - > safePrintf ( " { "
" \" response:{ \n "
2014-11-10 14:45:11 -08:00
" \t \" statusCode \" :% " INT32 " , \n "
2014-07-13 09:35:44 -07:00
" \t \" statusMsg \" : \" %s \" , \n "
2014-11-10 14:45:11 -08:00
" \t \" jobCreationTimeUTC \" :% " INT32 " , \n "
" \t \" jobCompletionTimeUTC \" :% " INT32 " , \n "
" \t \" sentJobDoneNotification \" :% " INT32 " , \n "
" \t \" urlsHarvested \" :% " INT64 " , \n "
" \t \" pageCrawlAttempts \" :% " INT64 " , \n "
" \t \" pageCrawlSuccesses \" :% " INT64 " , \n "
2014-07-13 09:35:44 -07:00
, crawlStatus
, tmp . getBufStart ( )
, cx - > m_diffbotCrawlStartTime
, completed
, sentAlert
, cx - > m_globalCrawlInfo . m_urlsHarvested
, cx - > m_globalCrawlInfo . m_pageDownloadAttempts
, cx - > m_globalCrawlInfo . m_pageDownloadSuccesses
) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \t \" currentTime \" :% " UINT32 " , \n " ,
( uint32_t ) getTimeGlobal ( ) ) ;
sb - > safePrintf ( " \t \" currentTimeUTC \" :% " UINT32 " , \n " ,
( uint32_t ) getTimeGlobal ( ) ) ;
2014-07-13 09:35:44 -07:00
sb - > safePrintf ( " \t } \n " ) ;
sb - > safePrintf ( " } \n " ) ;
}
2014-07-11 08:00:30 -07:00
2014-07-13 09:35:44 -07:00
if ( format = = FORMAT_XML ) {
sb - > safePrintf ( " <response> \n "
2014-11-10 14:45:11 -08:00
" \t <statusCode>% " INT32 " </statusCode> \n "
2014-07-13 09:35:44 -07:00
, crawlStatus
) ;
sb - > safePrintf (
" \t <statusMsg><![CDATA[%s]]></statusMsg> \n "
2014-11-10 14:45:11 -08:00
" \t <jobCreationTimeUTC>% " INT32 " "
2014-07-13 09:35:44 -07:00
" </jobCreationTimeUTC> \n "
, ( char * ) tmp . getBufStart ( )
2014-11-10 14:45:11 -08:00
, ( int32_t ) cx - > m_diffbotCrawlStartTime
2014-07-13 09:35:44 -07:00
) ;
sb - > safePrintf (
2014-11-10 14:45:11 -08:00
" \t <jobCompletionTimeUTC>% " INT32 " "
2014-07-13 09:35:44 -07:00
" </jobCompletionTimeUTC> \n "
2014-07-11 08:00:30 -07:00
2014-11-10 14:45:11 -08:00
" \t <sentJobDoneNotification>% " INT32 " "
2014-07-13 09:35:44 -07:00
" </sentJobDoneNotification> \n "
2014-07-11 08:00:30 -07:00
2014-11-10 14:45:11 -08:00
" \t <urlsHarvested>% " INT64 " </urlsHarvested> \n "
2014-07-11 08:00:30 -07:00
2014-11-10 14:45:11 -08:00
" \t <pageCrawlAttempts>% " INT64 " "
2014-07-13 09:35:44 -07:00
" </pageCrawlAttempts> \n "
2014-07-11 08:00:30 -07:00
2014-11-10 14:45:11 -08:00
" \t <pageCrawlSuccesses>% " INT64 " "
2014-07-13 09:35:44 -07:00
" </pageCrawlSuccesses> \n "
2014-07-11 08:00:30 -07:00
2014-07-13 09:35:44 -07:00
, completed
, sentAlert
, cx - > m_globalCrawlInfo . m_urlsHarvested
, cx - > m_globalCrawlInfo . m_pageDownloadAttempts
, cx - > m_globalCrawlInfo . m_pageDownloadSuccesses
) ;
2014-11-10 14:45:11 -08:00
sb - > safePrintf ( " \t <currentTime>% " UINT32 " </currentTime> \n " ,
( uint32_t ) getTimeGlobal ( ) ) ;
sb - > safePrintf ( " \t <currentTimeUTC>% " UINT32 " </currentTimeUTC> \n " ,
( uint32_t ) getTimeGlobal ( ) ) ;
2014-07-13 09:35:44 -07:00
sb - > safePrintf ( " </response> \n " ) ;
}
2014-07-11 08:00:30 -07:00
return true ;
}
2013-09-25 15:37:20 -06:00
bool printCrawlBotPage2 ( TcpSocket * socket ,
HttpRequest * hr ,
char fmt , // format
SafeBuf * injectionResponse ,
2013-09-27 10:49:24 -06:00
SafeBuf * urlUploadResponse ,
collnum_t collnum ) {
2013-09-25 15:37:20 -06:00
2013-10-16 14:03:14 -07:00
2013-09-25 15:37:20 -06:00
// store output into here
SafeBuf sb ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML )
2013-09-25 15:37:20 -06:00
sb . safePrintf (
" <html> "
" <title>Crawlbot - "
" Web Data Extraction and Search Made "
" Easy</title> "
" <body> "
) ;
2013-09-27 10:49:24 -06:00
CollectionRec * cr = g_collectiondb . m_recs [ collnum ] ;
2013-09-17 10:43:23 -07:00
2013-10-18 15:21:00 -07:00
// was coll deleted while adding urls to spiderdb?
if ( ! cr ) {
g_errno = EBADREQUEST ;
char * msg = " invalid crawl. crawl was deleted. " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-15 14:08:55 -07:00
char * token = cr - > m_diffbotToken . getBufStart ( ) ;
char * name = cr - > m_diffbotCrawlName . getBufStart ( ) ;
// this is usefful
SafeBuf hb ;
hb . safePrintf ( " <input type=hidden name=name value= \" %s \" > "
" <input type=hidden name=token value= \" %s \" > "
" <input type=hidden name=format value= \" html \" > "
, name
, token ) ;
hb . nullTerm ( ) ;
// and this
SafeBuf lb ;
lb . safePrintf ( " name= " ) ;
lb . urlEncode ( name ) ;
lb . safePrintf ( " &token= " ) ;
lb . urlEncode ( token ) ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) lb . safePrintf ( " &format=html " ) ;
2013-10-15 14:08:55 -07:00
lb . nullTerm ( ) ;
2013-09-17 10:43:23 -07:00
// set this to current collection. if only token was provided
// then it will return the first collection owned by token.
// if token has no collections it will be NULL.
2013-09-27 10:49:24 -06:00
//if ( ! cr )
// cr = getCollRecFromHttpRequest ( hr );
//if ( ! cr ) {
// char *msg = "failed to add new collection";
// g_msg = " (error: crawlbot failed to allocate crawl)";
// return sendErrorReply2 ( socket , fmt , msg );
//}
2013-09-17 10:43:23 -07:00
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-09-25 15:37:20 -06:00
sb . safePrintf ( " <table border=0> "
" <tr><td> "
" <b><font size=+2> "
" <a href=/crawlbot?token=%s> "
" Crawlbot</a></font></b> "
" <br> "
" <font size=-1> "
" Crawl, Datamine and Index the Web "
" </font> "
" </td></tr> "
" </table> "
, token
) ;
sb . safePrintf ( " <center><br> " ) ;
2013-09-25 18:00:16 -06:00
// first print help
2013-09-25 19:41:20 -06:00
sb . safePrintf ( " [ <a href=/crawlbot?help=1> "
" api help</a> ] "
// json output
2013-11-04 11:05:10 -08:00
" [ <a href= \" /crawlbot?token=%s&format=json& "
" name=%s \" > "
2013-09-25 19:41:20 -06:00
" json output "
" </a> ] "
2013-11-04 11:05:10 -08:00
, token
, name ) ;
2013-09-27 10:49:24 -06:00
// random coll name to add
2014-11-10 14:45:11 -08:00
uint32_t r1 = rand ( ) ;
uint32_t r2 = rand ( ) ;
2014-10-30 13:30:39 -06:00
uint64_t rand64 = ( uint64_t ) r1 ;
2013-09-27 10:49:24 -06:00
rand64 < < = 32 ;
rand64 | = r2 ;
2013-12-16 15:39:24 -08:00
char newCollName [ MAX_COLL_LEN + 1 ] ;
2014-11-10 14:45:11 -08:00
snprintf ( newCollName , MAX_COLL_LEN , " %s-%016 " XINT64 " " ,
2013-12-16 15:39:24 -08:00
token , rand64 ) ;
2013-09-25 15:37:20 -06:00
// first print "add new collection"
2014-11-10 14:45:11 -08:00
sb . safePrintf ( " [ <a href=/crawlbot?name=%016 " XINT64 " &token=%s& "
2013-12-16 16:04:43 -08:00
" format=html&addCrawl=%s> "
2013-10-15 16:57:34 -07:00
" add new crawl "
2013-09-25 15:37:20 -06:00
" </a> ] "
2013-10-15 12:40:56 -07:00
" [ <a href=/crawlbot?token=%s> "
2013-10-15 16:57:34 -07:00
" show all crawls "
2013-09-25 15:37:20 -06:00
" </a> ] "
2013-09-27 10:49:24 -06:00
, rand64
2013-09-25 15:37:20 -06:00
, token
2013-12-16 15:39:24 -08:00
, newCollName
2013-09-25 15:37:20 -06:00
, token
) ;
}
2013-09-13 16:22:07 -07:00
2013-09-25 15:37:20 -06:00
bool firstOne = true ;
2013-09-17 15:32:28 -07:00
2013-09-13 16:22:07 -07:00
//
2013-09-17 10:43:23 -07:00
// print list of collections controlled by this token
2013-09-13 16:22:07 -07:00
//
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; fmt = = FORMAT_HTML & & i < g_collectiondb . m_numRecs ; i + + ) {
2013-09-13 16:22:07 -07:00
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
if ( ! cx ) continue ;
2013-09-17 15:32:28 -07:00
// get its token if any
char * ct = cx - > m_diffbotToken . getBufStart ( ) ;
if ( ! ct ) continue ;
// skip if token does not match
if ( strcmp ( ct , token ) )
continue ;
2013-09-13 16:22:07 -07:00
// highlight the tab if it is what we selected
bool highlight = false ;
2013-09-17 10:43:23 -07:00
if ( cx = = cr ) highlight = true ;
2013-09-13 16:22:07 -07:00
char * style = " " ;
if ( highlight ) {
style = " style=text-decoration:none; " ;
sb . safePrintf ( " <b><font color=red> " ) ;
}
// print the crawl id. collection name minus <TOKEN>-
2013-10-15 12:40:56 -07:00
sb . safePrintf ( " <a %shref=/crawlbot?token= " , style ) ;
sb . urlEncode ( token ) ;
sb . safePrintf ( " &name= " ) ;
sb . urlEncode ( cx - > m_diffbotCrawlName . getBufStart ( ) ) ;
sb . safePrintf ( " &format=html> "
2014-11-10 14:45:11 -08:00
" %s (% " INT32 " ) "
2013-09-13 16:22:07 -07:00
" </a> "
2013-10-15 11:54:54 -06:00
, cx - > m_diffbotCrawlName . getBufStart ( )
2014-11-10 14:45:11 -08:00
, ( int32_t ) cx - > m_collnum
2013-09-13 16:22:07 -07:00
) ;
if ( highlight )
sb . safePrintf ( " </font></b> " ) ;
}
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML )
2013-09-25 15:37:20 -06:00
sb . safePrintf ( " </center><br/> " ) ;
2013-09-13 16:22:07 -07:00
2013-11-04 11:29:22 -08:00
// the ROOT JSON [
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON )
2013-10-16 17:17:28 -07:00
sb . safePrintf ( " { \n " ) ;
2013-10-16 14:03:14 -07:00
2013-10-29 15:26:32 -07:00
// injection is currently not in use, so this is an artifact:
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON & & injectionResponse )
2013-11-13 14:30:51 -08:00
sb . safePrintf ( " \" response \" : \" %s \" , \n \n "
2013-10-16 14:03:14 -07:00
, injectionResponse - > getBufStart ( ) ) ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON & & urlUploadResponse )
2013-11-13 14:30:51 -08:00
sb . safePrintf ( " \" response \" : \" %s \" , \n \n "
2013-10-16 14:03:14 -07:00
, urlUploadResponse - > getBufStart ( ) ) ;
2013-09-25 12:57:07 -06:00
//////
//
// print collection summary page
//
//////
2013-11-11 15:52:04 -08:00
// the items in the array now have type:bulk or type:crawl
// so call them 'jobs'
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON )
2013-11-11 15:52:04 -08:00
sb . safePrintf ( " \" jobs \" :[ " ) ; //\"collections\":");
2013-09-25 15:37:20 -06:00
2014-11-10 14:45:11 -08:00
int32_t summary = hr - > getLong ( " summary " , 0 ) ;
2013-09-25 16:59:31 -06:00
// enter summary mode for json
2014-04-05 18:09:04 -07:00
if ( fmt ! = FORMAT_HTML ) summary = 1 ;
2013-09-25 12:57:07 -06:00
// start the table
2014-04-05 18:09:04 -07:00
if ( summary & & fmt = = FORMAT_HTML ) {
2013-09-25 12:57:07 -06:00
sb . safePrintf ( " <table border=1 cellpadding=5> "
" <tr> "
" <td><b>Collection</b></td> "
" <td><b>Objects Found</b></td> "
" <td><b>URLs Harvested</b></td> "
" <td><b>URLs Examined</b></td> "
2013-11-11 15:52:04 -08:00
" <td><b>Page Download Attempts</b></td> "
" <td><b>Page Download Successes</b></td> "
2014-01-23 13:23:09 -08:00
" <td><b>Page Download Successes This Round "
" </b></td> "
2013-09-25 12:57:07 -06:00
" <td><b>Page Process Attempts</b></td> "
" <td><b>Page Process Successes</b></td> "
2014-01-23 13:23:09 -08:00
" <td><b>Page Process Successes This Round "
" </b></td> "
2013-09-25 12:57:07 -06:00
" </tr> "
) ;
}
2013-10-16 14:03:14 -07:00
2013-11-04 11:05:10 -08:00
char * name3 = hr - > getString ( " name " ) ;
2013-09-25 12:57:07 -06:00
// scan each coll and get its stats
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; summary & & i < g_collectiondb . m_numRecs ; i + + ) {
2013-09-25 12:57:07 -06:00
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
if ( ! cx ) continue ;
2014-11-17 18:24:38 -08:00
// must belong to us
2013-09-25 12:57:07 -06:00
if ( strcmp ( cx - > m_diffbotToken . getBufStart ( ) , token ) )
continue ;
2013-09-25 15:37:20 -06:00
2013-11-04 11:05:10 -08:00
// just print out single crawl info for json
2014-04-05 18:09:04 -07:00
if ( fmt ! = FORMAT_HTML & & cx ! = cr & & name3 )
2013-11-04 11:05:10 -08:00
continue ;
2013-09-25 15:37:20 -06:00
// if json, print each collectionrec
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON ) {
2013-09-25 15:37:20 -06:00
if ( ! firstOne )
sb . safePrintf ( " , \n \t " ) ;
firstOne = false ;
2013-09-28 14:17:43 -06:00
//char *alias = "";
//if ( cx->m_collectionNameAlias.length() > 0 )
// alias=cx->m_collectionNameAlias.getBufStart();
2014-11-10 14:45:11 -08:00
//int32_t paused = 1;
2013-10-29 13:16:01 -07:00
2013-10-21 18:32:57 -07:00
//if ( cx->m_spideringEnabled ) paused = 0;
2014-07-11 08:00:30 -07:00
if ( cx - > m_isCustomCrawl )
printCrawlDetailsInJson ( & sb , cx ,
getVersionFromRequest ( hr ) ) ;
else
2014-07-13 09:35:44 -07:00
printCrawlDetails2 ( & sb , cx , FORMAT_JSON ) ;
2014-07-11 08:00:30 -07:00
2013-09-25 15:37:20 -06:00
// print the next one out
continue ;
}
2013-09-25 12:57:07 -06:00
// print in table
sb . safePrintf ( " <tr> "
" <td>%s</td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
" <td>% " INT64 " </td> "
//"<td>%"INT64"</td>"
" <td>% " INT64 " </td> "
" <td>% " INT64 " </td> "
" <td>% " INT64 " </td> "
" <td>% " INT64 " </td> "
" <td>% " INT64 " </td> "
" <td>% " INT64 " </td> "
2013-09-25 12:57:07 -06:00
" </tr> "
, cx - > m_coll
, cx - > m_globalCrawlInfo . m_objectsAdded -
cx - > m_globalCrawlInfo . m_objectsDeleted
, cx - > m_globalCrawlInfo . m_urlsHarvested
2013-10-28 22:38:15 -07:00
//, cx->m_globalCrawlInfo.m_urlsConsidered
2013-09-25 12:57:07 -06:00
, cx - > m_globalCrawlInfo . m_pageDownloadAttempts
, cx - > m_globalCrawlInfo . m_pageDownloadSuccesses
2014-01-23 13:23:09 -08:00
, cx - > m_globalCrawlInfo . m_pageDownloadSuccessesThisRound
2013-09-25 12:57:07 -06:00
, cx - > m_globalCrawlInfo . m_pageProcessAttempts
, cx - > m_globalCrawlInfo . m_pageProcessSuccesses
2014-01-23 13:23:09 -08:00
, cx - > m_globalCrawlInfo . m_pageProcessSuccessesThisRound
2013-09-25 12:57:07 -06:00
) ;
}
2014-04-05 18:09:04 -07:00
if ( summary & & fmt = = FORMAT_HTML ) {
2013-09-25 12:57:07 -06:00
sb . safePrintf ( " </table></html> " ) ;
2013-09-25 15:37:20 -06:00
return g_httpServer . sendDynamicPage ( socket ,
2013-09-25 12:57:07 -06:00
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-09-27 10:49:24 -06:00
0 ) ; // cachetime
2013-09-25 12:57:07 -06:00
}
2013-09-25 15:37:20 -06:00
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON )
2013-09-25 17:12:01 -06:00
// end the array of collection objects
2013-10-16 17:17:28 -07:00
sb . safePrintf ( " \n ] \n " ) ;
2013-09-25 15:37:20 -06:00
2013-09-25 12:57:07 -06:00
///////
//
// end print collection summary page
//
///////
2013-09-25 15:37:20 -06:00
2013-09-16 15:33:45 -07:00
//
// show urls being crawled (ajax) (from Spider.cpp)
//
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-09-25 15:37:20 -06:00
sb . safePrintf ( " <table width=100%% cellpadding=5 "
" style=border-width:1px;border-style:solid; "
" border-color:black;> "
//"bgcolor=#%s>\n"
" <tr><td colspan=50> " // bgcolor=#%s>"
2014-11-10 14:45:11 -08:00
" <b>Last 10 URLs</b> (% " INT32 " spiders active) "
2013-09-25 15:37:20 -06:00
//,LIGHT_BLUE
//,DARK_BLUE
2014-11-10 14:45:11 -08:00
, ( int32_t ) g_spiderLoop . m_numSpidersOut ) ;
2013-10-15 12:40:56 -07:00
char * str = " <font color=green>Resume Crawl</font> " ;
2014-11-10 14:45:11 -08:00
int32_t pval = 0 ;
2013-10-15 12:40:56 -07:00
if ( cr - > m_spideringEnabled ) {
str = " <font color=red>Pause Crawl</font> " ;
pval = 1 ;
}
sb . safePrintf ( " "
2013-10-15 14:08:55 -07:00
" <a href=/crawlbot?%s "
2014-11-10 14:45:11 -08:00
" &pauseCrawl=% " INT32 " ><b>%s</b></a> "
2013-10-15 14:08:55 -07:00
, lb . getBufStart ( ) // has &name=&token= encoded
2013-10-15 12:40:56 -07:00
, pval
, str
) ;
2013-09-25 15:37:20 -06:00
sb . safePrintf ( " </td></tr> \n " ) ;
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest : : printTableHeaderSimple ( & sb , true ) )
2013-09-16 15:33:45 -07:00
return false ;
2014-11-10 14:45:11 -08:00
// int16_tcut
2013-09-25 15:37:20 -06:00
XmlDoc * * docs = g_spiderLoop . m_docs ;
2014-01-19 10:44:19 -08:00
// row count
2014-11-10 14:45:11 -08:00
int32_t j = 0 ;
2013-09-25 15:37:20 -06:00
// first print the spider recs we are spidering
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < ( int32_t ) MAX_SPIDERS ; i + + ) {
2013-09-25 15:37:20 -06:00
// get it
XmlDoc * xd = docs [ i ] ;
// skip if empty
if ( ! xd ) continue ;
// sanity check
2014-01-18 21:19:26 -08:00
if ( ! xd - > m_sreqValid ) { char * xx = NULL ; * xx = 0 ; }
2013-09-25 15:37:20 -06:00
// skip if not our coll rec!
2013-10-18 15:21:00 -07:00
//if ( xd->m_cr != cr ) continue;
if ( xd - > m_collnum ! = cr - > m_collnum ) continue ;
2013-09-25 15:37:20 -06:00
// grab it
2014-01-18 21:19:26 -08:00
SpiderRequest * oldsr = & xd - > m_sreq ;
2013-09-25 15:37:20 -06:00
// get status
char * status = xd - > m_statusMsg ;
// show that
2014-01-19 10:44:19 -08:00
if ( ! oldsr - > printToTableSimple ( & sb , status , xd , j ) )
2013-09-25 15:37:20 -06:00
return false ;
2014-01-19 10:44:19 -08:00
j + + ;
2013-09-25 15:37:20 -06:00
}
2013-09-16 15:33:45 -07:00
2013-09-25 15:37:20 -06:00
// end the table
sb . safePrintf ( " </table> \n " ) ;
sb . safePrintf ( " <br> \n " ) ;
2013-09-16 15:33:45 -07:00
2013-09-25 15:37:20 -06:00
} // end html format
2013-09-16 15:33:45 -07:00
2013-10-16 12:12:22 -07:00
// this is for making sure the search results are not cached
2014-11-10 14:45:11 -08:00
uint32_t r1 = rand ( ) ;
uint32_t r2 = rand ( ) ;
2014-10-30 13:30:39 -06:00
uint64_t rand64 = ( uint64_t ) r1 ;
2013-10-16 12:12:22 -07:00
rand64 < < = 32 ;
rand64 | = r2 ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-10-16 12:12:22 -07:00
sb . safePrintf ( " <br> "
" <table border=0 cellpadding=5> "
// OBJECT search input box
" <form method=get action=/search> "
" <tr> "
" <td> "
" <b>Search Objects:</b> "
" </td><td> "
" <input type=text name=q size=50> "
2013-11-07 09:40:31 -08:00
// site clustering off
" <input type=hidden name=sc value=0> "
// dup removal off
" <input type=hidden name=dr value=0> "
2013-10-16 12:12:22 -07:00
" <input type=hidden name=c value= \" %s \" > "
2014-11-10 14:45:11 -08:00
" <input type=hidden name=rand value=% " INT64 " > "
2013-10-21 19:06:13 -07:00
// bypass ajax, searchbox, logo, etc.
" <input type=hidden name=id value=12345> "
2013-10-16 12:12:22 -07:00
// restrict search to json objects
" <input type=hidden name=prepend "
" value= \" type:json | \" > "
" "
" <input type=submit name=submit value=OK> "
" </tr> "
" </form> "
// PAGE search input box
" <form method=get action=/search> "
" <tr> "
" <td> "
" <b>Search Pages:</b> "
" </td><td> "
" <input type=text name=q size=50> "
2013-11-07 09:40:31 -08:00
// site clustering off
" <input type=hidden name=sc value=0> "
// dup removal off
" <input type=hidden name=dr value=0> "
2013-10-16 12:12:22 -07:00
" <input type=hidden name=c value= \" %s \" > "
2014-11-10 14:45:11 -08:00
" <input type=hidden name=rand value=% " INT64 " > "
2013-10-21 19:06:13 -07:00
// bypass ajax, searchbox, logo, etc.
" <input type=hidden name=id value=12345> "
2013-10-16 12:12:22 -07:00
// restrict search to NON json objects
" <input type=hidden "
" name=prepend value= \" -type:json | \" > "
" "
" <input type=submit name=submit value=OK> "
" </tr> "
" </form> "
// add url input box
" <form method=get action=/crawlbot> "
" <tr> "
" <td> "
2013-10-18 11:53:14 -07:00
" <b>Add Seed Urls: </b> "
2013-10-16 12:12:22 -07:00
" </td><td> "
2013-10-18 11:53:14 -07:00
" <input type=text name=seeds size=50> "
2013-10-16 12:12:22 -07:00
" %s " // hidden tags
" "
" <input type=submit name=submit value=OK> "
2013-10-18 11:53:14 -07:00
//" <input type=checkbox "
//"name=spiderLinks value=1 "
//"checked>"
//" <i>crawl links on this page?</i>"
2013-10-16 12:12:22 -07:00
, cr - > m_coll
, rand64
, cr - > m_coll
, rand64
, hb . getBufStart ( ) // hidden tags
) ;
}
2014-04-05 18:09:04 -07:00
if ( injectionResponse & & fmt = = FORMAT_HTML )
2013-10-16 12:12:22 -07:00
sb . safePrintf ( " <br><font size=-1>%s</font> \n "
, injectionResponse - > getBufStart ( )
) ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML )
2013-10-16 12:12:22 -07:00
sb . safePrintf ( //"<input type=hidden name=c value=\"%s\">"
2013-10-18 11:53:14 -07:00
//"<input type=hidden name=crawlbotapi value=1>"
2013-10-16 12:12:22 -07:00
" </td> "
" </tr> "
2013-10-18 11:53:14 -07:00
//"</form>"
2013-10-16 12:12:22 -07:00
" <tr> "
2013-10-18 11:53:14 -07:00
" <td><b>Add Spot URLs:</b></td> "
2013-10-16 12:12:22 -07:00
" <td> "
// this page will call
// printCrawlbotPage2(uploadResponse) 2display it
2013-10-18 11:53:14 -07:00
//"<form method=post action=/crawlbot>"
//"<input type=file name=spots size=40>"
2013-10-29 16:37:14 -07:00
" <input type=text name=spots size=50> "
2013-10-18 11:53:14 -07:00
" <input type=submit name=submit value=OK> "
" %s " // hidden tags
//" <input type=checkbox "
//"name=spiderLinks value=1 "
//"checked>"
//" <i>crawl links on those pages?</i>"
2013-10-16 12:12:22 -07:00
" </form> "
" </td> "
" </tr> "
" </table> "
" <br> "
//, cr->m_coll
2013-10-18 11:53:14 -07:00
, hb . getBufStart ( )
2013-10-16 12:12:22 -07:00
) ;
2013-09-13 16:22:07 -07:00
//
// show stats
//
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-11-04 16:35:58 -08:00
2014-01-23 21:22:39 -08:00
char * seedStr = cr - > m_diffbotSeeds . getBufStart ( ) ;
if ( ! seedStr ) seedStr = " " ;
2013-11-04 16:35:58 -08:00
SafeBuf tmp ;
2014-11-10 14:45:11 -08:00
int32_t crawlStatus = - 1 ;
2013-11-04 16:35:58 -08:00
getSpiderStatusMsg ( cr , & tmp , & crawlStatus ) ;
CrawlInfo * ci = & cr - > m_localCrawlInfo ;
2014-11-10 14:45:11 -08:00
int32_t sentAlert = ( int32_t ) ci - > m_sentCrawlDoneAlert ;
2013-11-04 16:35:58 -08:00
if ( sentAlert ) sentAlert = 1 ;
2013-10-16 12:12:22 -07:00
sb . safePrintf (
2013-09-16 11:42:04 -07:00
" <form method=get action=/crawlbot> "
2013-10-15 14:08:55 -07:00
" %s "
, hb . getBufStart ( ) // hidden input token/name/..
2013-09-17 10:25:54 -07:00
) ;
sb . safePrintf ( " <TABLE border=0> "
2013-09-16 15:18:55 -07:00
" <TR><TD valign=top> "
2013-09-13 16:22:07 -07:00
" <table border=0 cellpadding=5> "
2013-10-16 16:27:24 -07:00
//
" <tr> "
" <td><b>Crawl Name:</td> "
" <td>%s</td> "
" </tr> "
2013-12-16 17:21:59 -08:00
" <tr> "
" <td><b>Crawl Type:</td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT32 " </td> "
2013-12-16 17:21:59 -08:00
" </tr> "
2013-10-16 16:27:24 -07:00
//"<tr>"
//"<td><b>Collection Alias:</td>"
//"<td>%s%s</td>"
//"</tr>"
" <tr> "
" <td><b>Token:</td> "
" <td>%s</td> "
" </tr> "
2014-01-23 21:22:39 -08:00
" <tr> "
" <td><b>Seeds:</td> "
" <td>%s</td> "
" </tr> "
2013-11-04 16:35:58 -08:00
" <tr> "
" <td><b>Crawl Status:</td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT32 " </td> "
2013-11-04 16:35:58 -08:00
" </tr> "
" <tr> "
" <td><b>Crawl Status Msg:</td> "
" <td>%s</td> "
" </tr> "
2014-05-21 13:55:01 -07:00
" <tr> "
" <td><b>Crawl Start Time:</td> "
2014-11-10 14:45:11 -08:00
" <td>% " UINT32 " </td> "
2014-05-21 13:55:01 -07:00
" </tr> "
" <tr> "
2015-09-09 16:56:01 -07:00
" <td><b>Last Crawl Completion Time:</td> "
2014-11-10 14:45:11 -08:00
" <td>% " UINT32 " </td> "
2014-05-21 13:55:01 -07:00
" </tr> "
2013-11-04 16:35:58 -08:00
" <tr> "
" <td><b>Rounds Completed:</td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT32 " </td> "
2013-11-04 16:35:58 -08:00
" </tr> "
2014-01-25 14:49:55 -08:00
" <tr> "
" <td><b>Has Urls Ready to Spider:</td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT32 " </td> "
2014-01-25 14:49:55 -08:00
" </tr> "
2015-09-09 16:56:01 -07:00
, cr - > m_diffbotCrawlName . getBufStart ( )
, ( int32_t ) cr - > m_isCustomCrawl
, cr - > m_diffbotToken . getBufStart ( )
, seedStr
, crawlStatus
, tmp . getBufStart ( )
, cr - > m_diffbotCrawlStartTime
// this is 0 if not over yet
, cr - > m_diffbotCrawlEndTime
, cr - > m_spiderRoundNum
, cr - > m_globalCrawlInfo . m_hasUrlsReadyToSpider
) ;
// show crawlinfo crap
CrawlInfo * cis = ( CrawlInfo * ) cr - > m_crawlInfoBuf . getBufStart ( ) ;
sb . safePrintf ( " <tr><td><b>Ready Hosts</b></td><td> " ) ;
for ( int32_t i = 0 ; i < g_hostdb . getNumHosts ( ) ; i + + ) {
CrawlInfo * ci = & cis [ i ] ;
2016-02-08 14:10:04 -08:00
if ( ! ci ) continue ;
2015-09-09 16:56:01 -07:00
if ( ! ci - > m_hasUrlsReadyToSpider ) continue ;
Host * h = g_hostdb . getHost ( i ) ;
if ( ! h ) continue ;
sb . safePrintf ( " <a href=http://%s:%i/crawlbot?c=%s> "
" %i</a> "
, iptoa ( h - > m_ip )
, ( int ) h - > m_httpPort
, cr - > m_coll
, ( int ) i
) ;
}
sb . safePrintf ( " </tr> \n " ) ;
sb . safePrintf (
2013-10-16 16:27:24 -07:00
2013-09-13 16:22:07 -07:00
// this will have to be in crawlinfo too!
//"<tr>"
//"<td><b>pages indexed</b>"
2014-11-10 14:45:11 -08:00
//"<td>%"INT64"</td>"
2013-09-13 16:22:07 -07:00
//"</tr>"
2013-09-13 17:34:39 -07:00
" <tr> "
2013-09-16 11:22:07 -07:00
" <td><b>Objects Found</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2013-09-16 15:18:55 -07:00
" </tr> "
" <tr> "
2013-09-30 14:25:33 -06:00
" <td><b>URLs Harvested</b> (inc. dups)</td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2013-09-16 15:18:55 -07:00
" </tr> "
2013-10-28 22:38:15 -07:00
//"<tr>"
//"<td><b>URLs Examined</b></td>"
2014-11-10 14:45:11 -08:00
//"<td>%"INT64"</td>"
2013-10-28 22:38:15 -07:00
//"</tr>"
2013-09-16 15:18:55 -07:00
" <tr> "
2013-10-30 16:14:30 -07:00
" <td><b>Page Crawl Attempts</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2013-09-16 15:18:55 -07:00
" </tr> "
" <tr> "
2013-10-30 16:14:30 -07:00
" <td><b>Page Crawl Successes</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2013-09-16 15:18:55 -07:00
" </tr> "
2014-01-23 13:23:09 -08:00
" <tr> "
" <td><b>Page Crawl Successes This Round</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2014-01-23 13:23:09 -08:00
" </tr> "
2013-09-16 15:18:55 -07:00
" <tr> "
" <td><b>Page Process Attempts</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2013-09-16 15:18:55 -07:00
" </tr> "
" <tr> "
" <td><b>Page Process Successes</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2013-09-16 15:18:55 -07:00
" </tr> "
2014-01-23 13:23:09 -08:00
" <tr> "
" <td><b>Page Process Successes This Round</b></td> "
2014-11-10 14:45:11 -08:00
" <td>% " INT64 " </td> "
2014-01-23 13:23:09 -08:00
" </tr> "
2013-09-16 15:18:55 -07:00
2013-09-17 10:25:54 -07:00
, cr - > m_globalCrawlInfo . m_objectsAdded -
cr - > m_globalCrawlInfo . m_objectsDeleted
, cr - > m_globalCrawlInfo . m_urlsHarvested
2013-10-28 22:38:15 -07:00
//, cr->m_globalCrawlInfo.m_urlsConsidered
2013-09-17 10:25:54 -07:00
, cr - > m_globalCrawlInfo . m_pageDownloadAttempts
, cr - > m_globalCrawlInfo . m_pageDownloadSuccesses
2014-01-23 13:23:09 -08:00
, cr - > m_globalCrawlInfo . m_pageDownloadSuccessesThisRound
2013-09-17 10:25:54 -07:00
, cr - > m_globalCrawlInfo . m_pageProcessAttempts
, cr - > m_globalCrawlInfo . m_pageProcessSuccesses
2014-01-23 13:23:09 -08:00
, cr - > m_globalCrawlInfo . m_pageProcessSuccessesThisRound
2013-09-17 10:25:54 -07:00
) ;
2013-09-16 15:18:55 -07:00
2013-12-03 16:23:05 -08:00
2014-11-10 14:45:11 -08:00
uint32_t now = ( uint32_t ) getTimeGlobalNoCore ( ) ;
2014-02-06 14:25:44 -08:00
2013-12-03 16:23:05 -08:00
sb . safePrintf ( " <tr> "
" <td><b>Download Objects:</b> "
" </td><td> "
" <a href=/crawlbot/download/%s_data.csv> "
" csv</a> "
2014-02-06 14:25:44 -08:00
2013-12-03 16:23:05 -08:00
" "
2014-02-06 14:25:44 -08:00
2013-12-03 16:23:05 -08:00
" <a href=/crawlbot/download/%s_data.json> "
2014-02-06 14:25:44 -08:00
" json full dump</a> "
" "
, cr - > m_coll
, cr - > m_coll
) ;
sb . safePrintf (
// newest json on top of results
" <a href=/search?icc=1&format=json&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10000000&rand=% " UINT64 " &scores=0&id=1& "
2014-02-06 14:25:44 -08:00
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" > "
" json full search (newest on top)</a> "
" "
// newest json on top of results, last 10 mins
2014-02-06 18:56:38 -08:00
" <a href=/search?icc=1&format=json& "
// disable site clustering
" sc=0& "
2014-06-17 18:50:42 -07:00
// doNOTdupcontentremoval:
" dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10000000&rand=% " UINT64 " &scores=0&id=1& "
2014-02-06 14:25:44 -08:00
" stream=1& " // stream results back as we get them
" q= "
2014-02-06 14:41:33 -08:00
// put NEWEST on top
2014-02-06 20:57:54 -08:00
" gbsortbyint%%3Agbspiderdate+ "
2014-02-06 14:25:44 -08:00
// min spider date = now - 10 mins
2014-11-10 14:45:11 -08:00
" gbminint%%3Agbspiderdate%%3A% " INT32 " & "
2014-02-06 18:56:38 -08:00
//"debug=1"
" prepend=type%%3Ajson "
2014-02-06 14:25:44 -08:00
" > "
2014-02-06 14:41:33 -08:00
" json search (last 30 seconds)</a> "
2014-02-06 14:25:44 -08:00
2013-12-03 16:23:05 -08:00
" </td> "
" </tr> "
2014-02-06 14:25:44 -08:00
// json search with gbsortby:gbspiderdate
, cr - > m_coll
, rand64
2013-12-03 16:23:05 -08:00
2014-02-06 14:25:44 -08:00
// json search with gbmin:gbspiderdate
, cr - > m_coll
, rand64
2014-02-06 14:41:33 -08:00
, now - 30 // 60 // last 1 minute
2014-02-06 14:25:44 -08:00
) ;
sb . safePrintf (
2013-12-03 16:23:05 -08:00
" <tr> "
" <td><b>Download Products:</b> "
" </td><td> "
// make it search.csv so excel opens it
" <a href=/search.csv?icc=1&format=csv&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10000000&rand=% " UINT64 " &scores=0&id=1& "
2013-12-03 16:23:05 -08:00
" q=gbrevsortby%%3Aproduct.offerPrice& "
" prepend=type%%3Ajson "
//"+type%%3Aproduct%%7C"
" > "
" csv</a> "
" "
" <a href=/search?icc=1&format=html&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10000000&rand=% " UINT64 " &scores=0&id=1& "
2013-12-03 16:23:05 -08:00
" q=gbrevsortby%%3Aproduct.offerPrice& "
" prepend=type%%3Ajson "
" > "
" html</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Download Urls:</b> "
" </td><td> "
" <a href=/crawlbot/download/%s_urls.csv> "
" csv</a> "
2015-08-29 14:28:01 -07:00
" <a href=/v3/crawl/download/%s_urls.csv> "
" new csv format</a> "
2015-12-15 16:22:53 -08:00
" <a href=/search?q=gbsortby "
" int%%3AgbssSpiderTime&n=50&c=%s> "
" last 50 download attempts</a> "
2015-08-29 14:28:01 -07:00
2013-12-03 16:23:05 -08:00
" </td> "
" </tr> "
" <tr> "
" <td><b>Latest Objects:</b> "
" </td><td> "
" <a href=/search.csv?icc=1&format=csv&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10&rand=% " UINT64 " &scores=0&id=1& "
2013-12-03 16:23:05 -08:00
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" > "
" csv</a> "
" "
" <a href=/search?icc=1&format=html&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10rand=% " UINT64 " &scores=0&id=1& "
2013-12-03 16:23:05 -08:00
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" > "
" html</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Latest Products:</b> "
" </td><td> "
" <a href=/search.csv?icc=1&format=csv&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10&rand=% " UINT64 " &scores=0&id=1& "
2013-12-03 16:23:05 -08:00
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson+type%%3Aproduct "
" > "
" csv</a> "
" "
" <a href=/search?icc=1&format=html&sc=0&dr=0& "
2014-11-10 14:45:11 -08:00
" c=%s&n=10&rand=% " UINT64 " &scores=0&id=1& "
2013-12-03 16:23:05 -08:00
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson+type%%3Aproduct "
" > "
" html</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Download Pages:</b> "
" </td><td> "
" <a href=/crawlbot/download/%s_pages.txt> "
" txt</a> "
//
" </td> "
" </tr> "
" </table> "
" </TD> "
2014-02-06 14:25:44 -08:00
// download products html
2013-12-03 16:23:05 -08:00
, cr - > m_coll
, rand64
, cr - > m_coll
, rand64
//, cr->m_coll
//, cr->m_coll
//, cr->m_coll
2015-08-29 14:28:01 -07:00
// urls.csv old
, cr - > m_coll
// urls.csv new format v3
2013-12-03 16:23:05 -08:00
, cr - > m_coll
2015-12-15 16:22:53 -08:00
// last 50 downloaded urls
, cr - > m_coll
2013-12-03 16:23:05 -08:00
// latest objects in html
, cr - > m_coll
, rand64
// latest objects in csv
, cr - > m_coll
, rand64
// latest products in html
, cr - > m_coll
, rand64
// latest products in csv
, cr - > m_coll
, rand64
// download pages
, cr - > m_coll
) ;
2013-09-17 15:32:28 -07:00
// spacer column
sb . safePrintf ( " <TD> "
2013-09-16 15:18:55 -07:00
" "
" "
" </TD> "
2013-09-17 15:32:28 -07:00
) ;
2013-09-16 15:18:55 -07:00
2013-09-17 15:32:28 -07:00
// what diffbot api to use?
2013-09-18 11:24:16 -07:00
/*
2013-09-17 15:32:28 -07:00
char * api = cr - > m_diffbotApi . getBufStart ( ) ;
2013-09-17 16:30:57 -07:00
char * s [ 10 ] ;
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < 10 ; i + + ) s [ i ] = " " ;
2013-09-17 16:38:56 -07:00
if ( api & & strcmp ( api , " all " ) = = 0 ) s [ 0 ] = " selected " ;
if ( api & & strcmp ( api , " article " ) = = 0 ) s [ 1 ] = " selected " ;
if ( api & & strcmp ( api , " product " ) = = 0 ) s [ 2 ] = " selected " ;
if ( api & & strcmp ( api , " image " ) = = 0 ) s [ 3 ] = " selected " ;
if ( api & & strcmp ( api , " frontpage " ) = = 0 ) s [ 4 ] = " selected " ;
if ( api & & strcmp ( api , " none " ) = = 0 ) s [ 5 ] = " selected " ;
if ( ! api | | ! api [ 0 ] ) s [ 5 ] = " selected " ;
2013-09-18 11:24:16 -07:00
*/
2013-09-17 15:32:28 -07:00
sb . safePrintf ( " <TD valign=top> "
2013-09-16 15:18:55 -07:00
" <table cellpadding=5 border=0> "
2013-09-18 11:24:16 -07:00
/*
2013-09-16 15:18:55 -07:00
" <tr> "
2013-09-17 15:32:28 -07:00
" <td> "
" Diffbot API "
" </td><td> "
" <select name=diffbotapi> "
" <option value=all%s>All</option> "
" <option value=article%s>Article</option> "
" <option value=product%s>Product</option> "
" <option value=image%s>Image</option> "
" <option value=frontpage%s>FrontPage</option> "
2013-09-17 16:30:57 -07:00
" <option value=none%s>None</option> "
2013-09-17 15:32:28 -07:00
" </select> "
" </td> "
2013-09-17 15:59:50 -07:00
" </tr> "
2013-09-17 15:32:28 -07:00
, s [ 0 ]
, s [ 1 ]
, s [ 2 ]
, s [ 3 ]
, s [ 4 ]
2013-09-17 16:30:57 -07:00
, s [ 5 ]
2013-09-18 11:24:16 -07:00
*/
2013-09-17 15:32:28 -07:00
) ;
2013-09-28 14:17:43 -06:00
//char *alias = "";
//if ( cr->m_collectionNameAlias.length() > 0 )
// alias = cr->m_collectionNameAlias.getBufStart();
//char *aliasResponse = "";
//if ( alias && ! isAliasUnique(cr,token,alias) )
// aliasResponse = "<br><font size=1 color=red>"
// "Alias not unique</font>";
2013-09-26 14:50:34 -06:00
2013-09-27 12:17:22 -06:00
char * urtYes = " checked " ;
char * urtNo = " " ;
if ( ! cr - > m_useRobotsTxt ) {
urtYes = " " ;
urtNo = " checked " ;
}
2014-02-27 19:53:17 -08:00
/*
2013-10-29 09:31:57 -07:00
char * rdomYes = " checked " ;
char * rdomNo = " " ;
if ( ! cr - > m_restrictDomain ) {
rdomYes = " " ;
rdomNo = " checked " ;
}
2014-02-27 19:53:17 -08:00
*/
2013-10-29 09:31:57 -07:00
2013-10-25 11:14:56 -07:00
char * isNewYes = " " ;
char * isNewNo = " checked " ;
2014-02-10 16:23:39 -08:00
if ( cr - > m_diffbotOnlyProcessIfNewUrl ) {
2013-10-25 11:14:56 -07:00
isNewYes = " checked " ;
isNewNo = " " ;
}
2013-11-20 16:41:28 -08:00
char * api = cr - > m_diffbotApiUrl . getBufStart ( ) ;
if ( ! api ) api = " " ;
SafeBuf apiUrl ;
apiUrl . htmlEncode ( api , gbstrlen ( api ) , true , 0 ) ;
apiUrl . nullTerm ( ) ;
char * px1 = cr - > m_diffbotUrlCrawlPattern . getBufStart ( ) ;
if ( ! px1 ) px1 = " " ;
SafeBuf ppp1 ;
ppp1 . htmlEncode ( px1 , gbstrlen ( px1 ) , true , 0 ) ;
ppp1 . nullTerm ( ) ;
char * px2 = cr - > m_diffbotUrlProcessPattern . getBufStart ( ) ;
if ( ! px2 ) px2 = " " ;
SafeBuf ppp2 ;
ppp2 . htmlEncode ( px2 , gbstrlen ( px2 ) , true , 0 ) ;
ppp2 . nullTerm ( ) ;
char * px3 = cr - > m_diffbotPageProcessPattern . getBufStart ( ) ;
if ( ! px3 ) px3 = " " ;
SafeBuf ppp3 ;
ppp3 . htmlEncode ( px3 , gbstrlen ( px3 ) , true , 0 ) ;
ppp3 . nullTerm ( ) ;
2013-12-03 16:23:05 -08:00
char * rx1 = cr - > m_diffbotUrlCrawlRegEx . getBufStart ( ) ;
if ( ! rx1 ) rx1 = " " ;
SafeBuf rrr1 ;
rrr1 . htmlEncode ( rx1 , gbstrlen ( rx1 ) , true , 0 ) ;
char * rx2 = cr - > m_diffbotUrlProcessRegEx . getBufStart ( ) ;
if ( ! rx2 ) rx2 = " " ;
SafeBuf rrr2 ;
rrr2 . htmlEncode ( rx2 , gbstrlen ( rx2 ) , true , 0 ) ;
2013-10-09 15:24:35 -06:00
char * notifEmail = cr - > m_notifyEmail . getBufStart ( ) ;
char * notifUrl = cr - > m_notifyUrl . getBufStart ( ) ;
if ( ! notifEmail ) notifEmail = " " ;
if ( ! notifUrl ) notifUrl = " " ;
2013-09-27 12:17:22 -06:00
2013-09-17 15:32:28 -07:00
sb . safePrintf (
2013-09-16 15:18:55 -07:00
2013-09-13 18:10:03 -07:00
//
2013-09-16 15:18:55 -07:00
//
2013-10-15 16:57:34 -07:00
" <tr> "
" <td><b>Repeat Crawl:</b> "
" </td><td> "
2013-11-11 15:52:04 -08:00
" <input type=text name=repeat "
2013-10-21 15:06:23 -07:00
" size=10 value= \" %f \" > "
2013-10-15 16:57:34 -07:00
" <input type=submit name=submit value=OK> "
" days "
" </td> "
" </tr> "
2013-11-20 16:41:28 -08:00
" <tr> "
" <td><b>Diffbot API Url:</b> "
" </td><td> "
" <input type=text name=apiUrl "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Url Crawl Pattern:</b> "
" </td><td> "
" <input type=text name=urlCrawlPattern "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Url Process Pattern:</b> "
" </td><td> "
" <input type=text name=urlProcessPattern "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-10-08 17:08:58 -07:00
" <tr> "
" <td><b>Page Process Pattern:</b> "
" </td><td> "
2013-10-14 18:19:59 -06:00
" <input type=text name=pageProcessPattern "
2013-10-08 17:08:58 -07:00
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-12-03 16:23:05 -08:00
" <tr> "
" <td><b>Url Crawl RegEx:</b> "
" </td><td> "
" <input type=text name=urlCrawlRegEx "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Url Process RegEx:</b> "
" </td><td> "
" <input type=text name=urlProcessRegEx "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-10-25 11:14:56 -07:00
" <tr> "
2014-10-31 16:34:31 -07:00
" <td><b>Max hopcount to seeds:</b> "
" </td><td> "
2014-11-04 16:05:20 -08:00
" <input type=text name=maxHops "
2014-11-20 16:53:07 -08:00
" size=9 value=% " INT32 " > "
2014-10-31 16:34:31 -07:00
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Only Process If New:</b> "
2013-10-25 11:14:56 -07:00
" </td><td> "
2013-11-04 13:57:44 -08:00
" <input type=radio name=onlyProcessIfNew "
2013-10-25 11:14:56 -07:00
" value=1%s> yes "
2013-11-04 13:57:44 -08:00
" <input type=radio name=onlyProcessIfNew "
2013-10-25 11:14:56 -07:00
" value=0%s> no "
" </td> "
" </tr> "
" <tr> "
2013-10-28 21:20:44 -07:00
" <td><b>Crawl Delay (seconds):</b> "
2013-10-25 11:14:56 -07:00
" </td><td> "
2013-10-28 21:20:44 -07:00
" <input type=text name=crawlDelay "
" size=9 value=%f> "
2013-10-25 11:14:56 -07:00
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-09-16 15:18:55 -07:00
" <tr> "
2013-10-30 16:14:30 -07:00
" <td><b>Max Page Crawl Successes:</b> "
2013-09-16 15:18:55 -07:00
" </td><td> "
2013-10-14 18:19:59 -06:00
" <input type=text name=maxToCrawl "
2014-11-10 14:45:11 -08:00
" size=9 value=% " INT64 " > "
2013-09-16 11:42:04 -07:00
" <input type=submit name=submit value=OK> "
2013-09-16 15:18:55 -07:00
" </td> "
2013-09-13 16:22:07 -07:00
" </tr> "
2013-10-01 17:30:06 -06:00
2013-09-13 17:34:39 -07:00
" <tr> "
2013-09-16 15:18:55 -07:00
" <td><b>Max Page Process Successes:</b> "
" </td><td> "
2013-10-14 18:19:59 -06:00
" <input type=text name=maxToProcess "
2014-11-10 14:45:11 -08:00
" size=9 value=% " INT64 " > "
2013-09-16 11:42:04 -07:00
" <input type=submit name=submit value=OK> "
2013-09-16 15:18:55 -07:00
" </td> "
2013-09-13 17:34:39 -07:00
" </tr> "
2013-09-13 18:10:03 -07:00
2013-10-23 11:40:30 -07:00
" <tr> "
2013-11-11 15:52:04 -08:00
" <td><b>Max Rounds:</b> "
2013-10-23 11:40:30 -07:00
" </td><td> "
2013-11-11 15:52:04 -08:00
" <input type=text name=maxRounds "
2014-11-10 14:45:11 -08:00
" size=9 value=% " INT32 " > "
2013-10-23 11:40:30 -07:00
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-10-01 17:30:06 -06:00
" <tr> "
" <td><b>Notification Email:</b> "
" </td><td> "
2013-10-14 18:19:59 -06:00
" <input type=text name=notifyEmail "
2013-10-01 17:30:06 -06:00
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Notification URL:</b> "
" </td><td> "
2013-10-30 13:39:10 -07:00
" <input type=text name=notifyWebhook "
2013-10-01 17:30:06 -06:00
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-09-16 15:18:55 -07:00
" <tr><td> "
2013-10-29 16:37:14 -07:00
" <b>Use Robots.txt when crawling?</b> "
2013-09-16 15:18:55 -07:00
" </td><td> "
2013-10-15 11:54:54 -06:00
" <input type=radio name=obeyRobots "
2013-09-27 12:17:22 -06:00
" value=1%s> yes "
2013-10-15 11:54:54 -06:00
" <input type=radio name=obeyRobots "
2013-09-27 12:17:22 -06:00
" value=0%s> no "
2013-09-16 15:18:55 -07:00
" </td> "
" </tr> "
2014-02-27 19:53:17 -08:00
//"<tr><td>"
//"<b>Restrict domain to seeds?</b> "
//"</td><td>"
//"<input type=radio name=restrictDomain "
//"value=1%s> yes "
//"<input type=radio name=restrictDomain "
//"value=0%s> no "
//"</td>"
//"</tr>"
2013-10-29 09:31:57 -07:00
2013-09-27 12:17:22 -06:00
//"<tr><td>"
//"Use spider proxies on AWS? "
//"</td><td>"
//"<input type=checkbox name=usefloaters checked>
//"</td>"
//"</tr>"
2013-09-16 15:18:55 -07:00
2013-09-13 16:22:07 -07:00
" </table> "
2013-09-16 15:18:55 -07:00
" </TD> "
" </TR> "
" </TABLE> "
2013-09-26 14:50:34 -06:00
2013-10-15 16:57:34 -07:00
, cr - > m_collectiveRespiderFrequency
2013-11-20 16:41:28 -08:00
, apiUrl . getBufStart ( )
, ppp1 . getBufStart ( )
, ppp2 . getBufStart ( )
, ppp3 . getBufStart ( )
2013-10-08 17:08:58 -07:00
2013-12-03 16:23:05 -08:00
, rrr1 . getBufStart ( )
, rrr2 . getBufStart ( )
2014-10-31 16:34:31 -07:00
2014-11-04 16:05:20 -08:00
, cr - > m_diffbotMaxHops
2013-12-03 16:23:05 -08:00
2013-10-25 11:14:56 -07:00
, isNewYes
, isNewNo
2013-10-28 21:20:44 -07:00
, cr - > m_collectiveCrawlDelay
2013-10-25 11:14:56 -07:00
2013-10-23 11:40:30 -07:00
, cr - > m_maxToCrawl
, cr - > m_maxToProcess
2014-11-10 14:45:11 -08:00
, ( int32_t ) cr - > m_maxCrawlRounds
2013-09-13 18:10:03 -07:00
2013-10-09 15:24:35 -06:00
, notifEmail
, notifUrl
2013-10-01 17:30:06 -06:00
2013-09-27 12:17:22 -06:00
, urtYes
, urtNo
2013-10-29 09:31:57 -07:00
2014-02-27 19:53:17 -08:00
//, rdomYes
//, rdomNo
2013-10-29 09:31:57 -07:00
2013-09-13 16:22:07 -07:00
) ;
}
2013-09-16 10:16:49 -07:00
2013-09-13 16:22:07 -07:00
// xml or json does not show the input boxes
2014-04-05 18:09:04 -07:00
//if ( format != FORMAT_HTML )
2013-09-16 15:33:45 -07:00
// return g_httpServer.sendDynamicPage ( s,
// sb.getBufStart(),
// sb.length(),
// -1 ); // cachetime
2013-09-13 17:34:39 -07:00
2013-09-16 14:29:01 -07:00
//
// print url filters. use "multimedia" to handle jpg etc.
//
2013-09-16 15:00:43 -07:00
// use "notindexable" for images/movies/css etc.
// add a "process" column to send to diffbot...
//
//
2013-09-18 12:38:05 -07:00
2013-10-16 12:12:22 -07:00
/*
2013-09-18 12:38:05 -07:00
char * s1 = " Show " ;
char * s2 = " none " ;
2013-09-18 13:50:55 -07:00
if ( hr - > getLongFromCookie ( " showtable " , 0 ) ) {
2013-09-18 12:38:05 -07:00
s1 = " Hide " ;
s2 = " " ;
}
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML )
2013-09-25 15:37:20 -06:00
sb . safePrintf (
" <a onclick= "
" \" "
" var e = document.getElementById('filters'); "
" var m = document.getElementById('msg'); "
" if ( e.style.display == 'none' ){ "
" e.style.display = ''; "
" m.innerHTML='Hide URL Filters Table'; "
" document.cookie = 'showtable=1;'; "
" } "
" else { "
" e.style.display = 'none'; "
" m.innerHTML='Show URL Filters Table'; "
" document.cookie = 'showtable=0;'; "
" } "
" \" "
" "
" style= "
" cursor:hand; "
" cursor:pointer; "
" color:blue;> "
" <u><b> "
" <div id=msg> "
" %s URL Filters Table "
" </div> "
" </b></u> "
" </a> "
" <div id=filters style=display:%s;> "
" <form method=get action=/crawlbot> "
" <input type=hidden name=c value= \" %s \" > "
" <input type=hidden name=showtable value=1> "
, s1
, s2
, cr - > m_coll
) ;
2013-09-16 15:33:45 -07:00
2013-09-16 16:27:48 -07:00
//
// print url filters. HACKy...
//
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML )
2013-09-25 15:37:20 -06:00
g_parms . sendPageGeneric ( socket ,
hr ,
PAGE_FILTERS ,
NULL ,
& sb ,
cr - > m_coll , // coll override
false ) ; // isJSON?
2013-09-16 16:27:48 -07:00
//
// end HACKy hack
//
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML )
2013-09-25 15:37:20 -06:00
sb . safePrintf (
" </form> "
" </div> "
" <br> "
" <br> "
) ;
2013-10-16 12:12:22 -07:00
*/
2013-09-16 14:29:01 -07:00
2013-09-13 16:22:07 -07:00
//
// add search box to your site
//
2013-09-16 16:27:48 -07:00
/*
2013-09-13 16:22:07 -07:00
sb . safePrintf ( " <br> "
" <table> "
" <tr> "
" <td><a onclick=unhide();> "
" Add this search box to your site "
" </a> "
" </td> "
" </tr> "
" </table> " ) ;
2013-09-16 16:27:48 -07:00
*/
2013-09-13 16:22:07 -07:00
//
2013-10-16 12:12:22 -07:00
// show simpler url filters table
2013-09-13 16:22:07 -07:00
//
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-11-22 17:37:42 -08:00
/*
2013-10-16 12:12:22 -07:00
sb . safePrintf ( " <table> "
2013-10-16 14:13:28 -07:00
" <tr><td colspan=2> "
" <b>URL Filters</b> "
" </td></tr> \n "
2013-10-16 12:12:22 -07:00
) ;
// true means its html input
printUrlFilters ( sb , cr , fmt ) ;
// for adding new rule
sb . safePrintf ( " <tr> "
" <td>Expression "
" <input type=text name=expression size=30 "
" value= \" \" > "
" </td><td> "
" Action <input type=text name=action size=50 "
" value= \" \" > "
2013-10-16 14:13:28 -07:00
" "
" <input type=submit name=submit value=OK> "
2013-10-16 12:12:22 -07:00
" </td> "
" </tr> \n "
) ;
//sb.safePrintf("<tr><td colspan=2><font size=-1><i>U
sb . safePrintf ( " </table> \n " ) ;
2013-11-22 17:37:42 -08:00
*/
2013-10-16 12:12:22 -07:00
//
// END THE BIG FORM
//
sb . safePrintf ( " </form> " ) ;
}
2013-09-13 16:22:07 -07:00
2013-10-16 12:12:22 -07:00
//
// show reset and delete crawl buttons
//
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_HTML ) {
2013-09-25 15:37:20 -06:00
sb . safePrintf (
" <table cellpadding=5> "
" <tr> "
2013-09-17 12:21:09 -07:00
2013-09-25 15:37:20 -06:00
" <td> "
2013-09-17 11:27:31 -07:00
2013-09-17 12:21:09 -07:00
2013-09-25 15:37:20 -06:00
// reset collection form
" <form method=get action=/crawlbot> "
2013-10-15 14:08:55 -07:00
" %s " // hidden tags
, hb . getBufStart ( )
2013-09-25 15:37:20 -06:00
) ;
2013-10-15 12:40:56 -07:00
sb . safePrintf (
2013-09-25 15:37:20 -06:00
2013-12-17 10:53:12 -08:00
" <input type=hidden name=reset value=1> "
2013-09-25 15:37:20 -06:00
// also show it in the display, so set "c"
" <input type=submit name=button value= \" "
" Reset this collection \" > "
" </form> "
// end reset collection form
" </td> "
2013-09-17 12:21:09 -07:00
2013-09-25 15:37:20 -06:00
" <td> "
2013-09-17 12:21:09 -07:00
2013-09-25 15:37:20 -06:00
// delete collection form
" <form method=get action=/crawlbot> "
2013-10-15 14:08:55 -07:00
" %s "
2014-11-10 14:45:11 -08:00
//, (int32_t)cr->m_collnum
2013-10-15 14:08:55 -07:00
, hb . getBufStart ( )
2013-09-25 15:37:20 -06:00
) ;
2013-09-17 12:21:09 -07:00
2013-10-15 12:40:56 -07:00
sb . safePrintf (
2013-12-17 10:53:12 -08:00
" <input type=hidden name=delete value=1> "
2013-09-25 15:37:20 -06:00
" <input type=submit name=button value= \" "
" Delete this collection \" > "
" </form> "
// end delete collection form
" </td> "
2013-09-13 16:22:07 -07:00
2013-11-14 14:07:45 -08:00
// restart collection form
" <td> "
" <form method=get action=/crawlbot> "
" %s "
2013-12-17 10:53:12 -08:00
" <input type=hidden name=restart value=1> "
2013-11-14 14:07:45 -08:00
" <input type=submit name=button value= \" "
" Restart this collection \" > "
" </form> "
" </td> "
2014-07-28 13:26:17 -07:00
// restart collection form
" <td> "
" <form method=get action=/crawlbot> "
" %s "
" <input type=hidden name=roundStart value=1> "
" <input type=submit name=button value= \" "
" Restart spider round \" > "
" </form> "
" </td> "
2013-09-25 15:37:20 -06:00
" </tr> "
" </table> "
2013-11-14 14:07:45 -08:00
2014-11-10 14:45:11 -08:00
//, (int32_t)cr->m_collnum
2013-11-14 14:07:45 -08:00
, hb . getBufStart ( )
2014-07-28 13:26:17 -07:00
, hb . getBufStart ( )
2014-11-10 14:45:11 -08:00
//, (int32_t)cr->m_collnum
2013-09-25 15:37:20 -06:00
) ;
}
2013-09-13 16:22:07 -07:00
2013-10-16 12:12:22 -07:00
2013-10-16 17:17:28 -07:00
// the ROOT JSON }
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON )
2013-10-16 17:17:28 -07:00
sb . safePrintf ( " } \n " ) ;
2013-10-16 12:12:22 -07:00
2013-10-15 11:50:57 -07:00
char * ct = " text/html " ;
2014-04-05 18:09:04 -07:00
if ( fmt = = FORMAT_JSON ) ct = " application/json " ;
if ( fmt = = FORMAT_XML ) ct = " text/xml " ;
if ( fmt = = FORMAT_CSV ) ct = " text/csv " ;
2013-10-15 11:50:57 -07:00
2013-09-25 15:37:20 -06:00
// this could be in html json or xml
return g_httpServer . sendDynamicPage ( socket ,
2013-09-13 16:22:07 -07:00
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-10-15 11:50:57 -07:00
- 1 , // cachetime
false ,
ct ) ;
2013-09-13 16:22:07 -07:00
/*
" <h1>API for Diffbot</h1> "
" <form action=/api/diffbot> "
" <input type=text name=url size=100> "
" <input type=submit name=inject value= \" Inject \" > "
" </form> "
" <br> "
" <h1>API for Crawlbot</h1> "
// "<form id=\"addCrawl\" onSubmit=\"addCrawlFromForm(); return false;\">"
" <form action=/api/startcrawl method=get> "
" <div class= \" control-group well \" > "
" <div id= \" apiSelection \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" Token: <input type=text name=token><br><br> "
" API: <input type=text name=api> <i>(article, product)</i><br><br> "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >Page-type</label></div> "
" <div class= \" input-append span7 \" > "
" <select id= \" apiSelect \" name= \" api \" class= \" span2 \" value= \" sds \" > "
" <option value= \" \" disabled= \" disabled \" selected= \" selected \" >Select pages to process and extract</option> "
" <option class= \" automatic \" value= \" article \" >Article</option> "
" <option class= \" automatic \" value= \" frontpage \" >Frontpage</option> "
" <option class= \" automatic \" value= \" image \" >Image</option> "
" <option class= \" automatic \" value= \" product \" >Product</option> "
" </select> "
" <span id= \" formError-apiSelect \" class= \" formError \" >Page-type is required</span> "
" <span class= \" inputNote \" >API calls will be made using your current token.</span> "
" </div> "
" </div> "
" </div> "
" <div id= \" apiQueryString \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >API Querystring</label></div> "
" <div class= \" input-prepend span7 \" > "
" <span class= \" add-on \" >?</span><input class= \" span6 search-input \" name= \" apiQueryString \" size= \" 16 \" type= \" text \" placeholder= \" Enter a querystring to specify Diffbot API parameters \" > "
" </div> "
" </div> "
" </div> "
" <hr> "
" <div id= \" seedUrl \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >Seed URL</label></div> "
" <div class= \" input-append span7 \" > "
" <input class= \" span6 search-input \" name= \" seed \" size= \" 16 \" type= \" text \" placeholder= \" Enter a seed URL \" > "
" <span id= \" formError-seedUrl \" class= \" formError \" ><br>Seed URL is required</span> "
" </div> "
" </div> "
" </div> "
" <hr> "
" <div id= \" headerRow \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" ><strong>Crawl Filters</strong></label></div> "
" </div> "
" </div> "
" <div id= \" urlCrawlPattern \" class= \" titleColumn \" > "
" <div class= \" regex-edit row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >URL Regex</label></div> "
" <div class= \" input-append span7 \" > "
" <input class= \" span6 \" name= \" urlCrawlPattern \" size= \" 16 \" type= \" text \" placeholder= \" Only crawl pages whose URLs match this regex \" value= \" \" > "
" <span class= \" inputNote \" >Diffbot uses <a href= \" http://www.regular-expressions.info/refflavors.html \" target= \" _blank \" >Java regex syntax</a>. Be sure to escape your characters.</span> "
" </div> "
" </div> "
" </div> "
" <div id= \" maxCrawled \" class= \" titleColumn \" > "
" <div class= \" regex-edit row \" ><div class= \" span2 \" ><label class= \" on-default-hide \" >Max Pages Crawled</label></div> <div class= \" input-append span7 \" > <input class= \" span1 \" name= \" maxCrawled \" size= \" \" type= \" text \" value= \" \" > </div> </div> </div> <div id= \" headerRow \" class= \" titleColumn \" > <div class= \" row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" ><strong>Processing Filters</strong></label></div> </div> </div> <div id= \" classify \" class= \" titleColumn \" > <div class= \" row \" > <div class= \" span2 \" id= \" smartProcessLabel \" ><label class= \" on-default-hide \" >Smart Processing</label></div> <div class= \" span7 \" ><label class= \" checkbox \" ><input id= \" smartProcessing \" type= \" checkbox \" name= \" classify \" ><span id= \" smartProcessAutomatic \" >Only process pages that match the selected page-type. Uses <a href= \" /our-apis/classifier \" >Page Classifier API</a>.</span><span id= \" smartProcessCustom \" >Smart Processing only operates with Diffbot <a href= \" /products/automatic \" >Automatic APIs.</a></span></label></div> </div> </div> <div id= \" urlProcessPattern \" class= \" titleColumn \" > <div class= \" regex-edit row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" >URL Regex</label></div> <div class= \" input-append span7 \" > <input class= \" span6 \" name= \" urlProcessPattern \" size= \" 16 \" type= \" text \" placeholder= \" Only process pages whose URLs match this regex \" value= \" \" > </div> </div> </div> <div id= \" pageProcessPattern \" class= \" titleColumn \" > <div class= \" regex-edit row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" >Page-Content Regex</label></div> <div class= \" input-append span7 \" > <input class= \" span6 \" name= \" pageProcessPattern \" size= \" 16 \" type= \" text \" placeholder= \" Only process pages whose content contains a match to this regex \" value= \" \" > </div> </div> </div> <div id= \" maxMatches \" class= \" titleColumn \" > <div class= \" regex-edit row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" >Max Pages Processed</label></div> <div class= \" input-append span7 \" > <input class= \" span1 \" name= \" maxProcessed \" size= \" 16 \" type= \" text \" value= \" \" > </div> </div> </div> <hr> <div class= \" controls row \" > <div class= \" span2 \" > </div> <div class= \" span7 \" id= \" startCrawlButtons \" > <button id= \" testButton \" class= \" btn \" type= \" button \" onclick= \" testcrawl(formToData());clicky.log('/dev/crawl#testCrawl','Test Crawl'); \" >Test</button> "
" <!--<button id= \" submitButton \" class= \" btn btn-info \" type= \" button \" onclick= \" addCrawlFromForm() \" >Start Crawl</button>--> "
" <input type=submit name=start value= \" Start Crawl \" > "
" </div> </div> </div> <div id= \" hiddenTestDiv \" style= \" display: none; \" ></div> </form> </div><!-- end Crawler tab --> " ) ;
*/
}
2013-10-21 17:35:14 -07:00
// . do not add dups into m_diffbotSeeds safebuf
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
2014-11-10 14:45:11 -08:00
int32_t isInSeedBuf ( CollectionRec * cr , char * url , int len ) {
2013-10-21 17:35:14 -07:00
HashTableX * ht = & cr - > m_seedHashTable ;
// if table is empty, populate it
if ( ht - > m_numSlotsUsed < = 0 ) {
// initialize the hash table
if ( ! ht - > set ( 8 , 0 , 1024 , NULL , 0 , false , 1 , " seedtbl " ) )
return - 1 ;
// populate it from list of seed urls
char * p = cr - > m_diffbotSeeds . getBufStart ( ) ;
for ( ; p & & * p ; ) {
// get url
char * purl = p ;
// advance to next
for ( ; * p & & ! is_wspace_a ( * p ) ; p + + ) ;
// make end then
char * end = p ;
// skip possible white space. might be \0.
if ( * p ) p + + ;
// hash it
2014-10-30 13:36:39 -06:00
int64_t h64 = hash64 ( purl , end - purl ) ;
2013-10-21 17:35:14 -07:00
if ( ! ht - > addKey ( & h64 ) ) return - 1 ;
}
}
// is this url in the hash table?
2014-10-30 13:36:39 -06:00
int64_t u64 = hash64 ( url , len ) ;
2013-10-21 17:35:14 -07:00
if ( ht - > isInTable ( & u64 ) ) return 1 ;
// add it to hashtable
if ( ! ht - > addKey ( & u64 ) ) return - 1 ;
// WAS not in table
return 0 ;
}
2013-09-25 16:04:16 -06:00
// just use "fakeips" based on the hash of each url hostname/subdomain
// so we don't waste time doing ip lookups.
2013-09-25 17:51:43 -06:00
bool getSpiderRequestMetaList ( char * doc ,
SafeBuf * listBuf ,
2013-10-21 17:35:14 -07:00
bool spiderLinks ,
CollectionRec * cr ) {
2013-10-18 11:53:14 -07:00
if ( ! doc ) return true ;
2013-09-25 16:04:16 -06:00
// . scan the list of urls
// . assume separated by white space \n \t or space
char * p = doc ;
2014-11-10 14:45:11 -08:00
uint32_t now = ( uint32_t ) getTimeGlobal ( ) ;
2013-09-25 16:04:16 -06:00
// a big loop
while ( true ) {
// skip white space (\0 is not a whitespace)
for ( ; is_wspace_a ( * p ) ; p + + ) ;
// all done?
if ( ! * p ) break ;
// save it
char * saved = p ;
// advance to next white space
for ( ; ! is_wspace_a ( * p ) & & * p ; p + + ) ;
// set end
char * end = p ;
// get that url
Url url ;
url . set ( saved , end - saved ) ;
// if not legit skip
if ( url . getUrlLen ( ) < = 0 ) continue ;
// need this
2014-10-30 13:36:39 -06:00
int64_t probDocId = g_titledb . getProbableDocId ( & url ) ;
2013-09-25 16:04:16 -06:00
// make it
SpiderRequest sreq ;
sreq . reset ( ) ;
sreq . m_firstIp = url . getHostHash32 ( ) ; // fakeip!
2014-01-17 21:01:43 -08:00
// avoid ips of 0 or -1
if ( sreq . m_firstIp = = 0 | | sreq . m_firstIp = = - 1 )
sreq . m_firstIp = 1 ;
2013-09-25 16:04:16 -06:00
sreq . m_hostHash32 = url . getHostHash32 ( ) ;
sreq . m_domHash32 = url . getDomainHash32 ( ) ;
sreq . m_siteHash32 = url . getHostHash32 ( ) ;
2015-03-19 16:17:36 -06:00
//sreq.m_probDocId = probDocId;
2013-09-25 16:04:16 -06:00
sreq . m_hopCount = 0 ; // we're a seed
sreq . m_hopCountValid = true ;
sreq . m_addedTime = now ;
sreq . m_isNewOutlink = 1 ;
sreq . m_isWWWSubdomain = url . isSimpleSubdomain ( ) ;
// treat seed urls as being on same domain and hostname
sreq . m_sameDom = 1 ;
sreq . m_sameHost = 1 ;
sreq . m_sameSite = 1 ;
2013-09-25 17:51:43 -06:00
sreq . m_fakeFirstIp = 1 ;
sreq . m_isAddUrl = 1 ;
// spider links?
if ( ! spiderLinks )
sreq . m_avoidSpiderLinks = 1 ;
2013-09-25 16:04:16 -06:00
// save the url!
strcpy ( sreq . m_url , url . getUrl ( ) ) ;
// finally, we can set the key. isDel = false
sreq . setKey ( sreq . m_firstIp , probDocId , false ) ;
2013-12-18 17:20:53 -08:00
2014-11-10 14:45:11 -08:00
int32_t oldBufSize = listBuf - > getCapacity ( ) ;
int32_t need = listBuf - > getLength ( ) + 100 + sreq . getRecSize ( ) ;
int32_t newBufSize = 0 ;
2014-02-25 13:53:41 -08:00
if ( need > oldBufSize ) newBufSize = oldBufSize + 100000 ;
if ( newBufSize & & ! listBuf - > reserve ( newBufSize ) )
2013-12-18 17:20:53 -08:00
// return false with g_errno set
return false ;
2013-09-25 16:04:16 -06:00
// store rdbid first
if ( ! listBuf - > pushChar ( RDB_SPIDERDB ) )
// return false with g_errno set
return false ;
// store it
if ( ! listBuf - > safeMemcpy ( & sreq , sreq . getRecSize ( ) ) )
// return false with g_errno set
return false ;
2013-10-21 17:35:14 -07:00
if ( ! cr ) continue ;
// do not add dups into m_diffbotSeeds safebuf
2014-11-10 14:45:11 -08:00
int32_t status = isInSeedBuf ( cr , saved , end - saved ) ;
2013-10-21 17:35:14 -07:00
// error?
if ( status = = - 1 ) {
log ( " crawlbot: error adding seed to table: %s " ,
mstrerror ( g_errno ) ) ;
return true ;
}
// already in buf
if ( status = = 1 ) continue ;
// add url into m_diffbotSeeds, \n separated list
if ( cr - > m_diffbotSeeds . length ( ) )
// make it space not \n so it looks better in the
// json output i guess
cr - > m_diffbotSeeds . pushChar ( ' ' ) ; // \n
cr - > m_diffbotSeeds . safeMemcpy ( url . getUrl ( ) , url . getUrlLen ( ) ) ;
cr - > m_diffbotSeeds . nullTerm ( ) ;
2013-09-25 16:04:16 -06:00
}
// all done
return true ;
}
2013-09-28 14:17:43 -06:00
/*
2013-09-26 14:50:34 -06:00
bool isAliasUnique ( CollectionRec * cr , char * token , char * alias ) {
// scan all collections
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
2013-09-26 14:50:34 -06:00
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
if ( ! cx ) continue ;
2014-11-17 18:24:38 -08:00
// must belong to us
2013-09-26 14:50:34 -06:00
if ( strcmp ( cx - > m_diffbotToken . getBufStart ( ) , token ) )
continue ;
// skip if collection we are putting alias on
if ( cx = = cr ) continue ;
// does it match?
if ( cx - > m_collectionNameAlias . length ( ) < = 0 ) continue ;
// return false if it matches! not unique
if ( strcmp ( cx - > m_collectionNameAlias . getBufStart ( ) ,
alias ) = = 0 )
return false ;
}
return true ;
}
2013-09-28 14:17:43 -06:00
*/
2013-10-11 16:14:26 -06:00
// json can be provided via get or post but content type must be
// url-encoded so we can test with a simple html form page.
2013-10-14 18:19:59 -06:00
/*
2013-10-14 17:19:30 -06:00
bool setSpiderParmsFromJSONPost ( TcpSocket * socket ,
HttpRequest * hr ,
CollectionRec * cr ) {
2013-10-11 16:14:26 -06:00
// get the json
char * json = hr - > getString ( " json " ) ;
if ( ! json )
return sendReply2 ( socket ,
2014-04-05 18:09:04 -07:00
FORMAT_JSON ,
2013-10-11 16:14:26 -06:00
" No &json= provided in request. " ) ;
2013-10-14 13:00:05 -06:00
Json JP ;
bool status = JP . parseJsonStringIntoJsonItems ( json ) ;
2013-10-11 16:14:26 -06:00
2013-10-14 13:00:05 -06:00
// wtf?
if ( ! status )
2014-04-05 18:09:04 -07:00
return sendReply2 ( socket , FORMAT_JSON ,
2013-10-14 13:00:05 -06:00
" Error with JSON parser. " ) ;
2013-10-11 16:14:26 -06:00
2013-10-14 13:00:05 -06:00
// error adding it?
if ( ! cr )
2014-04-05 18:09:04 -07:00
return sendReply2 ( socket , FORMAT_JSON ,
2013-10-14 13:00:05 -06:00
" Failed to create new collection. " ) ;
2013-10-14 16:10:48 -06:00
ji = JP . getFirstItem ( ) ;
char * seed = NULL ;
2013-10-11 16:14:26 -06:00
// traverse the json
2013-10-14 13:00:05 -06:00
for ( ; ji ; ji = ji - > m_next ) {
// just get STRINGS or NUMS
if ( ji - > m_type ! = JT_STRING & & ji - > m_type ! = JT_NUMBER )
continue ;
// check name
char * name = ji - > m_name ;
char * val = ji - > getValue ( ) ;
2013-10-14 16:10:48 -06:00
if ( strcmp ( name , " seed " ) = = 0 )
seed = val ;
if ( strcmp ( name , " email " ) = = 0 )
cr - > m_notifyEmail . set ( val ) ;
if ( strcmp ( name , " webhook " ) = = 0 )
cr - > m_notifyUrl . set ( val ) ;
if ( strcmp ( name , " frequency " ) = = 0 )
cr - > m_collectiveRespiderFrequency = atof ( val ) ;
if ( strcmp ( name , " maxToCrawl " ) = = 0 )
2013-10-23 11:40:30 -07:00
cr - > m_maxToCrawl = atoll ( val ) ;
2013-10-14 16:10:48 -06:00
if ( strcmp ( name , " maxToProcess " ) = = 0 )
2013-10-23 11:40:30 -07:00
cr - > m_maxToProcess = atoll ( val ) ;
2013-10-14 16:10:48 -06:00
if ( strcmp ( name , " pageProcessPattern " ) = = 0 )
cr - > m_diffbotPageProcessPattern . set ( val ) ;
if ( strcmp ( name , " obeyRobots " ) = = 0 ) {
if ( val [ 0 ] = = ' t ' | | val [ 0 ] = = ' T ' | | val [ 0 ] = = 1 )
cr - > m_useRobotsTxt = true ;
else
cr - > m_useRobotsTxt = false ;
}
if ( strcmp ( name , " onlyProcessNew " ) = = 0 ) {
if ( val [ 0 ] = = ' t ' | | val [ 0 ] = = ' T ' | | val [ 0 ] = = 1 )
cr - > m_diffbotOnlyProcessIfNew = true ;
else
cr - > m_diffbotOnlyProcessIfNew = false ;
}
if ( strcmp ( name , " pauseCrawl " ) = = 0 ) {
if ( val [ 0 ] = = ' t ' | | val [ 0 ] = = ' T ' | | val [ 0 ] = = 1 )
cr - > m_spideringEnabled = 0 ;
else
cr - > m_spideringEnabled = 1 ;
2013-10-14 13:00:05 -06:00
}
}
2013-10-14 16:10:48 -06:00
// set collective respider in case just that was passed
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < MAX_FILTERS ; i + + )
2013-10-14 16:10:48 -06:00
cr - > m_spiderFreqs [ i ] = cr - > m_collectiveRespiderFrequency ;
2013-10-14 13:00:05 -06:00
2013-10-14 16:10:48 -06:00
// if url filters not specified, we are done
if ( ! JP . getItem ( " urlFilters " ) )
return true ;
// reset the url filters here to the default set.
// we will append the client's filters below them below.
resetUrlFilters ( cr ) ;
2013-10-14 13:00:05 -06:00
char * expression = NULL ;
char * action = NULL ;
// start over at top
ji = JP . getFirstItem ( ) ;
// "urlFilters": [
// {
2013-10-14 16:10:48 -06:00
// "value": "*", // MDW - this matches all urls! ("default")
2013-10-14 13:00:05 -06:00
// "action": "http://www.diffbot.com/api/analyze?mode=auto"
// }
// {
// "value": "company",
// "action" : "http://www.diffbot.com/api/article?tags&meta"
// }
// {
// "value": "^http://www",
// "action": "doNotProcess"
// }
// {
// "value": "$.html && category",
// "action": "doNotCrawl"
// }
// {
// "value": "!$.html && $.php",
// "action": "doNotCrawl"
// }
// ]
// how many filters do we have so far?
2014-11-10 14:45:11 -08:00
int32_t nf = cr - > m_numRegExs ;
2013-10-14 13:00:05 -06:00
for ( ; ji ; ji = ji - > m_next ) {
// just get STRINGS only
if ( ji - > m_type ! = JT_STRING ) continue ;
// must be right now
char * name = ji - > m_name ;
char * value = ji - > getValue ( ) ;
if ( strcmp ( name , " value " ) = = 0 )
expression = value ;
if ( strcmp ( name , " action " ) = = 0 )
action = ji - > getValue ( ) ;
// need both
if ( ! action ) continue ;
if ( ! expression ) continue ;
2013-10-14 16:10:48 -06:00
// they use "*" instead of "default" so put that back
if ( expression [ 0 ] = = ' * ' )
expression = " default " ;
2013-10-14 13:00:05 -06:00
// deal with it
cr - > m_regExs [ 1 ] . set ( expression ) ;
2013-10-14 16:10:48 -06:00
cr - > m_numRegExs + + ;
2014-11-10 14:45:11 -08:00
int32_t priority = 50 ;
2013-10-14 13:00:05 -06:00
// default diffbot api call:
2013-10-14 16:10:48 -06:00
char * api = NULL ;
2013-10-14 13:00:05 -06:00
if ( strcasecmp ( action , " donotcrawl " ) = = 0 )
priority = SPIDER_PRIORITY_FILTERED ;
2013-10-14 16:10:48 -06:00
//if ( strcasecmp(action,"donotprocess") == 0 )
// api = NULL;
2013-10-14 13:00:05 -06:00
// a new diffbot url?
if ( strcasecmp ( action , " http " ) = = 0 )
api = action ;
// add the new filter
2013-10-14 16:10:48 -06:00
cr - > m_regExs [ nf ] . set ( expression ) ;
2013-10-14 13:00:05 -06:00
cr - > m_spiderPriorities [ nf ] = priority ;
2013-10-14 16:10:48 -06:00
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( api ) ;
nf + + ;
// add a mirror of that filter but for manually added,
// i.e. injected or via add url,
if ( priority < 0 ) continue ;
// make the priority higher!
cr - > m_regExs [ nf ] . safePrintf ( " ismanualadd && %s " , expression ) ;
cr - > m_spiderPriorities [ nf ] = 70 ;
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( api ) ; // appends \0
nf + + ;
2013-10-14 13:00:05 -06:00
// NULL out again
action = NULL ;
expression = NULL ;
2013-10-14 16:10:48 -06:00
if ( nf < MAX_FILTERS ) continue ;
log ( " crawlbot: too many url filters! " ) ;
break ;
}
// update the counts
cr - > m_numRegExs = nf ;
cr - > m_numRegExs2 = nf ;
cr - > m_numRegExs3 = nf ;
cr - > m_numRegExs10 = nf ;
cr - > m_numRegExs5 = nf ;
cr - > m_numRegExs6 = nf ;
cr - > m_numRegExs7 = nf ;
cr - > m_numRegExs11 = nf ;
// set collective respider
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < nf ; i + + )
2013-10-14 16:10:48 -06:00
cr - > m_spiderFreqs [ i ] = cr - > m_collectiveRespiderFrequency ;
return true ;
}
2013-10-14 18:19:59 -06:00
*/
2013-10-14 16:10:48 -06:00
2013-10-14 17:19:30 -06:00
2013-12-10 13:09:55 -08:00
/*
THIS IS NOW AUTOMATIC from new Parms . cpp broadcast logic
2013-10-21 17:35:14 -07:00
2013-10-14 17:19:30 -06:00
bool setSpiderParmsFromHtmlRequest ( TcpSocket * socket ,
HttpRequest * hr ,
CollectionRec * cr ) {
// update the url filters for now since that is complicated
// supply "cr" directly since "c" may not be in the http
// request if addcoll=xxxxxx (just created a new rec)
2014-11-10 14:45:11 -08:00
//int32_t page = PAGE_FILTERS;
2013-10-14 18:19:59 -06:00
//WebPage *pg = g_pages.getPage ( page ) ;
//g_parms.setFromRequest ( hr , socket , pg->m_function, cr );
2013-12-03 16:17:36 -08:00
bool rebuild = false ;
2013-10-14 18:19:59 -06:00
//
// set other diffbot parms for this collection
//
2014-11-10 14:45:11 -08:00
int32_t maxToCrawl = hr - > getLongLong ( " maxToCrawl " , - 1LL ) ;
2013-11-11 15:52:04 -08:00
if ( maxToCrawl = = - 1 )
maxToCrawl = hr - > getLongLong ( " maxToDownload " , - 1LL ) ;
2013-10-14 18:19:59 -06:00
if ( maxToCrawl ! = - 1 ) {
2013-10-23 11:40:30 -07:00
cr - > m_maxToCrawl = maxToCrawl ;
2013-10-14 18:19:59 -06:00
cr - > m_needsSave = 1 ;
}
2014-11-10 14:45:11 -08:00
int32_t maxToProcess = hr - > getLongLong ( " maxToProcess " , - 1LL ) ;
2013-10-14 18:19:59 -06:00
if ( maxToProcess ! = - 1 ) {
2013-10-23 11:40:30 -07:00
cr - > m_maxToProcess = maxToProcess ;
cr - > m_needsSave = 1 ;
}
// -1 means no max, so use -2 as default here
2014-11-10 14:45:11 -08:00
int32_t maxCrawlRounds = hr - > getLongLong ( " maxCrawlRounds " , - 2LL ) ;
2013-11-11 15:52:04 -08:00
if ( maxCrawlRounds = = - 2 )
maxCrawlRounds = hr - > getLongLong ( " maxRounds " , - 2LL ) ;
2013-10-23 11:40:30 -07:00
if ( maxCrawlRounds ! = - 2 ) {
cr - > m_maxCrawlRounds = maxCrawlRounds ;
2013-10-14 18:19:59 -06:00
cr - > m_needsSave = 1 ;
}
char * email = hr - > getString ( " notifyEmail " , NULL , NULL ) ;
if ( email ) {
cr - > m_notifyEmail . set ( email ) ;
cr - > m_needsSave = 1 ;
}
char * url = hr - > getString ( " notifyWebHook " , NULL , NULL ) ;
2013-10-15 12:31:02 -06:00
if ( ! url ) url = hr - > getString ( " notifyWebhook " , NULL , NULL ) ;
2013-10-14 18:19:59 -06:00
if ( url ) {
2013-10-21 17:51:23 -07:00
// assume url is invalid, purge it
cr - > m_notifyUrl . purge ( ) ;
// normalize
Url norm ;
norm . set ( url ) ;
if ( norm . getDomainLen ( ) > 0 & &
norm . getHostLen ( ) > 0 )
// set the ssafebuf to it. will \0 terminate it.
cr - > m_notifyUrl . set ( norm . getUrl ( ) ) ;
// save the collection rec
2013-10-14 18:19:59 -06:00
cr - > m_needsSave = 1 ;
}
2014-11-10 14:45:11 -08:00
int32_t pause = hr - > getLong ( " pauseCrawl " , - 1 ) ;
2013-10-22 18:51:09 -07:00
// /v2/bulk api support
if ( pause = = - 1 ) pause = hr - > getLong ( " pause " , - 1 ) ;
2013-10-14 18:19:59 -06:00
if ( pause = = 0 ) { cr - > m_needsSave = 1 ; cr - > m_spideringEnabled = 1 ; }
if ( pause = = 1 ) { cr - > m_needsSave = 1 ; cr - > m_spideringEnabled = 0 ; }
2014-11-10 14:45:11 -08:00
int32_t obeyRobots = hr - > getLong ( " obeyRobots " , - 1 ) ;
2013-10-22 18:55:19 -07:00
if ( obeyRobots = = - 1 ) obeyRobots = hr - > getLong ( " robots " , - 1 ) ;
2013-10-14 18:19:59 -06:00
if ( obeyRobots ! = - 1 ) {
cr - > m_useRobotsTxt = obeyRobots ;
cr - > m_needsSave = 1 ;
}
2014-11-10 14:45:11 -08:00
int32_t restrictDomain = hr - > getLong ( " restrictDomain " , - 1 ) ;
2013-10-29 09:31:57 -07:00
if ( restrictDomain ! = - 1 ) {
cr - > m_restrictDomain = restrictDomain ;
cr - > m_needsSave = 1 ;
2013-12-03 16:17:36 -08:00
rebuild = true ;
2013-10-29 09:31:57 -07:00
}
2013-11-20 16:41:28 -08:00
char * api = hr - > getString ( " apiUrl " , NULL ) ;
if ( api ) {
cr - > m_diffbotApiUrl . set ( api ) ;
cr - > m_needsSave = 1 ;
}
char * ppp1 = hr - > getString ( " urlCrawlPattern " , NULL ) ;
if ( ppp1 ) {
cr - > m_diffbotUrlCrawlPattern . set ( ppp1 ) ;
cr - > m_needsSave = 1 ;
2013-12-03 16:17:36 -08:00
rebuild = true ;
2013-11-20 16:41:28 -08:00
}
char * ppp2 = hr - > getString ( " urlProcessPattern " , NULL ) ;
if ( ppp2 ) {
cr - > m_diffbotUrlProcessPattern . set ( ppp2 ) ;
cr - > m_needsSave = 1 ;
}
char * ppp3 = hr - > getString ( " pageProcessPattern " , NULL ) ;
if ( ppp3 ) {
cr - > m_diffbotPageProcessPattern . set ( ppp3 ) ;
2013-10-14 18:19:59 -06:00
cr - > m_needsSave = 1 ;
}
2013-12-03 16:23:05 -08:00
// reg ex support
char * rx1 = hr - > getString ( " urlCrawlRegEx " , NULL ) ;
// clear what we had
if ( rx1 & & cr - > m_hasucr ) {
regfree ( & cr - > m_ucr ) ;
cr - > m_hasucr = false ;
cr - > m_diffbotUrlCrawlRegEx . purge ( ) ;
cr - > m_needsSave = 1 ;
2013-12-03 16:23:58 -08:00
rebuild = true ;
2013-12-03 16:23:05 -08:00
}
// add a new one if not blank
if ( rx1 & & rx1 [ 0 ] ) {
cr - > m_diffbotUrlCrawlRegEx . set ( rx1 ) ;
cr - > m_needsSave = 1 ;
// this will store the compiled regular expression into ucr
if ( regcomp ( & cr - > m_ucr ,
// the regular expression to compile
rx1 ,
// some flags
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ) {
regfree ( & cr - > m_ucr ) ;
// should never fail!
return log ( " xmldoc: regcomp %s failed: %s. "
" Ignoring. " ,
rx1 , mstrerror ( errno ) ) ;
}
cr - > m_hasucr = true ;
}
char * rx2 = hr - > getString ( " urlProcessRegEx " , NULL ) ;
// clear what we had
if ( rx2 & & cr - > m_hasupr ) {
regfree ( & cr - > m_upr ) ;
cr - > m_hasupr = false ;
cr - > m_diffbotUrlProcessRegEx . purge ( ) ;
cr - > m_needsSave = 1 ;
}
// add a new one if not blank
if ( rx2 & & rx2 [ 0 ] ) {
cr - > m_diffbotUrlProcessRegEx . set ( rx2 ) ;
cr - > m_needsSave = 1 ;
// this will store the compiled regular expression into upr
if ( regcomp ( & cr - > m_upr ,
// the regular expression to compile
rx2 ,
// some flags
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ) {
regfree ( & cr - > m_upr ) ;
// error!
return log ( " xmldoc: regcomp %s failed: %s. "
" Ignoring. " ,
rx2 , mstrerror ( errno ) ) ;
}
cr - > m_hasupr = true ;
}
2013-11-11 15:52:04 -08:00
float respider = hr - > getFloat ( " repeatJob " , - 1.0 ) ;
2013-10-22 18:55:19 -07:00
if ( respider = = - 1.0 ) respider = hr - > getFloat ( " repeat " , - 1.0 ) ;
2013-11-11 15:52:04 -08:00
if ( respider = = - 1.0 ) respider = hr - > getFloat ( " repeatCrawl " , - 1.0 ) ;
2013-10-14 18:19:59 -06:00
if ( respider > = 0.0 ) {
2013-10-17 17:17:19 -07:00
// if not 0, then change this by the delta
if ( cr - > m_spiderRoundStartTime ) {
2013-10-25 12:11:40 -07:00
// convert from days into seconds
float rfOld = cr - > m_collectiveRespiderFrequency ;
float rfNew = respider ;
// 86400 seconds in a day
2014-11-10 14:45:11 -08:00
int32_t secondsOld = ( int32_t ) ( rfOld * 86400 ) ;
int32_t secondsNew = ( int32_t ) ( rfNew * 86400 ) ;
2013-10-17 17:17:19 -07:00
// remove old one.
2013-10-25 12:11:40 -07:00
cr - > m_spiderRoundStartTime - = secondsOld ;
2013-10-17 17:17:19 -07:00
// add in new one
2013-10-25 12:11:40 -07:00
cr - > m_spiderRoundStartTime + = secondsNew ;
2013-10-17 17:17:19 -07:00
}
// if 0 that means NO recrawling
2013-10-17 18:59:00 -07:00
if ( respider = = 0.0 ) {
cr - > m_spiderRoundStartTime = 0 ; //getTimeGlobal();
2013-10-17 17:17:19 -07:00
}
2013-10-14 18:19:59 -06:00
cr - > m_collectiveRespiderFrequency = respider ;
cr - > m_needsSave = 1 ;
}
2013-10-24 19:05:57 -07:00
2013-10-28 21:20:44 -07:00
float delay = hr - > getFloat ( " crawlDelay " , - 1.0 ) ;
2014-11-10 14:45:11 -08:00
//int32_t crawlWait = hr->getLong("wait",-1);
2013-12-03 16:17:36 -08:00
if ( delay > = 0.0 ) {
rebuild = true ;
2013-10-28 21:20:44 -07:00
cr - > m_collectiveCrawlDelay = delay ;
2013-12-03 16:17:36 -08:00
}
2013-10-24 19:05:57 -07:00
2014-11-10 14:45:11 -08:00
int32_t onlyProcessNew = hr - > getLong ( " onlyProcessIfNew " , - 1 ) ;
2013-10-14 18:19:59 -06:00
if ( onlyProcessNew ! = - 1 ) {
cr - > m_diffbotOnlyProcessIfNew = onlyProcessNew ;
cr - > m_needsSave = 1 ;
}
// set collective respider
2014-11-10 14:45:11 -08:00
//for ( int32_t i =0 ; i < cr->m_numRegExs ; i++ ) {
2013-10-21 15:06:23 -07:00
// if ( cr->m_collectiveRespiderFrequency == 0.0 )
// cr->m_spiderFreqs[i] = 0.000;
// else
// cr->m_spiderFreqs[i] = 0.001;
// //cr->m_collectiveRespiderFrequency;
//}
2013-10-14 18:19:59 -06:00
2013-10-22 18:51:09 -07:00
char * path = hr - > getPath ( ) ;
bool isBulkApi = false ;
if ( path & & strncmp ( path , " /v2/bulk " , 8 ) = = 0 ) isBulkApi = true ;
2013-10-14 18:19:59 -06:00
// were any url filteres specified? if not, don't reset them
2013-11-20 16:41:28 -08:00
//if ( ! hr->hasField("action") )
// return true;
2013-10-14 18:19:59 -06:00
// reset the url filters here to the default set.
// we will append the client's filters below them below.
resetUrlFilters ( cr ) ;
2013-12-03 16:17:36 -08:00
// if it was not recrawling and we made it start we have
// to repopulate waiting tree because most entries will
// need to be re-added!
// really, anytime we change url filters we have to repopulate
// the waiting tree
SpiderColl * sc = cr - > m_spiderColl ;
if ( sc & & rebuild ) {
// this is causing a bulk job not to complete because
// jenkins keeps checking it every 10 seconds
sc - > m_waitingTreeNeedsRebuild = true ;
}
2013-11-20 16:41:28 -08:00
return true ;
2013-10-14 18:19:59 -06:00
// "urlFilters": [
// {
// "value": "*", // MDW - this matches all urls! ("default")
// "action": "http://www.diffbot.com/api/analyze?mode=auto"
// }
// {
// "value": "company",
// "action" : "http://www.diffbot.com/api/article?tags&meta"
// }
// {
// "value": "^http://www",
// "action": "doNotProcess"
// }
// {
// "value": "$.html && category",
// "action": "doNotCrawl"
// }
// {
// "value": "!$.html && $.php",
// "action": "doNotCrawl"
// }
// ]
char * expression = NULL ;
char * action = NULL ;
// how many filters do we have so far?
2014-11-10 14:45:11 -08:00
int32_t nf = cr - > m_numRegExs ;
2013-10-14 18:19:59 -06:00
2013-10-15 12:22:59 -06:00
// delete the 3rd default filter cuz we should re-add it below
// to the bottom of the list.
if ( nf > = 3 ) nf - - ;
bool addedDefault = false ;
2013-10-14 18:19:59 -06:00
// loop over the cgi parms
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < hr - > getNumFields ( ) ; i + + ) {
2013-10-14 18:19:59 -06:00
// get cgi parm name
char * field = hr - > getField ( i ) ;
2014-11-10 14:45:11 -08:00
//int32_t flen = hr->getFieldLen ( i );
2013-10-14 18:19:59 -06:00
if ( strcmp ( field , " expression " ) = = 0 )
expression = hr - > getValue ( i ) ;
if ( strcmp ( field , " action " ) = = 0 )
action = hr - > getValue ( i ) ;
// need both
if ( ! action ) continue ;
2013-11-14 09:54:36 -08:00
// no! the /v2/bulk api just has a single action
if ( isBulkApi ) expression = " * " ;
2013-10-14 18:19:59 -06:00
// action before expresion???? set action to NULL then?
2013-11-14 09:54:36 -08:00
if ( ! expression ) continue ;
//else continue;// { action = NULL; continue; }
2013-10-17 17:17:19 -07:00
// skip whitespace
while ( is_wspace_a ( * expression ) ) expression + + ;
while ( is_wspace_a ( * action ) ) action + + ;
2013-10-16 12:12:22 -07:00
// skip if expression is empty
if ( ! expression [ 0 ] ) {
action = NULL ; expression = NULL ; continue ; }
2013-10-14 18:19:59 -06:00
// they use "*" instead of "default" so put that back
2013-10-15 12:22:59 -06:00
if ( expression [ 0 ] = = ' * ' ) {
2013-10-14 18:19:59 -06:00
expression = " default " ;
2013-10-15 12:22:59 -06:00
addedDefault = true ;
}
2013-10-14 18:19:59 -06:00
// deal with it
2014-11-10 14:45:11 -08:00
int32_t priority = 50 ;
2013-10-14 18:19:59 -06:00
// default diffbot api call:
2013-10-17 17:17:19 -07:00
//char *api = NULL;
2013-10-16 12:12:22 -07:00
if ( strcasecmp ( action , " donotcrawl " ) = = 0 )
2013-10-14 18:19:59 -06:00
priority = SPIDER_PRIORITY_FILTERED ;
//if ( strcasecmp(action,"donotprocess") == 0 )
// api = NULL;
// a new diffbot url?
2013-10-16 12:19:25 -07:00
//if ( strncasecmp(action,"http",4) == 0 )
2013-10-17 17:17:19 -07:00
//api = action;
2013-10-14 18:19:59 -06:00
2013-10-21 13:44:30 -07:00
// add a mirror of that filter but for manually added,
// i.e. injected or via add url,
if ( priority > = 0 ) {
// purge because might have been the last "default"
// filter that we did nf-- above on.
cr - > m_regExs [ nf ] . purge ( ) ;
// make the priority higher!
cr - > m_regExs [ nf ] . safePrintf ( " ismanualadd && %s " ,
expression ) ;
cr - > m_spiderPriorities [ nf ] = 70 ;
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( action ) ; // appends\0
2013-10-21 15:06:23 -07:00
cr - > m_spiderFreqs [ nf ] =
cr - > m_collectiveRespiderFrequency ;
2013-10-21 13:44:30 -07:00
nf + + ;
}
2013-10-14 18:19:59 -06:00
// add the new filter
cr - > m_regExs [ nf ] . set ( expression ) ;
cr - > m_spiderPriorities [ nf ] = priority ;
2013-10-17 17:17:19 -07:00
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( action ) ;
2013-10-21 15:06:23 -07:00
cr - > m_spiderFreqs [ nf ] = cr - > m_collectiveRespiderFrequency ;
2013-10-14 18:19:59 -06:00
nf + + ;
// NULL out again
action = NULL ;
expression = NULL ;
if ( nf < MAX_FILTERS ) continue ;
log ( " crawlbot: too many url filters! " ) ;
break ;
}
2013-10-15 12:22:59 -06:00
// if no '*' line was provided, add it here
if ( ! addedDefault ) {
2013-11-13 18:31:26 -08:00
cr - > m_regExs [ nf ] . set ( " default " ) ;
2013-10-15 12:22:59 -06:00
cr - > m_spiderPriorities [ nf ] = 50 ;
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( NULL ) ;
2013-10-21 15:06:23 -07:00
cr - > m_spiderFreqs [ nf ] = cr - > m_collectiveRespiderFrequency ;
2013-10-15 12:22:59 -06:00
nf + + ;
}
2013-10-14 18:19:59 -06:00
// update the counts
cr - > m_numRegExs = nf ;
cr - > m_numRegExs2 = nf ;
cr - > m_numRegExs3 = nf ;
cr - > m_numRegExs10 = nf ;
cr - > m_numRegExs5 = nf ;
cr - > m_numRegExs6 = nf ;
cr - > m_numRegExs7 = nf ;
cr - > m_numRegExs11 = nf ;
// set collective respider
2014-11-10 14:45:11 -08:00
//for ( int32_t i =0 ; i < nf ; i++ )
2013-10-21 15:06:23 -07:00
// cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency;
2013-10-14 18:19:59 -06:00
return true ;
}
2013-12-10 13:09:55 -08:00
*/
2013-10-24 11:32:41 -07:00
///////////
//
// SUPPORT for getting the last 100 spidered urls
//
// . sends request to each node
// . each node returns top 100 after scanning spiderdb (cache for speed)
// . master node gets top 100 of the top 100s
// . sends pretty html or json back to socket
// . then user can see why their crawl isn't working
// . also since we are scanning spiderdb indicate how many urls are
// ignored because they match "ismedia" or "!isonsamedomain" etc. so
// show each url filter expression then show how many urls matched that.
// when doing this make the spiderReply null, b/c the purpose is to see
// what urls
// . BUT url may never be attempted because it matches "ismedia" so that kind
// of thing might have to be indicated on the spiderdb dump above, not here.
//
//////////
//bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {