checkpoint. moved warc and arc looping into xmldoc.
now will any container doc from pageinject into xmldoc. simplifies pageinject.cpp a lot. and sets up a framework for dealing with container docs.
This commit is contained in:
parent
d3c071e4c0
commit
0ca27638bc
372
PageInject.cpp
372
PageInject.cpp
@ -349,8 +349,6 @@ void Msg7::reset() {
|
||||
m_injectCount = 0;
|
||||
m_start = NULL;
|
||||
m_sbuf.reset();
|
||||
m_isWarc = false;
|
||||
m_isArc = false;
|
||||
m_isDoneInjecting = false;
|
||||
}
|
||||
|
||||
@ -391,10 +389,6 @@ void injectLoopWrapper9 ( void *state ) {
|
||||
if ( delim && ! delim[0] ) delim = NULL;
|
||||
bool loopIt = false;
|
||||
if ( delim ) loopIt = true;
|
||||
// by default warc and arc files consist of many subdocuments
|
||||
// that have to be indexed individually as well
|
||||
if ( msg7->m_isWarc ) loopIt = true;
|
||||
if ( msg7->m_isArc ) loopIt = true;
|
||||
|
||||
if ( loopIt ) { // && msg7->m_start ) {
|
||||
// do another injection. returns false if it blocks
|
||||
@ -526,22 +520,6 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
|
||||
sendReply ( slot );
|
||||
}
|
||||
|
||||
void gotWarcContentWrapper ( void *state , TcpSocket *ts ) {
|
||||
Msg7 *THIS = (Msg7 *)state;
|
||||
// set content to that
|
||||
GigablastRequest *gr = &THIS->m_gr;
|
||||
gr->m_contentBuf.setBuf (ts->m_readBuf,
|
||||
ts->m_readBufSize ,
|
||||
ts->m_readOffset ,
|
||||
true , // ownBuf?
|
||||
0 ); // encoding
|
||||
// just ref it
|
||||
gr->m_content = ts->m_readBuf;
|
||||
// so tcpserver.cpp doesn't free the ward/arc file
|
||||
ts->m_readBuf = NULL;
|
||||
// continue with injection loop
|
||||
injectLoopWrapper9 ( THIS );
|
||||
}
|
||||
|
||||
// . returns false if blocked and callback will be called, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -593,66 +571,6 @@ bool Msg7::inject ( void *state ,
|
||||
// get the normalized url
|
||||
u.set ( gr->m_url );
|
||||
|
||||
char *ustr = u.getUrl();
|
||||
int32_t ulen = u.getUrlLen();
|
||||
char *uend = ustr + ulen;
|
||||
|
||||
m_isWarc = false;
|
||||
m_isArc = false;
|
||||
|
||||
if ( ulen>8 && strncmp(uend-8,".warc.gz",8)==0 )
|
||||
m_isWarc = true;
|
||||
if ( ulen>8 && strncmp(uend-5,".warc" ,5)==0 )
|
||||
m_isWarc = true;
|
||||
|
||||
if ( ulen>8 && strncmp(uend-7,".arc.gz",7)==0 )
|
||||
m_isArc = true;
|
||||
if ( ulen>8 && strncmp(uend-4,".arc" ,4)==0 )
|
||||
m_isArc = true;
|
||||
|
||||
// if warc/arc download it and make gr->m_content reference it...
|
||||
// we won't handle redirects though.
|
||||
if ( ! content && ( m_isWarc || m_isArc) ) {
|
||||
// download the warc/arc url
|
||||
if ( ! g_httpServer.getDoc ( ustr ,
|
||||
0 , // urlip
|
||||
0 , // offset
|
||||
-1 ,
|
||||
0,//r->m_ifModifiedSince ,
|
||||
this , // state
|
||||
gotWarcContentWrapper ,// callback
|
||||
30*1000 , // 30 sec timeout
|
||||
0 , // r->m_proxyIp ,
|
||||
0 , // r->m_proxyPort ,
|
||||
-1,//r->m_maxTextDocLen ,
|
||||
-1,//r->m_maxOtherDocLen ,
|
||||
NULL,//agent ,
|
||||
DEFAULT_HTTP_PROTO , // "HTTP/1.0"
|
||||
false , // doPost?
|
||||
NULL , // cookie
|
||||
NULL , // additionalHeader
|
||||
NULL , // our own mime!
|
||||
NULL , // postContent
|
||||
NULL))//proxyUsernamePwdAuth ) )
|
||||
// return false if blocked
|
||||
return false;
|
||||
// error?
|
||||
log("inject: %s",mstrerror(g_errno));
|
||||
}
|
||||
|
||||
if ( m_firstTime && ( m_isWarc || m_isArc ) ) {
|
||||
// skip over the first http mime header, it is not
|
||||
// part of the warc file per se.
|
||||
content = strstr(content,"\r\n\r\n");
|
||||
if ( ! content ) {
|
||||
log("inject: no mime received from webserver");
|
||||
return true;
|
||||
}
|
||||
// skip over that to point to start of actual warc
|
||||
// file content
|
||||
content += 4;
|
||||
}
|
||||
|
||||
if ( m_firstTime ) {
|
||||
m_firstTime = false;
|
||||
m_start = content;
|
||||
@ -668,10 +586,6 @@ bool Msg7::inject ( void *state ,
|
||||
char *delim = gr->m_contentDelim;
|
||||
if ( delim && ! delim[0] ) delim = NULL;
|
||||
|
||||
// delim is sill for warc/arcs so ignore it
|
||||
if ( m_isWarc || m_isArc ) delim = NULL;
|
||||
|
||||
|
||||
// if doing delimeterized injects, hitting a \0 is the end of the road
|
||||
if ( delim && m_fixMe && ! m_saved ) {
|
||||
m_isDoneInjecting = true;
|
||||
@ -702,290 +616,6 @@ bool Msg7::inject ( void *state ,
|
||||
m_start = start + gbstrlen(start);
|
||||
}
|
||||
|
||||
// WARC files are mime delimeted. the http reply, which
|
||||
// contains a mime, as a mime a level above that whose
|
||||
// content-length: field includes the original http reply mime
|
||||
// as part of its content.
|
||||
if ( m_isWarc ) { // gr->m_containerContentType == CT_WARC ) {
|
||||
// no setting delim for this!
|
||||
if ( delim ) { char *xx=NULL;*xx=0; }
|
||||
// should have the url as well
|
||||
char *mm = strstr(start,"Content-Length:");
|
||||
char *mmend = NULL;
|
||||
if ( mm ) mmend = strstr (mm,"\n");
|
||||
if ( ! mm || ! mmend ) {
|
||||
log("inject: warc: all done");
|
||||
// XmlDoc.cpp checks for this to stop calling us
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
char c = *mmend;
|
||||
*mmend = '\0';
|
||||
int64_t recordSize = atoll ( mm + 15 );
|
||||
*mmend = c;
|
||||
|
||||
// end of mime header
|
||||
char *hend = strstr ( mm, "\r\n\r\n");
|
||||
if ( ! hend ) {
|
||||
log("inject: warc: could no mime header end.");
|
||||
return true;
|
||||
}
|
||||
|
||||
// tmp \0 that for these strstr() calls
|
||||
c = *hend;
|
||||
*hend = '\0';
|
||||
|
||||
char *warcUrl = strstr(start,"WARC-Target-URI:");
|
||||
char *warcType = strstr(start,"WARC-Type:");
|
||||
char *warcDate = strstr(start,"WARC-Date:");
|
||||
char *warcIp = strstr(start,"WARC-IP-Address:");
|
||||
|
||||
// advance
|
||||
if ( warcUrl ) warcUrl += 16;
|
||||
if ( warcType ) warcType += 10;
|
||||
if ( warcDate ) warcDate += 10;
|
||||
if ( warcIp ) warcIp += 17;
|
||||
|
||||
// restore
|
||||
*hend = c;
|
||||
|
||||
// skip the \r\n\r\n
|
||||
hend += 4;
|
||||
|
||||
// adjust start to point to start of the content really
|
||||
start = hend;
|
||||
|
||||
// and over record
|
||||
m_start = hend + recordSize;
|
||||
advanced = true;
|
||||
|
||||
if ( ! warcType ) {
|
||||
log("inject: warc: could not find rec type");
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( is_wspace_a(*warcType) ) warcType++;
|
||||
if ( is_wspace_a(*warcType) ) warcType++;
|
||||
|
||||
// WARC-Type:
|
||||
// do not index this record as a doc if it is not a
|
||||
// "WARC-Type: response" record.
|
||||
if ( strncmp(warcType,"response",8) != 0 )
|
||||
return true;
|
||||
|
||||
// skip this rec if url-less
|
||||
if ( ! warcUrl ) {
|
||||
log("inject: warc: could not find rec url");
|
||||
return true;
|
||||
}
|
||||
if ( ! warcDate ) {
|
||||
log("inject: warc: could not find rec date");
|
||||
return true;
|
||||
}
|
||||
|
||||
// skip spaces on all
|
||||
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
|
||||
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
|
||||
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
|
||||
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
|
||||
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
|
||||
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
|
||||
|
||||
// url must start with http:// or https://
|
||||
// it's probably like WARC-Target-URI: dns:www.xyz.com
|
||||
// so it is a dns response
|
||||
if ( strncmp(warcUrl,"http://" ,7) != 0 &&
|
||||
strncmp(warcUrl,"https://",8) != 0 )
|
||||
return true;
|
||||
|
||||
gr->m_injectDocIp = 0;
|
||||
|
||||
// get the record IP address from the warc header if there
|
||||
if ( warcIp ) {
|
||||
// get end of ip
|
||||
char *warcIpEnd = warcIp;
|
||||
// skip digits and periods
|
||||
while ( ! is_wspace_a(*warcIpEnd) ) warcIpEnd++;
|
||||
// we now have the ip address for doing ip: searches
|
||||
// this func is in ip.h
|
||||
gr->m_injectDocIp = atoip ( warcIp, warcIpEnd-warcIp );
|
||||
}
|
||||
|
||||
// convert date to timestamp
|
||||
int64_t warcTime = 0;
|
||||
if ( warcDate ) warcTime = atotime ( warcDate );
|
||||
gr->m_firstIndexed = warcTime;
|
||||
gr->m_lastSpidered = warcTime;
|
||||
// does this work?
|
||||
gr->m_hopCount = -1;
|
||||
gr->m_diffbotReply = 0;
|
||||
gr->m_newOnly = 0;
|
||||
// end of the url
|
||||
char *warcUrlEnd = warcUrl;
|
||||
for ( ; *warcUrlEnd && ! is_wspace_a(*warcUrlEnd) ;
|
||||
warcUrlEnd++ );
|
||||
// set it to that
|
||||
m_injectUrlBuf.reset();
|
||||
// by default append a -<ch64> to the provided url
|
||||
int32_t warcUrlLen = warcUrlEnd - warcUrl;
|
||||
m_injectUrlBuf.safeMemcpy(warcUrl,warcUrlLen);
|
||||
m_injectUrlBuf.nullTerm();
|
||||
// skip if robots.txt
|
||||
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
|
||||
m_injectUrlBuf.getLength() ) )
|
||||
return true;
|
||||
// all warc records have the http mime
|
||||
gr->m_hasMime = true;
|
||||
char *recMime = hend;
|
||||
// and find the next \r\n\r\n
|
||||
char *recMimeEnd = strstr ( recMime , "\r\n\r\n" );
|
||||
if ( ! recMimeEnd ) {
|
||||
log("inject: warc: no http mime.");
|
||||
return true;
|
||||
}
|
||||
// gotta include the \r\n\r\n in the mime length here
|
||||
recMimeEnd += 4;
|
||||
// should be a mime that starts with GET or POST
|
||||
HttpMime mime;
|
||||
if ( ! mime.set ( recMime, recMimeEnd - recMime , NULL ) ) {
|
||||
log("inject: warc: mime set failed ");
|
||||
return true;
|
||||
}
|
||||
// check content type. if bad advance to next rec.
|
||||
int ct = mime.getContentType();
|
||||
if ( ct != CT_HTML &&
|
||||
ct != CT_TEXT &&
|
||||
ct != CT_XML &&
|
||||
ct != CT_JSON )
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// ARC files have a url on one line and the length on the next line
|
||||
if ( m_isArc ) {
|
||||
// no setting delim for this!
|
||||
if ( delim ) { char *xx=NULL;*xx=0; }
|
||||
// should have the url as well
|
||||
char *arcHeader = strstr(start,"\nhttp");
|
||||
//char *mmend = NULL;
|
||||
//if ( mm ) mmend = strstr (mm,"\n");
|
||||
if ( ! arcHeader ) { // || ! mmend ) {
|
||||
log("inject: arc: all done");
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
// find end of url
|
||||
char *arcHeaderEnd = strstr (arcHeader+1,"\n");
|
||||
if ( ! arcHeaderEnd ) {
|
||||
log("inject: arc: no header end. all done");
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
// term it
|
||||
*arcHeaderEnd = '\0';
|
||||
char *arcContent = arcHeaderEnd + 1;
|
||||
|
||||
// parse arc header line
|
||||
char *arcUrl = arcHeader + 1;
|
||||
char *hp = arcUrl;
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 1.");
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0';
|
||||
m_injectUrlBuf.reset();
|
||||
m_injectUrlBuf.safeStrcpy(arcUrl);
|
||||
m_injectUrlBuf.nullTerm();
|
||||
|
||||
|
||||
char *ipStr = hp;
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 2.");
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0';
|
||||
gr->m_injectDocIp = atoip(ipStr);
|
||||
|
||||
char *timeStr = hp;
|
||||
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 3.");
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0'; // null term timeStr
|
||||
char *arcConType = hp;
|
||||
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 4.");
|
||||
m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0'; // null term arcContentType
|
||||
|
||||
char *arcContentLenStr = hp;
|
||||
|
||||
// get arc content len
|
||||
int64_t arcContentLen = atoll(arcContentLenStr);
|
||||
char *arcContentEnd = arcContent + arcContentLen;
|
||||
//uint64_t recSize = (arcContentEnd - realStart);
|
||||
|
||||
// convert to timestamp
|
||||
int64_t arcTime = 0;
|
||||
// this time structure, once filled, will help yield a time_t
|
||||
struct tm t;
|
||||
// DAY OF MONTH
|
||||
t.tm_mday = atol2 ( timeStr + 6 , 2 );
|
||||
// MONTH
|
||||
t.tm_mon = atol2 ( timeStr + 4 , 2 );
|
||||
// YEAR
|
||||
// # of years since 1900
|
||||
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
|
||||
// TIME
|
||||
t.tm_hour = atol2 ( timeStr + 8 , 2 );
|
||||
t.tm_min = atol2 ( timeStr + 10 , 2 );
|
||||
t.tm_sec = atol2 ( timeStr + 12 , 2 );
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
// translate using mktime
|
||||
arcTime = timegm ( &t );
|
||||
|
||||
gr->m_firstIndexed = arcTime;
|
||||
gr->m_lastSpidered = arcTime;
|
||||
|
||||
|
||||
start = arcContent;
|
||||
|
||||
// assume "start" has the http mime
|
||||
gr->m_hasMime = true;
|
||||
|
||||
// advance to next rec BEFORE we return true below
|
||||
m_start = arcContentEnd;
|
||||
advanced = true;
|
||||
|
||||
// arcConType needs to indexable
|
||||
int32_t ct = getContentTypeFromStr ( arcConType );
|
||||
if ( ct != CT_HTML &&
|
||||
ct != CT_TEXT &&
|
||||
ct != CT_XML &&
|
||||
ct != CT_JSON ) {
|
||||
// read another arc record
|
||||
return true;
|
||||
}
|
||||
|
||||
// skip if robots.txt
|
||||
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
|
||||
m_injectUrlBuf.getLength() ) )
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
// for injecting "start" set this to \0
|
||||
if ( advanced ) { // m_start ) {
|
||||
@ -997,7 +627,7 @@ bool Msg7::inject ( void *state ,
|
||||
m_fixMe = true;
|
||||
}
|
||||
|
||||
if ( ! delim && ! m_isWarc && ! m_isArc )
|
||||
if ( ! delim )
|
||||
// this is the url of the injected content
|
||||
m_injectUrlBuf.safeStrcpy ( gr->m_url );
|
||||
|
||||
|
@ -45,9 +45,6 @@ public:
|
||||
|
||||
//int32_t m_crawlbotAPI;
|
||||
|
||||
bool m_isWarc;
|
||||
bool m_isArc;
|
||||
|
||||
class ImportState *m_importState;
|
||||
|
||||
//void constructor();
|
||||
|
55
Url.cpp
55
Url.cpp
@ -1441,22 +1441,24 @@ bool Url::isBadExtension ( int32_t version ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Url::isCompressedArcOrWarc ( ) {
|
||||
bool Url::isWarc ( ) {
|
||||
|
||||
// hack to allow for .gz if it is .warc.gz or .arc.gz
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
m_extension[1] == 'z' &&
|
||||
m_ulen > 10 &&
|
||||
m_extension[-1] == '.' &&
|
||||
m_extension[-2] == 'c' &&
|
||||
m_extension[-3] == 'r' &&
|
||||
m_extension[-4] == 'a' &&
|
||||
m_extension[-5] == '.' ) {
|
||||
// m_isArc = true;
|
||||
// m_isArcValid = true;
|
||||
// if ( ulen>8 && strncmp(uend-8,".warc.gz",8)==0 )
|
||||
// m_isWarc = true;
|
||||
// if ( ulen>8 && strncmp(uend-5,".warc" ,5)==0 )
|
||||
// m_isWarc = true;
|
||||
|
||||
// if ( ulen>8 && strncmp(uend-7,".arc.gz",7)==0 )
|
||||
// m_isArc = true;
|
||||
// if ( ulen>8 && strncmp(uend-4,".arc" ,4)==0 )
|
||||
// m_isArc = true;
|
||||
|
||||
if ( m_elen == 4 &&
|
||||
m_extenion[0] == 'w' &&
|
||||
m_extenion[1] == 'a' &&
|
||||
m_extenion[2] == 'r' &&
|
||||
m_extenion[3] == 'c' )
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
@ -1474,7 +1476,32 @@ bool Url::isCompressedArcOrWarc ( ) {
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Url::isArc ( ) {
|
||||
|
||||
if ( m_elen == 3 &&
|
||||
m_extenion[0] == 'a' &&
|
||||
m_extenion[1] == 'r' &&
|
||||
m_extenion[2] == 'c' )
|
||||
return true;
|
||||
|
||||
// hack to allow for .gz if it is .warc.gz or .arc.gz
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
m_extension[1] == 'z' &&
|
||||
m_ulen > 10 &&
|
||||
m_extension[-1] == '.' &&
|
||||
m_extension[-2] == 'c' &&
|
||||
m_extension[-3] == 'r' &&
|
||||
m_extension[-4] == 'a' &&
|
||||
m_extension[-5] == '.' ) {
|
||||
// m_isArc = true;
|
||||
// m_isArcValid = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// see Url.h for a description of this.
|
||||
|
5
Url.h
5
Url.h
@ -92,7 +92,10 @@ public:
|
||||
bool isBadExtension(int32_t xxx);
|
||||
bool isSet() { return m_ulen != 0; }
|
||||
|
||||
bool isCompressedArcOrWarc ( ) ;
|
||||
// is this url a warc or arc url? i.e. ends in .warc or .arc or
|
||||
// .warc.gz or .arc.gz?
|
||||
bool isWarc ( );
|
||||
bool isArc ( );
|
||||
|
||||
// does it end in .xml, .rdb or .rss, etc. kinda thing
|
||||
//bool isRSSFormat ( ) ;
|
||||
|
484
XmlDoc.cpp
484
XmlDoc.cpp
@ -114,6 +114,8 @@ XmlDoc::XmlDoc() {
|
||||
m_freed = false;
|
||||
m_contentInjected = false;
|
||||
m_wasContentInjected = false;
|
||||
m_msg7 = NULL;
|
||||
m_warcContentPtr = NULL;
|
||||
//m_coll = NULL;
|
||||
m_ubuf = NULL;
|
||||
m_pbuf = NULL;
|
||||
@ -192,6 +194,13 @@ class XmlDoc *g_xd;
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
if ( m_msg7 ) {
|
||||
mdelete ( m_msg7 , sizeof(Msg7) , "xdmsg7" );
|
||||
delete ( m_msg7 );
|
||||
m_msg7 = NULL;
|
||||
}
|
||||
m_warcContentPtr = NULL;
|
||||
|
||||
m_redirUrl.reset();
|
||||
|
||||
m_ipStartTime = 0;
|
||||
@ -2709,6 +2718,32 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
}
|
||||
|
||||
|
||||
// handle docs that consist of subdocs that need to be injected
|
||||
// or indexed individually.
|
||||
if ( m_firstUrlValid && m_firstUrl.isWarc() ) {
|
||||
// this returns false if it would block and callback will be called
|
||||
if ( ! indexWarc () )
|
||||
return false;
|
||||
// all done! no need to add the parent doc.
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( m_firstUrlValid && m_firstUrl.isArc() ) {
|
||||
// this returns false if it would block and callback will be called
|
||||
if ( ! indexArc () )
|
||||
return false;
|
||||
// all done! no need to add the parent doc.
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( m_isContainerDoc ) {
|
||||
// m_delimeter should be set!
|
||||
if ( ! indexContainerDoc () )
|
||||
return false;
|
||||
// all done! no need to add the parent doc.
|
||||
return true;
|
||||
}
|
||||
|
||||
// . now get the meta list from it to add
|
||||
// . returns NULL and sets g_errno on error
|
||||
char *metaList = getMetaList ( );
|
||||
@ -2958,6 +2993,447 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
*/
|
||||
}
|
||||
|
||||
// returns false if would block, true otherwise. returns true and sets g_errno on err
|
||||
bool XmlDoc::indexArc ( ) {
|
||||
|
||||
int8_t *hc = getHopCount();
|
||||
if ( ! hc ) return true; // error?
|
||||
if ( hc == (void *)-1 ) return false;
|
||||
// first download
|
||||
char **arcContent = getUtf8Content();
|
||||
// return true with g_errno set on error
|
||||
if ( ! arcContent ) {
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
return true;
|
||||
}
|
||||
// would block? return false then
|
||||
if ( arcContent == (void *)-1 )
|
||||
return false;
|
||||
|
||||
// need this. it is almost 1MB in size, so alloc it
|
||||
if ( ! m_msg7 ) {
|
||||
try { m_msg7 = new ( Msg7 ); }
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
return true;
|
||||
}
|
||||
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
|
||||
}
|
||||
|
||||
// inject input parms:
|
||||
GigablastRequest *gr = &m_msg7->m_gr;
|
||||
// the cursor for scanning the subdocs
|
||||
if ( ! m_arcContentPtr ) {
|
||||
|
||||
// init the content cursor to point to the first subdoc
|
||||
m_arcContentPtr = arcContent;
|
||||
|
||||
// init the input parms
|
||||
memset ( gr , 0 , sizeof(GigablastRequest) );
|
||||
// reset it
|
||||
gr->m_spiderLinks = false;
|
||||
gr->m_injectLinks = false;
|
||||
// what happens if coll gets nuked from under us? use collnum
|
||||
gr->m_coll = cr->m_coll;
|
||||
gr->m_hopCount = *hc + 1;
|
||||
// if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
||||
gr->m_collnum = m_collnum;
|
||||
// will this work on a content delimeterized doc?
|
||||
gr->m_deleteUrl = m_deleteFromIndex;
|
||||
// each subdoc will have a mime since it is an arc
|
||||
gr->m_hasMime = true;
|
||||
|
||||
}
|
||||
|
||||
subdocLoop:
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// we had \0 terminated the end of the previous record, so put back
|
||||
if ( m_savedChar && ! *m_arcContentPtr ) *m_arcContentPtr = m_savedChar;
|
||||
|
||||
// . should have the url as well.
|
||||
// . the url, ip etc. are on a single \n terminated line for an arc!
|
||||
char *arcHeader = strstr(m_arcContentPtr,"\nhttp");
|
||||
if ( ! arcHeader ) {
|
||||
log("inject: arc: all done");
|
||||
return true;
|
||||
}
|
||||
// find end of url
|
||||
char *arcHeaderEnd = strstr (arcHeader+1,"\n");
|
||||
if ( ! arcHeaderEnd ) {
|
||||
log("inject: arc: no header end. all done");
|
||||
return true;
|
||||
}
|
||||
// term it
|
||||
*arcHeaderEnd = '\0';
|
||||
|
||||
char *arcRecord = arcHeaderEnd + 1;
|
||||
|
||||
|
||||
// parse arc header line
|
||||
char *arcUrl = arcHeader + 1;
|
||||
char *hp = arcUrl;
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 1.");
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0';
|
||||
m_injectUrlBuf.reset();
|
||||
m_injectUrlBuf.safeStrcpy(arcUrl);
|
||||
m_injectUrlBuf.nullTerm();
|
||||
|
||||
|
||||
char *ipStr = hp;
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 2.");
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0';
|
||||
gr->m_injectDocIp = atoip(ipStr);
|
||||
|
||||
char *timeStr = hp;
|
||||
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 3.");
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0'; // null term timeStr
|
||||
char *arcConType = hp;
|
||||
|
||||
for ( ; *hp && *hp != ' ' ; hp++ );
|
||||
if ( ! *hp ) {
|
||||
log("inject: bad arc header 4.");
|
||||
return true;
|
||||
}
|
||||
*hp++ = '\0'; // null term arcContentType
|
||||
|
||||
char *arcRecLenStr = hp;
|
||||
|
||||
// get arc content len
|
||||
int64_t arcRecLen = atoll(arcContentLenStr);
|
||||
char *arcRecEnd = arcContent + arcContentLen;
|
||||
|
||||
// we could also use m_contentFile if it was on disk
|
||||
gr->m_content = m_arcContentPtr;
|
||||
|
||||
// advance for loop
|
||||
m_arcContentPtr = arcRecEnd;
|
||||
|
||||
// null term this record
|
||||
m_savedChar = *m_arcContentPtr; *m_arcContentPtr = '\0';
|
||||
|
||||
|
||||
// convert to timestamp
|
||||
int64_t arcTime = 0;
|
||||
// this time structure, once filled, will help yield a time_t
|
||||
struct tm t;
|
||||
// DAY OF MONTH
|
||||
t.tm_mday = atol2 ( timeStr + 6 , 2 );
|
||||
// MONTH
|
||||
t.tm_mon = atol2 ( timeStr + 4 , 2 );
|
||||
// YEAR
|
||||
// # of years since 1900
|
||||
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
|
||||
// TIME
|
||||
t.tm_hour = atol2 ( timeStr + 8 , 2 );
|
||||
t.tm_min = atol2 ( timeStr + 10 , 2 );
|
||||
t.tm_sec = atol2 ( timeStr + 12 , 2 );
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
// translate using mktime
|
||||
arcTime = timegm ( &t );
|
||||
|
||||
gr->m_firstIndexed = arcTime;
|
||||
gr->m_lastSpidered = arcTime;
|
||||
|
||||
// assume "start" has the http mime
|
||||
gr->m_hasMime = true;
|
||||
|
||||
// arcConType needs to indexable
|
||||
int32_t ct = getContentTypeFromStr ( arcConType );
|
||||
if ( ct != CT_HTML &&
|
||||
ct != CT_TEXT &&
|
||||
ct != CT_XML &&
|
||||
ct != CT_JSON ) {
|
||||
// read another arc record
|
||||
goto subdocLoop;
|
||||
}
|
||||
|
||||
// skip if robots.txt
|
||||
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
|
||||
m_injectUrlBuf.getLength() ) )
|
||||
goto subdocLoop;
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// TODO: set these based on the date in the warc mime!!
|
||||
//gr->m_firstIndexed = ;
|
||||
//gr->m_lastSpidered = ;
|
||||
// then process. this will scan over each delimeted
|
||||
// doc in the arc/warc file and inject each one individually.
|
||||
if ( ! m_msg7->inject ( m_masterCallback , m_masterState ) )
|
||||
// it would block, callback will be called later
|
||||
return false;
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
log("build: index arc error %s",mstrerror(g_errno));
|
||||
return NULL;
|
||||
}
|
||||
// loop it up
|
||||
goto subdocLoop;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// returns false if would block, true otherwise. returns true and sets g_errno on err
|
||||
bool XmlDoc::indexWarc ( ) {
|
||||
|
||||
int8_t *hc = getHopCount();
|
||||
if ( ! hc ) return true; // error?
|
||||
if ( hc == (void *)-1 ) return false;
|
||||
// first download
|
||||
char **warcContent = getUtf8Content();
|
||||
// return true with g_errno set on error
|
||||
if ( ! warcContent ) {
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
return true;
|
||||
}
|
||||
// would block? return false then
|
||||
if ( warcContent == (void *)-1 )
|
||||
return false;
|
||||
|
||||
// need this. it is almost 1MB in size, so alloc it
|
||||
if ( ! m_msg7 ) {
|
||||
try { m_msg7 = new ( Msg7 ); }
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
return true;
|
||||
}
|
||||
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
|
||||
}
|
||||
|
||||
// inject input parms:
|
||||
GigablastRequest *gr = &m_msg7->m_gr;
|
||||
// the cursor for scanning the subdocs
|
||||
if ( ! m_warcContentPtr ) {
|
||||
|
||||
// init the content cursor to point to the first subdoc
|
||||
m_warcContentPtr = warcContent;
|
||||
//m_warcContentEnd = warcContent + size_utf8Content;
|
||||
|
||||
// init the input parms
|
||||
memset ( gr , 0 , sizeof(GigablastRequest) );
|
||||
// reset it
|
||||
gr->m_spiderLinks = false;
|
||||
gr->m_injectLinks = false;
|
||||
// what happens if coll gets nuked from under us? use collnum
|
||||
gr->m_coll = cr->m_coll;
|
||||
gr->m_hopCount = *hc + 1;
|
||||
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
||||
gr->m_collnum = m_collnum;
|
||||
// will this work on a content delimeterized doc?
|
||||
gr->m_deleteUrl = m_deleteFromIndex;
|
||||
// each subdoc will have a mime since it is a warc
|
||||
gr->m_hasMime = true;
|
||||
|
||||
}
|
||||
|
||||
subdocLoop:
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// we had \0 terminated the end of the previous record, so put back
|
||||
if ( m_savedChar && ! *m_warcContentPtr ) *m_warcContentPtr = m_savedChar;
|
||||
|
||||
// find size of this warc record. parse this warc record
|
||||
char *mm = strstr(m_warcContentPtr,"Content-Length:");
|
||||
char *mmend = NULL;
|
||||
if ( mm ) mmend = strstr (mm,"\n");
|
||||
if ( ! mm || ! mmend ) {
|
||||
log("build: warc: all done");
|
||||
// XmlDoc.cpp checks for this to stop calling us
|
||||
//m_isDoneInjecting = true;
|
||||
return true;
|
||||
}
|
||||
// set 'recordSize' to the content-length
|
||||
char c = *mmend;
|
||||
*mmend = '\0';
|
||||
int64_t recordSize = atoll ( mm + 15 );
|
||||
*mmend = c;
|
||||
// set 'hend' to the end of this mime header
|
||||
char *warcMimeEnd = strstr ( mm, "\r\n\r\n");
|
||||
if ( ! warcMimeEnd ) {
|
||||
log("build: warc: could no mime header end. stopping.");
|
||||
// this is critical, so we gotta stop
|
||||
return true;
|
||||
}
|
||||
// tmp \0 that for these strstr() calls in the mime so if they miss
|
||||
// we don't scan a 1GB warc for them
|
||||
c = *warcMimeEnd;
|
||||
*warcMimeEnd = '\0';
|
||||
|
||||
char *warcUrl = strstr(m_warcContentPtr,"WARC-Target-URI:");
|
||||
char *warcType = strstr(m_warcContentPtr,"WARC-Type:");
|
||||
char *warcDate = strstr(m_warcContentPtr,"WARC-Date:");
|
||||
char *warcIp = strstr(m_warcContentPtr,"WARC-IP-Address:");
|
||||
|
||||
// advance
|
||||
if ( warcUrl ) warcUrl += 16;
|
||||
if ( warcType ) warcType += 10;
|
||||
if ( warcDate ) warcDate += 10;
|
||||
if ( warcIp ) warcIp += 17;
|
||||
|
||||
// restore
|
||||
*warcMimeEnd = c;
|
||||
|
||||
// skip the \r\n\r\n at the end of this subdoc's http mime
|
||||
warcMimeEnd += 4;
|
||||
|
||||
// we could also use m_contentFile if it was on disk
|
||||
gr->m_content = m_warcContentPtr;
|
||||
|
||||
// and point to the mime of the NEXT subdoc
|
||||
// before we call goto subdocLoop anywhere.
|
||||
m_warcContentPtr = warcMimeEnd + recordSize;
|
||||
|
||||
// null term end of this subdoc before injecting
|
||||
m_savedChar = *m_warcContentPtr;
|
||||
*m_warcContentPtr = '\0';
|
||||
|
||||
if ( ! warcType ) {
|
||||
log("inject: warc: could not find rec type");
|
||||
goto subdocLoop;
|
||||
}
|
||||
|
||||
if ( is_wspace_a(*warcType) ) warcType++;
|
||||
if ( is_wspace_a(*warcType) ) warcType++;
|
||||
|
||||
// WARC-Type:
|
||||
// do not index this record as a doc if it is not a
|
||||
// "WARC-Type: response" record.
|
||||
if ( strncmp(warcType,"response",8) != 0 )
|
||||
goto subdocLoop;
|
||||
|
||||
// skip this rec if url-less
|
||||
if ( ! warcUrl ) {
|
||||
log("inject: warc: could not find rec url");
|
||||
goto subdocLoop;
|
||||
}
|
||||
if ( ! warcDate ) {
|
||||
log("inject: warc: could not find rec date");
|
||||
goto subdocLoop;
|
||||
}
|
||||
|
||||
// skip spaces on all
|
||||
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
|
||||
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
|
||||
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
|
||||
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
|
||||
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
|
||||
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
|
||||
|
||||
// url must start with http:// or https://
|
||||
// it's probably like WARC-Target-URI: dns:www.xyz.com
|
||||
// so it is a dns response
|
||||
if ( strncmp(warcUrl,"http://" ,7) != 0 &&
|
||||
strncmp(warcUrl,"https://",8) != 0 )
|
||||
goto subdocLoop;
|
||||
|
||||
gr->m_injectDocIp = 0;
|
||||
|
||||
// get the record IP address from the warc header if there
|
||||
if ( warcIp ) {
|
||||
// get end of ip
|
||||
char *warcIpEnd = warcIp;
|
||||
// skip digits and periods
|
||||
while ( ! is_wspace_a(*warcIpEnd) ) warcIpEnd++;
|
||||
// we now have the ip address for doing ip: searches
|
||||
// this func is in ip.h
|
||||
gr->m_injectDocIp = atoip ( warcIp, warcIpEnd-warcIp );
|
||||
}
|
||||
|
||||
// convert date to timestamp
|
||||
int64_t warcTime = 0;
|
||||
if ( warcDate ) warcTime = atotime ( warcDate );
|
||||
gr->m_firstIndexed = warcTime;
|
||||
gr->m_lastSpidered = warcTime;
|
||||
// does this work?
|
||||
gr->m_hopCount = -1;
|
||||
gr->m_diffbotReply = 0;
|
||||
gr->m_newOnly = 0;
|
||||
// end of the url
|
||||
char *warcUrlEnd = warcUrl;
|
||||
for ( ; *warcUrlEnd && ! is_wspace_a(*warcUrlEnd) ;
|
||||
warcUrlEnd++ );
|
||||
// set it to that
|
||||
m_injectUrlBuf.reset();
|
||||
// by default append a -<ch64> to the provided url
|
||||
int32_t warcUrlLen = warcUrlEnd - warcUrl;
|
||||
m_injectUrlBuf.safeMemcpy(warcUrl,warcUrlLen);
|
||||
m_injectUrlBuf.nullTerm();
|
||||
// skip if robots.txt
|
||||
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
|
||||
m_injectUrlBuf.getLength() ) )
|
||||
goto subdocLoop;
|
||||
// all warc records have the http mime
|
||||
gr->m_hasMime = true;
|
||||
|
||||
// point to subdoc's mime again, not the warc mime for it, which is above it
|
||||
char *recMime = warcMimeEnd;
|
||||
// and find the next \r\n\r\n
|
||||
char *recMimeEnd = strstr ( recMime , "\r\n\r\n" );
|
||||
if ( ! recMimeEnd ) {
|
||||
log("inject: warc: no http mime.");
|
||||
goto subdocLoop;
|
||||
}
|
||||
// gotta include the \r\n\r\n in the mime length here otherwise mime.set fail
|
||||
recMimeEnd += 4;
|
||||
// should be a mime that starts with GET or POST
|
||||
HttpMime mime;
|
||||
if ( ! mime.set ( recMime, recMimeEnd - recMime , NULL ) ) {
|
||||
log("inject: warc: mime set failed ");
|
||||
goto subdocLoop;
|
||||
}
|
||||
// check content type. if bad advance to next rec.
|
||||
int ct = mime.getContentType();
|
||||
if ( ct != CT_HTML &&
|
||||
ct != CT_TEXT &&
|
||||
ct != CT_XML &&
|
||||
ct != CT_JSON )
|
||||
goto subdocLoop;
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// TODO: set these based on the date in the warc mime!!
|
||||
//gr->m_firstIndexed = ;
|
||||
//gr->m_lastSpidered = ;
|
||||
// then process. this will scan over each delimeted
|
||||
// doc in the arc/warc file and inject each one individually.
|
||||
if ( ! m_msg7->inject ( m_masterCallback , m_masterState ) )
|
||||
// it would block, callback will be called later
|
||||
return false;
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
log("build: index warc error %s",mstrerror(g_errno));
|
||||
return NULL;
|
||||
}
|
||||
// loop it up
|
||||
goto subdocLoop;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void getTitleRecBufWrapper ( void *state ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
// make sure has not been freed from under us!
|
||||
@ -10138,6 +10614,9 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// let's just parse out the meta tag by hand
|
||||
bool checkMeta = true;
|
||||
if ( isRobotsTxt ) checkMeta = false;
|
||||
// if we are a doc that consists of a sequence of sub-docs that
|
||||
// we are indexing/injecting then don't do this check.
|
||||
if ( isContainerDoc() ) checkMeta = false;
|
||||
if ( checkMeta ) {
|
||||
Url **mrup = getMetaRedirUrl();
|
||||
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
|
||||
@ -17926,6 +18405,11 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
|
||||
|
||||
bool skip = m_skipIframeExpansion;
|
||||
|
||||
// if we are a warc, arc or doc that consists of a sequence of
|
||||
// sub-docs that we are indexing/injecting then skip iframe expansion
|
||||
if ( isContainerDoc() )
|
||||
skip = true;
|
||||
|
||||
// or if this is set to true
|
||||
if ( skip ) {
|
||||
m_expandedUtf8Content = m_rawUtf8Content;
|
||||
|
5
XmlDoc.h
5
XmlDoc.h
@ -499,6 +499,8 @@ class XmlDoc {
|
||||
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
|
||||
bool indexDoc ( );
|
||||
bool indexDoc2 ( );
|
||||
bool indexArc ( ) ;
|
||||
bool indexWarc ( ) ;
|
||||
key_t *getTitleRecKey() ;
|
||||
//char *getSkipIndexing ( );
|
||||
char *prepareToMakeTitleRec ( ) ;
|
||||
@ -1048,6 +1050,9 @@ class XmlDoc {
|
||||
SafeBuf m_zbuf;
|
||||
SafeBuf m_kbuf;
|
||||
|
||||
class Msg7 *m_msg7;
|
||||
char *m_warcContentPtr;
|
||||
|
||||
// . same thing, a little more complicated
|
||||
// . these classes are only set on demand
|
||||
Xml m_xml;
|
||||
|
Loading…
x
Reference in New Issue
Block a user