checkpoint. moved warc and arc looping into xmldoc.

now will any container doc from pageinject into
xmldoc. simplifies pageinject.cpp a lot. and sets up
a framework for dealing with container docs.
This commit is contained in:
Matt 2015-05-01 19:11:13 -07:00
parent d3c071e4c0
commit 0ca27638bc
6 changed files with 535 additions and 389 deletions

@ -349,8 +349,6 @@ void Msg7::reset() {
m_injectCount = 0;
m_start = NULL;
m_sbuf.reset();
m_isWarc = false;
m_isArc = false;
m_isDoneInjecting = false;
}
@ -391,10 +389,6 @@ void injectLoopWrapper9 ( void *state ) {
if ( delim && ! delim[0] ) delim = NULL;
bool loopIt = false;
if ( delim ) loopIt = true;
// by default warc and arc files consist of many subdocuments
// that have to be indexed individually as well
if ( msg7->m_isWarc ) loopIt = true;
if ( msg7->m_isArc ) loopIt = true;
if ( loopIt ) { // && msg7->m_start ) {
// do another injection. returns false if it blocks
@ -526,22 +520,6 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
sendReply ( slot );
}
void gotWarcContentWrapper ( void *state , TcpSocket *ts ) {
Msg7 *THIS = (Msg7 *)state;
// set content to that
GigablastRequest *gr = &THIS->m_gr;
gr->m_contentBuf.setBuf (ts->m_readBuf,
ts->m_readBufSize ,
ts->m_readOffset ,
true , // ownBuf?
0 ); // encoding
// just ref it
gr->m_content = ts->m_readBuf;
// so tcpserver.cpp doesn't free the ward/arc file
ts->m_readBuf = NULL;
// continue with injection loop
injectLoopWrapper9 ( THIS );
}
// . returns false if blocked and callback will be called, true otherwise
// . sets g_errno on error
@ -593,66 +571,6 @@ bool Msg7::inject ( void *state ,
// get the normalized url
u.set ( gr->m_url );
char *ustr = u.getUrl();
int32_t ulen = u.getUrlLen();
char *uend = ustr + ulen;
m_isWarc = false;
m_isArc = false;
if ( ulen>8 && strncmp(uend-8,".warc.gz",8)==0 )
m_isWarc = true;
if ( ulen>8 && strncmp(uend-5,".warc" ,5)==0 )
m_isWarc = true;
if ( ulen>8 && strncmp(uend-7,".arc.gz",7)==0 )
m_isArc = true;
if ( ulen>8 && strncmp(uend-4,".arc" ,4)==0 )
m_isArc = true;
// if warc/arc download it and make gr->m_content reference it...
// we won't handle redirects though.
if ( ! content && ( m_isWarc || m_isArc) ) {
// download the warc/arc url
if ( ! g_httpServer.getDoc ( ustr ,
0 , // urlip
0 , // offset
-1 ,
0,//r->m_ifModifiedSince ,
this , // state
gotWarcContentWrapper ,// callback
30*1000 , // 30 sec timeout
0 , // r->m_proxyIp ,
0 , // r->m_proxyPort ,
-1,//r->m_maxTextDocLen ,
-1,//r->m_maxOtherDocLen ,
NULL,//agent ,
DEFAULT_HTTP_PROTO , // "HTTP/1.0"
false , // doPost?
NULL , // cookie
NULL , // additionalHeader
NULL , // our own mime!
NULL , // postContent
NULL))//proxyUsernamePwdAuth ) )
// return false if blocked
return false;
// error?
log("inject: %s",mstrerror(g_errno));
}
if ( m_firstTime && ( m_isWarc || m_isArc ) ) {
// skip over the first http mime header, it is not
// part of the warc file per se.
content = strstr(content,"\r\n\r\n");
if ( ! content ) {
log("inject: no mime received from webserver");
return true;
}
// skip over that to point to start of actual warc
// file content
content += 4;
}
if ( m_firstTime ) {
m_firstTime = false;
m_start = content;
@ -668,10 +586,6 @@ bool Msg7::inject ( void *state ,
char *delim = gr->m_contentDelim;
if ( delim && ! delim[0] ) delim = NULL;
// delim is sill for warc/arcs so ignore it
if ( m_isWarc || m_isArc ) delim = NULL;
// if doing delimeterized injects, hitting a \0 is the end of the road
if ( delim && m_fixMe && ! m_saved ) {
m_isDoneInjecting = true;
@ -702,290 +616,6 @@ bool Msg7::inject ( void *state ,
m_start = start + gbstrlen(start);
}
// WARC files are mime delimeted. the http reply, which
// contains a mime, as a mime a level above that whose
// content-length: field includes the original http reply mime
// as part of its content.
if ( m_isWarc ) { // gr->m_containerContentType == CT_WARC ) {
// no setting delim for this!
if ( delim ) { char *xx=NULL;*xx=0; }
// should have the url as well
char *mm = strstr(start,"Content-Length:");
char *mmend = NULL;
if ( mm ) mmend = strstr (mm,"\n");
if ( ! mm || ! mmend ) {
log("inject: warc: all done");
// XmlDoc.cpp checks for this to stop calling us
m_isDoneInjecting = true;
return true;
}
char c = *mmend;
*mmend = '\0';
int64_t recordSize = atoll ( mm + 15 );
*mmend = c;
// end of mime header
char *hend = strstr ( mm, "\r\n\r\n");
if ( ! hend ) {
log("inject: warc: could no mime header end.");
return true;
}
// tmp \0 that for these strstr() calls
c = *hend;
*hend = '\0';
char *warcUrl = strstr(start,"WARC-Target-URI:");
char *warcType = strstr(start,"WARC-Type:");
char *warcDate = strstr(start,"WARC-Date:");
char *warcIp = strstr(start,"WARC-IP-Address:");
// advance
if ( warcUrl ) warcUrl += 16;
if ( warcType ) warcType += 10;
if ( warcDate ) warcDate += 10;
if ( warcIp ) warcIp += 17;
// restore
*hend = c;
// skip the \r\n\r\n
hend += 4;
// adjust start to point to start of the content really
start = hend;
// and over record
m_start = hend + recordSize;
advanced = true;
if ( ! warcType ) {
log("inject: warc: could not find rec type");
return true;
}
if ( is_wspace_a(*warcType) ) warcType++;
if ( is_wspace_a(*warcType) ) warcType++;
// WARC-Type:
// do not index this record as a doc if it is not a
// "WARC-Type: response" record.
if ( strncmp(warcType,"response",8) != 0 )
return true;
// skip this rec if url-less
if ( ! warcUrl ) {
log("inject: warc: could not find rec url");
return true;
}
if ( ! warcDate ) {
log("inject: warc: could not find rec date");
return true;
}
// skip spaces on all
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
// url must start with http:// or https://
// it's probably like WARC-Target-URI: dns:www.xyz.com
// so it is a dns response
if ( strncmp(warcUrl,"http://" ,7) != 0 &&
strncmp(warcUrl,"https://",8) != 0 )
return true;
gr->m_injectDocIp = 0;
// get the record IP address from the warc header if there
if ( warcIp ) {
// get end of ip
char *warcIpEnd = warcIp;
// skip digits and periods
while ( ! is_wspace_a(*warcIpEnd) ) warcIpEnd++;
// we now have the ip address for doing ip: searches
// this func is in ip.h
gr->m_injectDocIp = atoip ( warcIp, warcIpEnd-warcIp );
}
// convert date to timestamp
int64_t warcTime = 0;
if ( warcDate ) warcTime = atotime ( warcDate );
gr->m_firstIndexed = warcTime;
gr->m_lastSpidered = warcTime;
// does this work?
gr->m_hopCount = -1;
gr->m_diffbotReply = 0;
gr->m_newOnly = 0;
// end of the url
char *warcUrlEnd = warcUrl;
for ( ; *warcUrlEnd && ! is_wspace_a(*warcUrlEnd) ;
warcUrlEnd++ );
// set it to that
m_injectUrlBuf.reset();
// by default append a -<ch64> to the provided url
int32_t warcUrlLen = warcUrlEnd - warcUrl;
m_injectUrlBuf.safeMemcpy(warcUrl,warcUrlLen);
m_injectUrlBuf.nullTerm();
// skip if robots.txt
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
m_injectUrlBuf.getLength() ) )
return true;
// all warc records have the http mime
gr->m_hasMime = true;
char *recMime = hend;
// and find the next \r\n\r\n
char *recMimeEnd = strstr ( recMime , "\r\n\r\n" );
if ( ! recMimeEnd ) {
log("inject: warc: no http mime.");
return true;
}
// gotta include the \r\n\r\n in the mime length here
recMimeEnd += 4;
// should be a mime that starts with GET or POST
HttpMime mime;
if ( ! mime.set ( recMime, recMimeEnd - recMime , NULL ) ) {
log("inject: warc: mime set failed ");
return true;
}
// check content type. if bad advance to next rec.
int ct = mime.getContentType();
if ( ct != CT_HTML &&
ct != CT_TEXT &&
ct != CT_XML &&
ct != CT_JSON )
return true;
}
// ARC files have a url on one line and the length on the next line
if ( m_isArc ) {
// no setting delim for this!
if ( delim ) { char *xx=NULL;*xx=0; }
// should have the url as well
char *arcHeader = strstr(start,"\nhttp");
//char *mmend = NULL;
//if ( mm ) mmend = strstr (mm,"\n");
if ( ! arcHeader ) { // || ! mmend ) {
log("inject: arc: all done");
m_isDoneInjecting = true;
return true;
}
// find end of url
char *arcHeaderEnd = strstr (arcHeader+1,"\n");
if ( ! arcHeaderEnd ) {
log("inject: arc: no header end. all done");
m_isDoneInjecting = true;
return true;
}
// term it
*arcHeaderEnd = '\0';
char *arcContent = arcHeaderEnd + 1;
// parse arc header line
char *arcUrl = arcHeader + 1;
char *hp = arcUrl;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 1.");
m_isDoneInjecting = true;
return true;
}
*hp++ = '\0';
m_injectUrlBuf.reset();
m_injectUrlBuf.safeStrcpy(arcUrl);
m_injectUrlBuf.nullTerm();
char *ipStr = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 2.");
m_isDoneInjecting = true;
return true;
}
*hp++ = '\0';
gr->m_injectDocIp = atoip(ipStr);
char *timeStr = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 3.");
m_isDoneInjecting = true;
return true;
}
*hp++ = '\0'; // null term timeStr
char *arcConType = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 4.");
m_isDoneInjecting = true;
return true;
}
*hp++ = '\0'; // null term arcContentType
char *arcContentLenStr = hp;
// get arc content len
int64_t arcContentLen = atoll(arcContentLenStr);
char *arcContentEnd = arcContent + arcContentLen;
//uint64_t recSize = (arcContentEnd - realStart);
// convert to timestamp
int64_t arcTime = 0;
// this time structure, once filled, will help yield a time_t
struct tm t;
// DAY OF MONTH
t.tm_mday = atol2 ( timeStr + 6 , 2 );
// MONTH
t.tm_mon = atol2 ( timeStr + 4 , 2 );
// YEAR
// # of years since 1900
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
// TIME
t.tm_hour = atol2 ( timeStr + 8 , 2 );
t.tm_min = atol2 ( timeStr + 10 , 2 );
t.tm_sec = atol2 ( timeStr + 12 , 2 );
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
arcTime = timegm ( &t );
gr->m_firstIndexed = arcTime;
gr->m_lastSpidered = arcTime;
start = arcContent;
// assume "start" has the http mime
gr->m_hasMime = true;
// advance to next rec BEFORE we return true below
m_start = arcContentEnd;
advanced = true;
// arcConType needs to indexable
int32_t ct = getContentTypeFromStr ( arcConType );
if ( ct != CT_HTML &&
ct != CT_TEXT &&
ct != CT_XML &&
ct != CT_JSON ) {
// read another arc record
return true;
}
// skip if robots.txt
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
m_injectUrlBuf.getLength() ) )
return true;
}
// for injecting "start" set this to \0
if ( advanced ) { // m_start ) {
@ -997,7 +627,7 @@ bool Msg7::inject ( void *state ,
m_fixMe = true;
}
if ( ! delim && ! m_isWarc && ! m_isArc )
if ( ! delim )
// this is the url of the injected content
m_injectUrlBuf.safeStrcpy ( gr->m_url );

@ -45,9 +45,6 @@ public:
//int32_t m_crawlbotAPI;
bool m_isWarc;
bool m_isArc;
class ImportState *m_importState;
//void constructor();

55
Url.cpp

@ -1441,22 +1441,24 @@ bool Url::isBadExtension ( int32_t version ) {
return true;
}
bool Url::isCompressedArcOrWarc ( ) {
bool Url::isWarc ( ) {
// hack to allow for .gz if it is .warc.gz or .arc.gz
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
m_extension[1] == 'z' &&
m_ulen > 10 &&
m_extension[-1] == '.' &&
m_extension[-2] == 'c' &&
m_extension[-3] == 'r' &&
m_extension[-4] == 'a' &&
m_extension[-5] == '.' ) {
// m_isArc = true;
// m_isArcValid = true;
// if ( ulen>8 && strncmp(uend-8,".warc.gz",8)==0 )
// m_isWarc = true;
// if ( ulen>8 && strncmp(uend-5,".warc" ,5)==0 )
// m_isWarc = true;
// if ( ulen>8 && strncmp(uend-7,".arc.gz",7)==0 )
// m_isArc = true;
// if ( ulen>8 && strncmp(uend-4,".arc" ,4)==0 )
// m_isArc = true;
if ( m_elen == 4 &&
m_extenion[0] == 'w' &&
m_extenion[1] == 'a' &&
m_extenion[2] == 'r' &&
m_extenion[3] == 'c' )
return true;
}
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
@ -1474,7 +1476,32 @@ bool Url::isCompressedArcOrWarc ( ) {
}
return false;
}
bool Url::isArc ( ) {
if ( m_elen == 3 &&
m_extenion[0] == 'a' &&
m_extenion[1] == 'r' &&
m_extenion[2] == 'c' )
return true;
// hack to allow for .gz if it is .warc.gz or .arc.gz
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
m_extension[1] == 'z' &&
m_ulen > 10 &&
m_extension[-1] == '.' &&
m_extension[-2] == 'c' &&
m_extension[-3] == 'r' &&
m_extension[-4] == 'a' &&
m_extension[-5] == '.' ) {
// m_isArc = true;
// m_isArcValid = true;
return true;
}
return false;
}
// see Url.h for a description of this.

5
Url.h

@ -92,7 +92,10 @@ public:
bool isBadExtension(int32_t xxx);
bool isSet() { return m_ulen != 0; }
bool isCompressedArcOrWarc ( ) ;
// is this url a warc or arc url? i.e. ends in .warc or .arc or
// .warc.gz or .arc.gz?
bool isWarc ( );
bool isArc ( );
// does it end in .xml, .rdb or .rss, etc. kinda thing
//bool isRSSFormat ( ) ;

@ -114,6 +114,8 @@ XmlDoc::XmlDoc() {
m_freed = false;
m_contentInjected = false;
m_wasContentInjected = false;
m_msg7 = NULL;
m_warcContentPtr = NULL;
//m_coll = NULL;
m_ubuf = NULL;
m_pbuf = NULL;
@ -192,6 +194,13 @@ class XmlDoc *g_xd;
void XmlDoc::reset ( ) {
if ( m_msg7 ) {
mdelete ( m_msg7 , sizeof(Msg7) , "xdmsg7" );
delete ( m_msg7 );
m_msg7 = NULL;
}
m_warcContentPtr = NULL;
m_redirUrl.reset();
m_ipStartTime = 0;
@ -2709,6 +2718,32 @@ bool XmlDoc::indexDoc2 ( ) {
}
// handle docs that consist of subdocs that need to be injected
// or indexed individually.
if ( m_firstUrlValid && m_firstUrl.isWarc() ) {
// this returns false if it would block and callback will be called
if ( ! indexWarc () )
return false;
// all done! no need to add the parent doc.
return true;
}
if ( m_firstUrlValid && m_firstUrl.isArc() ) {
// this returns false if it would block and callback will be called
if ( ! indexArc () )
return false;
// all done! no need to add the parent doc.
return true;
}
if ( m_isContainerDoc ) {
// m_delimeter should be set!
if ( ! indexContainerDoc () )
return false;
// all done! no need to add the parent doc.
return true;
}
// . now get the meta list from it to add
// . returns NULL and sets g_errno on error
char *metaList = getMetaList ( );
@ -2958,6 +2993,447 @@ bool XmlDoc::indexDoc2 ( ) {
*/
}
// returns false if would block, true otherwise. returns true and sets g_errno on err
bool XmlDoc::indexArc ( ) {
int8_t *hc = getHopCount();
if ( ! hc ) return true; // error?
if ( hc == (void *)-1 ) return false;
// first download
char **arcContent = getUtf8Content();
// return true with g_errno set on error
if ( ! arcContent ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// would block? return false then
if ( arcContent == (void *)-1 )
return false;
// need this. it is almost 1MB in size, so alloc it
if ( ! m_msg7 ) {
try { m_msg7 = new ( Msg7 ); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
}
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
}
// inject input parms:
GigablastRequest *gr = &m_msg7->m_gr;
// the cursor for scanning the subdocs
if ( ! m_arcContentPtr ) {
// init the content cursor to point to the first subdoc
m_arcContentPtr = arcContent;
// init the input parms
memset ( gr , 0 , sizeof(GigablastRequest) );
// reset it
gr->m_spiderLinks = false;
gr->m_injectLinks = false;
// what happens if coll gets nuked from under us? use collnum
gr->m_coll = cr->m_coll;
gr->m_hopCount = *hc + 1;
// if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
gr->m_collnum = m_collnum;
// will this work on a content delimeterized doc?
gr->m_deleteUrl = m_deleteFromIndex;
// each subdoc will have a mime since it is an arc
gr->m_hasMime = true;
}
subdocLoop:
QUICKPOLL ( m_niceness );
// we had \0 terminated the end of the previous record, so put back
if ( m_savedChar && ! *m_arcContentPtr ) *m_arcContentPtr = m_savedChar;
// . should have the url as well.
// . the url, ip etc. are on a single \n terminated line for an arc!
char *arcHeader = strstr(m_arcContentPtr,"\nhttp");
if ( ! arcHeader ) {
log("inject: arc: all done");
return true;
}
// find end of url
char *arcHeaderEnd = strstr (arcHeader+1,"\n");
if ( ! arcHeaderEnd ) {
log("inject: arc: no header end. all done");
return true;
}
// term it
*arcHeaderEnd = '\0';
char *arcRecord = arcHeaderEnd + 1;
// parse arc header line
char *arcUrl = arcHeader + 1;
char *hp = arcUrl;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 1.");
return true;
}
*hp++ = '\0';
m_injectUrlBuf.reset();
m_injectUrlBuf.safeStrcpy(arcUrl);
m_injectUrlBuf.nullTerm();
char *ipStr = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 2.");
return true;
}
*hp++ = '\0';
gr->m_injectDocIp = atoip(ipStr);
char *timeStr = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 3.");
return true;
}
*hp++ = '\0'; // null term timeStr
char *arcConType = hp;
for ( ; *hp && *hp != ' ' ; hp++ );
if ( ! *hp ) {
log("inject: bad arc header 4.");
return true;
}
*hp++ = '\0'; // null term arcContentType
char *arcRecLenStr = hp;
// get arc content len
int64_t arcRecLen = atoll(arcContentLenStr);
char *arcRecEnd = arcContent + arcContentLen;
// we could also use m_contentFile if it was on disk
gr->m_content = m_arcContentPtr;
// advance for loop
m_arcContentPtr = arcRecEnd;
// null term this record
m_savedChar = *m_arcContentPtr; *m_arcContentPtr = '\0';
// convert to timestamp
int64_t arcTime = 0;
// this time structure, once filled, will help yield a time_t
struct tm t;
// DAY OF MONTH
t.tm_mday = atol2 ( timeStr + 6 , 2 );
// MONTH
t.tm_mon = atol2 ( timeStr + 4 , 2 );
// YEAR
// # of years since 1900
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ;
// TIME
t.tm_hour = atol2 ( timeStr + 8 , 2 );
t.tm_min = atol2 ( timeStr + 10 , 2 );
t.tm_sec = atol2 ( timeStr + 12 , 2 );
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
arcTime = timegm ( &t );
gr->m_firstIndexed = arcTime;
gr->m_lastSpidered = arcTime;
// assume "start" has the http mime
gr->m_hasMime = true;
// arcConType needs to indexable
int32_t ct = getContentTypeFromStr ( arcConType );
if ( ct != CT_HTML &&
ct != CT_TEXT &&
ct != CT_XML &&
ct != CT_JSON ) {
// read another arc record
goto subdocLoop;
}
// skip if robots.txt
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
m_injectUrlBuf.getLength() ) )
goto subdocLoop;
QUICKPOLL ( m_niceness );
// TODO: set these based on the date in the warc mime!!
//gr->m_firstIndexed = ;
//gr->m_lastSpidered = ;
// then process. this will scan over each delimeted
// doc in the arc/warc file and inject each one individually.
if ( ! m_msg7->inject ( m_masterCallback , m_masterState ) )
// it would block, callback will be called later
return false;
QUICKPOLL ( m_niceness );
// error?
if ( g_errno ) {
log("build: index arc error %s",mstrerror(g_errno));
return NULL;
}
// loop it up
goto subdocLoop;
}
// returns false if would block, true otherwise. returns true and sets g_errno on err
bool XmlDoc::indexWarc ( ) {
int8_t *hc = getHopCount();
if ( ! hc ) return true; // error?
if ( hc == (void *)-1 ) return false;
// first download
char **warcContent = getUtf8Content();
// return true with g_errno set on error
if ( ! warcContent ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// would block? return false then
if ( warcContent == (void *)-1 )
return false;
// need this. it is almost 1MB in size, so alloc it
if ( ! m_msg7 ) {
try { m_msg7 = new ( Msg7 ); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
}
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
}
// inject input parms:
GigablastRequest *gr = &m_msg7->m_gr;
// the cursor for scanning the subdocs
if ( ! m_warcContentPtr ) {
// init the content cursor to point to the first subdoc
m_warcContentPtr = warcContent;
//m_warcContentEnd = warcContent + size_utf8Content;
// init the input parms
memset ( gr , 0 , sizeof(GigablastRequest) );
// reset it
gr->m_spiderLinks = false;
gr->m_injectLinks = false;
// what happens if coll gets nuked from under us? use collnum
gr->m_coll = cr->m_coll;
gr->m_hopCount = *hc + 1;
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
gr->m_collnum = m_collnum;
// will this work on a content delimeterized doc?
gr->m_deleteUrl = m_deleteFromIndex;
// each subdoc will have a mime since it is a warc
gr->m_hasMime = true;
}
subdocLoop:
QUICKPOLL ( m_niceness );
// we had \0 terminated the end of the previous record, so put back
if ( m_savedChar && ! *m_warcContentPtr ) *m_warcContentPtr = m_savedChar;
// find size of this warc record. parse this warc record
char *mm = strstr(m_warcContentPtr,"Content-Length:");
char *mmend = NULL;
if ( mm ) mmend = strstr (mm,"\n");
if ( ! mm || ! mmend ) {
log("build: warc: all done");
// XmlDoc.cpp checks for this to stop calling us
//m_isDoneInjecting = true;
return true;
}
// set 'recordSize' to the content-length
char c = *mmend;
*mmend = '\0';
int64_t recordSize = atoll ( mm + 15 );
*mmend = c;
// set 'hend' to the end of this mime header
char *warcMimeEnd = strstr ( mm, "\r\n\r\n");
if ( ! warcMimeEnd ) {
log("build: warc: could no mime header end. stopping.");
// this is critical, so we gotta stop
return true;
}
// tmp \0 that for these strstr() calls in the mime so if they miss
// we don't scan a 1GB warc for them
c = *warcMimeEnd;
*warcMimeEnd = '\0';
char *warcUrl = strstr(m_warcContentPtr,"WARC-Target-URI:");
char *warcType = strstr(m_warcContentPtr,"WARC-Type:");
char *warcDate = strstr(m_warcContentPtr,"WARC-Date:");
char *warcIp = strstr(m_warcContentPtr,"WARC-IP-Address:");
// advance
if ( warcUrl ) warcUrl += 16;
if ( warcType ) warcType += 10;
if ( warcDate ) warcDate += 10;
if ( warcIp ) warcIp += 17;
// restore
*warcMimeEnd = c;
// skip the \r\n\r\n at the end of this subdoc's http mime
warcMimeEnd += 4;
// we could also use m_contentFile if it was on disk
gr->m_content = m_warcContentPtr;
// and point to the mime of the NEXT subdoc
// before we call goto subdocLoop anywhere.
m_warcContentPtr = warcMimeEnd + recordSize;
// null term end of this subdoc before injecting
m_savedChar = *m_warcContentPtr;
*m_warcContentPtr = '\0';
if ( ! warcType ) {
log("inject: warc: could not find rec type");
goto subdocLoop;
}
if ( is_wspace_a(*warcType) ) warcType++;
if ( is_wspace_a(*warcType) ) warcType++;
// WARC-Type:
// do not index this record as a doc if it is not a
// "WARC-Type: response" record.
if ( strncmp(warcType,"response",8) != 0 )
goto subdocLoop;
// skip this rec if url-less
if ( ! warcUrl ) {
log("inject: warc: could not find rec url");
goto subdocLoop;
}
if ( ! warcDate ) {
log("inject: warc: could not find rec date");
goto subdocLoop;
}
// skip spaces on all
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
if ( warcUrl && is_wspace_a(*warcUrl ) ) warcUrl++;
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
if ( warcDate && is_wspace_a(*warcDate) ) warcDate++;
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
if ( warcIp && is_wspace_a(*warcIp ) ) warcIp++;
// url must start with http:// or https://
// it's probably like WARC-Target-URI: dns:www.xyz.com
// so it is a dns response
if ( strncmp(warcUrl,"http://" ,7) != 0 &&
strncmp(warcUrl,"https://",8) != 0 )
goto subdocLoop;
gr->m_injectDocIp = 0;
// get the record IP address from the warc header if there
if ( warcIp ) {
// get end of ip
char *warcIpEnd = warcIp;
// skip digits and periods
while ( ! is_wspace_a(*warcIpEnd) ) warcIpEnd++;
// we now have the ip address for doing ip: searches
// this func is in ip.h
gr->m_injectDocIp = atoip ( warcIp, warcIpEnd-warcIp );
}
// convert date to timestamp
int64_t warcTime = 0;
if ( warcDate ) warcTime = atotime ( warcDate );
gr->m_firstIndexed = warcTime;
gr->m_lastSpidered = warcTime;
// does this work?
gr->m_hopCount = -1;
gr->m_diffbotReply = 0;
gr->m_newOnly = 0;
// end of the url
char *warcUrlEnd = warcUrl;
for ( ; *warcUrlEnd && ! is_wspace_a(*warcUrlEnd) ;
warcUrlEnd++ );
// set it to that
m_injectUrlBuf.reset();
// by default append a -<ch64> to the provided url
int32_t warcUrlLen = warcUrlEnd - warcUrl;
m_injectUrlBuf.safeMemcpy(warcUrl,warcUrlLen);
m_injectUrlBuf.nullTerm();
// skip if robots.txt
if ( isRobotsTxtFile(m_injectUrlBuf.getBufStart(),
m_injectUrlBuf.getLength() ) )
goto subdocLoop;
// all warc records have the http mime
gr->m_hasMime = true;
// point to subdoc's mime again, not the warc mime for it, which is above it
char *recMime = warcMimeEnd;
// and find the next \r\n\r\n
char *recMimeEnd = strstr ( recMime , "\r\n\r\n" );
if ( ! recMimeEnd ) {
log("inject: warc: no http mime.");
goto subdocLoop;
}
// gotta include the \r\n\r\n in the mime length here otherwise mime.set fail
recMimeEnd += 4;
// should be a mime that starts with GET or POST
HttpMime mime;
if ( ! mime.set ( recMime, recMimeEnd - recMime , NULL ) ) {
log("inject: warc: mime set failed ");
goto subdocLoop;
}
// check content type. if bad advance to next rec.
int ct = mime.getContentType();
if ( ct != CT_HTML &&
ct != CT_TEXT &&
ct != CT_XML &&
ct != CT_JSON )
goto subdocLoop;
QUICKPOLL ( m_niceness );
// TODO: set these based on the date in the warc mime!!
//gr->m_firstIndexed = ;
//gr->m_lastSpidered = ;
// then process. this will scan over each delimeted
// doc in the arc/warc file and inject each one individually.
if ( ! m_msg7->inject ( m_masterCallback , m_masterState ) )
// it would block, callback will be called later
return false;
QUICKPOLL ( m_niceness );
// error?
if ( g_errno ) {
log("build: index warc error %s",mstrerror(g_errno));
return NULL;
}
// loop it up
goto subdocLoop;
}
void getTitleRecBufWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// make sure has not been freed from under us!
@ -10138,6 +10614,9 @@ Url **XmlDoc::getRedirUrl() {
// let's just parse out the meta tag by hand
bool checkMeta = true;
if ( isRobotsTxt ) checkMeta = false;
// if we are a doc that consists of a sequence of sub-docs that
// we are indexing/injecting then don't do this check.
if ( isContainerDoc() ) checkMeta = false;
if ( checkMeta ) {
Url **mrup = getMetaRedirUrl();
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
@ -17926,6 +18405,11 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
bool skip = m_skipIframeExpansion;
// if we are a warc, arc or doc that consists of a sequence of
// sub-docs that we are indexing/injecting then skip iframe expansion
if ( isContainerDoc() )
skip = true;
// or if this is set to true
if ( skip ) {
m_expandedUtf8Content = m_rawUtf8Content;

@ -499,6 +499,8 @@ class XmlDoc {
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
bool indexDoc ( );
bool indexDoc2 ( );
bool indexArc ( ) ;
bool indexWarc ( ) ;
key_t *getTitleRecKey() ;
//char *getSkipIndexing ( );
char *prepareToMakeTitleRec ( ) ;
@ -1048,6 +1050,9 @@ class XmlDoc {
SafeBuf m_zbuf;
SafeBuf m_kbuf;
class Msg7 *m_msg7;
char *m_warcContentPtr;
// . same thing, a little more complicated
// . these classes are only set on demand
Xml m_xml;