checkpoint

This commit is contained in:
Matt
2015-05-03 12:55:19 -07:00
parent 91d9179e46
commit a07c94f85d
2 changed files with 155 additions and 136 deletions

@ -2786,7 +2786,7 @@ bool XmlDoc::indexDoc2 ( ) {
if ( m_firstUrlValid && m_firstUrl.isWarc() ) {
// this returns false if it would block and callback will be
// called
if ( ! indexWarc () )
if ( ! indexWarcOrArc ( CT_WARC ) )
return false;
logIt();
// all done! no need to add the parent doc.
@ -2796,7 +2796,7 @@ bool XmlDoc::indexDoc2 ( ) {
if ( m_firstUrlValid && m_firstUrl.isArc() ) {
// this returns false if it would block and callback will be
// called
if ( ! indexArc () )
if ( ! indexWarcOrArc ( CT_ARC ) )
return false;
logIt();
// all done! no need to add the parent doc.
@ -3446,7 +3446,8 @@ void doneInjectingWarcRec ( void *state ) {
// . returns false if would block, true otherwise.
// . returns true and sets g_errno on err
// . injectwarc
bool XmlDoc::indexWarc ( ) {
// . ctype is CT_WARC or CT_ARC respectively
bool XmlDoc::indexWarcOrArc ( char ctype ) {
int8_t *hc = getHopCount();
if ( ! hc ) return true; // error?
@ -3566,137 +3567,158 @@ bool XmlDoc::indexWarc ( ) {
goto readMore;
}
// find "WARC/1.0" or whatever
char *whp = m_fptr;
// we do terminate last warc rec with \0 so be aware of that...
int32_t maxCount = 10;
for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++ );
// none?
if ( ! *whp ) {
log("build: could not find WARC/1 header start for file=%s",
file->getFilename());
// we don't really need this and since we force the http
// reply to end in \0 before calling inject2() on it it
// gets messed up
goto warcDone;
}
char *warcHeader = whp;
// find end of warc mime HEADER not the content
char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
if ( ! warcHeaderEnd ) {
log("build: could not find end of WARC header for file=%s.",
file->getFilename());
goto warcDone;
}
// \0 term for strstrs below
*warcHeaderEnd = '\0';
//warcHeaderEnd += 4;
char *warcLen = strstr(warcHeader,"Content-Length:");
char *warcUrl = strstr(warcHeader,"WARC-Target-URI:");
char *warcType = strstr(warcHeader,"WARC-Type:");
char *warcDate = strstr(warcHeader,"WARC-Date:");
char *warcIp = strstr(warcHeader,"WARC-IP-Address:");
char *warcCon = strstr(warcHeader,"Content-Type:");
// advance
if ( warcLen ) warcLen += 15;
if ( warcUrl ) warcUrl += 16;
if ( warcType ) warcType += 10;
if ( warcIp ) warcIp += 17;
if ( warcCon ) warcCon += 13;
if ( warcDate ) warcDate += 10;
// skip initial spaces spaces
for ( ; warcUrl && is_wspace_a(*warcUrl ) ; warcUrl ++ );
for ( ; warcLen && is_wspace_a(*warcLen ) ; warcLen ++ );
for ( ; warcType && is_wspace_a(*warcType) ; warcType++ );
for ( ; warcDate && is_wspace_a(*warcDate) ; warcDate++ );
for ( ; warcIp && is_wspace_a(*warcIp ) ; warcIp ++ );
for ( ; warcCon && is_wspace_a(*warcCon ) ; warcCon ++ );
// get Content-Length: of WARC header for its content
if ( ! warcLen ) {
// this is a critical stop.
log("build: could not find WARC Content-Length:");
goto warcDone;
}
//
// advance m_fptr to point to the next warc record in case we
// end up calling 'goto loop' below
//
char *warcContent = warcHeaderEnd + 4;
int64_t warcContentLen = atoll(warcLen);
char *warcContentEnd = warcContent + warcContentLen;
uint64_t recSize = (warcContentEnd - realStart);
// point to the next warc record
m_fptr += recSize;
int64_t recTime = 0;
char *recIp = NULL;
char *recUrl = NULL;
char *recContent = NULL;
int64_t recContentLen = 0;
// what we skip over
uint64_t recSize = 0;
uint64_t oldOff = m_fileOff;
// advance the file offset to the next record as well
m_fileOff += recSize;
if ( ctype == CT_WARC ) {
// find "WARC/1.0" or whatever
char *whp = m_fptr;
// we do terminate last warc rec with \0 so be aware of that...
int32_t maxCount = 10;
for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++);
// none?
if ( ! *whp ) {
log("build: could not find WARC/1 header start for "
"file=%s", file->getFilename());
// we don't really need this and since we force the
// http reply to end in \0 before calling inject2() on
// it it gets messed up
goto warcDone;
}
char *warcHeader = whp;
// find end of warc mime HEADER not the content
char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
if ( ! warcHeaderEnd ) {
log("build: could not find end of WARC header for "
"file=%s.",
file->getFilename());
goto warcDone;
}
// \0 term for strstrs below
*warcHeaderEnd = '\0';
//warcHeaderEnd += 4;
char *warcLen = strstr(warcHeader,"Content-Length:");
char *warcUrl = strstr(warcHeader,"WARC-Target-URI:");
char *warcType = strstr(warcHeader,"WARC-Type:");
char *warcDate = strstr(warcHeader,"WARC-Date:");
char *warcIp = strstr(warcHeader,"WARC-IP-Address:");
char *warcCon = strstr(warcHeader,"Content-Type:");
// advance
if ( warcLen ) warcLen += 15;
if ( warcUrl ) warcUrl += 16;
if ( warcType ) warcType += 10;
if ( warcIp ) warcIp += 17;
if ( warcCon ) warcCon += 13;
if ( warcDate ) warcDate += 10;
// skip initial spaces spaces
for ( ; warcUrl && is_wspace_a(*warcUrl ) ; warcUrl ++ );
for ( ; warcLen && is_wspace_a(*warcLen ) ; warcLen ++ );
for ( ; warcType && is_wspace_a(*warcType) ; warcType++ );
for ( ; warcDate && is_wspace_a(*warcDate) ; warcDate++ );
for ( ; warcIp && is_wspace_a(*warcIp ) ; warcIp ++ );
for ( ; warcCon && is_wspace_a(*warcCon ) ; warcCon ++ );
// get Content-Length: of WARC header for its content
if ( ! warcLen ) {
// this is a critical stop.
log("build: could not find WARC Content-Length:");
goto warcDone;
}
//
// advance m_fptr to point to the next warc record in case we
// end up calling 'goto loop' below
//
recContent = warcHeaderEnd + 4;
recContentLen = atoll(warcLen);
char *warcContentEnd = recContent + recContentLen;
recSize = (warcContentEnd - realStart);
recUrl = warcUrl;
// point to the next warc record
m_fptr += recSize;
// advance the file offset to the next record as well
m_fileOff += recSize;
// get WARC-Type:
// revisit (if url was already done before)
// request (making a GET or DNS request)
// response (reponse to a GET or dns request)
// warcinfo (crawling parameters, robots: obey, etc)
// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
if ( ! warcType ) {
log("build: could not find WARC-Type:");
goto loop;
// get WARC-Type:
// revisit (if url was already done before)
// request (making a GET or DNS request)
// response (reponse to a GET or dns request)
// warcinfo (crawling parameters, robots: obey, etc)
// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
if ( ! warcType ) {
log("build: could not find WARC-Type:");
goto loop;
}
// get Content-Type:
// application/warc-fields (fetch time, hops from seed)
// application/http; msgtype=request (the GET request)
// application/http; msgtype=response (the GET reply)
if ( ! warcCon ) {
log("build: could not find Content-Type:");
goto loop;
}
if ( ! warcUrl ) {
// no URI?
goto loop;
}
// if WARC-Type: is not response, skip it. so if it
// is a revisit then skip it i guess.
if ( strncmp ( warcType,"response", 8 ) != 0 ) {
// read another warc record
goto loop;
}
// warcConType needs to be
// application/http; msgtype=response
if ( strncmp(warcCon,"application/http; msgtype=response",34)){
// read another warc record
goto loop;
}
recTime = 0;
if ( warcDate ) recTime = atotime ( warcDate );
recIp = warcIp;
// END WARC SPECIFIC PARSING
}
// get Content-Type:
// application/warc-fields (fetch time, hops from seed)
// application/http; msgtype=request (the GET request)
// application/http; msgtype=response (the GET reply)
if ( ! warcCon ) {
log("build: could not find Content-Type:");
goto loop;
}
if ( ! warcUrl ) {
// no URI?
goto loop;
}
// if WARC-Type: is not response, skip it. so if it
// is a revisit then skip it i guess.
if ( strncmp ( warcType,"response", 8 ) != 0 ) {
// read another warc record
goto loop;
}
// warcConType needs to be
// application/http; msgtype=response
if ( strncmp(warcCon,"application/http; msgtype=response", 34) ) {
// read another warc record
goto loop;
}
// must be http not dns:
// url must start with http:// or https://
// it's probably like WARC-Target-URI: dns:www.xyz.com
// so it is a dns response
if ( strncmp(warcUrl,"http://" ,7) != 0 &&
strncmp(warcUrl,"https://",8) != 0 )
if ( strncmp(recUrl,"http://" ,7) != 0 &&
strncmp(recUrl,"https://",8) != 0 )
goto loop;
// get length of it, null term it
char *warcUrlEnd = warcUrl;
for ( ; *warcUrlEnd && ! is_wspace_a(*warcUrlEnd) ; warcUrlEnd++ );
int32_t warcUrlLen = warcUrlEnd - warcUrl;
*warcUrlEnd = '\0';
char *recUrlEnd = recUrl;
for ( ; *recUrlEnd && ! is_wspace_a(*recUrlEnd) ; recUrlEnd++ );
int32_t recUrlLen = recUrlEnd - recUrl;
*recUrlEnd = '\0';
// skip if robots.txt
if ( isRobotsTxtFile( warcUrl , warcUrlLen ) )
if ( isRobotsTxtFile( recUrl , recUrlLen ) )
goto loop;
// how can there be no more to read?
@ -3720,8 +3742,8 @@ bool XmlDoc::indexWarc ( ) {
goto readMore;
}
char *httpReply = warcContent;
int64_t httpReplySize = warcContentLen;
char *httpReply = recContent;
int64_t httpReplySize = recContentLen;
// should be a mime that starts with GET or POST
HttpMime m;
@ -3732,11 +3754,11 @@ bool XmlDoc::indexWarc ( ) {
}
// check content type
int ct = m.getContentType();
if ( ct != CT_HTML &&
ct != CT_TEXT &&
ct != CT_XML &&
ct != CT_JSON )
int ct2 = m.getContentType();
if ( ct2 != CT_HTML &&
ct2 != CT_TEXT &&
ct2 != CT_XML &&
ct2 != CT_JSON )
goto loop;
@ -3791,11 +3813,8 @@ bool XmlDoc::indexWarc ( ) {
//
// set 'timestamp' for injection
//
int64_t warcTime = 0;
if ( warcDate ) warcTime = atotime ( warcDate );
gr->m_firstIndexed = warcTime;
gr->m_lastSpidered = warcTime;
gr->m_firstIndexed = recTime;
gr->m_lastSpidered = recTime;
//
@ -3803,14 +3822,14 @@ bool XmlDoc::indexWarc ( ) {
//
gr->m_injectDocIp = 0;
// get the record IP address from the warc header if there
if ( warcIp ) {
if ( recIp ) {
// get end of ip
char *warcIpEnd = warcIp;
char *ipEnd = recIp;
// skip digits and periods
while ( ! is_wspace_a(*warcIpEnd) ) warcIpEnd++;
while ( ! is_wspace_a(*ipEnd) ) ipEnd++;
// we now have the ip address for doing ip: searches
// this func is in ip.h
gr->m_injectDocIp = atoip ( warcIp, warcIpEnd-warcIp );
gr->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
}
// we end up repopulating m_fileBuf to read the next warc sometimes
@ -3838,11 +3857,11 @@ bool XmlDoc::indexWarc ( ) {
gr->m_newOnly = 0;
// all warc records have the http mime
gr->m_hasMime = true;
gr->m_url = warcUrl;
gr->m_url = recUrl;
// log it
log("build: warc: injecting WARC url %s",warcUrl);
log("build: warc: injecting WARC/ARC url %s",recUrl);
QUICKPOLL ( m_niceness );

@ -506,7 +506,7 @@ class XmlDoc {
bool isContainerDoc ( );
bool indexContainerDoc ( );
bool indexArc ( ) ;
bool indexWarc ( ) ;
bool indexWarcOrArc ( char ct ) ;
key_t *getTitleRecKey() ;
//char *getSkipIndexing ( );
char *prepareToMakeTitleRec ( ) ;