Merge branch 'ia' into ia-zak

This commit is contained in:
Matt
2015-05-05 23:46:16 -07:00
22 changed files with 2418 additions and 588 deletions

@ -486,6 +486,7 @@ class CollectionRec {
char m_detectCustomErrorPages ;
char m_useSimplifiedRedirects ;
char m_useIfModifiedSince ;
char m_useTimeAxis ;
char m_buildVecFromCont ;
int32_t m_maxPercentSimilarPublishDate;
char m_useSimilarityPublishDate;

@ -45,6 +45,7 @@ void HttpMime::reset ( ) {
m_locationFieldLen = 0;
m_contentEncodingPos = NULL;
m_contentLengthPos = NULL;
m_contentTypePos = NULL;
}
// . returns false if could not get a valid mime
@ -67,7 +68,12 @@ bool HttpMime::set ( char *buf , int32_t bufLen , Url *url ) {
// . return false if we had no mime boundary
// . but set m_bufLen to 0 so getMimeLen() will return 0 instead of -1
// thus avoiding a potential buffer overflow
if ( m_bufLen < 0 ) { m_bufLen = 0; m_boundaryLen = 0; return false; }
if ( m_bufLen < 0 ) {
m_bufLen = 0;
m_boundaryLen = 0;
log("mime: no rnrn boundary detected");
return false;
}
// set this
m_content = buf + m_bufLen;
// . parse out m_status, m_contentLen, m_lastModifiedData, contentType
@ -157,8 +163,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
time_t now = time(NULL);
if (m_lastModifiedDate > now) m_lastModifiedDate = now;
}
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) {
m_contentType = getContentTypePrivate ( p + 13 );
char *s = p + 13;
while ( *s == ' ' || *s == '\t' ) s++;
m_contentTypePos = s;
}
else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
m_cookie = p + 11;
if ( m_cookie[0] == ' ' ) m_cookie++;
@ -533,6 +543,8 @@ int32_t getContentTypeFromStr ( char *s ) {
else if (!strcasecmp(s,"application/vnd.ms-powerpoint")) ct = CT_PPT;
else if (!strcasecmp(s,"application/mspowerpoint") ) ct = CT_PPT;
else if (!strcasecmp(s,"application/postscript" ) ) ct = CT_PS;
else if (!strcasecmp(s,"application/warc" ) ) ct = CT_WARC;
else if (!strcasecmp(s,"application/arc" ) ) ct = CT_ARC;
else if (!strcasecmp(s,"image/gif" ) ) ct = CT_GIF;
else if (!strcasecmp(s,"image/jpeg" ) ) ct = CT_JPG;
else if (!strcasecmp(s,"image/png" ) ) ct = CT_PNG;
@ -540,6 +552,7 @@ int32_t getContentTypeFromStr ( char *s ) {
else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE;
else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS;
else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
else if (!strcasecmp(s,"application/x-gzip" ) ) ct = CT_GZ;
else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS;
else if (!strcasecmp(s,"text/x-js" ) ) ct = CT_JS;
else if (!strcasecmp(s,"text/js" ) ) ct = CT_JS;
@ -626,6 +639,17 @@ void resetHttpMime ( ) {
s_mimeTable.reset();
}
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
// assume text/html if no extension provided
if ( ! ext || ! ext[0] ) return NULL;
if ( elen <= 0 ) return NULL;
// get hash for table look up
int32_t key = hash32 ( ext , elen );
char **pp = (char **)s_mimeTable.getValue ( &key );
if ( ! pp ) return NULL;
return *pp;
}
const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
// assume text/html if no extension provided
if ( ! ext || ! ext[0] ) return "text/html";
@ -1051,7 +1075,10 @@ static char *s_ext[] = {
"xwd" , "image/x-xwindowdump",
"xyz" , "chemical/x-pdb",
"zip" , "application/zip" ,
"xpi", "application/x-xpinstall"
"xpi", "application/x-xpinstall",
// newstuff
"warc", "application/warc",
"arc", "application/arc"
};
// . init s_mimeTable in this call

@ -9,6 +9,8 @@
// convert application/json to CT_JSON for instance
int32_t getContentTypeFromStr ( char *s ) ;
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ;
#include <time.h>
void getTime ( char *s , int *sec , int *min , int *hour ) ;
@ -42,6 +44,9 @@ time_t atotime5 ( char *s ) ;
#define CT_JSON 16
#define CT_IMAGE 17
#define CT_STATUS 18 // an internal type indicating spider reply
#define CT_GZ 19
#define CT_ARC 20
#define CT_WARC 21
#define ET_IDENTITY 0
#define ET_GZIP 1
@ -127,6 +132,7 @@ class HttpMime {
int32_t getContentEncoding () {return m_contentEncoding;}
char *getContentEncodingPos() {return m_contentEncodingPos;}
char *getContentLengthPos() {return m_contentLengthPos;}
char *getContentTypePos() {return m_contentTypePos;}
// private:
@ -166,6 +172,7 @@ class HttpMime {
int32_t m_contentEncoding;
char *m_contentEncodingPos;
char *m_contentLengthPos;
char *m_contentTypePos;
// the size of the terminating boundary, either 1 or 2 bytes.
// just the last \n in the case of a \n\n or \r in the case

@ -1548,8 +1548,8 @@ void HttpRequest::parseFieldsMultipart ( char *s , int32_t slen ) {
// Content-Disposition: form-data; name=\"file\"; filename=\"poo.txt\"\r\nContent-Type: text/plain\r\n\r\nsomething here\n=====\nagain we do it...
char *equal2 = strstr ( s , "\"" );
// debug point
if ( strncmp(s,"file",4) == 0 )
log("hey");
// if ( strncmp(s,"file",4) == 0 )
// log("hey");
// so if we had that then we had an uploaded file
bool uploadedFile = false;
if ( equal2 && equal && equal2 < equal ) {

@ -197,7 +197,6 @@ bool HttpServer::getDoc ( char *url ,
if ( ! ip || useHttpTunnel )
host = getHostFast ( url , &hostLen , &port );
// this returns false and sets g_errno on error
if ( ! fullRequest ) {
if ( ! r.set ( url , offset , size , ifModifiedSince ,
@ -212,6 +211,7 @@ bool HttpServer::getDoc ( char *url ,
// TODO: ensure we close the socket on this error!
return true;
}
//log("archive: %s",r.m_reqBuf.getBufStart());
reqSize = r.getRequestLen();
int32_t need = reqSize + pcLen;
// if we are requesting an HTTPS url through a proxy then
@ -1035,6 +1035,44 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( strncmp ( path , "/download/", 10 ) == 0 )
return sendBackDump ( s , r );
if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) {
SafeBuf cmd;
char *iaItem = path + 10;
char c = iaItem[pathLen];
iaItem[pathLen] = '\0';
// iaItem is like "webgroup-20100422114008-00011"
// print out the warc files as if they were urls
// so we can spider them through the spider pipeline as-is.
// this hack only works on internet archive servers
// that have the '/home/mwells/ia' obviously
cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
"awk '{print \"<a "
"href=http://archive.org/download/"
"%s/\"$1\">\"$1\"</a><br>\"}' > ./tmpiaout"
//, g_hostdb.m_dir
,iaItem
,iaItem
);
iaItem[pathLen] = c;
log("system: %s",cmd.getBufStart());
gbsystem ( cmd.getBufStart() );
SafeBuf sb;
sb.safePrintf("<title>%s</title>\n<br>\n",iaItem);
sb.load ( "./tmpiaout" );
// remove those pesky ^M guys. i guess ia is windows based.
sb.safeReplace3("\r","");
//log("system: output(%"INT32"=%s",sb.getBufStart(),
//sb.length());
return g_httpServer.sendDynamicPage(s,
sb.getBufStart(),
sb.length(),
0, false,
"text/html",
-1, NULL,
"UTF-8");
}
// . is it a diffbot api request, like "GET /api/*"
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
//if ( strncmp ( path , "/api/" , 5 ) == 0 )
@ -2357,6 +2395,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
}
// if has no content then it must end in \n\r\n\r or \r\n\r\n
if ( ! hasContent ) return bufSize;
// look for a Content-Type: field because we now limit how much
// we read based on this
char *p = buf;
@ -2380,45 +2419,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
// as well index that at least.
if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0)
allOrNothing = true;
if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0)
allOrNothing = true;
// adjust "max to read" if we don't have an html/plain doc
if ( ! isPost ) {
max = s->m_maxOtherDocLen + 10*1024 ;
if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff;
}
}
// // if it is a warc or arc.gz allow it for now but we should
// // only allow one spider at a time per host
if ( s->m_sendBuf ) {
char *p = s->m_sendBuf;
char *pend = p + s->m_sendBufSize;
if ( strncmp(p,"GET /",5) == 0 ) p += 4;
// find end of url we are getting
char *e = p;
for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ );
if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 )
max = 0x7fffffff;
if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 )
max = 0x7fffffff;
}
int32_t contentSize = 0;
int32_t totalReplySize = 0;
// now look for Content-Length in the mime
for ( int32_t j = 0; j < i ; j++ ) {
int32_t j; for ( j = 0; j < i ; j++ ) {
if ( buf[j] != 'c' && buf[j] != 'C' ) continue;
if ( j + 16 >= i ) break;
if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 )
continue;
int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) );
int32_t totalReplySize = contentSize + mimeSize ;
// all-or-nothing filter
if ( totalReplySize > max && allOrNothing ) {
log(LOG_INFO,
"http: pdf reply/request size of %"INT32" is larger "
"than limit of %"INT32". Cutoff pdf's are useless. "
"Abandoning.",totalReplySize,max);
// do not read any more than what we have
return bufSize;
}
// warn if we received a post that was truncated
if ( totalReplySize > max && isPost ) {
log("http: Truncated POST request from %"INT32" "
"to %"INT32" bytes. Increase \"max other/text doc "
"len\" in Spider Controls page to prevent this.",
totalReplySize,max);
}
// truncate the reply if we have to
if ( totalReplySize > max ) {
log("http: truncating reply of %"INT32" to %"INT32" bytes",
totalReplySize,max);
totalReplySize = max;
}
// truncate if we need to
return totalReplySize;
contentSize = atol2 ( &buf[j+15] , i - (j+15) );
totalReplySize = contentSize + mimeSize ;
break;
}
// all-or-nothing filter
if ( totalReplySize > max && allOrNothing ) {
log(LOG_INFO,
"http: reply/request size of %"INT32" is larger "
"than limit of %"INT32". Cutoff documents "
"of this type are useless. "
"Abandoning.",totalReplySize,max);
// do not read any more than what we have
return bufSize;
}
// warn if we received a post that was truncated
if ( totalReplySize > max && isPost ) {
log("http: Truncated POST request from %"INT32" "
"to %"INT32" bytes. Increase \"max other/text doc "
"len\" in Spider Controls page to prevent this.",
totalReplySize,max);
}
// truncate the reply if we have to
if ( totalReplySize > max ) {
log("http: truncating reply of %"INT32" to %"INT32" bytes",
totalReplySize,max);
totalReplySize = max;
}
// truncate if we need to
if ( totalReplySize )
return totalReplySize;
// if it is a POST request with content but no content length...
// we don't know how big it is...
if ( isPost ) {
@ -2849,16 +2914,34 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
// so we need to rewrite the Content-Length: and the
// Content-Encoding: http mime field values so they are no longer
// "gzip" and use the uncompressed content-length.
char *ptr1 = NULL;
char *ptr2 = NULL;
if(mime.getContentEncodingPos() &&
mime.getContentEncodingPos() < mime.getContentLengthPos()) {
ptr1 = mime.getContentEncodingPos();
ptr2 = mime.getContentLengthPos();
}
else {
ptr1 = mime.getContentLengthPos();
ptr2 = mime.getContentEncodingPos();
char *ptr1 = mime.getContentEncodingPos();
char *ptr2 = mime.getContentLengthPos();
char *ptr3 = NULL;
// change the content type based on the extension before the
// .gz extension since we are uncompressing it
char *p = s->m_sendBuf + 4;
char *pend = s->m_sendBuf + s->m_sendBufSize;
const char *newCT = NULL;
char *lastPeriod = NULL;
// get the extension, if any, before the .gz
for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) {
if ( p[0] != '.' ) continue;
if ( p[1] != 'g' ) { lastPeriod = p; continue; }
if ( p[2] != 'z' ) { lastPeriod = p; continue; }
if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; }
// no prev?
if ( ! lastPeriod ) break;
// skip period
lastPeriod++;
// back up
newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod);
// this is NULL if the file extension is unrecognized
if ( ! newCT ) break;
// this should be like text/html or
// WARC/html or something like that...
ptr3 = mime.getContentTypePos();
break;
}
// this was writing a number at the start of the mime and messing
@ -2870,38 +2953,47 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
char *src = s->m_readBuf;
// sometimes they are missing Content-Length:
if ( ptr1 ) {
// copy ptr1 to src
gbmemcpy ( pnew, src, ptr1 - src );
pnew += ptr1 - src;
src += ptr1 - src;
// store either the new content encoding or new length
if(ptr1 == mime.getContentEncodingPos())
pnew += sprintf(pnew, " identity");
else
pnew += sprintf(pnew, " %"INT32"",newSize);
// scan to \r\n at end of that line we replace
while ( *src != '\r' && *src != '\n') src++;
subloop:
char *nextMin = (char *)-1;
if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1;
if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2;
if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3;
// if all ptrs are NULL then copy the tail
if ( nextMin == (char *)-1 ) nextMin = mimeEnd;
// copy ptr1 to src
gbmemcpy ( pnew, src, nextMin - src );
pnew += nextMin - src;
src += nextMin - src;
// store either the new content encoding or new length
if ( nextMin == mime.getContentEncodingPos()) {
pnew += sprintf(pnew, " identity");
ptr1 = NULL;
}
else if ( nextMin == mime.getContentLengthPos() ) {
pnew += sprintf(pnew, " %"INT32"",newSize);
ptr2 = NULL;
}
else if ( nextMin == mime.getContentTypePos() ) {
pnew += sprintf(pnew," %s",newCT);
ptr3 = NULL;
}
if ( ptr2 ) {
// copy ptr2 to src
gbmemcpy ( pnew , src , ptr2 - src );
pnew += ptr2 - src;
src += ptr2 - src;
// now insert the new shit
if(ptr2 == mime.getContentEncodingPos())
pnew += sprintf(pnew, " identity");
else
pnew += sprintf(pnew, " %"INT32"",newSize);
// loop for more
if ( nextMin < mimeEnd ) {
// scan to \r\n at end of that line we replace
while ( *src != '\r' && *src != '\n') src++;
goto subloop;
}
// copy the rest
gbmemcpy ( pnew , src , mimeEnd - src );
pnew += mimeEnd - src;
src += mimeEnd - src;
// gbmemcpy ( pnew , src , mimeEnd - src );
// pnew += mimeEnd - src;
// src += mimeEnd - src;
// before restLen was negative because we were skipping over

@ -394,8 +394,8 @@ RdbBuckets.o:
Linkdb.o:
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
XmlDoc.o:
$(CC) $(DEFS) $(CPPFLAGS) $(XMLDOCOPT) -c $*.cpp
#XmlDoc.o:
# $(CC) $(DEFS) $(CPPFLAGS) $(XMLDOCOPT) -c $*.cpp
# final gigabit generation in here:
Msg40.o:

@ -301,6 +301,8 @@ bool Msg13::forwardRequest ( ) {
if ( ++hostId >= nh ) hostId = 0;
}
hostId = 0; // HACK!!
// forward it to self if we are the spider proxy!!!
if ( g_hostdb.m_myHost->m_isProxy )
h = g_hostdb.m_myHost;

@ -242,7 +242,7 @@ bool sendReply ( void *state , bool addUrlEnabled ) {
mbuf.safePrintf("<center><font color=red>");
mbuf.safePrintf("<b><u>");
mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
mbuf.safePrintf("</u></b> added to spider "
mbuf.safePrintf("</u></b></font> added to spider "
"queue "
"successfully<br><br>");
mbuf.safePrintf("</font></center>");

File diff suppressed because one or more lines are too long

@ -1,6 +1,8 @@
#ifndef GBINJECT_H
#define GBINJECT_H
void handleRequest7Import ( class UdpSlot *slot , int32_t netnice ) ;
void handleRequest7 ( class UdpSlot *slot , int32_t netnice ) ;
bool sendPageInject ( class TcpSocket *s, class HttpRequest *hr );
@ -12,27 +14,88 @@ void saveImportStates ( ) ;
#include "XmlDoc.h"
#include "Users.h"
#include "Parms.h" // GigablastRequest
#include "Parms.h"
void setInjectionRequestFromParms ( class TcpSocket *sock ,
class HttpRequest *hr ,
class CollectionRec *cr ,
class InjectionRequest *ir ) ;
class InjectionRequest {
public:
int32_t m_injectDocIp;
char m_injectLinks;
char m_spiderLinks;
char m_shortReply;
char m_newOnly;
char m_deleteUrl;
char m_recycle;
char m_dedup;
char m_hasMime;
char m_doConsistencyTesting;
char m_getSections;
char m_gotSections;
int32_t m_charset;
int32_t m_hopCount;
collnum_t m_collnum; // more reliable than m_coll
uint32_t m_firstIndexed;
uint32_t m_lastSpidered;
char *ptr_url;
char *ptr_queryToScrape;
char *ptr_contentDelim;
char *ptr_contentFile;
char *ptr_contentTypeStr;
char *ptr_content;
char *ptr_diffbotReply; // secret thing from dan
int32_t size_url;
int32_t size_queryToScrape;
int32_t size_contentDelim;
int32_t size_contentFile;
int32_t size_contentTypeStr;
int32_t size_content;
int32_t size_diffbotReply; // secret thing from dan
// serialized space for the ptr_* strings above
char m_buf[0];
};
class Msg7 {
public:
GigablastRequest m_gr;
SafeBuf m_injectUrlBuf;
bool m_firstTime;
char *m_start;
bool m_fixMe;
int32_t m_injectCount;
//GigablastRequest m_gr;
InjectionRequest m_injectionRequest;
int32_t m_replyIndexCode;
int64_t m_replyDocId;
//SafeBuf m_injectUrlBuf;
//bool m_firstTime;
//char *m_start;
//bool m_fixMe;
//char m_saved;
//int32_t m_injectCount;
//bool m_isDoneInjecting;
char *m_sir;
int32_t m_sirSize;
bool m_needsSet;
XmlDoc m_xd;
XmlDoc *m_xd;
TcpSocket *m_socket;
SafeBuf m_sb;
char m_round;
char m_useAhrefs;
HashTableX m_linkDedupTable;
// referenced by InjectionRequest::ptr_content
SafeBuf m_contentBuf;
SafeBuf m_sbuf; // for holding entire titlerec for importing
void *m_state;
@ -49,27 +112,39 @@ public:
Msg7 ();
~Msg7 ();
bool m_inUse;
int32_t m_format;
HttpRequest m_hr;
class XmlDoc *m_stashxd;
void reset();
bool scrapeQuery ( );
bool inject ( char *coll,
char *proxiedUrl,
int32_t proxiedUrlLen,
char *content,
void *state ,
void (*callback)(void *state) );
void gotUdpReply ( class UdpSlot *slot ) ;
bool inject ( void *state ,
void (*callback)(void *state) );
bool sendInjectionRequestToHost ( InjectionRequest *ir ,
void *state ,
void (* callback)(void *) );
// msg7request m_req7 must be valid
//bool inject ( char *coll,
// char *proxiedUrl,
// int32_t proxiedUrlLen,
// char *content,
// void *state ,
// void (*callback)(void *state) );
// msg7request m_req7 must be valid
// bool inject2 ( void *state , */
// void (*callback)(void *state) ); */
//bool injectTitleRec ( void *state ,
// void (*callback)(void *state) ,
// class CollectionRec *cr );
void gotMsg7Reply ();
//void gotMsg7Reply ();
};

@ -2246,11 +2246,11 @@ public:
//SpiderRequest m_sreq;
};
static void doneInjectingWrapper3 ( void *st1 ) ;
// only allow up to 1 Msg10's to be in progress at a time
static bool s_inprogress = false;
void doneInjectingWrapper3 ( void *st ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
@ -2511,17 +2511,30 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
}
*/
Msg7 *msg7 = &st1->m_msg7;
// set this.
InjectionRequest *ir = &msg7->m_injectionRequest;
// default to zero
memset ( ir , 0 , sizeof(InjectionRequest) );
// set this. also sets gr->m_hr
GigablastRequest *gr = &st1->m_msg7.m_gr;
// this will fill in GigablastRequest so all the parms we need are set
g_parms.setGigablastRequest ( sock , hr , gr );
//setInjectionRequestFromParms ( sock , hr , cr , ir );
ir->ptr_url = hr->getString("u",NULL);
if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("url",NULL);
// get back a short reply so we can show the status code easily
ir->m_shortReply = 1;
ir->m_spiderLinks = st1->m_spiderLinks;
// this is really an injection, not add url, so make
// GigablastRequest::m_url point to Gigablast::m_urlsBuf because
// the PAGE_ADDURLS2 parms in Parms.cpp fill in the m_urlsBuf.
// HACK!
gr->m_url = gr->m_urlsBuf;
//gr->m_url = gr->m_urlsBuf;
//ir->ptr_url = gr->m_urlsBuf;
//
// inject using msg7
@ -2529,10 +2542,8 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
// . pass in the cleaned url
// . returns false if blocked, true otherwise
if ( ! st1->m_msg7.inject ( //s ,
//r ,
st1 ,
doneInjectingWrapper3 ) )
if ( ! msg7->sendInjectionRequestToHost ( ir, st1 , doneInjectingWrapper3 ) )
return false;
// some kinda error, g_errno should be set i guess
@ -2551,10 +2562,14 @@ void doneInjectingWrapper3 ( void *st ) {
// in order to see what sites are being added log it, then we can
// more easily remove sites from sitesearch.gigablast.com that are
// being added but not being searched
char *url = st1->m_msg7.m_xd.m_firstUrl.m_url;
//char *url = st1->m_msg7.m_xd.m_firstUrl.m_url;
Msg7 *msg7 = &st1->m_msg7;
InjectionRequest *ir = &msg7->m_injectionRequest;
char *url = ir->ptr_url;
log(LOG_INFO,"http: add url %s (%s)",url ,mstrerror(g_errno));
// extract info from state
TcpSocket *sock = st1->m_socket;
//bool isAdmin = st1->m_isMasterAdmin;
//char *url = NULL;
//if ( st1->m_urlLen ) url = st1->m_url;
@ -2654,11 +2669,12 @@ void doneInjectingWrapper3 ( void *st ) {
// " is enabled.");
sb.safePrintf("%s",pm);
}
else if ( st1->m_msg7.m_xd.m_indexCodeValid &&
st1->m_msg7.m_xd.m_indexCode ) {
int32_t ic = st1->m_msg7.m_xd.m_indexCode;
else if ( msg7->m_replyIndexCode ) {
//st1->m_msg7.m_xd.m_indexCodeValid &&
// st1->m_msg7.m_xd.m_indexCode ) {
//int32_t ic = st1->m_msg7.m_xd.m_indexCode;
sb.safePrintf("<b>Had error injecting url: %s</b>",
mstrerror(ic));
mstrerror(msg7->m_replyIndexCode));
}
/*
if ( url && ! st1->m_ufu[0] && url[0] && printUrl ) {

141
Parms.cpp

@ -32,7 +32,7 @@
#include "Test.h"
#include "Rebalance.h"
#include "SpiderProxy.h" // buildProxyTable()
#include "PageInject.h"
#include "PageInject.h" // InjectionRequest
// width of input box in characters for url filter expression
#define REGEX_TXT_MAX 80
@ -1085,6 +1085,9 @@ bool Parms::setGigablastRequest ( TcpSocket *socket ,
return false;
}
// just in case
memset ( gr , 0 , sizeof(GigablastRequest) );
gr->m_socket = socket;
// make a copy of the httprequest because the original is on the stack
@ -1798,6 +1801,8 @@ bool Parms::printParms2 ( SafeBuf* sb ,
GigablastRequest gr;
g_parms.setToDefault ( (char *)&gr , OBJ_GBREQUEST , NULL);
InjectionRequest ir;
g_parms.setToDefault ( (char *)&ir , OBJ_IR , NULL);
// Begin "parms":[]
if (format == FORMAT_JSON ) {
@ -1841,6 +1846,8 @@ bool Parms::printParms2 ( SafeBuf* sb ,
}
if ( m->m_obj == OBJ_GBREQUEST )
THIS = (char *)&gr;
if ( m->m_obj == OBJ_IR )
THIS = (char *)&ir;
// might have an array, do not exceed the array size
int32_t jend = m->m_max;
int32_t size = jend ;
@ -2237,6 +2244,7 @@ bool Parms::printParm ( SafeBuf* sb,
// test it
if ( m->m_def &&
m->m_obj != OBJ_NONE &&
m->m_obj != OBJ_IR && // do not do for injectionrequest
m->m_obj != OBJ_GBREQUEST && // do not do for GigablastRequest
strcmp ( val1.getBufStart() , m->m_def ) )
// put non-default valued parms in orange!
@ -4883,6 +4891,8 @@ void Parms::init ( ) {
GigablastRequest gr;
InjectionRequest ir;
/*
m->m_title = "delete collection";
m->m_desc = "A collection name to delete. You can specify multiple "
@ -14881,50 +14891,57 @@ void Parms::init ( ) {
"The injection api is described on the "
"<a href=/admin/api>api</a> page. "
"Make up a fake url if you are injecting content that "
"does not have one.";
"does not have one."
"<br>"
"<br>"
"If the url ends in .warc or .arc or .warc.gz or .arc.gz "
"Gigablast will index the contained documents as individual "
"documents, using the appropriate dates and other meta "
"information contained in the containing archive file."
;
m->m_cgi = "url";
//m->m_cgi2 = "u";
//m->m_cgi3 = "seed"; // pagerawlbot
//m->m_cgi4 = "injecturl";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #1
m->m_title = "url";
m->m_cgi = "u";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #2
m->m_title = "url";
m->m_cgi = "seed";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN | PF_DIFFBOT;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #3
m->m_title = "url";
m->m_cgi = "injecturl";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
@ -14933,24 +14950,24 @@ void Parms::init ( ) {
"and inject their links. You are not required to supply "
"the <i>url</i> parm if you supply this parm.";
m->m_cgi = "qts";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_queryToScrape - (char *)&gr;
m->m_off = (char *)&ir.ptr_queryToScrape - (char *)&ir;
m++;
m->m_title = "inject links";
m->m_desc = "Should we inject the links found in the injected "
"content as well?";
m->m_cgi = "injectlinks";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_injectLinks - (char *)&gr;
m->m_off = (char *)&ir.m_injectLinks - (char *)&ir;
m++;
@ -14958,47 +14975,47 @@ void Parms::init ( ) {
m->m_desc = "Add the outlinks of the injected content into spiderdb "
"for spidering?";
m->m_cgi = "spiderlinks";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
// leave off because could start spidering whole web unintentionally
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_spiderLinks - (char *)&gr;
m->m_off = (char *)&ir.m_spiderLinks - (char *)&ir;
m++;
m->m_title = "int16_t reply";
m->m_desc = "Should the injection response be int16_t and simple?";
m->m_title = "short reply";
m->m_desc = "Should the injection response be short and simple?";
m->m_cgi = "quick";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_shortReply - (char *)&gr;
m->m_off = (char *)&ir.m_shortReply - (char *)&ir;
m++;
m->m_title = "only inject content if new";
m->m_desc = "If the specified url is already in the index then "
"skip the injection.";
m->m_cgi = "newonly";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_newOnly - (char *)&gr;
m->m_off = (char *)&ir.m_newOnly - (char *)&ir;
m++;
m->m_title = "delete from index";
m->m_desc = "Delete the specified url from the index.";
m->m_cgi = "deleteurl";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_deleteUrl - (char *)&gr;
m->m_off = (char *)&ir.m_deleteUrl - (char *)&ir;
m++;
m->m_title = "recycle content";
@ -15006,68 +15023,68 @@ void Parms::init ( ) {
"re-download the content, just use the content that was "
"stored in the cache from last time.";
m->m_cgi = "recycle";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_recycle - (char *)&gr;
m->m_off = (char *)&ir.m_recycle - (char *)&ir;
m++;
m->m_title = "dedup url";
m->m_desc = "Do not index the url if there is already another "
"url in the index with the same content.";
m->m_cgi = "dedup";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_dedup - (char *)&gr;
m->m_off = (char *)&ir.m_dedup - (char *)&ir;
m++;
m->m_title = "do consistency checking";
m->m_desc = "Turn this on for debugging.";
m->m_cgi = "consist";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_doConsistencyTesting - (char *)&gr;
m->m_off = (char *)&ir.m_doConsistencyTesting - (char *)&ir;
m++;
m->m_title = "hop count";
m->m_desc = "Use this hop count when injecting the page.";
m->m_cgi = "hopcount";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_hopCount - (char *)&gr;
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
m++;
m->m_title = "last spider time";
m->m_desc = "Override last time spidered";
m->m_cgi = "lastspidered";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_lastSpidered - (char *)&gr;
m->m_off = (char *)&ir.m_lastSpidered - (char *)&ir;
m++;
m->m_title = "first indexed";
m->m_desc = "Override first indexed time";
m->m_cgi = "firstindexed";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_firstIndexed - (char *)&gr;
m->m_off = (char *)&ir.m_firstIndexed - (char *)&ir;
m++;
@ -15075,12 +15092,12 @@ void Parms::init ( ) {
m->m_desc = "If the content of the url is provided below, does "
"it begin with an HTTP mime header?";
m->m_cgi = "hasmime";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_hasMime - (char *)&gr;
m->m_off = (char *)&ir.m_hasMime - (char *)&ir;
m++;
m->m_title = "content delimeter";
@ -15094,12 +15111,12 @@ void Parms::init ( ) {
"injected url. Otherwise it will append numbers to the "
"url you provide above.";
m->m_cgi = "delim";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_contentDelim - (char *)&gr;
m->m_off = (char *)&ir.ptr_contentDelim - (char *)&ir;
m++;
@ -15110,12 +15127,12 @@ void Parms::init ( ) {
"Possible values: <b>text/html text/plain text/xml "
"application/json</b>";
m->m_cgi = "contenttype";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR; //text/html application/json application/xml
m->m_def = "text/html";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_contentTypeStr - (char *)&gr;
m->m_off = (char *)&ir.ptr_contentTypeStr - (char *)&ir;
m++;
m->m_title = "content charset";
@ -15125,24 +15142,24 @@ void Parms::init ( ) {
"which is 106. "
"See iana_charset.h for the numeric values.";
m->m_cgi = "charset";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "106";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_charset - (char *)&gr;
m->m_off = (char *)&ir.m_charset - (char *)&ir;
m++;
m->m_title = "upload content file";
m->m_desc = "Instead of specifying the content to be injected in "
"the text box below, upload this file for it.";
m->m_cgi = "file";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_def = NULL;
m->m_flags = PF_NOAPI;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_contentFile - (char *)&gr;
m->m_off = (char *)&ir.ptr_contentFile - (char *)&ir;
m++;
m->m_title = "content";
@ -15156,35 +15173,35 @@ void Parms::init ( ) {
"inject empty content, otherwise the content will "
"be downloaded from the url.";
m->m_cgi = "content";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_content - (char *)&gr;
m->m_off = (char *)&ir.ptr_content - (char *)&ir;
m++;
m->m_title = "get sectiondb voting info";
m->m_desc = "Return section information of injected content for "
"the injected subdomain. ";
m->m_cgi = "sections";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API|PF_NOHTML;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_getSections - (char *)&gr;
m->m_off = (char *)&ir.m_getSections - (char *)&ir;
m++;
m->m_title = "diffbot reply";
m->m_desc = "Used exclusively by diffbot. Do not use.";
m->m_cgi = "diffbotreply";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA|PF_NOHTML; // do not show in our api
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_diffbotReply - (char *)&gr;
m->m_off = (char *)&ir.ptr_diffbotReply - (char *)&ir;
m++;
@ -16436,7 +16453,6 @@ void Parms::init ( ) {
m->m_flags = PF_CLONE;
m++;
m->m_title = "use robots.txt";
m->m_desc = "If this is true Gigablast will respect "
"the robots.txt convention.";
@ -16524,6 +16540,20 @@ void Parms::init ( ) {
m->m_flags = PF_CLONE;
m++;
m->m_title = "use time axis";
m->m_desc = "If this is true Gigablast will index the same "
"url multiple times if its content varies over time, "
"rather than overwriting the older version in the index. "
"Useful for archive web pages as they change over time.";
m->m_cgi = "usetimeaxis";
m->m_off = (char *)&cr.m_useTimeAxis - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
/*
m->m_title = "add url enabled";
m->m_desc = "If this is enabled others can add "
@ -20084,6 +20114,7 @@ void Parms::overlapTest ( char step ) {
SearchInput tmpsi;
GigablastRequest tmpgr;
InjectionRequest tmpir;
CollectionRec tmpcr;
Conf tmpconf;
char b;
@ -20109,6 +20140,7 @@ void Parms::overlapTest ( char step ) {
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi;
if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr;
if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir;
if ( p1 ) p1 += m_parms[i].m_off;
p2 = NULL;
int32_t size = m_parms[i].m_size;
@ -20157,6 +20189,7 @@ void Parms::overlapTest ( char step ) {
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi;
if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr;
if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir;
if ( p1 ) p1 += m_parms[i].m_off;
p2 = NULL;
int32_t size = m_parms[i].m_size;
@ -20184,6 +20217,8 @@ void Parms::overlapTest ( char step ) {
objStr = "SearchInput.h";
if ( m_parms[i].m_obj == OBJ_GBREQUEST )
objStr = "GigablastRequest/Parms.h";
if ( m_parms[i].m_obj == OBJ_IR )
objStr = "InjectionRequest/PageInject.h";
// save it
infringerB = p1[j];
savedi = i;

47
Parms.h

@ -39,6 +39,7 @@ enum {
OBJ_COLL ,
OBJ_SI , // SearchInput class
OBJ_GBREQUEST , // for GigablastRequest class of parms
OBJ_IR , // InjectionRequest class from PageInject.h
OBJ_NONE
};
@ -121,28 +122,32 @@ class GigablastRequest {
////////////
// these all reference into m_hr or into the Parm::m_def string!
char *m_url; // also for /get
char *m_queryToScrape;
char *m_contentDelim;
char *m_contentTypeStr;
char *m_contentFile;
char *m_content;
char *m_diffbotReply; // secret thing from dan
char m_injectLinks;
char m_spiderLinks;
char m_shortReply;
char m_newOnly;
char m_deleteUrl;
char m_recycle;
char m_dedup;
char m_hasMime;
char m_doConsistencyTesting;
char m_getSections;
char m_gotSections;
int32_t m_charset;
int32_t m_hopCount; // hopcount
//char *m_queryToScrape;
//char *m_contentDelim;
//char m_containerContentType; // CT_UNKNOWN, CT_WARC, CT_ARC
//int32_t m_injectDocIp;
//char *m_contentTypeStr;
//char *m_contentFile;
//char *m_content;
//char *m_diffbotReply; // secret thing from dan
//char m_injectLinks;
//char m_spiderLinks;
//char m_shortReply;
//char m_newOnly;
//char m_deleteUrl;
//char m_recycle;
//char m_dedup;
//char m_hasMime;
//char m_doConsistencyTesting;
//char m_getSections;
//char m_gotSections;
//int32_t m_charset;
//int32_t m_hopCount; // hopcount
//collnum_t m_collnum; // more reliable than m_coll
// older ones
uint32_t m_firstIndexed; // firstimdexed
uint32_t m_lastSpidered; // lastspidered;
//uint32_t m_firstIndexed; // firstimdexed
//uint32_t m_lastSpidered; // lastspidered;
//SafeBuf m_contentBuf; // for holding a warc/arc file

@ -961,7 +961,10 @@ float getDiskUsage ( int64_t *diskAvail ) {
g_hostdb.m_dir,
out);
errno = 0;
// time it to see how long it took. could it be causing load spikes?
//log("process: begin df -ka");
int err = system ( cmd );
//log("process: end df -ka");
if ( err == 127 ) {
log("build: /bin/sh does not exist. can not get disk usage.");
return -1.0; // unknown

@ -11729,11 +11729,14 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto gotOne;
}
// two letter extensions
else if ( ext[1] == '.' ) {
if ( to_lower_a(ext[2]) == 'g' &&
to_lower_a(ext[3]) == 'z' )
goto gotOne;
}
// .warc.gz and .arc.gz is ok
// take this out for now
// else if ( ext[1] == '.' ) {
// if ( to_lower_a(ext[2]) == 'g' &&
// to_lower_a(ext[3]) == 'z' )
// goto gotOne;
// }
// check for ".css?" substring
// these two suck up a lot of time:
// take them out for now. MDW 2/21/2015

72
Url.cpp

@ -32,6 +32,8 @@ void Url::reset() {
//m_siteLen = 0;
// ip related stuff
m_ip = 0;
// m_isWarcValid = false;
// m_isArcValid = false;
}
// set from another Url, does a copy
@ -1426,13 +1428,79 @@ bool Url::isBadExtension ( int32_t version ) {
s_badExtInitialized = true;
}
int myKey = hash64Lower_a(m_extension,m_elen);
//zero unless we have a bad extention, otherwise
//we return TR version in which it was banned
int32_t badVersion = s_badExtTable.getValue(myKey);
if (badVersion == 0) return false;
if(badVersion <= version) return true;
//if(badVersion <= version) return true;
if ( badVersion > version ) return false;
// exceptions for .warc.gz .warc .arc .argc.gz
if ( isWarc() || isArc() ) return false;
return true;
}
bool Url::isWarc ( ) {
// if ( ulen>8 && strncmp(uend-8,".warc.gz",8)==0 )
// m_isWarc = true;
// if ( ulen>8 && strncmp(uend-5,".warc" ,5)==0 )
// m_isWarc = true;
// if ( ulen>8 && strncmp(uend-7,".arc.gz",7)==0 )
// m_isArc = true;
// if ( ulen>8 && strncmp(uend-4,".arc" ,4)==0 )
// m_isArc = true;
if ( m_elen == 4 &&
m_extension[0] == 'w' &&
m_extension[1] == 'a' &&
m_extension[2] == 'r' &&
m_extension[3] == 'c' )
return true;
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
m_extension[1] == 'z' &&
m_ulen > 10 &&
m_extension[-1] == '.' &&
m_extension[-2] == 'c' &&
m_extension[-3] == 'r' &&
m_extension[-4] == 'a' &&
m_extension[-5] == 'w' &&
m_extension[-6] == '.' ) {
// m_isWarc = true;
// m_isWarcValid = true;
return true;
}
return false;
}
bool Url::isArc ( ) {
if ( m_elen == 3 &&
m_extension[0] == 'a' &&
m_extension[1] == 'r' &&
m_extension[2] == 'c' )
return true;
// hack to allow for .gz if it is .warc.gz or .arc.gz
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
m_extension[1] == 'z' &&
m_ulen > 10 &&
m_extension[-1] == '.' &&
m_extension[-2] == 'c' &&
m_extension[-3] == 'r' &&
m_extension[-4] == 'a' &&
m_extension[-5] == '.' ) {
// m_isArc = true;
// m_isArcValid = true;
return true;
}
return false;
}

5
Url.h

@ -92,6 +92,11 @@ public:
bool isBadExtension(int32_t xxx);
bool isSet() { return m_ulen != 0; }
// is this url a warc or arc url? i.e. ends in .warc or .arc or
// .warc.gz or .arc.gz?
bool isWarc ( );
bool isArc ( );
// does it end in .xml, .rdb or .rss, etc. kinda thing
//bool isRSSFormat ( ) ;

1119
XmlDoc.cpp

File diff suppressed because it is too large Load Diff

@ -249,6 +249,8 @@ public:
#define MAX_XML_DOCS 4
#define MAXMSG7S 50
class XmlDoc {
public:
@ -339,7 +341,7 @@ class XmlDoc {
uint16_t m_isDiffbotJSONObject:1;
uint16_t m_sentToDiffbot:1;
uint16_t m_gotDiffbotSuccessfulReply:1;
uint16_t m_reserved804:1;
uint16_t m_useTimeAxis:1; // m_reserved804:1;
uint16_t m_reserved805:1;
uint16_t m_reserved806:1;
uint16_t m_reserved807:1;
@ -473,7 +475,9 @@ class XmlDoc {
int32_t forcedIp = 0 ,
uint8_t contentType = CT_HTML ,
uint32_t spideredTime = 0 , // time_t
bool contentHasMime = false ) ;
bool contentHasMime = false ,
// for container docs, what is the separator of subdocs?
char *contentDelim = NULL ) ;
// we now call this right away rather than at download time!
int32_t getSpideredTime();
@ -499,6 +503,9 @@ class XmlDoc {
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
bool indexDoc ( );
bool indexDoc2 ( );
bool isContainerDoc ( );
bool indexContainerDoc ( );
bool indexWarcOrArc ( char ct ) ;
key_t *getTitleRecKey() ;
//char *getSkipIndexing ( );
char *prepareToMakeTitleRec ( ) ;
@ -609,7 +616,7 @@ class XmlDoc {
//int32_t *getNumBannedOutlinks ( ) ;
uint16_t *getCountryId ( ) ;
class XmlDoc **getOldXmlDoc ( ) ;
bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
//bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
class XmlDoc **getExtraDoc ( char *url , int32_t maxCacheAge = 0 ) ;
bool getIsPageParser ( ) ;
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
@ -687,6 +694,8 @@ class XmlDoc {
char **getRawUtf8Content ( ) ;
char **getExpandedUtf8Content ( ) ;
char **getUtf8Content ( ) ;
// we download large files to a file on disk, like warcs and arcs
File *getUtf8ContentInFile ( int64_t *fileSizeArg );
int32_t *getContentHash32 ( ) ;
int32_t *getContentHashJson32 ( ) ;
//int32_t *getTagHash32 ( ) ;
@ -800,6 +809,8 @@ class XmlDoc {
bool hashContentType ( class HashTableX *table ) ;
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool getUseTimeAxis ( ) ;
SafeBuf *getTimeAxisUrl ( );
bool hashUrl ( class HashTableX *table );
bool hashDateNumbers ( class HashTableX *tt );
bool hashSections ( class HashTableX *table ) ;
@ -1046,6 +1057,33 @@ class XmlDoc {
SafeBuf m_zbuf;
SafeBuf m_kbuf;
// warc parsing member vars
class Msg7 *m_msg7;
class Msg7 *m_msg7s[MAXMSG7S];
char *m_warcContentPtr;
char *m_arcContentPtr;
char *m_anyContentPtr;
char *m_contentDelim;
SafeBuf m_injectUrlBuf;
bool m_subDocsHaveMime;
int32_t m_warcError ;
int32_t m_arcError ;
bool m_doneInjectingWarc ;
bool m_doneInjectingArc ;
int64_t m_fileOff ;
char *m_fileBuf ;
int32_t m_fileBufAllocSize;
char *m_fptr ;
char *m_fptrEnd ;
File m_file;
int64_t m_fileSize;
bool m_hasMoreToRead;
int32_t m_numInjectionsOut;
bool m_calledWgetThread;
// used by msg7 to store udp slot
class UdpSlot *m_injectionSlot;
// . same thing, a little more complicated
// . these classes are only set on demand
Xml m_xml;
@ -1116,6 +1154,8 @@ class XmlDoc {
//bool m_storedVoteCache;
//SafeBuf m_cacheRecBuf;
SafeBuf m_timeAxisUrl;
HashTableX m_turkVotingTable;
HashTableX m_turkBitsTable;
uint32_t m_confirmedTitleContentHash ;
@ -1156,6 +1196,8 @@ class XmlDoc {
class SafeBuf *m_savedSb;
class HttpRequest *m_savedHr;
char m_savedChar;
// validity flags. on reset() all these are set to false.
char m_VALIDSTART;
@ -1165,10 +1207,14 @@ class XmlDoc {
char m_addedSpiderReplySizeValid;
char m_addedStatusDocSizeValid;
char m_downloadStartTimeValid;
char m_contentDelimValid;
char m_fileValid;
//char m_docQualityValid;
char m_siteValid;
char m_startTimeValid;
char m_currentUrlValid;
char m_useTimeAxisValid;
char m_timeAxisUrlValid;
char m_firstUrlValid;
char m_firstUrlHash48Valid;
char m_firstUrlHash64Valid;
@ -2399,7 +2445,10 @@ class XmlDoc {
void (*callback)(void *state) ,
uint32_t firstIndexedTime = 0,
uint32_t lastSpideredDate = 0 );
uint32_t lastSpideredDate = 0 ,
int32_t injectDocIp = 0 ,
// for container docs consisting of subdocs to inject
char *contentDelim = NULL );
bool injectLinks ( HashTableX *linkDedupTable ,

@ -2408,6 +2408,83 @@ char *serializeMsg ( int32_t baseSize ,
return buf;
}
char *serializeMsg2 ( void *thisPtr ,
int32_t objSize ,
char **firstStrPtr ,
int32_t *firstSizeParm ,
int32_t *retSize ) {
// make a buffer to serialize into
char *buf = NULL;
int32_t baseSize = (char *)firstStrPtr - (char *)thisPtr;
int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
int32_t need = baseSize;
need += nptrs * sizeof(char *);
need += nptrs * sizeof(int32_t);
// tally up the string sizes
int32_t *srcSizePtr = (int32_t *)firstSizeParm;
char **srcStrPtr = (char **)firstStrPtr;
int32_t totalStringSizes = 0;
for ( int i = 0 ; i < nptrs ; i++ ) {
if ( srcStrPtr[i] == NULL ) continue;
totalStringSizes += srcSizePtr[i];
}
int32_t stringBufferOffset = need;
need += totalStringSizes;
// alloc if we should
if ( ! buf ) buf = (char *)mmalloc ( need , "sm2" );
// bail on error, g_errno should be set
if ( ! buf ) return NULL;
// set how many bytes we will serialize into
*retSize = need;
// copy everything over except strings themselves
char *p = buf;
gbmemcpy ( p , (char *)thisPtr , stringBufferOffset );//need );
// point to the string buffer
p += stringBufferOffset;
// then store the strings!
char **dstStrPtr = (char **)(buf + baseSize );
int32_t *dstSizePtr = (int32_t *)(buf + baseSize+sizeof(char *)*nptrs);
for ( int count = 0 ; count < nptrs ; count++ ) {
// copy ptrs
//*dstStrPtr = *srcStrPtr;
//*dstSizePtr = *srcSizePtr;
// if we are NULL, we are a "bookmark", so
// we alloc'd space for it, but don't copy into
// the space until after this call toe serialize()
if ( ! *srcStrPtr )
goto skip;
// if this is valid then size can't be 0! fix upstream.
if ( ! *srcSizePtr ) { char *xx=NULL;*xx=0; }
// if size is 0 use gbstrlen. helps with InjectionRequest
// where we set ptr_url or ptr_content but not size_url, etc.
//if ( ! *srcSizePtr )
// *srcSizePtr = gbstrlen(*strPtr);
// sanity check -- cannot copy onto ourselves
if ( p > *srcStrPtr && p < *srcStrPtr + *srcSizePtr ) {
char *xx = NULL; *xx = 0; }
// copy the string into the buffer
gbmemcpy ( p , *srcStrPtr , *srcSizePtr );
skip:
// point it now into the string buffer
*dstStrPtr = p;
// if it is 0 length, make ptr NULL in destination
if ( *srcSizePtr == 0 || *srcStrPtr == NULL ) {
*dstStrPtr = NULL;
*dstSizePtr = 0;
}
// advance our destination ptr
p += *dstSizePtr;
// advance both ptrs to next string
srcSizePtr++;
srcStrPtr++;
dstSizePtr++;
dstStrPtr++;
}
return buf;
}
// convert offsets back into ptrs
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
@ -2437,6 +2514,33 @@ int32_t deserializeMsg ( int32_t baseSize ,
return baseSize + (p - stringBuf);//getStringBuf());
}
void deserializeMsg2 ( char **firstStrPtr , // ptr_url
int32_t *firstSizeParm ) { // size_url
int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
// point to our string buffer
char *p = ((char *)firstSizeParm + sizeof(int32_t)*nptrs);
// then store the strings!
int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
//int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displ
char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
int count = 0;
for ( ; count < nptrs ; count++ ) { // sizePtr <= sizeEnd ; ) {
// convert the offset to a ptr
*strPtr = p;
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
sizePtr++;
strPtr++;
}
// return how many bytes we processed
//return baseSize + (p - stringBuf);//getStringBuf());
}
// print it to stdout for debugging Dates.cpp
int32_t printTime ( time_t ttt ) {
//char *s = ctime(&ttt);

@ -612,6 +612,13 @@ char *serializeMsg ( int32_t baseSize ,
char *userBuf ,
int32_t userBufSize ,
bool makePtrsRefNewBuf ) ;
char *serializeMsg2 ( void *thisPtr ,
int32_t objSize ,
char **firstStrPtr ,
int32_t *firstSizeParm ,
int32_t *retSize );
// convert offsets back into ptrs
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
@ -619,4 +626,6 @@ int32_t deserializeMsg ( int32_t baseSize ,
char **firstStrPtr ,
char *stringBuf ) ;
void deserializeMsg2 ( char **firstStrPtr , int32_t *firstSizeParm );
#endif

218
qa.cpp

@ -75,6 +75,127 @@ void markOut ( char *content , char *needle ) {
goto loop;
}
void markOut2 ( char *content , char *needle ) {
if ( ! content ) return;
int32_t nlen = gbstrlen(needle);
loop:
char *s = strstr ( content , needle );
if ( ! s ) return;
// advance over name like "rand64=" to avoid hitting those digits
//s += gbstrlen(needle);
for (int32_t i = 0 ; i < nlen ; i++ )
*s++ = ' ';
//for ( ; *s && ! is_digit(*s); s++ );
// find end of digit stream
//char *end = s;
//while ( ; *end && is_digit(*s); end++ );
// just bury the digit stream now, zeroing out was not
// a consistent LENGTH if we had 10 hits vs 9... making the hash
// different
// space out digits. including decimal point.
//for ( ; *s && (is_digit(*s)||*s=='.'); s++ ) *s = ' ';
// loop for more for the "rand64=" thing
content = s;
goto loop;
}
void markOutBuf ( char *content ) {
// take out <responseTimeMS>
markOut ( content , "<currentTimeUTC>");
markOut ( content , "<responseTimeMS>");
// ...from an index of about 429 pages in 0.91 seconds in collection...
markOut ( content , " pages in ");
// until i figure this one out, take it out
markOut ( content , "<docsInCollection>");
markOut ( content , "spider is done (");
markOut ( content , "spider is paused (");
markOut ( content , "spider queue empty (");
markOut ( content , "spider is active (");
markOut ( content , "<totalShards>");
// 3 Collections etc.
markOut ( content , "/rocket.jpg></div></a></center><br><br><div style=\"width:190px;padding:4px;margin-left:10px;background-color:white;border-top-left-radius:10px;border-bottom-left-radius:10px;border-color:blue;border-width:3px;border-style:solid;margin-right:-3px;border-right-color:white;overflow-y:auto;overflow-x:hidden;line-height:23px;color:black;\"><center><nobr><b>" );
// until i figure this one out, take it out
markOut ( content , "<hits>");
// for those links in the html pages
markOut ( content, "rand64=");
// for json
markOut ( content , "\"currentTimeUTC\":" );
markOut ( content , "\"responseTimeMS\":");
markOut ( content , "\"docsInCollection\":");
// if the results are in json, then status doc is encoded json
markOut ( content , "\\\"gbssDownloadStartTime\\\":");
markOut ( content , "\\\"gbssDownloadEndTime\\\":");
markOut ( content , "\\\"gbssDownloadStartTimeMS\\\":");
markOut ( content , "\\\"gbssDownloadEndTimeMS\\\":");
markOut ( content , "\\\"gbssDownloadDurationMS\\\":");
markOut ( content , "\\\"gbssAgeInIndex\\\":");
markOut ( content , "\\\"gbssDiscoveredTime\\\":");
// if the results are in xml, then the status doc is xml encoded
markOut ( content , "\"gbssDownloadStartTime\":");
markOut ( content , "\"gbssDownloadEndTime\":");
markOut ( content , "\"gbssDownloadStartTimeMS\":");
markOut ( content , "\"gbssDownloadEndTimeMS\":");
markOut ( content , "\"gbssDownloadDurationMS\":");
markOut ( content , "\"gbssAgeInIndex\":");
// for xml
markOut ( content , "<currentTimeUTC>" );
markOut ( content , "<responseTimeMS>");
markOut ( content , "<docsInCollection>");
markOut ( content , "<firstIndexedDateUTC>");
// indexed 1 day ago
markOut ( content,"indexed:");
// modified 1 day ago
markOut ( content,"modified:");
// s_gigabitCount... it is perpetually incrementing static counter
// in PageResults.cpp
markOut(content,"ccc(");
markOut(content,"id=fd");
markOut(content,"id=sd");
// for some reason the term freq seems to change a little in
// the scoring table
markOut(content,"id=tf");
// # of collections in the admin page: ..."4 Collections"
markOut(content,"px;color:black;\"><center><nobr><b>");
markOut(content,"spider is done (");
markOut(content,"spider is paused (");
markOut(content,"spider is active (");
markOut(content,"spider queue empty (");
markOut2(content,"bgcolor=#c0c0f0");
markOut2(content,"bgcolor=#d0d0e0");
}
// do not hash
int32_t qa_hash32 ( char *s ) {
uint32_t h = 0;
@ -171,84 +292,8 @@ void processReply ( char *reply , int32_t replyLen ) {
s_content = content;
// take out <responseTimeMS>
markOut ( content , "<currentTimeUTC>");
markOut ( content , "<responseTimeMS>");
markOutBuf ( content );
// ...from an index of about 429 pages in 0.91 seconds in collection...
markOut ( content , " pages in ");
// until i figure this one out, take it out
markOut ( content , "<docsInCollection>");
markOut ( content , "spider is done (");
markOut ( content , "spider is paused (");
markOut ( content , "spider queue empty (");
markOut ( content , "spider is active (");
markOut ( content , "<totalShards>");
// 3 Collections etc.
markOut ( content , "/rocket.jpg></div></a></center><br><br><div style=\"width:190px;padding:4px;margin-left:10px;background-color:white;border-top-left-radius:10px;border-bottom-left-radius:10px;border-color:blue;border-width:3px;border-style:solid;margin-right:-3px;border-right-color:white;overflow-y:auto;overflow-x:hidden;line-height:23px;color:black;\"><center><nobr><b>" );
// until i figure this one out, take it out
markOut ( content , "<hits>");
// for those links in the html pages
markOut ( content, "rand64=");
// for json
markOut ( content , "\"currentTimeUTC\":" );
markOut ( content , "\"responseTimeMS\":");
markOut ( content , "\"docsInCollection\":");
// if the results are in json, then status doc is encoded json
markOut ( content , "\\\"gbssDownloadStartTime\\\":");
markOut ( content , "\\\"gbssDownloadEndTime\\\":");
markOut ( content , "\\\"gbssDownloadStartTimeMS\\\":");
markOut ( content , "\\\"gbssDownloadEndTimeMS\\\":");
markOut ( content , "\\\"gbssDownloadDurationMS\\\":");
markOut ( content , "\\\"gbssAgeInIndex\\\":");
markOut ( content , "\\\"gbssDiscoveredTime\\\":");
// if the results are in xml, then the status doc is xml encoded
markOut ( content , "\"gbssDownloadStartTime\":");
markOut ( content , "\"gbssDownloadEndTime\":");
markOut ( content , "\"gbssDownloadStartTimeMS\":");
markOut ( content , "\"gbssDownloadEndTimeMS\":");
markOut ( content , "\"gbssDownloadDurationMS\":");
markOut ( content , "\"gbssAgeInIndex\":");
// for xml
markOut ( content , "<currentTimeUTC>" );
markOut ( content , "<responseTimeMS>");
markOut ( content , "<docsInCollection>");
markOut ( content , "<firstIndexedDateUTC>");
// indexed 1 day ago
markOut ( content,"indexed:");
// modified 1 day ago
markOut ( content,"modified:");
// s_gigabitCount... it is perpetually incrementing static counter
// in PageResults.cpp
markOut(content,"ccc(");
markOut(content,"id=fd");
markOut(content,"id=sd");
// for some reason the term freq seems to change a little in
// the scoring table
markOut(content,"id=tf");
// # of collections in the admin page: ..."4 Collections"
markOut(content,"px;color:black;\"><center><nobr><b>");
markOut(content,"spider is done (");
markOut(content,"spider is paused (");
markOut(content,"spider is active (");
markOut(content,"spider queue empty (");
// make checksum. we ignore back to back spaces so this
// hash works for <docsInCollection>10 vs <docsInCollection>9
@ -361,9 +406,26 @@ void processReply ( char *reply , int32_t replyLen ) {
fb1.load(fn1);
fb1.nullTerm();
// markout both
markOutBuf ( fb1.getBufStart() );
markOutBuf ( fb2.getBufStart() );
// save temps
SafeBuf tmpfn1;
SafeBuf tmpfn2;
tmpfn1.safePrintf("%strash/tmpdiff1.txt",g_hostdb.m_dir);
tmpfn2.safePrintf("%strash/tmpdiff2.txt",g_hostdb.m_dir);
fb1.save(tmpfn1.getBufStart());
fb2.save(tmpfn2.getBufStart());
// do the diff between the two replies so we can see what changed
// now do the diffs between the marked out versions so it is less
// spammy
char cmd[1024];
sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2);
sprintf(cmd,"diff %s %s > /tmp/diffout",
tmpfn1.getBufStart(),
tmpfn2.getBufStart());
//fn1,fn2);
//log("qa: %s\n",cmd);
gbsystem(cmd);