Add qa test for arc and warc files. Change XmlDoc to use timeaxis url

when creating the titlerec key instead of the firsturl.
This commit is contained in:
Zak Betz
2015-05-21 15:19:33 -06:00
parent 9b8065589a
commit e399a8b0aa
4 changed files with 186 additions and 49 deletions

@ -193,6 +193,8 @@ case ENOTITLEREC: return "No title rec found when recycling content";
case EQUERYINGDISABLED: return "Querying is disabled in the master controls";
case EJSONMISSINGLASTCURLY: return "JSON was missing last curly bracket";
case EADMININTERFERENCE: return "Adminstrative interference";
case ETHREADSDISABLED:return "Threads Disabled";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

@ -197,6 +197,7 @@ enum {
ENOTITLEREC,
EQUERYINGDISABLED,
EJSONMISSINGLASTCURLY,
EADMININTERFERENCE
EADMININTERFERENCE,
ETHREADSDISABLED
};
#endif

@ -5209,7 +5209,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
// time it
int64_t startTime = gettimeofdayInMilliseconds();
//////
//
// fill in m_titleRecBuf
@ -10915,6 +10914,12 @@ int64_t XmlDoc::getFirstUrlHash48() {
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
// this must work
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
if ( getUseTimeAxis() ) {
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
m_firstUrlHash48Valid = true;
return m_firstUrlHash48;
}
m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL;
m_firstUrlHash48Valid = true;
return m_firstUrlHash48;
@ -10924,6 +10929,13 @@ int64_t XmlDoc::getFirstUrlHash64() {
if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
// this must work
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
if ( getUseTimeAxis() ) {
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
m_firstUrlHash64Valid = true;
return m_firstUrlHash64;
}
m_firstUrlHash64 = hash64b ( m_firstUrl.m_url );
m_firstUrlHash64Valid = true;
return m_firstUrlHash64;
@ -11635,12 +11647,13 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) {
// valid if we are a docid based doc and THIS function was called
// from getFirstUrl() -- we end up in a recursive loop.
if ( ! m_setFromDocId ) {
int64_t uh48 = getFirstUrl()->getUrlHash48();
//int64_t uh48 = getFirstUrl()->getUrlHash48();
int64_t uh48 = getFirstUrlHash48();
int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr );
if ( uh48 != tuh48 ) {
log("xmldoc: docid collision uh48 mismatch. cannot "
"index "
"%s",getFirstUrl()->getUrl() );
"index "
"%s",getFirstUrl()->getUrl() );
g_errno = EDOCIDCOLLISION;
return NULL;
}
@ -12024,9 +12037,8 @@ SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
if ( m_setFromDocId ) return &m_timeAxisUrl;
m_timeAxisUrlValid = true;
Url *fu = getFirstUrl();
int32_t spideredTime = getSpideredTime ();
m_timeAxisUrl.reset();
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),spideredTime);
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
return &m_timeAxisUrl;
}
@ -12096,7 +12108,7 @@ char **XmlDoc::getOldTitleRec ( ) {
SafeBuf *tau = getTimeAxisUrl();
u = tau->getBufStart();
}
// the title must be local since we're spidering it
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
u ,
@ -19254,6 +19266,12 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
return (File *)-1;
// failed?
log("build: failed to launch wget thread");
// If we run it in this thread then if we are fetching
// a local url it will block forever.
// systemStartWrapper_r(this,NULL);
// return getUtf8ContentInFile ( fileSizeArg );
g_errno = ETHREADSDISABLED;
return NULL;
}
@ -22240,7 +22258,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
// do not do this if not test collection for now
if ( strcmp(cr->m_coll,"qatest123") ) return true;
log("xmldoc: VERIFYING METALIST");
log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");
// store each record in the list into the send buffers
for ( ; p < pend ; ) {

196
qa.cpp

@ -46,6 +46,7 @@ static SafeBuf s_cbuf2;
static Url s_url;
static char *s_expect = NULL;
static char **s_ignore = NULL;
void markOut ( char *content , char *needle ) {
@ -323,38 +324,46 @@ void processReply ( char *reply , int32_t replyLen ) {
}
*/
// Just look a substring of the response so we don't have to worry about
// miniscule changes in output formats or changing dates.
if(s_expect) {
if(gb_strcasestr(content, s_expect)) {
g_qaOutput.safePrintf("<b style=color:green;>"
"passed test</b><br>%s : "
"<a href=%s>%s</a> Found %s (crc=%"UINT32")<br>"
"<hr>",
s_qt->m_testName,
s_url.getUrl(),
s_url.getUrl(),
s_expect,
contentCRC);
} else {
g_numErrors++;
g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
"<a href=%s>%s</a><br> Expected: %s in reply"
" (crc=%"UINT32")<br>"
"<hr>",
s_qt->m_testName,
s_url.getUrl(),
s_url.getUrl(),
s_expect,
contentCRC);
if(s_ignore) {
for(int i = 0;;i++) {
if(!s_ignore[i]) break;
if(gb_strcasestr(content, s_ignore[i])) return;
}
s_ignore = NULL;
}
// Just look a substring of the response so we don't have to worry about
// miniscule changes in output formats or changing dates.
if(s_expect) {
if(gb_strcasestr(content, s_expect)) {
g_qaOutput.safePrintf("<b style=color:green;>"
"passed test</b><br>%s : "
"<a href=%s>%s</a> Found %s (crc=%"UINT32")<br>"
"<hr>",
s_qt->m_testName,
s_url.getUrl(),
s_url.getUrl(),
s_expect,
contentCRC);
} else {
g_numErrors++;
g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
"<a href=%s>%s</a><br> Expected: %s in reply"
" (crc=%"UINT32")<br>"
"<hr>",
s_qt->m_testName,
s_url.getUrl(),
s_url.getUrl(),
s_expect,
contentCRC);
}
s_expect = NULL;
}
s_expect = NULL;
return;
}
}
// this means caller does not care about the response
if ( ! s_checkCRC ) {
@ -539,7 +548,7 @@ static void gotReplyWrapper ( void *state , TcpSocket *sock ) {
// returns false if blocked, true otherwise, like on quick connect error
bool getUrl( char *path , int32_t checkCRC = 0 , char *post = NULL ,
char* expect = NULL) {
char* expect = NULL, char** ignore = NULL) {
SafeBuf sb;
sb.safePrintf ( "http://%s:%"INT32"%s"
@ -557,6 +566,7 @@ bool getUrl( char *path , int32_t checkCRC = 0 , char *post = NULL ,
//Url u;
s_url.set ( sb.getBufStart() );
s_expect = expect;
s_ignore = ignore;
log("qa: getting %s",sb.getBufStart());
if ( ! g_httpServer.getDoc ( s_url.getUrl() ,
@ -1366,6 +1376,7 @@ typedef enum {
WAIT_A_BIT = 3,
EXAMINE_RESULTS = 16
} TimeAxisFlags;
char* g_timeAxisIgnore[3] = {"Bad IP", "Doc is error page", NULL};
bool qaTimeAxis ( ) {
@ -1397,7 +1408,7 @@ bool qaTimeAxis ( ) {
"&obeyRobots=0"
// This is what we are testing
"&usetimeaxis=1"
"&de=1"
"&de=0"
,
// checksum of reply expected
238170006 ) )
@ -1409,6 +1420,7 @@ bool qaTimeAxis ( ) {
loadUrls();
int32_t numDocsToInject = s_ubuf2.length()/(int32_t)sizeof(char *);
//
// Inject urls, return false if not done yet.
// Here we alternate sending the same url -> content pair with sending
@ -1416,7 +1428,7 @@ bool qaTimeAxis ( ) {
// at about half the rate that we spider them.
if ( ! s_flags[ADD_INITIAL_URLS] ) {
for ( ; s_flags[URL_COUNTER] < numDocsToInject &&
s_flags[CONTENT_COUNTER] < numDocsToInject; ) {
s_flags[URL_COUNTER] + s_flags[CONTENT_COUNTER] < numDocsToInject; ) {
// inject using html api
SafeBuf sb;
@ -1426,8 +1438,8 @@ bool qaTimeAxis ( ) {
s_flags[CONTENT_COUNTER] - flipFlop ;
char* expect = "[Success]";
if(flipFlop) {
expect = "[Doc is a dup]";
if(flipFlop && urlIndex != contentIndex) {
expect = "[Doc unchanged]";
}
log("sending url num %d with content num %d, flip %d expect %s",
@ -1442,21 +1454,22 @@ bool qaTimeAxis ( ) {
sb.nullTerm();
if(s_flags[CONTENT_COUNTER] >= 6) {
if(s_flags[CONTENT_COUNTER] >= 5) {
s_flags[URL_COUNTER] += s_flags[CONTENT_COUNTER];
s_flags[CONTENT_COUNTER] = 0;
}
s_flags[CONTENT_COUNTER]++;
if(s_flags[URL_COUNTER] >= 12) {
s_flags[ADD_INITIAL_URLS] = true;
}
// if(s_flags[URL_COUNTER] >= 12) {
// s_flags[ADD_INITIAL_URLS] = true;
// }
wait(1.0);
//wait(1.0);
if ( ! getUrl("/admin/inject",
0, // no idea what crc to expect
sb.getBufStart(),
expect)
expect,
g_timeAxisIgnore)
)
return false;
return false;
@ -1481,6 +1494,101 @@ bool qaTimeAxis ( ) {
return true;
}
bool qaWarcFiles ( ) {
if ( ! s_flags[DELETE_COLLECTION] ) {
s_flags[DELETE_COLLECTION] = true;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
if ( ! s_flags[ADD_COLLECTION] ) {
s_flags[ADD_COLLECTION] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1&"
"collectionips=127.0.0.1" ,
// checksum of reply expected
238170006 ) )
return false;
}
if ( ! s_flags[SET_PARAMETERS] ) {
s_flags[SET_PARAMETERS] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
// no spider replies because it messes
// up our last test to make sure posdb
// is 100% empty.
// see "index spider replies" in Parms.cpp.
"&isr=0"
// turn off use robots to avoid that
// xyz.com/robots.txt redir to seekseek.com
"&obeyRobots=0"
// This is what we are testing
"&usetimeaxis=1"
,
// checksum of reply expected
0 ) )
return false;
}
//
// Inject urls, return false if not done yet.
// Here we alternate sending the same url -> content pair with sending
// the same url with different content to simulate a site that is updated
// at about half the rate that we spider them.
if ( s_flags[ADD_INITIAL_URLS] == 0) {
s_flags[ADD_INITIAL_URLS]++;
SafeBuf sb;
sb.safePrintf("&c=qatest123"
"&format=json"
"&strip=1"
"&spiderlinks=1"
"&urls=http://%s:%"INT32"/test.warc.gz"
, iptoa(g_hostdb.m_myHost->m_ip)
, (int32_t)g_hostdb.m_myHost->m_httpPort
);
if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
return false;
}
if ( ! s_flags[EXAMINE_RESULTS] == 0) {
s_flags[EXAMINE_RESULTS]++;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
702467314 ) )
return false;
}
if ( s_flags[ADD_INITIAL_URLS] == 1) {
s_flags[ADD_INITIAL_URLS]++;
SafeBuf sb;
sb.safePrintf("&c=qatest123"
"&format=json"
"&strip=1"
"&spiderlinks=1"
"&urls=http://%s:%"INT32"/test.arc.gz"
, iptoa(g_hostdb.m_myHost->m_ip)
, (int32_t)g_hostdb.m_myHost->m_httpPort);
if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
return false;
}
if ( ! s_flags[EXAMINE_RESULTS] == 1) {
s_flags[EXAMINE_RESULTS]++;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
702467314 ) )
return false;
}
return true;
}
bool qaimport () {
//
@ -1584,6 +1692,9 @@ bool qaimport () {
return true;
}
bool qainlinks() {
//
@ -3084,7 +3195,12 @@ static QATest s_qatests[] = {
"timeAxisTest",
"Use Inject api to inject the same url at different times, "
"sometimes changed and sometimes not. Ensure docId is different "
"when content has changed, even if the url is the same. "}
"when content has changed, even if the url is the same. "},
{qaWarcFiles,
"indexWarcFiles",
"Ensure the spider handles arc.gz and warc.gz file formats."}
};