Add qa test for arc and warc files. Change XmlDoc to use timeaxis url
when creating the titlerec key instead of the firsturl.
This commit is contained in:
@ -193,6 +193,8 @@ case ENOTITLEREC: return "No title rec found when recycling content";
|
||||
case EQUERYINGDISABLED: return "Querying is disabled in the master controls";
|
||||
case EJSONMISSINGLASTCURLY: return "JSON was missing last curly bracket";
|
||||
case EADMININTERFERENCE: return "Adminstrative interference";
|
||||
case ETHREADSDISABLED:return "Threads Disabled";
|
||||
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -197,6 +197,7 @@ enum {
|
||||
ENOTITLEREC,
|
||||
EQUERYINGDISABLED,
|
||||
EJSONMISSINGLASTCURLY,
|
||||
EADMININTERFERENCE
|
||||
EADMININTERFERENCE,
|
||||
ETHREADSDISABLED
|
||||
};
|
||||
#endif
|
||||
|
34
XmlDoc.cpp
34
XmlDoc.cpp
@ -5209,7 +5209,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
|
||||
// time it
|
||||
int64_t startTime = gettimeofdayInMilliseconds();
|
||||
|
||||
|
||||
//////
|
||||
//
|
||||
// fill in m_titleRecBuf
|
||||
@ -10915,6 +10914,12 @@ int64_t XmlDoc::getFirstUrlHash48() {
|
||||
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
|
||||
// this must work
|
||||
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( getUseTimeAxis() ) {
|
||||
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
|
||||
m_firstUrlHash48Valid = true;
|
||||
return m_firstUrlHash48;
|
||||
}
|
||||
|
||||
m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL;
|
||||
m_firstUrlHash48Valid = true;
|
||||
return m_firstUrlHash48;
|
||||
@ -10924,6 +10929,13 @@ int64_t XmlDoc::getFirstUrlHash64() {
|
||||
if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
|
||||
// this must work
|
||||
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( getUseTimeAxis() ) {
|
||||
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
|
||||
m_firstUrlHash64Valid = true;
|
||||
return m_firstUrlHash64;
|
||||
}
|
||||
|
||||
m_firstUrlHash64 = hash64b ( m_firstUrl.m_url );
|
||||
m_firstUrlHash64Valid = true;
|
||||
return m_firstUrlHash64;
|
||||
@ -11635,12 +11647,13 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) {
|
||||
// valid if we are a docid based doc and THIS function was called
|
||||
// from getFirstUrl() -- we end up in a recursive loop.
|
||||
if ( ! m_setFromDocId ) {
|
||||
int64_t uh48 = getFirstUrl()->getUrlHash48();
|
||||
//int64_t uh48 = getFirstUrl()->getUrlHash48();
|
||||
int64_t uh48 = getFirstUrlHash48();
|
||||
int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr );
|
||||
if ( uh48 != tuh48 ) {
|
||||
log("xmldoc: docid collision uh48 mismatch. cannot "
|
||||
"index "
|
||||
"%s",getFirstUrl()->getUrl() );
|
||||
"index "
|
||||
"%s",getFirstUrl()->getUrl() );
|
||||
g_errno = EDOCIDCOLLISION;
|
||||
return NULL;
|
||||
}
|
||||
@ -12024,9 +12037,8 @@ SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
|
||||
if ( m_setFromDocId ) return &m_timeAxisUrl;
|
||||
m_timeAxisUrlValid = true;
|
||||
Url *fu = getFirstUrl();
|
||||
int32_t spideredTime = getSpideredTime ();
|
||||
m_timeAxisUrl.reset();
|
||||
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),spideredTime);
|
||||
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
|
||||
return &m_timeAxisUrl;
|
||||
}
|
||||
|
||||
@ -12096,7 +12108,7 @@ char **XmlDoc::getOldTitleRec ( ) {
|
||||
SafeBuf *tau = getTimeAxisUrl();
|
||||
u = tau->getBufStart();
|
||||
}
|
||||
|
||||
|
||||
// the title must be local since we're spidering it
|
||||
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
|
||||
u ,
|
||||
@ -19254,6 +19266,12 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
|
||||
return (File *)-1;
|
||||
// failed?
|
||||
log("build: failed to launch wget thread");
|
||||
// If we run it in this thread then if we are fetching
|
||||
// a local url it will block forever.
|
||||
// systemStartWrapper_r(this,NULL);
|
||||
// return getUtf8ContentInFile ( fileSizeArg );
|
||||
g_errno = ETHREADSDISABLED;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -22240,7 +22258,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
||||
// do not do this if not test collection for now
|
||||
if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
||||
|
||||
log("xmldoc: VERIFYING METALIST");
|
||||
log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");
|
||||
|
||||
// store each record in the list into the send buffers
|
||||
for ( ; p < pend ; ) {
|
||||
|
196
qa.cpp
196
qa.cpp
@ -46,6 +46,7 @@ static SafeBuf s_cbuf2;
|
||||
|
||||
static Url s_url;
|
||||
static char *s_expect = NULL;
|
||||
static char **s_ignore = NULL;
|
||||
|
||||
void markOut ( char *content , char *needle ) {
|
||||
|
||||
@ -323,38 +324,46 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
}
|
||||
*/
|
||||
|
||||
// Just look a substring of the response so we don't have to worry about
|
||||
// miniscule changes in output formats or changing dates.
|
||||
if(s_expect) {
|
||||
if(gb_strcasestr(content, s_expect)) {
|
||||
g_qaOutput.safePrintf("<b style=color:green;>"
|
||||
"passed test</b><br>%s : "
|
||||
"<a href=%s>%s</a> Found %s (crc=%"UINT32")<br>"
|
||||
"<hr>",
|
||||
s_qt->m_testName,
|
||||
s_url.getUrl(),
|
||||
s_url.getUrl(),
|
||||
s_expect,
|
||||
contentCRC);
|
||||
} else {
|
||||
g_numErrors++;
|
||||
|
||||
g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
|
||||
"<a href=%s>%s</a><br> Expected: %s in reply"
|
||||
" (crc=%"UINT32")<br>"
|
||||
"<hr>",
|
||||
s_qt->m_testName,
|
||||
s_url.getUrl(),
|
||||
s_url.getUrl(),
|
||||
s_expect,
|
||||
contentCRC);
|
||||
if(s_ignore) {
|
||||
for(int i = 0;;i++) {
|
||||
if(!s_ignore[i]) break;
|
||||
if(gb_strcasestr(content, s_ignore[i])) return;
|
||||
}
|
||||
s_ignore = NULL;
|
||||
}
|
||||
|
||||
// Just look a substring of the response so we don't have to worry about
|
||||
// miniscule changes in output formats or changing dates.
|
||||
if(s_expect) {
|
||||
if(gb_strcasestr(content, s_expect)) {
|
||||
g_qaOutput.safePrintf("<b style=color:green;>"
|
||||
"passed test</b><br>%s : "
|
||||
"<a href=%s>%s</a> Found %s (crc=%"UINT32")<br>"
|
||||
"<hr>",
|
||||
s_qt->m_testName,
|
||||
s_url.getUrl(),
|
||||
s_url.getUrl(),
|
||||
s_expect,
|
||||
contentCRC);
|
||||
} else {
|
||||
g_numErrors++;
|
||||
|
||||
g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
|
||||
"<a href=%s>%s</a><br> Expected: %s in reply"
|
||||
" (crc=%"UINT32")<br>"
|
||||
"<hr>",
|
||||
s_qt->m_testName,
|
||||
s_url.getUrl(),
|
||||
s_url.getUrl(),
|
||||
s_expect,
|
||||
contentCRC);
|
||||
|
||||
|
||||
}
|
||||
s_expect = NULL;
|
||||
}
|
||||
s_expect = NULL;
|
||||
return;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// this means caller does not care about the response
|
||||
if ( ! s_checkCRC ) {
|
||||
@ -539,7 +548,7 @@ static void gotReplyWrapper ( void *state , TcpSocket *sock ) {
|
||||
|
||||
// returns false if blocked, true otherwise, like on quick connect error
|
||||
bool getUrl( char *path , int32_t checkCRC = 0 , char *post = NULL ,
|
||||
char* expect = NULL) {
|
||||
char* expect = NULL, char** ignore = NULL) {
|
||||
|
||||
SafeBuf sb;
|
||||
sb.safePrintf ( "http://%s:%"INT32"%s"
|
||||
@ -557,6 +566,7 @@ bool getUrl( char *path , int32_t checkCRC = 0 , char *post = NULL ,
|
||||
//Url u;
|
||||
s_url.set ( sb.getBufStart() );
|
||||
s_expect = expect;
|
||||
s_ignore = ignore;
|
||||
|
||||
log("qa: getting %s",sb.getBufStart());
|
||||
if ( ! g_httpServer.getDoc ( s_url.getUrl() ,
|
||||
@ -1366,6 +1376,7 @@ typedef enum {
|
||||
WAIT_A_BIT = 3,
|
||||
EXAMINE_RESULTS = 16
|
||||
} TimeAxisFlags;
|
||||
char* g_timeAxisIgnore[3] = {"Bad IP", "Doc is error page", NULL};
|
||||
|
||||
|
||||
bool qaTimeAxis ( ) {
|
||||
@ -1397,7 +1408,7 @@ bool qaTimeAxis ( ) {
|
||||
"&obeyRobots=0"
|
||||
// This is what we are testing
|
||||
"&usetimeaxis=1"
|
||||
"&de=1"
|
||||
"&de=0"
|
||||
,
|
||||
// checksum of reply expected
|
||||
238170006 ) )
|
||||
@ -1409,6 +1420,7 @@ bool qaTimeAxis ( ) {
|
||||
loadUrls();
|
||||
int32_t numDocsToInject = s_ubuf2.length()/(int32_t)sizeof(char *);
|
||||
|
||||
|
||||
//
|
||||
// Inject urls, return false if not done yet.
|
||||
// Here we alternate sending the same url -> content pair with sending
|
||||
@ -1416,7 +1428,7 @@ bool qaTimeAxis ( ) {
|
||||
// at about half the rate that we spider them.
|
||||
if ( ! s_flags[ADD_INITIAL_URLS] ) {
|
||||
for ( ; s_flags[URL_COUNTER] < numDocsToInject &&
|
||||
s_flags[CONTENT_COUNTER] < numDocsToInject; ) {
|
||||
s_flags[URL_COUNTER] + s_flags[CONTENT_COUNTER] < numDocsToInject; ) {
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
|
||||
@ -1426,8 +1438,8 @@ bool qaTimeAxis ( ) {
|
||||
s_flags[CONTENT_COUNTER] - flipFlop ;
|
||||
|
||||
char* expect = "[Success]";
|
||||
if(flipFlop) {
|
||||
expect = "[Doc is a dup]";
|
||||
if(flipFlop && urlIndex != contentIndex) {
|
||||
expect = "[Doc unchanged]";
|
||||
}
|
||||
|
||||
log("sending url num %d with content num %d, flip %d expect %s",
|
||||
@ -1442,21 +1454,22 @@ bool qaTimeAxis ( ) {
|
||||
sb.nullTerm();
|
||||
|
||||
|
||||
if(s_flags[CONTENT_COUNTER] >= 6) {
|
||||
if(s_flags[CONTENT_COUNTER] >= 5) {
|
||||
s_flags[URL_COUNTER] += s_flags[CONTENT_COUNTER];
|
||||
s_flags[CONTENT_COUNTER] = 0;
|
||||
}
|
||||
s_flags[CONTENT_COUNTER]++;
|
||||
|
||||
if(s_flags[URL_COUNTER] >= 12) {
|
||||
s_flags[ADD_INITIAL_URLS] = true;
|
||||
}
|
||||
// if(s_flags[URL_COUNTER] >= 12) {
|
||||
// s_flags[ADD_INITIAL_URLS] = true;
|
||||
// }
|
||||
|
||||
wait(1.0);
|
||||
//wait(1.0);
|
||||
if ( ! getUrl("/admin/inject",
|
||||
0, // no idea what crc to expect
|
||||
sb.getBufStart(),
|
||||
expect)
|
||||
expect,
|
||||
g_timeAxisIgnore)
|
||||
)
|
||||
return false;
|
||||
return false;
|
||||
@ -1481,6 +1494,101 @@ bool qaTimeAxis ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qaWarcFiles ( ) {
|
||||
if ( ! s_flags[DELETE_COLLECTION] ) {
|
||||
s_flags[DELETE_COLLECTION] = true;
|
||||
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( ! s_flags[ADD_COLLECTION] ) {
|
||||
s_flags[ADD_COLLECTION] = true;
|
||||
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1&"
|
||||
"collectionips=127.0.0.1" ,
|
||||
// checksum of reply expected
|
||||
238170006 ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( ! s_flags[SET_PARAMETERS] ) {
|
||||
s_flags[SET_PARAMETERS] = true;
|
||||
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
|
||||
// no spider replies because it messes
|
||||
// up our last test to make sure posdb
|
||||
// is 100% empty.
|
||||
// see "index spider replies" in Parms.cpp.
|
||||
"&isr=0"
|
||||
// turn off use robots to avoid that
|
||||
// xyz.com/robots.txt redir to seekseek.com
|
||||
"&obeyRobots=0"
|
||||
// This is what we are testing
|
||||
"&usetimeaxis=1"
|
||||
,
|
||||
// checksum of reply expected
|
||||
0 ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Inject urls, return false if not done yet.
|
||||
// Here we alternate sending the same url -> content pair with sending
|
||||
// the same url with different content to simulate a site that is updated
|
||||
// at about half the rate that we spider them.
|
||||
if ( s_flags[ADD_INITIAL_URLS] == 0) {
|
||||
s_flags[ADD_INITIAL_URLS]++;
|
||||
SafeBuf sb;
|
||||
|
||||
sb.safePrintf("&c=qatest123"
|
||||
"&format=json"
|
||||
"&strip=1"
|
||||
"&spiderlinks=1"
|
||||
"&urls=http://%s:%"INT32"/test.warc.gz"
|
||||
, iptoa(g_hostdb.m_myHost->m_ip)
|
||||
, (int32_t)g_hostdb.m_myHost->m_httpPort
|
||||
|
||||
);
|
||||
if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
|
||||
return false;
|
||||
}
|
||||
if ( ! s_flags[EXAMINE_RESULTS] == 0) {
|
||||
s_flags[EXAMINE_RESULTS]++;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
|
||||
"&dsrt=500",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
if ( s_flags[ADD_INITIAL_URLS] == 1) {
|
||||
s_flags[ADD_INITIAL_URLS]++;
|
||||
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("&c=qatest123"
|
||||
"&format=json"
|
||||
"&strip=1"
|
||||
"&spiderlinks=1"
|
||||
"&urls=http://%s:%"INT32"/test.arc.gz"
|
||||
, iptoa(g_hostdb.m_myHost->m_ip)
|
||||
, (int32_t)g_hostdb.m_myHost->m_httpPort);
|
||||
|
||||
if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( ! s_flags[EXAMINE_RESULTS] == 1) {
|
||||
s_flags[EXAMINE_RESULTS]++;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
|
||||
"&dsrt=500",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool qaimport () {
|
||||
|
||||
//
|
||||
@ -1584,6 +1692,9 @@ bool qaimport () {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool qainlinks() {
|
||||
|
||||
//
|
||||
@ -3084,7 +3195,12 @@ static QATest s_qatests[] = {
|
||||
"timeAxisTest",
|
||||
"Use Inject api to inject the same url at different times, "
|
||||
"sometimes changed and sometimes not. Ensure docId is different "
|
||||
"when content has changed, even if the url is the same. "}
|
||||
"when content has changed, even if the url is the same. "},
|
||||
|
||||
|
||||
{qaWarcFiles,
|
||||
"indexWarcFiles",
|
||||
"Ensure the spider handles arc.gz and warc.gz file formats."}
|
||||
|
||||
|
||||
};
|
||||
|
Reference in New Issue
Block a user