privacore-open-source-searc…/misc/Test.cpp

// Copyright 2009, Gigablast Inc.

// . runs a series of tests on a gigablast instance
// . right now just performs injections to test parsing and indexing

#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 500

#include "gb-include.h"

#include "Test.h"
#include "Rdb.h"
#include "Spider.h"
#include "Pages.h"
#include "PingServer.h"
#include "Spider.h"
#include "Process.h"
#include "Threads.h"
#include "Msge1.h"
#include "Parms.h"

//static void testWrapper ( int fd , void *state ) ;
static void injectedWrapper ( void *state ) ;

// the global class
Test g_test;

Test::Test() {
	m_urlBuf = NULL;
	m_isRunning = false;
	m_isAdding  = false;
	m_urlsAdded = 0;
	m_urlsIndexed = 0;
	//m_spiderLinks = true;//false;
	m_bypassMenuElimination = false;
	// assume if they just turn spiders on we use this
	//m_testDir = "test-spider";
}

// main.cpp calls g_repair.init()
bool Test::init ( ) {
	m_isRunning = false;
	m_isAdding  = false;
	m_urlsAdded = 0;
	m_urlsIndexed = 0;
	//if( ! g_loop.registerSleepCallback( 1 , NULL , testWrapper ) )
	//	return log("repair: Failed register callback.");
	// record current value
	//m_testSpiderEnabledSaved = g_conf.m_testSpiderEnabled;
	//m_testParserEnabledSaved = g_conf.m_testParserEnabled;
	return true;
}

void Test::reset ( ) {
	if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test999");
	//m_spiderLinks = true;//false;
	m_bypassMenuElimination = false;
}

// . call this once every second
// . this is responsible for advancing from one g_repairMode to the next
//void testWrapper ( int fd , void *state ) {
//	// call it from the class
//	g_test.loop();
//}

char *Test::getTestDir ( ) {
	// sanity
	if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
		char *xx=NULL;*xx=0; }
	if ( g_conf.m_testSpiderEnabled )
		return "test-spider";
	if ( g_conf.m_testParserEnabled )
		return "test-parser";
	// default if they just turn on spiders (spiders on cmd)
	//return "test-spider";
	//if ( ! m_testDir ) { char *xx=NULL;*xx=0; }
	char *xx=NULL;*xx=0;
	return NULL;
}

void Test::removeFiles ( ) {
	// reset
	m_errno = 0;

	if ( g_conf.m_testParserEnabled ) {
		// remove all old files for now to avoid system diffs
		log("test: removing old parse critical and run files from "
		    "last run.");
		//system ("rm /home/mwells/gigablast/test/parse*.?.*" );
		//system ("rm /home/mwells/gigablast/test/critical*.?.*" );
		char sbuf[1024];
		char *testDir = getTestDir();
		sprintf(sbuf,"rm %s/%s/run.?.*" ,
			g_hostdb.m_dir,testDir);
		system (sbuf);
		// use this one instead since rm doesn't always work
		sprintf(sbuf,"ls -1 %s/%s/ | grep parse | xargs --verbose "
			"-I xxx rm %s/%s/xxx" ,
			g_hostdb.m_dir,
			testDir ,
			g_hostdb.m_dir,
			testDir );
		log("test: %s",sbuf);
		system(sbuf);

		sprintf(sbuf,"ls -1 %s/%s/ | grep critical | xargs --verbose "
			"-I xxx rm %s/%s/xxx" ,
			g_hostdb.m_dir,
			testDir ,
			g_hostdb.m_dir,
			testDir );
		log("test: %s",sbuf);
		system(sbuf);
	}


	// do not crash for lack of quickpoll now
	int32_t saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;

	CollectionRec *cr = g_collectiondb.getRec("qatest123");

	// . reset the qatest collection to zero docs
	// . TODO: implement this. only allow it for qatest coll.
	// . kinda like Collectiondb::deleteRec() i guess but we need to
	//   preserve the parms!!
	// . deletetagdb = false
	if ( cr ) g_collectiondb.resetColl2 ( cr->m_collnum ,
					      cr->m_collnum ,
					      true );

	// reset event count
	//g_collectiondb.countEvents();

	// turn it back on
	g_conf.m_useQuickpoll = saved;
}


// come here once per second i guess
void Test::initTestRun ( ) {

	g_errno = 0;

	// . all hosts should have their g_conf.m_repairMode parm set
	// . it is global now, not collection based, since we need to
	//   lock down titledb for the scan and there could be recs from
	//   the collection we are repairing in titledb's rdbtree, which,
	//   when dumped, would mess up our scan.
	if ( ! g_conf.m_testSpiderEnabled && ! g_conf.m_testParserEnabled ) {
		char *xx=NULL;*xx=0; }

	// if both enabled, core
	if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
		char *xx=NULL;*xx=0; }

	// return if currently running
	// no, admin can re-init even if running now
	//if ( m_isRunning ) { char *xx=NULL;*xx=0; }//return;

	// must be host #0 only
	if ( g_hostdb.m_myHost->m_hostId != 0 ) return;

	// if was initially in this mode, don't do anything
	//if ( m_testSpiderEnabledSaved ) return;
	//if ( m_testParserEnabledSaved ) return;

	// you must have the "qatest123" coll already setup!
	CollectionRec *cr = g_collectiondb.getRec("qatest123");
	if ( ! cr ) {
		// note it
		log("test: please add a collection named \"test\" first.");
		// stop the test
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		// all done
		return;
	}

	char *testDir = getTestDir();

	// scan for file named "run.start.%"INT32".txt" which is a dump of all
	// the conf and parms
	char filename[100];
	File f;
	int32_t i; for ( i = 0 ; i < 9999 ; i++ ) {
		// make filename. base it off working dir, g_hostdb.m_dir
		sprintf ( filename,"%s/%s/run.%"INT32".collparms.txt",
			  g_hostdb.m_dir,testDir,i );
		// exist?
		f.set ( filename );
		// open files
		int32_t status = f.doesExist();
		// error?
		if ( status == -1 ) {
			// note it in the log
			log("test: doesExist() returned -1");
			// end the test
			g_conf.m_testParserEnabled = false;
			g_conf.m_testSpiderEnabled = false;
			// all done
			return;
		}
		// try next i if this one in use
		if ( status ) continue;
		// got one
		break;
	}
	// close it
	f.close();

	// create the run.%"INT32".version.txt file
	char cmd[1000];
	char vfile[200];
	sprintf(vfile,"%s/%s/run.%"INT32".version.txt",g_hostdb.m_dir,testDir,i);
	sprintf(cmd,
		"%s/gb -v >& %s ; "
		"echo -n \"RUN START TIME: \" >> %s ; "
		"date >> %s",
		g_hostdb.m_dir,vfile,
		vfile,
		vfile);
	system(cmd);


	// save it
	m_runId = i;

	cr = g_collectiondb.getRec ( "qatest123" );
	if ( ! cr ) {
		// and no more of this
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		return;
	}
	// set these
	m_coll = cr->m_coll;

	// turn on spiders
	//cr->m_spideringEnabled = 1;

	// crap i guess this too!!!
	//g_conf.m_spideringEnabled = 1;


	//
	// log out the global parms
	//
	char fbuf[100];
	// print our global parms into a file called run.%"INT32".start.txt
	sprintf(fbuf,"%s/%s/run.%"INT32".confparms.txt",g_hostdb.m_dir,testDir,i);
	// this saves it as xml i think
	g_parms.saveToXml ( (char *)&g_conf , fbuf , OBJ_CONF);

	//
	// log out the coll specific parms
	//
	// update name
	sprintf(fbuf,"%s/%s/run.%"INT32".collparms.txt",g_hostdb.m_dir,testDir,i);
	// save that
	g_parms.saveToXml ( (char *)cr , fbuf , OBJ_COLL);

	// get the list of urls to download and inject in order
	sprintf(fbuf,"%s/%s/urls.txt",g_hostdb.m_dir,testDir);
	// set it
	f.set ( fbuf ) ;
	// read it in
	int32_t fsize = f.getFileSize();
	// add one for \0 termination
	int32_t need = fsize + 1;
	// read it in
	char *buf = (char *)mmalloc ( need ,"qatest");
	// error?
	if ( ! buf ) {
		// note it
		log("test: failed to alloc %"INT32" bytes for url buf",fsize);
		// disable testing
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		// all done
		return;
	}
	// open it
	f.open ( O_RDONLY );
	// read it in
	int32_t rs = f.read ( buf , fsize , 0 ) ;
	// check it
	if ( rs != fsize ) {
		// note it
		log("test: failed to read %"INT32" bytes of urls.txt file",fsize);
		// disable testing
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		// all done
		return;
	}
	// save it
	m_urlBuf = buf;
	// null term it just in case
	buf[need-1] = '\0';
	// end of it, including the terminating \0
	m_urlEnd = buf + need;
	// init url offset
	m_urlPtr = m_urlBuf;

	// reset just in case
	//m_spiderLinks = false;
	m_bypassMenuElimination = false;

	// first check for spiderlinks=1|true
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
		//if ( p[0] != 's' ) continue;
		//if ( p[1] != 'p' ) continue;
		//if ( ! strncmp(p,"spiderlinks",11) )
		//	m_spiderLinks = true;
		//if ( ! strncmp(p,"bypassmenuelimination",21) )
		//	m_bypassMenuElimination = true;
	}

	// force max spiders to one because one page is often dependent
	// on the previous page!
	//if ( ! m_spiderLinks ) cr->m_maxNumSpiders = 1;
	// need to make it 6 since some priorities essentially lock the
	// ips up that have urls in higher priorities. i.e. once we dole
	// a url out for ip X, then if later we add a high priority url for
	// IP X it can't get spidered until the one that is doled does.
	//else                   cr->m_maxNumSpiders = 6;

	// . first space out all comments
	// . comments are nice because we know why the url is in urls.txt
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
		// skip if not start of a comment line
		if ( *p != '#' ) continue;
		// if not preceeded by a \n or start, skip
		if ( p > m_urlBuf && *(p-1) != '\n' ) continue;
		// ok, nuke it
		for ( ; *p && *p !='\n' ; p++ ) *p = ' ';
	}

	// if we hit "\nSTOP\n" then white out that and all past it
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
		// skip if not start of a comment line
		if ( *p != '\n' ) continue;
		// check it
		if ( strncmp(p,"\nSTOP\n",6) ) continue;
		// white out
		for ( ; *p ; p++ ) {
			// until we HIT RESUME
			if ( *p == '\n' && ! strncmp(p,"\nRESUME\n",8) ) {
				p[1] = ' ';
				p[2] = ' ';
				p[3] = ' ';
				p[4] = ' ';
				p[5] = ' ';
				p[6] = ' ';
				break;
			}
			*p = ' ';
		}
		// all done
		//break;
	}

	// then NULL terminate all urls by converting all white space to \0s
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ )
		// all non url chars to \0
		if ( is_wspace_a(*p) ) *p = '\0';


	// flag this
	m_isRunning = true;

	// and this
	m_isAdding = true;

	m_testStartTime = gettimeofdayInMilliseconds();

	// set up dedup table
	m_dt.set ( 8,0,0,NULL,0,false,MAX_NICENESS,"testdedup");

	// remove all old files for now to avoid system diffs
	log("test: beginning injection");

	// . now inject each url in order, one at a time using msg7 i guess
	// . returns true if all done
	if ( ! injectLoop() ) return;
	// close it up
	//stopIt();
}


// this should be called when all docs have finished spidering
void Test::stopIt ( ) {

	// sanity
	if ( m_isAdding ) { char *xx=NULL;*xx=0; }
	// flag that we are done
	m_isRunning = false;

	// print time
	log("test: took %"INT64" ms to complete injections.",
	    gettimeofdayInMilliseconds() - m_testStartTime );

	// get this before setting testParserEnabled to false
	char *testDir = g_test.getTestDir();

	// turn this off now too
	g_conf.m_testParserEnabled = false;
	g_conf.m_testSpiderEnabled = false;


	// save all!
	bool disabled = g_threads.m_disabled;
	g_threads.disableThreads();
	// save it blocking style
	g_process.save();
	if ( ! disabled ) g_threads.enableThreads();

	// save ips.txt
	saveTestBuf ( testDir );

	log("test: test completed. making qa.html");

	//
	//
	// NOW MAKE THE qa.html FILE
	//
	//

	// only analyze up to last 7 runs
	int32_t start = m_runId - 7;
	if ( start < 0 ) start = 0;

	SafeBuf sb;
	sb.safePrintf("<table border=1>\n");
	sb.safePrintf("<tr>"
		      "<td><b><nobr>run id</nobr></b></td>"
		      "<td><b><nobr>conf diff</nobr></b></td>"
		      "<td><b><nobr>coll diff</nobr></b></td>"
		      "<td><b><nobr>run info</nobr></b></td>"
		      "</tr>\n");

	// take diffs between this run and the last run for confparms
	for ( int32_t i = m_runId ; i > start ; i-- ) {
		// shortcut
		char *dir = g_hostdb.m_dir;
		// make diff filename
		char diff1[200];
		sprintf(diff1,"%s/%s/run.%"INT32".confparms.txt.diff",dir,
			testDir,i);
		File f1;
		f1.set(diff1);
		if ( ! f1.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%"INT32".confparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%"INT32".confparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff1);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		int32_t fs1 = f1.getFileSize();
		sb.safePrintf("<tr><td>%"INT32"</td><td>%"INT32"</td>", i,fs1);

		// make diff filename
		char diff2[200];
		sprintf(diff2,"%s/%s/run.%"INT32".collparms.txt.diff",dir,
			testDir,i);
		File f2;
		f2.set(diff2);
		if ( ! f2.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%"INT32".collparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%"INT32".collparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff2);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		int32_t fs2 = f2.getFileSize();
		sb.safePrintf("<td>%"INT32"</td>", fs2);

		// the version
		char vf[200];
		sprintf(vf,"%s/%s/run.%"INT32".version.txt",dir,testDir,i);
		File f3;
		f3.set ( vf );
		int32_t fs3 = f3.getFileSize();
		char vbuf[1000];
		vbuf[0] = 0;
		if ( fs3 > 0 ) {
			f3.open(O_RDONLY);
			int32_t rs = f3.read(vbuf,fs3,0);
			vbuf[fs3] = '\0';
			if ( rs <= 0 ) continue;
			f3.close();
		}
		// show it
		sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf);
	}
	sb.safePrintf("</table>\n");
	sb.safePrintf("<br>\n");


	//
	// now diff each parser output file for each url in urls.txt
	//


	//
	// loop over url buf first so we can print one table per url
	//

	char *next = NULL;
	// reset the url buf ptr
	m_urlPtr = m_urlBuf;
	// count em
	int32_t count = 0;

	// ptrs to each url table
	int32_t  un = 0;
	int32_t  uptr [5000]; // offsets now, not char ptr since buf gets reallocd
	char  udiff[5000];
	int32_t  ulen [5000];
	int32_t  uhits[5000]; // critical errors! validateOutput() choked!
	int32_t  uunchecked[5000]; // events/addresses found but were not validatd
	int32_t  umiss[5000];
	int32_t  usort[5000];
	int32_t  uevents[5000];
	SafeBuf tmp;

	// advance to next url
	for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) {
		// breathe
		QUICKPOLL(MAX_NICENESS);
		// we converted all non-url chars into \0's so skip those!
		for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ );
		// breach check
		if ( m_urlPtr >= m_urlEnd ) break;
		// set this up
		next = m_urlPtr;
		// compute next url ptr
		for ( ; next < m_urlEnd && *next ; next++ );
		// point to this url
		char *u = m_urlPtr;
		// get hash
		int64_t h = hash64 ( u , gbstrlen(u) );
		// shortcut
		char *dir = g_hostdb.m_dir;


		// print into a secondary safe buf with a ptr to
		// it so we can sort that and transfer into the
		// primary safebuf later
		uptr[un] = tmp.length();
		// assume no diff
		udiff[un] = 0;

		// print number
		tmp.safePrintf("%"INT32") ",count++);
		// . link to our stored http server reply
		// . TODO: link it to our [cached] copy in the test coll!!!
		char local[1200];
		sprintf(local,"/%s/doc.%"UINT64".html",testDir,h);
		tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u);
		// link to live page
		tmp.safePrintf(" <a href=\"%s\">live</a> ",u);
		// link to page parser
		char ubuf[2000];
		urlEncode(ubuf,2000,u,gbstrlen(u),true);
		tmp.safePrintf(" <a href=\"/admin/parser?c=test&"
			       "u=%s\">parser</a> ",ubuf);
		//tmp.safePrintf(" (%"UINT64")",h);
		tmp.safePrintf("<br>\n");
		//tmp.safePrintf("<br>\n");
		tmp.safePrintf("<table border=1>\n");
		tmp.safePrintf("<tr>"
			      "<td><b><nobr>run id</nobr></b></td>"
			      "<td><b><nobr>crit hits</nobr></b></td>"
			      "<td><b><nobr>crit errors</nobr></b></td>"
			      "<td><b><nobr># e</nobr></b></td>"
			      "<td><b><nobr>unchecked</nobr></b></td>"
			      "<td><b><nobr>diff chars</nobr></b></td>"
			      "<td><b><nobr>diff file</nobr></b></td>"
			      "<td><b><nobr>full output</nobr></b></td>"
			      "</tr>\n");

		//SafeBuf sd;

		// loop over all the runs now, starting with latest run first
		for ( int32_t ri = m_runId ; ri >= start ; ri-- ) {

			QUICKPOLL(MAX_NICENESS);

			// the diff filename
			char pdiff[200];
			sprintf(pdiff,"%s/%s/parse.%"UINT64".%"INT32".html.diff",dir,
				testDir,h,ri);
			File f;
			f.set(pdiff);
			int32_t fs = f.getFileSize();
			if ( ! f.doesExist() && ri > 0 ) {
				// make the parse filename
				char pbuf1[200];
				char pbuf2[200];
				sprintf(pbuf1,"%s/%s/parse.%"UINT64".%"INT32".html",
					dir,testDir,h,ri);
				sprintf(pbuf2,"%s/%s/parse.%"UINT64".%"INT32".html",
					dir,testDir,h,ri-1);
				// sanity check
				//File tf; tf.set(pbuf1);
				//if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;}
				// tmp file name
				char tmp1[200];
				char tmp2[200];
				sprintf(tmp1,"%s/%s/t1.html",dir,testDir);
				sprintf(tmp2,"%s/%s/t2.html",dir,testDir);
				// filter first
				char cmd[600];
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf1,tmp1);
				system(cmd);
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf2,tmp2);
				system(cmd);
				// make the system cmd to do the diff
				sprintf(cmd,
					"echo \"<pre>\" > %s ; "
					"diff -w --text %s %s "
					// ignore this table header row
					//" | grep -v \"R#4\""
					" >> %s",
					pdiff,
					tmp1,tmp2,pdiff);
				log("test: system(\"%s\")",cmd);
				system(cmd);
				// try again
				f.set(pdiff);
				fs = f.getFileSize();
			}

			QUICKPOLL(MAX_NICENESS);

			// this means 0 . it just has the <pre> tag in it!
			if ( fs < 0 || fs == 6 ) fs = 0;
			// . if no diff and NOT current run, do not print it
			// . print it if the run right before the current
			//   now always too
			if ( ri != m_runId && ri != m_runId-1 && fs == 0 )
				continue;
			// relative filename
			char rel[200];
			sprintf(rel,"/%s/parse.%"UINT64".%"INT32".html.diff",
				testDir,h,ri);
			char full[200];
			sprintf(full,"/%s/parse.%"UINT64".%"INT32".html",
				testDir,h,ri);
			char validate[200];
			sprintf(validate,
				"/%s/parse-shortdisplay.%"UINT64".%"INT32".html",
				testDir,h,ri);
			// use red font for current run that has a diff!
			char *t1 = "";
			char *t2 = "";
			if ( ri == m_runId && fs != 0 ) {
				t1 = "<font color=pink><b>";
				t2 = "</b></font>";
				// a diff
				udiff[un] = 1;
			}

			// . get critical errors
			// . i.e. XmlDoc::validateOutput() could not validate
			//   a particular event or address that was in the
			//   url's "validated.uh64.txt" file since the admin
			//   clicked on the checkbox in the page parser output
			// . if we do not find such a tag in the parser output
			//   any more then Spider.cpp creates this file!
			if ( ri == m_runId ) {
				char cfile[256];
				sprintf(cfile,"%s/%s/critical.%"UINT64".%"INT32".txt",
					g_hostdb.m_dir,testDir,h,ri);
				SafeBuf ttt;
				ttt.fillFromFile(cfile);
				// first int32_t is misses, then hits then events
				umiss[un] = 0;
				uhits[un] = 0;
				uevents[un] = 0;
				uunchecked[un] = 0;
				if ( ttt.length() >= 3 )
					sscanf(ttt.getBufStart(),
					       "%"INT32" %"INT32" %"INT32" %"INT32"",
					       &umiss[un],
					       &uhits[un],
					       &uevents[un],
					       &uunchecked[un]);
				usort[un] = umiss[un] + uunchecked[un];
				//File cf;
				//cf.set(cfile);
				//if ( cf.doesExist()) ucrit[un] = 1;
				//else                 ucrit[un] = 0;
			}

			// more critical?
			if ( ri == m_runId && umiss[un] != 0 ) {
				t1 = "<font color=red><b>";
				t2 = "</b></font>";
			}

			// . these are good to have
			// . if you don't have 1+ critical hits then you
			//   probably need to be validate by the qa guy
			char *uhb1 = "";
			char *uhb2 = "";
			if ( ri == m_runId && uhits[un] != 0 ) {
				uhb1 = "<font color=green><b>**";
				uhb2 = "**</b></font>";
			}

			QUICKPOLL(MAX_NICENESS);

			char *e1 = "<td>";
			char *e2 = "</td>";
			int32_t ne = uevents[un];
			if ( ne ) {
				e1="<td bgcolor=orange><b><font color=brown>";
				e2="</font></b></td>";
			}
			char *u1 = "<td>";
			char *u2 = "</td>";
			if ( uunchecked[un] ) {
				u1="<td bgcolor=purple><b><font color=white>";
				u2="</font></b></td>";
			}

			// print the row!
			tmp.safePrintf("<tr>"
				      "<td>%s%"INT32"%s</td>"
				       "<td>%s%"INT32"%s</td>" // critical hits
				       "<td>%s%"INT32"%s</td>" // critical misses
				       "%s%"INT32"%s" // # events
				       "%s%"INT32"%s" // unchecked
				       "<td>%s%"INT32"%s</td>" // filesize of diff
				      // diff filename
				      "<td><a href=\"%s\">%s%s%s</a></td>"
				      // full parser output
				      "<td>"
				       "<a href=\"%s\">full</a> | "
				       "<a href=\"%s\">validate</a> "
				       "</td>"
				      "</tr>\n",
				      t1,ri,t2,
				      uhb1,uhits[un],uhb2,
				      t1,umiss[un],t2,
				      e1,ne,e2,
				       u1,uunchecked[un],u2,
				      t1,fs,t2,
				      rel,t1,rel,t2,
				      full,
				       validate);


			// only fill "sd" for the most recent guy
			if ( ri != m_runId ) continue;

			// now concatenate the parse-shortdisplay file
			// to this little table so qa admin can check/uncheck
			// validation checkboxes for addresses and events
			//sprintf(cfile,
			//	"%s/test/parse-shortdisplay.%"UINT64".%"INT32".html",
			//	g_hostdb.m_dir,h,ri);
			//sd.fillFromFile ( cfile );
		}
		// end table
		tmp.safePrintf("</table>\n");

		// . and a separate little section for the checkboxes
		// . should already be in tables, etc.
		// . each checkbox should provide its own uh64 when it
		//   calls senddiv() when clicked now
		//tmp.cat ( sd );

		tmp.safePrintf("<br>\n");
		tmp.safePrintf("<br>\n");
		// set this
		ulen[un] = tmp.length() - uptr[un] ;
		// sanity check
		if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; }
		// inc it
		un++;
		// increase the 5000!!
		if ( un >= 5000 ) { char *xx=NULL; *xx=0; }
	}


	char flag ;
 bubble:
	flag = 0;
	// sort the url tables
	for ( int32_t i = 0 ; i < un - 1 ; i++ ) {
		QUICKPOLL(MAX_NICENESS);
		if ( usort[i] >  usort[i+1] ) continue;
		if ( usort[i] == usort[i+1] )
			if ( udiff[i] >= udiff[i+1] ) continue;
		// swap em
		int32_t  tp = uptr[i];
		int32_t  td = udiff[i];
		int32_t  um = umiss[i];
		int32_t  us = usort[i];
		int32_t  uh = uhits[i];
		int32_t  tl = ulen [i];
		uptr[i] = uptr[i+1];
		umiss[i] = umiss[i+1];
		usort[i] = usort[i+1];
		uhits[i] = uhits[i+1];
		udiff[i] = udiff[i+1];
		ulen[i]  = ulen[i+1];
		uptr[i+1] = tp;
		umiss[i+1] = um;
		usort[i+1] = us;
		uhits[i+1] = uh;
		udiff[i+1] = td;
		ulen [i+1] = tl;
		flag = 1;
	}
	if ( flag ) goto bubble;

	// transfer into primary safe buf now
	for ( int32_t i = 0 ; i < un ; i++ )
		sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]);


	sb.safePrintf("</html>\n");

	char dfile[200];
	sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir);
	sb.dumpToFile ( dfile );

	// free the buffer of urls
	reset();

	// turn off spiders
	g_conf.m_spideringEnabled = 0;

	// all done
	return;
}


void injectedWrapper ( void *state ) {
	// wait for all msg4 buffers to flush
	//if ( ! flushMsg4Buffers ( state , injectedWrapper ) ) return;
	// this function is in Msge1.cpp. save ip file in test subdir
	//saveTestBuf();
	if ( ! g_test.injectLoop() ) return;
	//g_test.stopIt();
}

static int32_t s_count = 0;

// . returns true if all done!
// . returns false if still doing stuff
bool Test::injectLoop ( ) {

	int32_t  dlen   ;
	char *dom    ;
	int32_t  fakeIp ;

 loop:
	// advance to next url
	for ( ; m_urlPtr < m_urlEnd && ! *m_urlPtr ; m_urlPtr++ ) ;
	// all done?
	if ( m_urlPtr >= m_urlEnd ) {
		// flush em out
		if ( ! flushMsg4Buffers ( this , injectedWrapper ) )
			return false;
		// note it
		m_isAdding = false;
		// all done
		return true;
	}
	// error means all done
	if ( m_errno ) { m_isAdding = false; return true; }
	// point to it
	char *u = m_urlPtr;
	// advance to point to the next url for the next loop!
	for ( ; m_urlPtr < m_urlEnd && *m_urlPtr ; m_urlPtr++ ) ;

	// hash it
	int64_t h = hash64b ( u );
	// dedup it lest we freeze up and stopIt() never gets called because
	// m_urlsAdded is never decremented all the way to zero in Spider.cpp
	if ( m_dt.isInTable ( &h ) ) goto loop;
	// add it. return true with g_errno set on error
	if ( ! m_dt.addKey ( &h ) ) goto hadError;

	// make the SpiderRequest from it
	m_sreq.reset();
	// url
	strcpy ( m_sreq.m_url , u );
	// get domain of url
	dom = getDomFast ( m_sreq.m_url , &dlen );
	// make a fake ip
	fakeIp = 0x123456;
	// use domain if we got that
	if ( dom && dlen ) fakeIp = hash32 ( dom , dlen );
	// first ip is fake
	m_sreq.m_firstIp = fakeIp; // 0x123456;
	// these too
	m_sreq.m_domHash32  = fakeIp;
	m_sreq.m_hostHash32 = fakeIp;
	m_sreq.m_siteHash32 = fakeIp;
	//m_sreq.m_probDocId = Titledb::getProbableDocId( m_sreq.m_url );
	// this crap is fake
	m_sreq.m_isInjecting = 1;
	// use test-spider subdir for storing pages and spider times?
	// MDW: this was replaced by m_isParentSiteMap bit.
	//if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
	// use this later
	// injected requests use this as the spider time i guess
	// so we can sort them by this
	m_sreq.m_addedTime = ++s_count;

	// no, because to compute XmlDoc::m_min/maxPubDate we need this to
	// be valid for our test run.. no no we will fix it to be
	// basically 2 days before spider time in the code...
	//m_sreq.m_addedTime = spiderTime;

	m_sreq.m_fakeFirstIp = 1;

	// make the key (parentDocId=0)
	m_sreq.setKey ( fakeIp, 0LL , false );
	// test it
	if ( g_spiderdb.getFirstIp(&m_sreq.m_key) != fakeIp ) {
		char *xx=NULL;*xx=0;}
	// sanity check. check for http(s)://
	if ( m_sreq.m_url[0] != 'h' ) { char *xx=NULL;*xx=0; }

	// reset this
	g_errno = 0;

	// count it
	m_urlsAdded++;

	// note it
	//log("crazyout: %s",m_sreq.m_url );
	logf(LOG_DEBUG,"spider: injecting test url %s",m_sreq.m_url);

	// the receiving end will realize that we are injecting into the test
	// collection and use the "/test/" subdir to load the file
	// "ips.txt" to do our ip lookups, and search for any downloads in
	// that subdirectory as well.
	if ( ! m_msg4.addMetaList ( (char *)&m_sreq     ,
				    m_sreq.getRecSize() ,
				    m_coll              ,
				    NULL                ,
				    injectedWrapper     ,
				    RDB_SPIDERDB        ) )
		// return false if blocked
		return false;
	// error?
	if ( g_errno ) {
		// jump down here from above on error
	hadError:
		// save it
		m_errno = g_errno;
		// flag it
		m_isAdding = false;
		// note it
		log("test: inject had error: %s",mstrerror(g_errno));
		// stop, we are all done!
		return true;
	}
	// add the next spider request
	goto loop;
}