mirror of
				https://github.com/privacore/open-source-search-engine.git
				synced 2025-10-30 16:36:11 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			991 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			991 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright 2009, Gigablast Inc.
 | |
| 
 | |
| // . runs a series of tests on a gigablast instance
 | |
| // . right now just performs injections to test parsing and indexing
 | |
| 
 | |
| #undef _XOPEN_SOURCE
 | |
| #define _XOPEN_SOURCE 500
 | |
| 
 | |
| #include "gb-include.h"
 | |
| 
 | |
| #include "Test.h"
 | |
| #include "Rdb.h"
 | |
| #include "Spider.h"
 | |
| #include "Pages.h"
 | |
| #include "PingServer.h"
 | |
| #include "Spider.h"
 | |
| #include "Process.h"
 | |
| #include "Threads.h"
 | |
| #include "Msge1.h"
 | |
| #include "Parms.h"
 | |
| 
 | |
| //static void testWrapper ( int fd , void *state ) ;
 | |
| static void injectedWrapper ( void *state ) ;
 | |
| 
 | |
| // the global class
 | |
| Test g_test;
 | |
| 
 | |
| Test::Test() {
 | |
| 	m_urlBuf = NULL;
 | |
| 	m_isRunning = false;
 | |
| 	m_isAdding  = false;
 | |
| 	m_urlsAdded = 0;
 | |
| 	m_urlsIndexed = 0;
 | |
| 	//m_spiderLinks = true;//false;
 | |
| 	m_bypassMenuElimination = false;
 | |
| 	// assume if they just turn spiders on we use this
 | |
| 	//m_testDir = "test-spider";
 | |
| }
 | |
| 
 | |
| // main.cpp calls g_repair.init()
 | |
| bool Test::init ( ) {
 | |
| 	m_isRunning = false;
 | |
| 	m_isAdding  = false;
 | |
| 	m_urlsAdded = 0;
 | |
| 	m_urlsIndexed = 0;
 | |
| 	//if( ! g_loop.registerSleepCallback( 1 , NULL , testWrapper ) )
 | |
| 	//	return log("repair: Failed register callback.");
 | |
| 	// record current value
 | |
| 	//m_testSpiderEnabledSaved = g_conf.m_testSpiderEnabled;
 | |
| 	//m_testParserEnabledSaved = g_conf.m_testParserEnabled;
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| void Test::reset ( ) {
 | |
| 	if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test999");
 | |
| 	//m_spiderLinks = true;//false;
 | |
| 	m_bypassMenuElimination = false;
 | |
| }
 | |
| 
 | |
| // . call this once every second 
 | |
| // . this is responsible for advancing from one g_repairMode to the next
 | |
| //void testWrapper ( int fd , void *state ) {
 | |
| //	// call it from the class
 | |
| //	g_test.loop();
 | |
| //}
 | |
| 
 | |
| char *Test::getTestDir ( ) {
 | |
| 	// sanity
 | |
| 	if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
 | |
| 		char *xx=NULL;*xx=0; }
 | |
| 	if ( g_conf.m_testSpiderEnabled )
 | |
| 		return "test-spider";
 | |
| 	if ( g_conf.m_testParserEnabled )
 | |
| 		return "test-parser";
 | |
| 	// default if they just turn on spiders (spiders on cmd)
 | |
| 	//return "test-spider";
 | |
| 	//if ( ! m_testDir ) { char *xx=NULL;*xx=0; }
 | |
| 	char *xx=NULL;*xx=0;
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| void Test::removeFiles ( ) {
 | |
| 	// reset
 | |
| 	m_errno = 0;
 | |
| 
 | |
| 	if ( g_conf.m_testParserEnabled ) {
 | |
| 		// remove all old files for now to avoid system diffs
 | |
| 		log("test: removing old parse critical and run files from "
 | |
| 		    "last run.");
 | |
| 		//system ("rm /home/mwells/gigablast/test/parse*.?.*" );
 | |
| 		//system ("rm /home/mwells/gigablast/test/critical*.?.*" );
 | |
| 		char sbuf[1024];
 | |
| 		char *testDir = getTestDir();
 | |
| 		sprintf(sbuf,"rm %s/%s/run.?.*" ,
 | |
| 			g_hostdb.m_dir,testDir);
 | |
| 		system (sbuf);
 | |
| 		// use this one instead since rm doesn't always work
 | |
| 		sprintf(sbuf,"ls -1 %s/%s/ | grep parse | xargs --verbose "
 | |
| 			"-I xxx rm %s/%s/xxx" ,
 | |
| 			g_hostdb.m_dir,
 | |
| 			testDir ,
 | |
| 			g_hostdb.m_dir,
 | |
| 			testDir );
 | |
| 		log("test: %s",sbuf);
 | |
| 		system(sbuf);
 | |
| 		
 | |
| 		sprintf(sbuf,"ls -1 %s/%s/ | grep critical | xargs --verbose "
 | |
| 			"-I xxx rm %s/%s/xxx" ,
 | |
| 			g_hostdb.m_dir,
 | |
| 			testDir ,
 | |
| 			g_hostdb.m_dir,
 | |
| 			testDir );
 | |
| 		log("test: %s",sbuf);
 | |
| 		system(sbuf);
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	// do not crash for lack of quickpoll now
 | |
| 	int32_t saved = g_conf.m_useQuickpoll;
 | |
| 	g_conf.m_useQuickpoll = false;
 | |
| 
 | |
| 	CollectionRec *cr = g_collectiondb.getRec("qatest123");
 | |
| 
 | |
| 	// . reset the qatest collection to zero docs
 | |
| 	// . TODO: implement this. only allow it for qatest coll.
 | |
| 	// . kinda like Collectiondb::deleteRec() i guess but we need to
 | |
| 	//   preserve the parms!!
 | |
| 	// . deletetagdb = false
 | |
| 	if ( cr ) g_collectiondb.resetColl2 ( cr->m_collnum , 
 | |
| 					      cr->m_collnum ,
 | |
| 					      true );
 | |
| 
 | |
| 	// reset event count
 | |
| 	//g_collectiondb.countEvents();
 | |
| 
 | |
| 	// turn it back on
 | |
| 	g_conf.m_useQuickpoll = saved;
 | |
| }
 | |
| 
 | |
| 
 | |
| // come here once per second i guess
 | |
| void Test::initTestRun ( ) {
 | |
| 
 | |
| 	g_errno = 0;
 | |
| 
 | |
| 	// . all hosts should have their g_conf.m_repairMode parm set
 | |
| 	// . it is global now, not collection based, since we need to
 | |
| 	//   lock down titledb for the scan and there could be recs from
 | |
| 	//   the collection we are repairing in titledb's rdbtree, which,
 | |
| 	//   when dumped, would mess up our scan.
 | |
| 	if ( ! g_conf.m_testSpiderEnabled && ! g_conf.m_testParserEnabled ) {
 | |
| 		char *xx=NULL;*xx=0; }
 | |
| 
 | |
| 	// if both enabled, core
 | |
| 	if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
 | |
| 		char *xx=NULL;*xx=0; }
 | |
| 
 | |
| 	// return if currently running
 | |
| 	// no, admin can re-init even if running now
 | |
| 	//if ( m_isRunning ) { char *xx=NULL;*xx=0; }//return;
 | |
| 
 | |
| 	// must be host #0 only
 | |
| 	if ( g_hostdb.m_myHost->m_hostId != 0 ) return;
 | |
| 	
 | |
| 	// if was initially in this mode, don't do anything
 | |
| 	//if ( m_testSpiderEnabledSaved ) return;
 | |
| 	//if ( m_testParserEnabledSaved ) return;
 | |
| 
 | |
| 	// you must have the "qatest123" coll already setup!
 | |
| 	CollectionRec *cr = g_collectiondb.getRec("qatest123");
 | |
| 	if ( ! cr ) {
 | |
| 		// note it
 | |
| 		log("test: please add a collection named \"test\" first.");
 | |
| 		// stop the test
 | |
| 		g_conf.m_testParserEnabled = false;
 | |
| 		g_conf.m_testSpiderEnabled = false;
 | |
| 		// all done
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	char *testDir = getTestDir();
 | |
| 
 | |
| 	// scan for file named "run.start.%"INT32".txt" which is a dump of all
 | |
| 	// the conf and parms 
 | |
| 	char filename[100];
 | |
| 	File f;
 | |
| 	int32_t i; for ( i = 0 ; i < 9999 ; i++ ) {
 | |
| 		// make filename. base it off working dir, g_hostdb.m_dir
 | |
| 		sprintf ( filename,"%s/%s/run.%"INT32".collparms.txt",
 | |
| 			  g_hostdb.m_dir,testDir,i );
 | |
| 		// exist?
 | |
| 		f.set ( filename );
 | |
| 		// open files
 | |
| 		int32_t status = f.doesExist();
 | |
| 		// error?
 | |
| 		if ( status == -1 ) {
 | |
| 			// note it in the log
 | |
| 			log("test: doesExist() returned -1");
 | |
| 			// end the test
 | |
| 			g_conf.m_testParserEnabled = false;
 | |
| 			g_conf.m_testSpiderEnabled = false;
 | |
| 			// all done
 | |
| 			return;
 | |
| 		}
 | |
| 		// try next i if this one in use
 | |
| 		if ( status ) continue;
 | |
| 		// got one
 | |
| 		break;
 | |
| 	}
 | |
| 	// close it
 | |
| 	f.close();
 | |
| 
 | |
| 	// create the run.%"INT32".version.txt file
 | |
| 	char cmd[1000];
 | |
| 	char vfile[200];
 | |
| 	sprintf(vfile,"%s/%s/run.%"INT32".version.txt",g_hostdb.m_dir,testDir,i);
 | |
| 	sprintf(cmd,
 | |
| 		"%s/gb -v >& %s ; "
 | |
| 		"echo -n \"RUN START TIME: \" >> %s ; "
 | |
| 		"date >> %s",
 | |
| 		g_hostdb.m_dir,vfile,
 | |
| 		vfile,
 | |
| 		vfile);
 | |
| 	system(cmd);
 | |
| 
 | |
| 
 | |
| 	// save it
 | |
| 	m_runId = i;
 | |
| 
 | |
| 	cr = g_collectiondb.getRec ( "qatest123" );
 | |
| 	if ( ! cr ) {
 | |
| 		// and no more of this
 | |
| 		g_conf.m_testParserEnabled = false;
 | |
| 		g_conf.m_testSpiderEnabled = false;
 | |
| 		return;
 | |
| 	}
 | |
| 	// set these
 | |
| 	m_coll = cr->m_coll;
 | |
| 
 | |
| 	// turn on spiders
 | |
| 	//cr->m_spideringEnabled = 1;
 | |
| 
 | |
| 	// crap i guess this too!!!
 | |
| 	//g_conf.m_spideringEnabled = 1;
 | |
| 
 | |
| 
 | |
| 	//
 | |
| 	// log out the global parms
 | |
| 	//
 | |
| 	char fbuf[100]; 
 | |
| 	// print our global parms into a file called run.%"INT32".start.txt
 | |
| 	sprintf(fbuf,"%s/%s/run.%"INT32".confparms.txt",g_hostdb.m_dir,testDir,i);
 | |
| 	// this saves it as xml i think
 | |
| 	g_parms.saveToXml ( (char *)&g_conf , fbuf , OBJ_CONF);
 | |
| 
 | |
| 	//
 | |
| 	// log out the coll specific parms
 | |
| 	//
 | |
| 	// update name
 | |
| 	sprintf(fbuf,"%s/%s/run.%"INT32".collparms.txt",g_hostdb.m_dir,testDir,i);
 | |
| 	// save that
 | |
| 	g_parms.saveToXml ( (char *)cr , fbuf , OBJ_COLL);
 | |
| 
 | |
| 	// get the list of urls to download and inject in order
 | |
| 	sprintf(fbuf,"%s/%s/urls.txt",g_hostdb.m_dir,testDir);
 | |
| 	// set it
 | |
| 	f.set ( fbuf ) ;
 | |
| 	// read it in
 | |
| 	int32_t fsize = f.getFileSize();
 | |
| 	// add one for \0 termination
 | |
| 	int32_t need = fsize + 1;
 | |
| 	// read it in
 | |
| 	char *buf = (char *)mmalloc ( need ,"qatest");
 | |
| 	// error?
 | |
| 	if ( ! buf ) {
 | |
| 		// note it
 | |
| 		log("test: failed to alloc %"INT32" bytes for url buf",fsize);
 | |
| 		// disable testing
 | |
| 		g_conf.m_testParserEnabled = false;
 | |
| 		g_conf.m_testSpiderEnabled = false;
 | |
| 		// all done
 | |
| 		return;
 | |
| 	}
 | |
| 	// open it
 | |
| 	f.open ( O_RDONLY );
 | |
| 	// read it in
 | |
| 	int32_t rs = f.read ( buf , fsize , 0 ) ;
 | |
| 	// check it
 | |
| 	if ( rs != fsize ) {
 | |
| 		// note it
 | |
| 		log("test: failed to read %"INT32" bytes of urls.txt file",fsize);
 | |
| 		// disable testing
 | |
| 		g_conf.m_testParserEnabled = false;
 | |
| 		g_conf.m_testSpiderEnabled = false;
 | |
| 		// all done
 | |
| 		return;
 | |
| 	}
 | |
| 	// save it
 | |
| 	m_urlBuf = buf;
 | |
| 	// null term it just in case
 | |
| 	buf[need-1] = '\0';
 | |
| 	// end of it, including the terminating \0
 | |
| 	m_urlEnd = buf + need;
 | |
| 	// init url offset
 | |
| 	m_urlPtr = m_urlBuf;
 | |
| 
 | |
| 	// reset just in case
 | |
| 	//m_spiderLinks = false;
 | |
| 	m_bypassMenuElimination = false;
 | |
| 
 | |
| 	// first check for spiderlinks=1|true
 | |
| 	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
 | |
| 		//if ( p[0] != 's' ) continue;
 | |
| 		//if ( p[1] != 'p' ) continue;
 | |
| 		//if ( ! strncmp(p,"spiderlinks",11) ) 
 | |
| 		//	m_spiderLinks = true;
 | |
| 		//if ( ! strncmp(p,"bypassmenuelimination",21) ) 
 | |
| 		//	m_bypassMenuElimination = true;
 | |
| 	}
 | |
| 
 | |
| 	// force max spiders to one because one page is often dependent
 | |
| 	// on the previous page!
 | |
| 	//if ( ! m_spiderLinks ) cr->m_maxNumSpiders = 1;
 | |
| 	// need to make it 6 since some priorities essentially lock the
 | |
| 	// ips up that have urls in higher priorities. i.e. once we dole 
 | |
| 	// a url out for ip X, then if later we add a high priority url for
 | |
| 	// IP X it can't get spidered until the one that is doled does.
 | |
| 	//else                   cr->m_maxNumSpiders = 6;
 | |
| 
 | |
| 	// . first space out all comments
 | |
| 	// . comments are nice because we know why the url is in urls.txt
 | |
| 	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
 | |
| 		// skip if not start of a comment line
 | |
| 		if ( *p != '#' ) continue;
 | |
| 		// if not preceeded by a \n or start, skip
 | |
| 		if ( p > m_urlBuf && *(p-1) != '\n' ) continue;
 | |
| 		// ok, nuke it
 | |
| 		for ( ; *p && *p !='\n' ; p++ ) *p = ' ';
 | |
| 	}
 | |
| 
 | |
| 	// if we hit "\nSTOP\n" then white out that and all past it
 | |
| 	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
 | |
| 		// skip if not start of a comment line
 | |
| 		if ( *p != '\n' ) continue;
 | |
| 		// check it
 | |
| 		if ( strncmp(p,"\nSTOP\n",6) ) continue;
 | |
| 		// white out
 | |
| 		for ( ; *p ; p++ ) {
 | |
| 			// until we HIT RESUME
 | |
| 			if ( *p == '\n' && ! strncmp(p,"\nRESUME\n",8) ) {
 | |
| 				p[1] = ' ';
 | |
| 				p[2] = ' ';
 | |
| 				p[3] = ' ';
 | |
| 				p[4] = ' ';
 | |
| 				p[5] = ' ';
 | |
| 				p[6] = ' ';
 | |
| 				break;
 | |
| 			}
 | |
| 			*p = ' ';
 | |
| 		}
 | |
| 		// all done
 | |
| 		//break;
 | |
| 	}
 | |
| 
 | |
| 	// then NULL terminate all urls by converting all white space to \0s
 | |
| 	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ )
 | |
| 		// all non url chars to \0
 | |
| 		if ( is_wspace_a(*p) ) *p = '\0';
 | |
| 	
 | |
| 
 | |
| 	// flag this
 | |
| 	m_isRunning = true;
 | |
| 
 | |
| 	// and this
 | |
| 	m_isAdding = true;
 | |
| 
 | |
| 	m_testStartTime = gettimeofdayInMilliseconds();
 | |
| 
 | |
| 	// set up dedup table
 | |
| 	m_dt.set ( 8,0,0,NULL,0,false,MAX_NICENESS,"testdedup");
 | |
| 
 | |
| 	// remove all old files for now to avoid system diffs
 | |
| 	log("test: beginning injection");
 | |
| 
 | |
| 	// . now inject each url in order, one at a time using msg7 i guess
 | |
| 	// . returns true if all done
 | |
| 	if ( ! injectLoop() ) return;
 | |
| 	// close it up
 | |
| 	//stopIt();
 | |
| }
 | |
| 
 | |
| 
 | |
| // this should be called when all docs have finished spidering
 | |
| void Test::stopIt ( ) {
 | |
| 
 | |
| 	// sanity
 | |
| 	if ( m_isAdding ) { char *xx=NULL;*xx=0; }
 | |
| 	// flag that we are done
 | |
| 	m_isRunning = false;
 | |
| 
 | |
| 	// print time
 | |
| 	log("test: took %"INT64" ms to complete injections.",
 | |
| 	    gettimeofdayInMilliseconds() - m_testStartTime );
 | |
| 
 | |
| 	// get this before setting testParserEnabled to false
 | |
| 	char *testDir = g_test.getTestDir();
 | |
| 
 | |
| 	// turn this off now too
 | |
| 	g_conf.m_testParserEnabled = false;
 | |
| 	g_conf.m_testSpiderEnabled = false;
 | |
| 
 | |
| 
 | |
| 
 | |
| 	// save all!
 | |
| 	bool disabled = g_threads.m_disabled;
 | |
| 	g_threads.disableThreads();
 | |
| 	// save it blocking style
 | |
| 	g_process.save();
 | |
| 	if ( ! disabled ) g_threads.enableThreads();
 | |
| 
 | |
| 	// save ips.txt
 | |
| 	saveTestBuf ( testDir );
 | |
| 
 | |
| 	log("test: test completed. making qa.html");
 | |
| 
 | |
| 	//
 | |
| 	//
 | |
| 	// NOW MAKE THE qa.html FILE
 | |
| 	//
 | |
| 	//
 | |
| 
 | |
| 	// only analyze up to last 7 runs
 | |
| 	int32_t start = m_runId - 7;
 | |
| 	if ( start < 0 ) start = 0;
 | |
| 
 | |
| 	SafeBuf sb;
 | |
| 	sb.safePrintf("<table border=1>\n");
 | |
| 	sb.safePrintf("<tr>"
 | |
| 		      "<td><b><nobr>run id</nobr></b></td>"
 | |
| 		      "<td><b><nobr>conf diff</nobr></b></td>"
 | |
| 		      "<td><b><nobr>coll diff</nobr></b></td>"
 | |
| 		      "<td><b><nobr>run info</nobr></b></td>"
 | |
| 		      "</tr>\n");
 | |
| 
 | |
| 	// take diffs between this run and the last run for confparms
 | |
| 	for ( int32_t i = m_runId ; i > start ; i-- ) {
 | |
| 		// shortcut
 | |
| 		char *dir = g_hostdb.m_dir;
 | |
| 		// make diff filename
 | |
| 		char diff1[200];
 | |
| 		sprintf(diff1,"%s/%s/run.%"INT32".confparms.txt.diff",dir,
 | |
| 			testDir,i);
 | |
| 		File f1;
 | |
| 		f1.set(diff1);
 | |
| 		if ( ! f1.doesExist() ) {
 | |
| 			char df1[200];
 | |
| 			char df2[200];
 | |
| 			sprintf(df1,"%s/%s/run.%"INT32".confparms.txt",dir,
 | |
| 				testDir,i);
 | |
| 			sprintf(df2,"%s/%s/run.%"INT32".confparms.txt",dir,
 | |
| 				testDir,i-1);
 | |
| 			// do the diff
 | |
| 			char cmd[600];
 | |
| 			sprintf(cmd,"diff %s %s > %s",df1,df2,diff1);
 | |
| 			log("test: system(\"%s\")",cmd);
 | |
| 			system (cmd);
 | |
| 		}
 | |
| 		int32_t fs1 = f1.getFileSize();
 | |
| 		sb.safePrintf("<tr><td>%"INT32"</td><td>%"INT32"</td>", i,fs1);
 | |
| 
 | |
| 		// make diff filename
 | |
| 		char diff2[200];
 | |
| 		sprintf(diff2,"%s/%s/run.%"INT32".collparms.txt.diff",dir,
 | |
| 			testDir,i);
 | |
| 		File f2;
 | |
| 		f2.set(diff2);
 | |
| 		if ( ! f2.doesExist() ) {
 | |
| 			char df1[200];
 | |
| 			char df2[200];
 | |
| 			sprintf(df1,"%s/%s/run.%"INT32".collparms.txt",dir,
 | |
| 				testDir,i);
 | |
| 			sprintf(df2,"%s/%s/run.%"INT32".collparms.txt",dir,
 | |
| 				testDir,i-1);
 | |
| 			// do the diff
 | |
| 			char cmd[600];
 | |
| 			sprintf(cmd,"diff %s %s > %s",df1,df2,diff2);
 | |
| 			log("test: system(\"%s\")",cmd);
 | |
| 			system (cmd);
 | |
| 		}
 | |
| 		int32_t fs2 = f2.getFileSize();
 | |
| 		sb.safePrintf("<td>%"INT32"</td>", fs2);
 | |
| 
 | |
| 		// the version
 | |
| 		char vf[200]; 
 | |
| 		sprintf(vf,"%s/%s/run.%"INT32".version.txt",dir,testDir,i);
 | |
| 		File f3; 
 | |
| 		f3.set ( vf );
 | |
| 		int32_t fs3 = f3.getFileSize();
 | |
| 		char vbuf[1000];
 | |
| 		vbuf[0] = 0;
 | |
| 		if ( fs3 > 0 ) {
 | |
| 			f3.open(O_RDONLY);
 | |
| 			int32_t rs = f3.read(vbuf,fs3,0);
 | |
| 			vbuf[fs3] = '\0';
 | |
| 			if ( rs <= 0 ) continue;
 | |
| 			f3.close();
 | |
| 		}
 | |
| 		// show it
 | |
| 		sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf);
 | |
| 	}
 | |
| 	sb.safePrintf("</table>\n");
 | |
| 	sb.safePrintf("<br>\n");
 | |
| 
 | |
| 
 | |
| 	//
 | |
| 	// now diff each parser output file for each url in urls.txt
 | |
| 	//
 | |
| 
 | |
| 
 | |
| 	//
 | |
| 	// loop over url buf first so we can print one table per url
 | |
| 	//
 | |
| 
 | |
| 	char *next = NULL;
 | |
| 	// reset the url buf ptr
 | |
| 	m_urlPtr = m_urlBuf;
 | |
| 	// count em
 | |
| 	int32_t count = 0;
 | |
| 
 | |
| 	// ptrs to each url table
 | |
| 	int32_t  un = 0;
 | |
| 	int32_t  uptr [5000]; // offsets now, not char ptr since buf gets reallocd
 | |
| 	char  udiff[5000];
 | |
| 	int32_t  ulen [5000];
 | |
| 	int32_t  uhits[5000]; // critical errors! validateOutput() choked!
 | |
| 	int32_t  uunchecked[5000]; // events/addresses found but were not validatd
 | |
| 	int32_t  umiss[5000];
 | |
| 	int32_t  usort[5000];
 | |
| 	int32_t  uevents[5000];
 | |
| 	SafeBuf tmp;
 | |
| 
 | |
| 	// advance to next url
 | |
| 	for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) {
 | |
| 		// breathe
 | |
| 		QUICKPOLL(MAX_NICENESS);
 | |
| 		// we converted all non-url chars into \0's so skip those!
 | |
| 		for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ );
 | |
| 		// breach check
 | |
| 		if ( m_urlPtr >= m_urlEnd ) break;
 | |
| 		// set this up
 | |
| 		next = m_urlPtr;
 | |
| 		// compute next url ptr
 | |
| 		for ( ; next < m_urlEnd && *next ; next++ );
 | |
| 		// point to this url
 | |
| 		char *u = m_urlPtr;
 | |
| 		// get hash
 | |
| 		int64_t h = hash64 ( u , gbstrlen(u) );
 | |
| 		// shortcut
 | |
| 		char *dir = g_hostdb.m_dir;
 | |
| 
 | |
| 
 | |
| 		// print into a secondary safe buf with a ptr to
 | |
| 		// it so we can sort that and transfer into the
 | |
| 		// primary safebuf later
 | |
| 		uptr[un] = tmp.length();
 | |
| 		// assume no diff
 | |
| 		udiff[un] = 0;
 | |
| 
 | |
| 		// print number
 | |
| 		tmp.safePrintf("%"INT32") ",count++);
 | |
| 		// . link to our stored http server reply
 | |
| 		// . TODO: link it to our [cached] copy in the test coll!!!
 | |
| 		char local[1200];
 | |
| 		sprintf(local,"/%s/doc.%"UINT64".html",testDir,h);
 | |
| 		tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u);
 | |
| 		// link to live page
 | |
| 		tmp.safePrintf(" <a href=\"%s\">live</a> ",u);
 | |
| 		// link to page parser
 | |
| 		char ubuf[2000];
 | |
| 		urlEncode(ubuf,2000,u,gbstrlen(u),true);
 | |
| 		tmp.safePrintf(" <a href=\"/admin/parser?c=test&"
 | |
| 			       "u=%s\">parser</a> ",ubuf);
 | |
| 		//tmp.safePrintf(" (%"UINT64")",h);
 | |
| 		tmp.safePrintf("<br>\n");
 | |
| 		//tmp.safePrintf("<br>\n");
 | |
| 		tmp.safePrintf("<table border=1>\n");
 | |
| 		tmp.safePrintf("<tr>"
 | |
| 			      "<td><b><nobr>run id</nobr></b></td>"
 | |
| 			      "<td><b><nobr>crit hits</nobr></b></td>"
 | |
| 			      "<td><b><nobr>crit errors</nobr></b></td>"
 | |
| 			      "<td><b><nobr># e</nobr></b></td>"
 | |
| 			      "<td><b><nobr>unchecked</nobr></b></td>"
 | |
| 			      "<td><b><nobr>diff chars</nobr></b></td>"
 | |
| 			      "<td><b><nobr>diff file</nobr></b></td>"
 | |
| 			      "<td><b><nobr>full output</nobr></b></td>"
 | |
| 			      "</tr>\n");
 | |
| 
 | |
| 		//SafeBuf sd;
 | |
| 
 | |
| 		// loop over all the runs now, starting with latest run first
 | |
| 		for ( int32_t ri = m_runId ; ri >= start ; ri-- ) {
 | |
| 
 | |
| 			QUICKPOLL(MAX_NICENESS);
 | |
| 
 | |
| 			// the diff filename
 | |
| 			char pdiff[200];
 | |
| 			sprintf(pdiff,"%s/%s/parse.%"UINT64".%"INT32".html.diff",dir,
 | |
| 				testDir,h,ri);
 | |
| 			File f;
 | |
| 			f.set(pdiff);
 | |
| 			int32_t fs = f.getFileSize();
 | |
| 			if ( ! f.doesExist() && ri > 0 ) {
 | |
| 				// make the parse filename
 | |
| 				char pbuf1[200];
 | |
| 				char pbuf2[200];
 | |
| 				sprintf(pbuf1,"%s/%s/parse.%"UINT64".%"INT32".html",
 | |
| 					dir,testDir,h,ri);
 | |
| 				sprintf(pbuf2,"%s/%s/parse.%"UINT64".%"INT32".html",
 | |
| 					dir,testDir,h,ri-1);
 | |
| 				// sanity check
 | |
| 				//File tf; tf.set(pbuf1);
 | |
| 				//if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;}
 | |
| 				// tmp file name
 | |
| 				char tmp1[200];
 | |
| 				char tmp2[200];
 | |
| 				sprintf(tmp1,"%s/%s/t1.html",dir,testDir);
 | |
| 				sprintf(tmp2,"%s/%s/t2.html",dir,testDir);
 | |
| 				// filter first
 | |
| 				char cmd[600];
 | |
| 				sprintf(cmd,
 | |
| 					"cat %s | "
 | |
| 					"grep -v \"<!--ignore-->\" "
 | |
| 					" > %s", pbuf1,tmp1);
 | |
| 				system(cmd);
 | |
| 				sprintf(cmd,
 | |
| 					"cat %s | "
 | |
| 					"grep -v \"<!--ignore-->\" "
 | |
| 					" > %s", pbuf2,tmp2);
 | |
| 				system(cmd);
 | |
| 				// make the system cmd to do the diff
 | |
| 				sprintf(cmd,
 | |
| 					"echo \"<pre>\" > %s ; "
 | |
| 					"diff -w --text %s %s "
 | |
| 					// ignore this table header row
 | |
| 					//" | grep -v \"R#4\""
 | |
| 					" >> %s",
 | |
| 					pdiff,
 | |
| 					tmp1,tmp2,pdiff);
 | |
| 				log("test: system(\"%s\")",cmd);
 | |
| 				system(cmd);
 | |
| 				// try again
 | |
| 				f.set(pdiff);
 | |
| 				fs = f.getFileSize();
 | |
| 			}
 | |
| 
 | |
| 			QUICKPOLL(MAX_NICENESS);
 | |
| 
 | |
| 			// this means 0 . it just has the <pre> tag in it!
 | |
| 			if ( fs < 0 || fs == 6 ) fs = 0;
 | |
| 			// . if no diff and NOT current run, do not print it
 | |
| 			// . print it if the run right before the current 
 | |
| 			//   now always too
 | |
| 			if ( ri != m_runId && ri != m_runId-1 && fs == 0 ) 
 | |
| 				continue;
 | |
| 			// relative filename
 | |
| 			char rel[200];
 | |
| 			sprintf(rel,"/%s/parse.%"UINT64".%"INT32".html.diff",
 | |
| 				testDir,h,ri);
 | |
| 			char full[200];
 | |
| 			sprintf(full,"/%s/parse.%"UINT64".%"INT32".html",
 | |
| 				testDir,h,ri);
 | |
| 			char validate[200];
 | |
| 			sprintf(validate,
 | |
| 				"/%s/parse-shortdisplay.%"UINT64".%"INT32".html",
 | |
| 				testDir,h,ri);
 | |
| 			// use red font for current run that has a diff!
 | |
| 			char *t1 = "";
 | |
| 			char *t2 = "";
 | |
| 			if ( ri == m_runId && fs != 0 ) {
 | |
| 				t1 = "<font color=pink><b>";
 | |
| 				t2 = "</b></font>";
 | |
| 				// a diff
 | |
| 				udiff[un] = 1;
 | |
| 			}
 | |
| 
 | |
| 			// . get critical errors
 | |
| 			// . i.e. XmlDoc::validateOutput() could not validate
 | |
| 			//   a particular event or address that was in the
 | |
| 			//   url's "validated.uh64.txt" file since the admin
 | |
| 			//   clicked on the checkbox in the page parser output
 | |
| 			// . if we do not find such a tag in the parser output
 | |
| 			//   any more then Spider.cpp creates this file!
 | |
| 			if ( ri == m_runId ) {
 | |
| 				char cfile[256];
 | |
| 				sprintf(cfile,"%s/%s/critical.%"UINT64".%"INT32".txt",
 | |
| 					g_hostdb.m_dir,testDir,h,ri);
 | |
| 				SafeBuf ttt;
 | |
| 				ttt.fillFromFile(cfile);
 | |
| 				// first int32_t is misses, then hits then events
 | |
| 				umiss[un] = 0;
 | |
| 				uhits[un] = 0;
 | |
| 				uevents[un] = 0;
 | |
| 				uunchecked[un] = 0;
 | |
| 				if ( ttt.length() >= 3 )
 | |
| 					sscanf(ttt.getBufStart(),
 | |
| 					       "%"INT32" %"INT32" %"INT32" %"INT32"",
 | |
| 					       &umiss[un],
 | |
| 					       &uhits[un],
 | |
| 					       &uevents[un],
 | |
| 					       &uunchecked[un]);
 | |
| 				usort[un] = umiss[un] + uunchecked[un];
 | |
| 				//File cf;
 | |
| 				//cf.set(cfile);
 | |
| 				//if ( cf.doesExist()) ucrit[un] = 1;
 | |
| 				//else                 ucrit[un] = 0;
 | |
| 			}
 | |
| 
 | |
| 			// more critical?
 | |
| 			if ( ri == m_runId && umiss[un] != 0 ) {
 | |
| 				t1 = "<font color=red><b>";
 | |
| 				t2 = "</b></font>";
 | |
| 			}
 | |
| 
 | |
| 			// . these are good to have
 | |
| 			// . if you don't have 1+ critical hits then you
 | |
| 			//   probably need to be validate by the qa guy
 | |
| 			char *uhb1 = "";
 | |
| 			char *uhb2 = "";
 | |
| 			if ( ri == m_runId && uhits[un] != 0 ) {
 | |
| 				uhb1 = "<font color=green><b>**";
 | |
| 				uhb2 = "**</b></font>";
 | |
| 			}
 | |
| 
 | |
| 			QUICKPOLL(MAX_NICENESS);
 | |
| 
 | |
| 			char *e1 = "<td>";
 | |
| 			char *e2 = "</td>";
 | |
| 			int32_t ne = uevents[un];
 | |
| 			if ( ne ) { 
 | |
| 				e1="<td bgcolor=orange><b><font color=brown>"; 
 | |
| 				e2="</font></b></td>"; 
 | |
| 			}
 | |
| 			char *u1 = "<td>";
 | |
| 			char *u2 = "</td>";
 | |
| 			if ( uunchecked[un] ) {
 | |
| 				u1="<td bgcolor=purple><b><font color=white>"; 
 | |
| 				u2="</font></b></td>"; 
 | |
| 			}
 | |
| 				
 | |
| 			// print the row!
 | |
| 			tmp.safePrintf("<tr>"
 | |
| 				      "<td>%s%"INT32"%s</td>"
 | |
| 				       "<td>%s%"INT32"%s</td>" // critical hits
 | |
| 				       "<td>%s%"INT32"%s</td>" // critical misses
 | |
| 				       "%s%"INT32"%s" // # events
 | |
| 				       "%s%"INT32"%s" // unchecked
 | |
| 				       "<td>%s%"INT32"%s</td>" // filesize of diff
 | |
| 				      // diff filename
 | |
| 				      "<td><a href=\"%s\">%s%s%s</a></td>"
 | |
| 				      // full parser output
 | |
| 				      "<td>"
 | |
| 				       "<a href=\"%s\">full</a> | "
 | |
| 				       "<a href=\"%s\">validate</a> "
 | |
| 				       "</td>"
 | |
| 				      "</tr>\n",
 | |
| 				      t1,ri,t2,
 | |
| 				      uhb1,uhits[un],uhb2,
 | |
| 				      t1,umiss[un],t2,
 | |
| 				      e1,ne,e2,
 | |
| 				       u1,uunchecked[un],u2,
 | |
| 				      t1,fs,t2,
 | |
| 				      rel,t1,rel,t2,
 | |
| 				      full,
 | |
| 				       validate);
 | |
| 
 | |
| 
 | |
| 			// only fill "sd" for the most recent guy
 | |
| 			if ( ri != m_runId ) continue;
 | |
| 
 | |
| 			// now concatenate the parse-shortdisplay file
 | |
| 			// to this little table so qa admin can check/uncheck
 | |
| 			// validation checkboxes for addresses and events
 | |
| 			//sprintf(cfile,
 | |
| 			//	"%s/test/parse-shortdisplay.%"UINT64".%"INT32".html",
 | |
| 			//	g_hostdb.m_dir,h,ri);
 | |
| 			//sd.fillFromFile ( cfile );
 | |
| 		}
 | |
| 		// end table
 | |
| 		tmp.safePrintf("</table>\n");
 | |
| 
 | |
| 		// . and a separate little section for the checkboxes
 | |
| 		// . should already be in tables, etc.
 | |
| 		// . each checkbox should provide its own uh64 when it
 | |
| 		//   calls senddiv() when clicked now
 | |
| 		//tmp.cat ( sd );
 | |
| 
 | |
| 		tmp.safePrintf("<br>\n");
 | |
| 		tmp.safePrintf("<br>\n");
 | |
| 		// set this
 | |
| 		ulen[un] = tmp.length() - uptr[un] ;
 | |
| 		// sanity check
 | |
| 		if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; }
 | |
| 		// inc it
 | |
| 		un++;
 | |
| 		// increase the 5000!!
 | |
| 		if ( un >= 5000 ) { char *xx=NULL; *xx=0; }
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	char flag ;
 | |
|  bubble:
 | |
| 	flag = 0;
 | |
| 	// sort the url tables
 | |
| 	for ( int32_t i = 0 ; i < un - 1 ; i++ ) {
 | |
| 		QUICKPOLL(MAX_NICENESS);
 | |
| 		if ( usort[i] >  usort[i+1] ) continue;
 | |
| 		if ( usort[i] == usort[i+1] ) 
 | |
| 			if ( udiff[i] >= udiff[i+1] ) continue;
 | |
| 		// swap em
 | |
| 		int32_t  tp = uptr[i];
 | |
| 		int32_t  td = udiff[i];
 | |
| 		int32_t  um = umiss[i];
 | |
| 		int32_t  us = usort[i];
 | |
| 		int32_t  uh = uhits[i];
 | |
| 		int32_t  tl = ulen [i];
 | |
| 		uptr[i] = uptr[i+1];
 | |
| 		umiss[i] = umiss[i+1];
 | |
| 		usort[i] = usort[i+1];
 | |
| 		uhits[i] = uhits[i+1];
 | |
| 		udiff[i] = udiff[i+1];
 | |
| 		ulen[i]  = ulen[i+1];
 | |
| 		uptr[i+1] = tp;
 | |
| 		umiss[i+1] = um;
 | |
| 		usort[i+1] = us;
 | |
| 		uhits[i+1] = uh;
 | |
| 		udiff[i+1] = td;
 | |
| 		ulen [i+1] = tl;
 | |
| 		flag = 1;
 | |
| 	}
 | |
| 	if ( flag ) goto bubble;
 | |
| 
 | |
| 	// transfer into primary safe buf now
 | |
| 	for ( int32_t i = 0 ; i < un ; i++ ) 
 | |
| 		sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]);
 | |
| 
 | |
| 
 | |
| 	sb.safePrintf("</html>\n");
 | |
| 
 | |
| 	char dfile[200];
 | |
| 	sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir);
 | |
| 	sb.dumpToFile ( dfile );
 | |
| 
 | |
| 	// free the buffer of urls
 | |
| 	reset();
 | |
| 
 | |
| 	// turn off spiders
 | |
| 	g_conf.m_spideringEnabled = 0;
 | |
| 
 | |
| 	// all done
 | |
| 	return;
 | |
| }
 | |
| 
 | |
| 
 | |
| void injectedWrapper ( void *state ) {
 | |
| 	// wait for all msg4 buffers to flush
 | |
| 	//if ( ! flushMsg4Buffers ( state , injectedWrapper ) ) return;
 | |
| 	// this function is in Msge1.cpp. save ip file in test subdir
 | |
| 	//saveTestBuf();
 | |
| 	if ( ! g_test.injectLoop() ) return;
 | |
| 	//g_test.stopIt();
 | |
| }
 | |
| 
 | |
| static int32_t s_count = 0;
 | |
| 
 | |
| // . returns true if all done!
 | |
| // . returns false if still doing stuff
 | |
| bool Test::injectLoop ( ) {
 | |
| 
 | |
| 	int32_t  dlen   ;
 | |
| 	char *dom    ;
 | |
| 	int32_t  fakeIp ;
 | |
| 
 | |
|  loop:
 | |
| 	// advance to next url
 | |
| 	for ( ; m_urlPtr < m_urlEnd && ! *m_urlPtr ; m_urlPtr++ ) ;
 | |
| 	// all done?
 | |
| 	if ( m_urlPtr >= m_urlEnd ) {
 | |
| 		// flush em out
 | |
| 		if ( ! flushMsg4Buffers ( this , injectedWrapper ) ) 
 | |
| 			return false;
 | |
| 		// note it
 | |
| 		m_isAdding = false;
 | |
| 		// all done
 | |
| 		return true;
 | |
| 	}
 | |
| 	// error means all done
 | |
| 	if ( m_errno ) { m_isAdding = false; return true; }
 | |
| 	// point to it
 | |
| 	char *u = m_urlPtr;
 | |
| 	// advance to point to the next url for the next loop!
 | |
| 	for ( ; m_urlPtr < m_urlEnd && *m_urlPtr ; m_urlPtr++ ) ;
 | |
| 
 | |
| 	// hash it
 | |
| 	int64_t h = hash64b ( u );
 | |
| 	// dedup it lest we freeze up and stopIt() never gets called because
 | |
| 	// m_urlsAdded is never decremented all the way to zero in Spider.cpp
 | |
| 	if ( m_dt.isInTable ( &h ) ) goto loop;
 | |
| 	// add it. return true with g_errno set on error
 | |
| 	if ( ! m_dt.addKey ( &h ) ) goto hadError;
 | |
| 
 | |
| 	// make the SpiderRequest from it
 | |
| 	m_sreq.reset();
 | |
| 	// url
 | |
| 	strcpy ( m_sreq.m_url , u );
 | |
| 	// get domain of url
 | |
| 	dom = getDomFast ( m_sreq.m_url , &dlen );
 | |
| 	// make a fake ip
 | |
| 	fakeIp = 0x123456;
 | |
| 	// use domain if we got that
 | |
| 	if ( dom && dlen ) fakeIp = hash32 ( dom , dlen );
 | |
| 	// first ip is fake
 | |
| 	m_sreq.m_firstIp = fakeIp; // 0x123456;
 | |
| 	// these too
 | |
| 	m_sreq.m_domHash32  = fakeIp;
 | |
| 	m_sreq.m_hostHash32 = fakeIp;
 | |
| 	m_sreq.m_siteHash32 = fakeIp;
 | |
| 	//m_sreq.m_probDocId = Titledb::getProbableDocId( m_sreq.m_url );
 | |
| 	// this crap is fake
 | |
| 	m_sreq.m_isInjecting = 1;
 | |
| 	// use test-spider subdir for storing pages and spider times?
 | |
| 	// MDW: this was replaced by m_isParentSiteMap bit.
 | |
| 	//if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
 | |
| 	// use this later
 | |
| 	// injected requests use this as the spider time i guess
 | |
| 	// so we can sort them by this
 | |
| 	m_sreq.m_addedTime = ++s_count;
 | |
| 
 | |
| 	// no, because to compute XmlDoc::m_min/maxPubDate we need this to
 | |
| 	// be valid for our test run.. no no we will fix it to be
 | |
| 	// basically 2 days before spider time in the code...
 | |
| 	//m_sreq.m_addedTime = spiderTime;
 | |
| 
 | |
| 	m_sreq.m_fakeFirstIp = 1;
 | |
| 
 | |
| 	// make the key (parentDocId=0)
 | |
| 	m_sreq.setKey ( fakeIp, 0LL , false );
 | |
| 	// test it
 | |
| 	if ( g_spiderdb.getFirstIp(&m_sreq.m_key) != fakeIp ) {
 | |
| 		char *xx=NULL;*xx=0;}
 | |
| 	// sanity check. check for http(s)://
 | |
| 	if ( m_sreq.m_url[0] != 'h' ) { char *xx=NULL;*xx=0; }
 | |
| 
 | |
| 	// reset this
 | |
| 	g_errno = 0;
 | |
| 
 | |
| 	// count it
 | |
| 	m_urlsAdded++;
 | |
| 
 | |
| 	// note it
 | |
| 	//log("crazyout: %s",m_sreq.m_url );
 | |
| 	logf(LOG_DEBUG,"spider: injecting test url %s",m_sreq.m_url);
 | |
| 
 | |
| 	// the receiving end will realize that we are injecting into the test
 | |
| 	// collection and use the "/test/" subdir to load the file
 | |
| 	// "ips.txt" to do our ip lookups, and search for any downloads in
 | |
| 	// that subdirectory as well.
 | |
| 	if ( ! m_msg4.addMetaList ( (char *)&m_sreq     ,
 | |
| 				    m_sreq.getRecSize() ,
 | |
| 				    m_coll              ,
 | |
| 				    NULL                ,
 | |
| 				    injectedWrapper     ,
 | |
| 				    RDB_SPIDERDB        ) )
 | |
| 		// return false if blocked
 | |
| 		return false;
 | |
| 	// error?
 | |
| 	if ( g_errno ) {
 | |
| 		// jump down here from above on error
 | |
| 	hadError:
 | |
| 		// save it
 | |
| 		m_errno = g_errno;
 | |
| 		// flag it
 | |
| 		m_isAdding = false;
 | |
| 		// note it
 | |
| 		log("test: inject had error: %s",mstrerror(g_errno));
 | |
| 		// stop, we are all done!
 | |
| 		return true;
 | |
| 	}
 | |
| 	// add the next spider request
 | |
| 	goto loop;
 | |
| }
 |