open-source-search-engine/Scraper.cpp

#include "gb-include.h"

#include "Scraper.h"
//#include "CollectionRec.h"
#include "HttpMime.h"
#include "Xml.h"
//#include "Links.h"
#include "HashTableT.h"
#include "Wiki.h"
#include "HttpServer.h"
#include "Speller.h"
#include "Repair.h"

static void scraperSleepWrapper      ( int fd , void *state ) ;
static void gotPhraseWrapper         ( void *state ) ;
//static void gotPagesWrapper          ( void *state , TcpSocket *s ) ;
//static void addedScrapedSitesWrapper ( void *state ) ;
//static void gotUrlInfoWrapper        ( void *state ) ;
//static void addedUrlsWrapper         ( void *state ) ;
static void indexedDocWrapper ( void *state );

Scraper g_scraper;

Scraper::Scraper ( ) {
	m_registered = false;
}

Scraper::~Scraper ( ) {
	// unregister it
	//if ( m_registered )
	//	g_loop.unregisterSleepCallback (NULL,scraperSleepWrapper);
}

// returns false and sets g_errno on error
bool Scraper::init ( ) {
	// . set the sleep callback for once per 10 minutes
	// . this is in milliseconds. 1000 ms = 1 second
	int32_t wait = 1000; // 1000*60*10;
	if ( ! g_loop.registerSleepCallback(wait,NULL,scraperSleepWrapper))
		return false;
	m_registered = true;
	m_numReceived = 0;
	m_numSent     = 0;
	//m_bufEnd = m_buf + 50000;
	return true;
}

void scraperSleepWrapper ( int fd , void *state ) {
	g_scraper.wakeUp ( );
}

void Scraper::wakeUp ( ) {
	// this is weird
	if ( m_numReceived != 0 || m_numSent != 0 ) {
		log("scraper: seems like a scrape is outstanding.");
		return;
	}
	// only host #0 scrapes
	if ( g_hostdb.m_myHost->m_hostId != 0 ) return;
	// no writing/adding if we are the tmp cluster
	if ( g_hostdb.m_useTmpCluster ) return;
	// scraping is off when repairing obviously
	if ( g_repairMode ) return;
	// . we are only allowed one collection scraping at a time
	// . find the collection, if any
	CollectionRec *cr = NULL;
	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		cr = g_collectiondb.m_recs[i];
		if ( cr ) break;
	}
	// it was deleted, just return
	if ( ! cr ) return;

	// bail if scraping not enabled
	if ( ! cr->m_scrapingEnabledWeb   &&
	     ! cr->m_scrapingEnabledNews  &&
	     ! cr->m_scrapingEnabledBlogs &&
	     ! cr->m_scrapingEnabledProCog )
		return;

	// unregister it
	g_loop.unregisterSleepCallback (NULL,scraperSleepWrapper);
	m_registered = false;

	// get its coll
	strcpy ( m_coll , cr->m_coll );

	// try procog query log scraping
	//if ( cr->m_scrapingEnabledProCog )
	//	scrapeProCog();

	// bail now if only scraping procog
	if ( ! cr->m_scrapingEnabledWeb   &&
	     ! cr->m_scrapingEnabledNews  &&
	     ! cr->m_scrapingEnabledBlogs )
		return;

	log(LOG_INFO,"scraper: getting random phrase to scrape.");

	// get a random phrase from the wikipedia titles
	if ( ! g_wiki.getRandomPhrase ( NULL , gotPhraseWrapper ) ) return;
	// call it ourselves
	gotPhrase();
}

void gotPhraseWrapper ( void *state ) {
	g_scraper.gotPhrase();
}

void Scraper::gotPhrase ( ) {
	// error getting random phrase? bail!
	if ( g_errno ) log("scraper: got error getting random phrase: %s",
			   mstrerror(g_errno));

	CollectionRec *cr = g_collectiondb.getRec ( m_coll );

 loop:
	// what type of query should we do?
	m_qtype = rand() % 3;

	// make sure web, news, blog is enabled
	if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb   ) goto loop;
	if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews  ) goto loop;
	if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop;

	// scraping is off when repairing obviously
	if ( g_repairMode ) return;

	// get it
	char *s = g_wiki.m_randPhrase;
	// convert _'s to spaces
	for ( char *p = s ; *p ; p++ )
		if ( *p == '_' ) *p = ' ';
	// . url encode the random phrase
	// . truncate it to 200 bytes to keep things sane
	// . Wiki::doneReadingWiki() keeps it below 128 i think anyway
	char qe[400];
	urlEncode(qe, 200, s , gbstrlen(s) );
	char *end = qe + 390;

	// half the time append a random word from dictionary so that we
	// discovery those tail-end sites better
	if ( m_qtype == 0 && (rand() % 2) ) {
		// point into it for appending
		char *p = qe + gbstrlen(qe);
		// add a space, url encoded
		*p++ = '+';
		// append a random word to it from dictionary
		char *rw = g_speller.getRandomWord();
		// append that in
		urlEncode( p , end - p - 1 , rw , gbstrlen(rw) );
	}

	// make a query to scrape
	char buf[2048];

	char *uf ;
	if      ( m_qtype == 0 )
		uf="http://www.google.com/search?num=50&q=%s&scoring=d"
			"&filter=0";
	// google news query? sort by date.
	else if ( m_qtype == 1 )
		uf="http://news.google.com/news?num=50&q=%s&sort=n"
			"&filter=0";
	// google blog query?
	else if ( m_qtype == 2 )
		uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d"
			"&filter=0";
	// sanity check
	else { char *xx=NULL;*xx=0; }

	// make the url we will download
	sprintf ( buf , uf , qe );

	SpiderRequest sreq;
	// set the SpiderRequest
	strcpy(sreq.m_url, uf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	int32_t firstIp = hash32n(uf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 );

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// call this when index completes
	m_xd.setCallback ( NULL , indexedDocWrapper );

	// assume it blocked
	m_numSent++;

	// scraper is special
	m_xd.m_usePosdb     = false;
	//m_xd.m_useDatedb    = false;
	m_xd.m_useClusterdb = false;
	m_xd.m_useLinkdb    = false;
	m_xd.m_useSpiderdb  = true; // only this one i guess
	m_xd.m_useTitledb   = false;
	m_xd.m_useTagdb     = false;
	m_xd.m_usePlacedb   = false;
	//m_xd.m_useTimedb    = false;
	//m_xd.m_useSectiondb = false;
	//m_xd.m_useRevdb     = false;

	// . return false if this blocks
	// . will add the spider recs to spiderdb of the outlinks
	// . will add "ingoogle", etc. tags for each outlink
	if ( ! m_xd.indexDoc ( ) ) return ;

	// we didn't block
	indexedDoc ( );
}

void indexedDocWrapper ( void *state ) {
	// it did not block
	g_scraper.indexedDoc ( );
}

bool Scraper::indexedDoc ( ) {
	// got one completed now
	m_numReceived++;
	// if not done, leave
	if ( m_numReceived < m_numSent ) return false;
	// ok, all done, reset
	m_numSent     = 0;
	m_numReceived = 0;
	return true;
}
/*
// . uses parameters assigned to local member vars above
// . returns false if blocked, true otherwise
// . sets g_errno on error
void gotDocIdsWrapper ( void *state ) {
	Msg40 *THIS = (Msg40 *) state;
	// this should not block, just cache the score/docid from the search
	THIS->gotDocIds();
	// keep scraping!
	g_scraper.scrapeProCog();
}

bool Scraper::scrapeProCog ( ) {

	// done?
	if ( m_nextQuery >= m_queryBufEnd ) return true;

	// get the next query to scrape
	m_currentQuery = m_nextQuery;

	// advance
	m_nextQuery += gbstrlen(qptr) + 1;

	// do a local msg39
	if ( ! m_msg39 ) {
		try { m_msg39 = new ( Msg39 ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			return false;
		}
		mnew ( m_msg39 , sizeof(Msg39),"scrm39");
	}

	// need to make the request
	m_r.reset();
	m_r.ptr_coll                    = m_coll;
	m_r.size_coll                   = gbstrlen(m_coll)+1;
	m_r.m_maxAge                    = 0; // no caching!
	m_r.m_addToCache                = false;
	m_r.m_docsToGet                 = 2; // m_docsToGet;
	m_r.m_niceness                  = MAX_NICENESS;
	m_r.m_isDebug                   = false;
	m_r.m_getDocIdScoringInfo       = false;
	m_r.m_doSiteClustering          = true;
	m_r.m_useMinAlgo                = false;
	m_r.m_useNewAlgo                = true; // for speed
	m_r.m_doMaxScoreAlgo            = true; // filter #1 of the new algo
	m_r.m_fastIntersection          = -1;
	m_r.m_doIpClustering            = true;
	m_r.m_doDupContentRemoval       = true;
	m_r.m_restrictIndexdbForQuery   = false;
	m_r.m_queryExpansion            = true;
	m_r.m_boolFlag                  = 2;
	m_r.m_familyFilter              = false;
	// TODO: make language that of the query! not always english
	m_r.m_language                  = langEnglish;
	m_r.ptr_query                   = m_currentQuery;
	m_r.size_query                  = gbstrlen(m_currentQuery)+1;
	m_r.m_timeout                   = 9999; // wait a int32_t time, in secs

	// callback hack
	m_msg39->m_callback = gotDocIdsWrapper;
	m_msg39->m_state    = NULL;
	m_msg39->m_queryHash32 = hash32n ( m_currentQuery );

	// . get the docIds
	// . this sets m_msg3a.m_clusterLevels[] for us
	// . TODO: consider limiting to first 20% of docids, i.e. so we'd
	//   have to read 5x less from disk!!
	if ( ! m_msg39->getDocIds2 ( &m_r ) )
		return false;

	// call again w/o parameters now
	return gotDocIds ( );
}

// . return false if blocked, true otherwise
// . sets g_errno on error
bool Scraper::gotDocIds ( ) {
	// cache the top result
	//int64_t docId = m_msg39->m_topDocId;
	//float     score = m_msg39->m_topScore;
	// store in local cache
	int32_t key32 = m_msg39->m_queryHash32;
	// make the record
	char rec[128];
	char *p = rec;
	// the flag to indicate we're doing a single node query
	*p = QLRC_LOCALQUERY;
	p++;
	*(float *)p = m_msg39->m_topScore;
	p += sizeof(float);
	*(float *)p = m_msg39->m_topScore2;
	p += sizeof(float);
	// how big is that?
	int32_t recSize = p - rec;
	// cache it
	g_queryLogResultCache.addRec ( &key32 , rec , recSize );
}
*/