mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-13 02:36:06 -04:00
Remove commented out DateParse & code from main.cpp
This commit is contained in:
@ -66,7 +66,6 @@ class Address *g_address; // for debug
|
||||
#include "gb-include.h"
|
||||
#include "Address.h"
|
||||
#include "Sections.h"
|
||||
//#include "DateParse2.h"
|
||||
#include "Abbreviations.h"
|
||||
#include "Phrases.h"
|
||||
//#include "Weights.h"
|
||||
|
12
CatRec.cpp
12
CatRec.cpp
@ -3,7 +3,6 @@
|
||||
#include "CatRec.h"
|
||||
//#include "SiteBonus.h"
|
||||
#include "Lang.h"
|
||||
//#include "DateParse.h"
|
||||
|
||||
//static int32_t getY(Xml *xml, int32_t n0,int32_t n1,int32_t X,
|
||||
// char *strx,char *stry,int32_t def);
|
||||
@ -922,15 +921,4 @@ char* CatRec::getAdultStr() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
char *CatRec::getPubDateFmtStr() {
|
||||
int32_t fmt = getScoreForType(SiteType::DATE_FORMAT);
|
||||
switch (fmt) {
|
||||
case DateParse::DATE_FMT_AMER:
|
||||
return "American";
|
||||
case DateParse::DATE_FMT_EURO:
|
||||
return "European";
|
||||
}
|
||||
return "Unknown/Ambiguous";
|
||||
}
|
||||
*/
|
||||
|
2
CatRec.h
2
CatRec.h
@ -123,8 +123,6 @@ class CatRec {
|
||||
bool isKidSafe() { return m_adultLevel == RATED_G; }
|
||||
char* getAdultStr();
|
||||
|
||||
char *getPubDateFmtStr();
|
||||
|
||||
int32_t getTimeStamp() { return m_timeStamp; }
|
||||
char *getComment() { return m_comment; }
|
||||
char *getUsername() { return m_username; }
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include "Dns.h"
|
||||
//#include "Thesaurus.h" // SynonymLinkGroup structure
|
||||
#include "PostQueryRerank.h" // for ComTopInDmozRec
|
||||
//#include "DateParse.h" // TimeZoneInfo structure
|
||||
#include "types.h"
|
||||
|
||||
template<class Key_t, class Val_t>
|
||||
|
@ -16,7 +16,6 @@
|
||||
#include "Users.h"
|
||||
#include "Tagdb.h"
|
||||
#include "Spider.h"
|
||||
//#include "DateParse2.h"
|
||||
|
||||
// TODO: meta redirect tag to host if hostId not ours
|
||||
static bool gotTitleRec ( void *state );
|
||||
|
183
Pops.cpp
183
Pops.cpp
@ -14,189 +14,6 @@ Pops::~Pops() {
|
||||
mfree ( m_pops , m_popsSize , "Pops" );
|
||||
}
|
||||
|
||||
/*
|
||||
// should be one for each host in the network
|
||||
bool Pops::readPopFiles ( ) {
|
||||
int32_t n = g_hostdb.getNumGroups();
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// note it
|
||||
log(LOG_INIT,"db: Reading %s/pops.%"INT32" of %"INT32".",
|
||||
g_conf.m_dir,i,n);
|
||||
//
|
||||
}
|
||||
}
|
||||
|
||||
bool Pops::makeFinalPopFile ( char *coll ) {
|
||||
|
||||
int32_t n = g_hostdb.getNumGroups();
|
||||
|
||||
// tell each host to write his pop file to html directory
|
||||
Msg3e msg3e;
|
||||
for ( int32_t i = 0 ; i < n ; i++ )
|
||||
msg3e.sendRequest ( );
|
||||
|
||||
// no more than 4096 groups supported for this now, but up later maybe
|
||||
char *buf [ 4096 ];
|
||||
|
||||
// retrieve it from each host (msg3f = getFile)
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// get over http
|
||||
g_httpServer.getDoc ( ... );
|
||||
// save to disk
|
||||
out[i].write ( content , contentLen );
|
||||
}
|
||||
|
||||
// merge out file
|
||||
BigFile out;
|
||||
|
||||
// then merge all of them out
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
|
||||
}
|
||||
|
||||
// merge final
|
||||
|
||||
// distribute final copy to all
|
||||
|
||||
// clean up locals
|
||||
}
|
||||
|
||||
// . make the pop file from indexdb
|
||||
// . a bunch of wordhash/#docs pairs
|
||||
// . word hash is lower 4 bytes of the termid
|
||||
// . first int64_t in file is the # of docs
|
||||
bool Pops::makeLocalPopFile ( char *coll ) {
|
||||
// get the rdbmap of the first indexdb file
|
||||
RdbBase *base = g_indexdb.getBase ( coll );
|
||||
//RdbMap *map = base->getMap(0);
|
||||
if ( ! base )
|
||||
return log("admin: Collection \"%s\" does not exist.",coll);
|
||||
BigFile *f = base->getFile(0);
|
||||
// term must be in at least this many docs
|
||||
int32_t minDocs = 4000;
|
||||
// log it
|
||||
log(LOG_INFO,"admin: Making popularity file from %s for coll \"%s\".",
|
||||
f->getFilename(),coll);
|
||||
log(LOG_INFO,"admin: Using cutoff of %"INT32" docs.",minDocs);
|
||||
|
||||
// output the wordId/count pairs to this file
|
||||
BigFile out;
|
||||
char outFilename[256];
|
||||
sprintf(outFilename,"%s/popout.%"INT32"",g_conf.m_dir,g_hostdb.m_hostId);
|
||||
out.set ( outFilename );
|
||||
|
||||
// store # of docs
|
||||
int64_t n = g_titledb.getGlobalNumDocs();
|
||||
out.write ( &n , 8 );
|
||||
|
||||
// store key read from disk into here
|
||||
char tmp [ MAX_KEY_BYTES ];
|
||||
|
||||
//
|
||||
//
|
||||
// this part is taken from main.cpp:dumpIndexdb()
|
||||
//
|
||||
//
|
||||
char buf [ 1000000 ];
|
||||
int32_t bufSize = 1000000;
|
||||
if ( ! f.open ( O_RDONLY ) ) return;
|
||||
// init our vars
|
||||
bool haveTop = false;
|
||||
char top[6];
|
||||
memset ( top , 0 , 6 );
|
||||
bool warned = false;
|
||||
// how big is this guy?
|
||||
int64_t filesize = f.getFileSize();
|
||||
// reset error number
|
||||
g_errno = 0;
|
||||
// the big read loop
|
||||
loop:
|
||||
int64_t readSize = bufSize;
|
||||
if ( off + readSize > filesize ) readSize = filesize - off;
|
||||
// return if we're done reading the whole file
|
||||
if ( readSize <= 0 ) return;
|
||||
// read in as much as we can
|
||||
f.read ( buf , readSize , off );
|
||||
// bail on read error
|
||||
if ( g_errno ) {
|
||||
log("admin: Read of %s failed.",f.getFilename());
|
||||
return;
|
||||
}
|
||||
char *p = buf;
|
||||
char *pend = buf + readSize;
|
||||
inner:
|
||||
// parse out the keys
|
||||
int32_t size;
|
||||
if ( ((*p) & 0x02) == 0x00 ) size = ks;
|
||||
else size = ks-6;
|
||||
if ( p + size > pend ) {
|
||||
// skip what we read
|
||||
off += readSize ;
|
||||
// back up so we don't split a key we should not
|
||||
off -= ( pend - p );
|
||||
// read more
|
||||
goto loop;
|
||||
}
|
||||
// new top?
|
||||
if ( size == ks ) { gbmemcpy ( top , p + (ks-6) , 6 ); haveTop = true; }
|
||||
// warning msg
|
||||
if ( ! haveTop && ! warned ) {
|
||||
warned = true;
|
||||
log("admin: Warning: first key is a half key.");
|
||||
}
|
||||
|
||||
//
|
||||
// BUT i added this part to the main.cpp stuff
|
||||
//
|
||||
|
||||
// was it the same as last key?
|
||||
if ( ks == 6 )
|
||||
count++;
|
||||
// ok, this starts a new key
|
||||
else {
|
||||
// did the previous key meet the min count requirement?
|
||||
if ( count >= minDocs ) {
|
||||
// if so, store the upper 4 bytes of the termid
|
||||
int32_t h;
|
||||
gbmemcpy ( &h , tmp+8 , 4 );
|
||||
// write it out
|
||||
out.write ( &h , 4 );
|
||||
// and the count
|
||||
out.write ( &count , 4 );
|
||||
}
|
||||
// reset, we got a new termid
|
||||
count = 1;
|
||||
}
|
||||
|
||||
//
|
||||
// end new stuff
|
||||
//
|
||||
|
||||
|
||||
// make the key
|
||||
gbmemcpy ( tmp , p , ks-6 );
|
||||
gbmemcpy ( tmp + ks-6 , top , 6 );
|
||||
// print the key
|
||||
//if ( ks == 12 )
|
||||
// fprintf(stdout,"%08lli) %08"XINT32" %016"XINT64"\n",
|
||||
// off + (p - buf) ,
|
||||
// *(int32_t *)(tmp+8),*(int64_t *)tmp );
|
||||
//else
|
||||
// fprintf(stdout,"%08lli) %016"XINT64" %016"XINT64"\n",
|
||||
// off + (p - buf) ,
|
||||
// *(int64_t *)(tmp+8),*(int64_t *)tmp );
|
||||
|
||||
// go to next key
|
||||
p += size;
|
||||
// loop up
|
||||
goto inner;
|
||||
|
||||
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
bool Pops::set ( Words *words , int32_t a , int32_t b ) {
|
||||
int32_t nw = words->getNumWords();
|
||||
int64_t *wids = words->getWordIds ();
|
||||
|
@ -1992,8 +1992,6 @@ bool Process::saveBlockingFiles2 ( ) {
|
||||
// this one too
|
||||
// g_classifier.save();
|
||||
//g_siteBonus.save();
|
||||
// save state for top docs
|
||||
//g_pageTopDocs.saveStateToDisk();
|
||||
|
||||
// save the turk url cache, urls and user states
|
||||
//g_pageTurk.saveCache();
|
||||
@ -2034,8 +2032,6 @@ void Process::resetAll ( ) {
|
||||
g_profiler .reset();
|
||||
g_autoBan .reset();
|
||||
//g_qtable .reset();
|
||||
//g_pageTopDocs .destruct();
|
||||
//g_pageNetTest .destructor();
|
||||
|
||||
for ( int32_t i = 0; i < MAX_GENERIC_CACHES; i++ )
|
||||
g_genericCache[i].reset();
|
||||
|
1
Xml.cpp
1
Xml.cpp
@ -6,7 +6,6 @@
|
||||
#include "Unicode.h" // for html entities that return unicode
|
||||
#include "Titledb.h"
|
||||
#include "Words.h"
|
||||
//#include "DateParse2.h"
|
||||
|
||||
Xml::Xml () {
|
||||
m_xml = NULL;
|
||||
|
3
XmlDoc.h
3
XmlDoc.h
@ -541,7 +541,6 @@ class XmlDoc {
|
||||
|
||||
int64_t **getWikiDocIds ( ) ;
|
||||
void gotWikiResults ( class UdpSlot *slot );
|
||||
//class DateParse2 *getDateParse2 ( ) ;
|
||||
//class HashTableX *getClockCandidatesTable();
|
||||
int32_t getOutlinkAge ( int32_t outlinkNum ) ;
|
||||
char *getIsPermalink ( ) ;
|
||||
@ -1448,7 +1447,6 @@ class XmlDoc {
|
||||
bool m_isWWWDupValid;
|
||||
bool m_linkInfo1Valid;
|
||||
bool m_linkSiteHashesValid;
|
||||
//bool m_dateParse2Valid;
|
||||
bool m_sectionsReplyValid;
|
||||
bool m_sectionsVotesValid;
|
||||
bool m_sectiondbDataValid;
|
||||
@ -1542,7 +1540,6 @@ class XmlDoc {
|
||||
//Query m_wq;
|
||||
//SearchInput m_si;
|
||||
//Msg40 m_msg40;
|
||||
//DateParse2 m_dateParse2;
|
||||
bool m_printedMenu;
|
||||
//HashTableX m_clockCandidatesTable;
|
||||
//SafeBuf m_cctbuf;
|
||||
|
102
main.cpp
102
main.cpp
@ -93,8 +93,6 @@ bool registerMsgHandlers3 ( ) ;
|
||||
|
||||
void allExitWrapper ( int fd , void *state ) ;
|
||||
|
||||
//bool QuerySerializeTest( char *ff ); // Query.cpp
|
||||
|
||||
void rmTest();
|
||||
|
||||
int g_inMemcpy=0;
|
||||
@ -202,8 +200,6 @@ void countdomains( char* coll, int32_t numRecs, int32_t verb, int32_t output );
|
||||
|
||||
UdpProtocol g_dp; // Default Proto
|
||||
|
||||
//void zlibtest ( );
|
||||
|
||||
// installFlag konstants
|
||||
typedef enum {
|
||||
ifk_install = 1,
|
||||
@ -650,14 +646,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
"<file>. Used to only check performance of "
|
||||
"getPhrasePopularity.\n\n"
|
||||
|
||||
//"stemmertest <file>\n"
|
||||
//"\truns the stemmer on words in <file>.\n\n"
|
||||
|
||||
//"queryserializetest <file>\n"
|
||||
//"\tserializes every query in <file> and tracks "
|
||||
//"statistics, as well as \t\nverifying consistency; "
|
||||
//"takes raw strings or URLs as input\n\n"
|
||||
|
||||
// less common things
|
||||
"gendict <coll> [numWordsToDump]\n\tgenerate "
|
||||
"dictionary used for spellchecker "
|
||||
@ -3035,24 +3023,10 @@ int main2 ( int argc , char *argv[] ) {
|
||||
//return 1;
|
||||
}
|
||||
|
||||
//if( !g_pageTopDocs.init() ) {
|
||||
// log( "init: PageTopDocs init failed." );
|
||||
// return 1;
|
||||
//}
|
||||
|
||||
//if( !g_pageNetTest.init() ) {
|
||||
// log( "init: PageNetTest init failed." );
|
||||
// return 1;
|
||||
//}
|
||||
|
||||
//if(!Msg6a::init()) {
|
||||
// log( "init: Quality Agent init failed." );
|
||||
//}
|
||||
|
||||
//if ( ! DateParse::init() ) {
|
||||
// log("db: DateParse init failed." ); return 1;
|
||||
//}
|
||||
|
||||
//countdomains was HERE, moved up to access more mem.
|
||||
|
||||
// load up the dmoz categories here
|
||||
@ -3090,12 +3064,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
// //return 1;
|
||||
//}
|
||||
|
||||
// deprecated in favor of Msg13-based throttling
|
||||
//if ( !g_msg6.init() ) {
|
||||
// log ( "init: msg6 init failed." );
|
||||
// return 1;
|
||||
//}
|
||||
|
||||
// if(!g_profiler.init()) {
|
||||
// log("profiler: init failed.");
|
||||
// }
|
||||
@ -3211,26 +3179,31 @@ int main2 ( int argc , char *argv[] ) {
|
||||
// 20 , // pollTime in ms
|
||||
// 1000 )){ // max udp slots
|
||||
// log("db: UdpServer2 init failed." ); return 1; }
|
||||
|
||||
// start pinging right away
|
||||
if ( ! g_pingServer.init() ) {
|
||||
log("db: PingServer init failed." ); return 1; }
|
||||
|
||||
// start up repair loop
|
||||
if ( ! g_repair.init() ) {
|
||||
log("db: Repair init failed." ); return 1; }
|
||||
|
||||
// start up repair loop
|
||||
if ( ! g_dailyMerge.init() ) {
|
||||
log("db: Daily merge init failed." ); return 1; }
|
||||
|
||||
// . then dns Distributed client
|
||||
// . server should listen to a socket and register with g_loop
|
||||
// . Only the distributed cache shall call the dns server.
|
||||
if ( ! g_dns.init( h9->m_dnsClientPort ) ) {
|
||||
log("db: Dns distributed client init failed." ); return 1; }
|
||||
|
||||
// . then dns Local client
|
||||
//if ( ! g_dnsLocal.init( 0 , false ) ) {
|
||||
// log("db: Dns local client init failed." ); return 1; }
|
||||
|
||||
// . then webserver
|
||||
// . server should listen to a socket and register with g_loop
|
||||
// again:
|
||||
if ( ! g_httpServer.init( h9->m_httpPort, h9->m_httpsPort ) ) {
|
||||
log("db: HttpServer init failed. Another gb already "
|
||||
"running?" );
|
||||
@ -3254,8 +3227,7 @@ int main2 ( int argc , char *argv[] ) {
|
||||
if ( ! registerMsgHandlers() ) {
|
||||
log("db: registerMsgHandlers failed" ); return 1; }
|
||||
|
||||
// for Events.cpp event extraction we need to parse out "places" from
|
||||
// each doc
|
||||
// for Events.cpp event extraction we need to parse out "places" from each doc
|
||||
//if ( ! initPlaceDescTable ( ) ) {
|
||||
// log("events: places table init failed"); return 1; }
|
||||
|
||||
@ -3292,35 +3264,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// gb stemmertest
|
||||
//if ( strcmp ( cmd , "stemmertest" ) == 0 ) {
|
||||
// if ( argc != cmdarg + 2 ) goto printHelp;
|
||||
// g_stemmer.test ( argv[cmdarg + 1] );
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
// gb queryserializetest
|
||||
/*
|
||||
if ( strcmp ( cmd , "queryserializetest" ) == 0 ) {
|
||||
if ( argc != cmdarg + 2 ) goto printHelp;
|
||||
int64_t starttime = gettimeofdayInMilliseconds();
|
||||
QuerySerializeTest( argv[cmdarg + 1] );
|
||||
log(LOG_INFO, "query: took %"INT64"msecs for query serialize" \
|
||||
"test on %s", gettimeofdayInMilliseconds() - starttime,
|
||||
argv[cmdarg + 1]);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
#ifdef _LIMIT10_
|
||||
// how many pages have we indexed so far?
|
||||
//int64_t numPages = g_titledb.getRdb()->getNumGlobalRecs();
|
||||
int64_t numPages = g_clusterdb.getRdb()->getNumGlobalRecs();
|
||||
if ( numPages > 10123466 )
|
||||
log("WARNING: Over 10 million documents are in the index. "
|
||||
"You have exceeded the terms of your license. "
|
||||
"Please contact mwells@gigablast.com for a new license.");
|
||||
#endif
|
||||
// bdflush needs to be turned off because we need to control the
|
||||
// writes directly. we do this by killing the write thread.
|
||||
// we kill it when we need to do important reads, otherwise, if
|
||||
@ -3338,13 +3281,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
//log("REMINDER: remove mem leack checking");
|
||||
//log("REMINDER: put thread back in Msg39");
|
||||
|
||||
// . now check with gigablast.com (216.243.113.1) to see if we
|
||||
// are licensed, for now, just get the doc
|
||||
// . TODO: implement this (GET /license.html \r\n
|
||||
// Host: www.gigablast.com\r\n\r)
|
||||
|
||||
// do the zlib test
|
||||
//zlibtest();
|
||||
// . now m_minToMerge might have changed so try to do a merge
|
||||
// . only does one merge at a time
|
||||
// . other rdb's will sleep and retry until it's their turn
|
||||
@ -3352,6 +3288,7 @@ int main2 ( int argc , char *argv[] ) {
|
||||
//g_loop.registerSleepCallback ( 1000 ,
|
||||
// NULL ,
|
||||
// tryMergingWrapper );
|
||||
|
||||
// . register a callback to try to merge everything every 2 seconds
|
||||
// . do not exit if we couldn't do this, not a huge deal
|
||||
// . put this in here instead of Rdb.cpp because we don't want
|
||||
@ -3372,29 +3309,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
if ( ! g_loop.registerSleepCallback(1000,NULL,tryToSyncWrapper,0))
|
||||
return false;
|
||||
|
||||
//if( !g_loop.registerSleepCallback(2000,(void *)1,controlDumpTopDocs) )
|
||||
// log("db: Failed to init dump TopDocs sleep callback.");
|
||||
|
||||
// MTS: removing nettest, this breaks NetGear switches when all links
|
||||
// are transmitting full bore and full duplex.
|
||||
//if( !g_loop.registerSleepCallback(2000,(void *)1,controlNetTest) )
|
||||
// log("db: Failed to init network test sleep callback.");
|
||||
|
||||
//if( !g_loop.registerSleepCallback(60000,(void *)1,takeSnapshotWrapper))
|
||||
// log("db: Failed to init Statsdb snapshot sleep callback.");
|
||||
|
||||
// check to make sure we have the latest parms
|
||||
//Msg3e msg3e;
|
||||
//msg3e.checkForNewParms();
|
||||
|
||||
// this stuff is similar to alden's msg3e but will sync collections
|
||||
// that were added/deletede
|
||||
//if ( ! g_parms.syncParmsWithHost0() ) {
|
||||
// log("parms: error syncing parms: %s",mstrerror(g_errno));
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
|
||||
if(g_recoveryMode) {
|
||||
//now that everything is init-ed send the message.
|
||||
char buf[256];
|
||||
|
Reference in New Issue
Block a user