fixed 'gb inject titledb-DIR newhosts.conf' command

for populating an index from titledb files in DIR
and transmitting to appropriate host in newhosts.conf.
also prettied up the gb -h output to use a formatting
function.
This commit is contained in:
Matt Wells
2014-01-02 01:20:08 -07:00
parent 935a4faccf
commit 7df2111ceb
5 changed files with 218 additions and 85 deletions

@ -963,6 +963,15 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we , bool purgeSeeds) {
return true;
}
// a hack function
bool addCollToTable ( char *coll , collnum_t collnum ) {
// readd it to the hashtable that maps name to collnum too
long long h64 = hash64n(coll);
g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
false,0,"nhshtbl");
return g_collTable.addKey ( &h64 , &collnum );
}
// get coll rec specified in the HTTP request
CollectionRec *Collectiondb::getRec ( HttpRequest *r ) {

@ -13,6 +13,8 @@
#include "SafeBuf.h"
bool addCollToTable ( char *coll , collnum_t collnum ) ;
class WaitEntry {
public:
void (* m_callback) (void *state);

@ -2982,14 +2982,16 @@ bool SafeBuf::brify ( char *s , long slen , long niceness ) {
}
*/
bool SafeBuf::brify2 ( char *s , long cols ) {
return brify ( s, gbstrlen(s), 0 , cols );
bool SafeBuf::brify2 ( char *s , long cols , char *sep , bool isHtml ) {
return brify ( s, gbstrlen(s), 0 , cols , sep , isHtml );
}
bool SafeBuf::brify ( char *s ,
long slen ,
long niceness ,
long maxCharsPerLine ) {
long maxCharsPerLine ,
char *sep ,
bool isHtml ) {
// count the xml tags so we know how much buf to allocated
char *p = s;
char *pend = s + slen;
@ -3001,6 +3003,8 @@ bool SafeBuf::brify ( char *s ,
char *pstart = s;
char *breakPoint = NULL;
bool inTag = false;
long sepLen = gbstrlen(sep);
bool forced = false;
redo:
@ -3012,12 +3016,18 @@ bool SafeBuf::brify ( char *s ,
if ( *p == '>' ) inTag = false;
continue;
}
if ( *p == '<' ) {
if ( *p == '<' && isHtml ) {
inTag = true;
continue;
}
col++;
if ( is_wspace_utf8(p) ) {
// reset?
if ( ! isHtml && *p == '\n' ) {
forced = true;
breakPoint = p;
goto forceBreak;
}
// apostrophe exceptions
//if ( *p == '\'' ) continue;
// break AFTER this punct
@ -3025,17 +3035,22 @@ bool SafeBuf::brify ( char *s ,
continue;
}
if ( col < maxCharsPerLine ) continue;
forceBreak:
// now add the break point i guess
// if none, gotta break here for sure!!!
if ( ! breakPoint ) breakPoint = p;
// count that
brSizes += 4;
brSizes += sepLen;//4;
// print only for last round
if ( lastRound ) {
// print up to that
// . print up to that
// . this includes the \n if forced is true
safeMemcpy ( pstart , breakPoint - pstart + 1 );
// then br
safeMemcpy ( "<br>" , 4 );
//if ( forced ) pushChar('\n');
if ( ! forced ) safeMemcpy ( sep , sepLen ) ; // "<br>"
forced = false;
}
// start right after breakpoint for next line
p = breakPoint;

@ -161,9 +161,11 @@ struct SafeBuf {
void zeroOut() { memset ( m_buf , 0 , m_capacity ); }
bool brify2 ( char *s , long cols ) ;
bool brify2 ( char *s , long cols , char *sep = "<br>" ,
bool isHtml = true ) ;
bool brify ( char *s , long slen , long niceness , long cols );
bool brify ( char *s , long slen , long niceness , long cols ,
char *sep = "<br>" , bool isHtml = true );
bool fixIsolatedPeriods ( ) ;

257
main.cpp

@ -386,25 +386,32 @@ int main ( int argc , char *argv[] ) {
if (argc < 1) {
printHelp:
fprintf(stdout,
"Usage: gb [-c hostsConf] <CMD>\n\n");
fprintf (stdout,
"Items in []'s are optional, and items in <>'s are "
"required.\n\n");
fprintf (stdout,
SafeBuf sb;
sb.safePrintf(
"Usage: gb [-c hostsConf] <CMD>\n");
sb.safePrintf(
"\tItems in []'s are optional, and items "
"in <>'s are "
"required.");
sb.safePrintf(
"\n\t"
"[hostsConf] is the hosts.conf config file as "
"described in overview.html. If not\nprovided then "
"it is assumed to be ./hosts.conf.\n\n" );
fprintf(stdout,
"described in overview.html. If not provided then "
"it is assumed to be ./hosts.conf. If "
"./localhosts.conf exists then that will be "
"used instead of ./hosts.conf. That is "
"convenient to use since it will not be "
"overwritten from git pulls.\n\n" );
sb.safePrintf(
"<CMD> can have the following values:\n\n"
"-h\tprint this help.\n\n"
"-v\tprint version and exit.\n\n"
"-o\tprint the overview documentation in HTML. "
"Contains the format of\n\thosts.conf.\n\n"
"Contains the format of hosts.conf.\n\n"
"-r\tindicates recovery mode, "
"sends email to addresses "
"specified in Conf.h\n\tupon startup.\n\n"
"specified in Conf.h upon startup.\n\n"
"<hostId>\n"
"\tstart the gb process for this <hostId> locally.\n\n"
@ -431,19 +438,19 @@ int main ( int argc , char *argv[] ) {
"tmpstart [hostId]\n"
"\tstart the gb process on all hosts or just on "
"[hostId] if specified, but\n\t"
"[hostId] if specified, but "
"use the ports specified in hosts.conf PLUS one. "
"Then you can switch the\n\t"
"Then you can switch the "
"proxy over to point to those and upgrade the "
"original cluster's gb.\n\t"
"original cluster's gb. "
"That can be done in the Master Controls of the "
"proxy using the 'use\n\t"
"proxy using the 'use "
"temporary cluster'. Also, this assumes the binary "
"name is tmpgb not gb.\n\n"
"tmpstop [hostId]\n"
"\tsaves and exits for all gb hosts or "
"just on [hostId] if specified, for\n\tthe "
"just on [hostId] if specified, for the "
"tmpstart command.\n\n"
"spidersoff [hostId]\n"
@ -456,7 +463,7 @@ int main ( int argc , char *argv[] ) {
"cacheoff [hostId]\n"
"\tdisables all disk PAGE caches on all hosts or "
"just on [hostId] if\n\tspecified.\n\n"
"just on [hostId] if specified.\n\n"
"freecache [maxShmid]\n"
"\tfinds and frees all shared memory up to shmid "
@ -484,18 +491,18 @@ int main ( int argc , char *argv[] ) {
"dsh <CMD>\n"
"\trun this command on the primary IPs of "
"all active hosts in\n\thosts.conf. Example: "
"all active hosts in hosts.conf. Example: "
"gb dsh 'ps auxw; uptime'\n\n"
"dsh2 <CMD>\n"
"\trun this command on the secondary IPs of "
"all active hosts in\n\thosts.conf. Example: "
"all active hosts in hosts.conf. Example: "
"gb dsh2 'ps auxw; uptime'\n\n"
"install [hostId]\n"
"\tinstall all required files for gb from "
"current working directory\n"
"\tto [hostId]. If no [hostId] is specified install "
"current working directory "
"to [hostId]. If no [hostId] is specified install "
"to ALL hosts.\n\n"
"install2 [hostId]\n"
@ -538,30 +545,30 @@ int main ( int argc , char *argv[] ) {
"backupcopy <backupSubdir>\n"
"\tsave a copy of all xml, config, data and map files "
"into <backupSubdir>\n\twhich is relative "
"into <backupSubdir> which is relative "
"to the working dir. Done for all hosts.\n\n"
"backupmove <backupSubdir>\n"
"\tmove all all xml, config, data and map files "
"into <backupSubdir> which \n\tis relative "
"into <backupSubdir> which is relative "
"to the working dir. Done for all hosts.\n\n"
"backuprestore <backupSubdir>\n"
"\tmove all all xml, config, data and map files "
"in <backupSubdir>, which\n\tis relative "
"in <backupSubdir>, which is relative "
"to the working dir, into the working dir. "
"Will NOT\n\toverwrite anything. Done for all "
"Will NOT overwrite anything. Done for all "
"hosts.\n\n"
"proxy start [proxyId]\n"
"\tStart a proxy that acts as a frontend to gb "
"and passes on\n\trequests to random machines on "
"the cluster given in hosts.conf.\n\tHelps to "
"and passes on requests to random machines on "
"the cluster given in hosts.conf. Helps to "
"distribute the load evenly across all machines.\n\n"
"proxy load <proxyId>\n"
"\tStart a proxy process directly without calling "
"ssh. Called\n\tby 'gb proxy start'.\n\n"
"ssh. Called by 'gb proxy start'.\n\n"
"proxy stop [proxyId]\n"
"\tStop a proxy that acts as a frontend to gb.\n\n"
@ -569,22 +576,22 @@ int main ( int argc , char *argv[] ) {
"blasterdiff [-v] [-j] [-p] <file1> <file2> "
"<maxNumThreads> <wait>\n"
"\tcompare search results between urls in file1 and"
"file2 and output the\n\tsearch results in the url"
" from file1 not found in the url from file2\n\t"
"file2 and output the search results in the url"
" from file1 not found in the url from file2 "
"maxNumThreads is the number of concurrent "
"comparisons "
"that should be\n\tdone at one time and wait is the"
"time to wait between comparisons. \n\t-v is for "
"that should be done at one time and wait is the"
"time to wait between comparisons. -v is for "
"verbose "
" and -j is to just display links not found and "
"not\n\t"
"not "
"search for them on server2. If you do not want to"
" use the proxy server\n\t"
" use the proxy server "
"on gk10, use -p\n\n"
"blaster [-l|-u|-i] <file> <maxNumThreads> <wait>\n"
"\tget documents from the urls given in file. The "
"-l argument is to\n\t"
"-l argument is to "
"automatically get documents "
"from the gigablast log file.\n"
"\t-u means to inject/index the url into gb.\n"
@ -594,35 +601,44 @@ int main ( int argc , char *argv[] ) {
"which also entails a DNS lookup on each outlink.\n"
"\tmaxNumThreads is the"
" number of concurrent threads at one time and wait "
"\n\tis the time to wait between threads.\n\n"
" is the time to wait between threads.\n\n"
"scale <newHosts.conf>\n"
"\tGenerate a script to be called to migrate the "
"data to the new places.\n\tRemaining hosts will "
"keep the data they have, but it will be\n\t"
"data to the new places. Remaining hosts will "
"keep the data they have, but it will be "
"filtered during the next merge operations.\n\n"
"collcopy <newHosts.conf> <coll> <collnum>\n"
"\tGenerate a script to copy the collection data on "
"the cluster defined by\n\tnewHosts.conf to the "
"the cluster defined by newHosts.conf to the "
"current cluster. Remote network must have "
"called\n\t\"gb ddump\" twice in a row just before to "
"ensure all of its data is\n\ton disk.\n\n"
"called \"gb ddump\" twice in a row just before to "
"ensure all of its data is on disk.\n\n"
// gb inject <file> <ip:port> [startdocid]
// gb inject titledb <newhosts.conf> [startdocid]
"inject <file> <ip:port> [startdocid]\n"
"inject titledb <newhosts.conf> [startdocid]\n"
"\tInject all documents in <file> into [hostId]. If "
"[hostId] not given,\n\t0 is assumed. Each document "
"must be preceeded by a valid HTTP mime with\n\t"
"a Content-Length: field.\n\n"
"\tInject all documents in <file> into the host "
"at ip:port. "
"Each document "
"must be preceeded by a valid HTTP mime with "
"a Content-Length: field. "
"\n\n"
"inject titledb-<DIR> <newhosts.conf> [startdocid]\n"
"\tInject all pages from all the titledb "
"files in the <DIR> directory into the appropriate "
"host defined by the newhosts.conf config file. This "
"is useful for populating one search engine with "
"another. "
"\n\n"
"injecttest <requestLen> [hostId]\n"
"\tinject random documents into [hostId]. If [hostId] "
"not given\n\t0 is assumed.\n\n"
"not given 0 is assumed.\n\n"
"ping <hostId> [clientport]\n"
"\tperforms pings to <hostId>. [clientport] defaults "
@ -633,7 +649,7 @@ int main ( int argc , char *argv[] ) {
"dictlookuptest <file>\n"
"\tgets the popularities of the entries in the "
"<file>.\n Used to only check performance of "
"<file>. Used to only check performance of "
"getPhrasePopularity.\n\n"
//"stemmertest <file>\n"
@ -647,18 +663,18 @@ int main ( int argc , char *argv[] ) {
// less common things
"gendict <coll> [numWordsToDump]\n\tgenerate "
"dictionary used for spellchecker "
"from titledb\n\tfiles in collection <coll>. Use "
"from titledb files in collection <coll>. Use "
"first [numWordsToDump] words.\n\n"
//#ifndef _LARS_
//"gendbs <coll> [hostId]\n\tgenerate missing spiderdb, "
//"tfndb and checksumdb files from titledb\n\tfiles.\n\n"
//"tfndb and checksumdb files from titledb files.\n\n"
//"gentfndb <coll> [hostId]\n\tgenerate missing tfndb. "
//"titledb disk dumps and tight merges are no\n\t"
//"titledb disk dumps and tight merges are no "
//"longer necessary. Also "
//"generates tfndb from spiderdb. tfndb-saved.dat\n\t"
//"generates tfndb from spiderdb. tfndb-saved.dat "
//"and all tfndb* files in the collection subdir "
//"must not exist, so move\n\tthem to a temp dir.\n\n"
//"must not exist, so move them to a temp dir.\n\n"
//"fixtfndb <coll> [hostId]\n\tremove tfndb recs "
//"referring to non-existent titledb recs.\n\n"
@ -694,8 +710,8 @@ int main ( int argc , char *argv[] ) {
// Quality Tests
"countdomains <coll> <X>\n"
"\tCounts the domains and IPs in collection coll and "
"in the first X\n\ttitledb records. Results are sorted"
"by popularity and stored in\n\tthe log file. \n\n"
"in the first X titledb records. Results are sorted"
"by popularity and stored in the log file. \n\n"
"cachetest\n\t"
"cache stability and speed tests\n\n"
@ -705,7 +721,7 @@ int main ( int argc , char *argv[] ) {
"dosopen <ip> <port> <numThreads>\n"
"\tOpen numThreads tcp sockets to ip:port and just "
"sit there. For\n\ttestingthe robustness of gb.\n\n"
"sit there. For testingthe robustness of gb.\n\n"
"xmldiff [-td] <file1> <file2>\n"
"\tTest xml diff routine on file1 and file2.\n"
@ -768,9 +784,10 @@ int main ( int argc , char *argv[] ) {
#endif
"\tT is the first docId to dump. Applies only to "
"titledb. "
"(default none)\n\n"
//"(default none)\n\n"
"\tV is c to dump cached recs.\n"
"\n"
"dump s [X [Y [Z [C]]]\n"
@ -805,22 +822,22 @@ int main ( int argc , char *argv[] ) {
"dumpmissing <coll> [hostId]\n\t"
"dump the docIds in indexdb but not "
"in tfndb/titledb to stderr. "
"\n\tUsed for passing in to removedocids.\n"
" Used for passing in to removedocids.\n"
"\n"
"dumpdups <coll> [hostId]\n\t"
"dump the docIds in duplicated in indexdb when "
"they should not be to\n\tstderr. Usually a sign "
"of mis-indexing. Used for passing in to\n\t"
"they should not be to stderr. Usually a sign "
"of mis-indexing. Used for passing in to "
"removedocids.\n"
"\n"
"removedocids <coll> <fileOfDocIds> "
"[hostId|hostId1-hostId2]"
"\n\tremoves the docids in fileOfDocIds from indexdb, "
"clusterdb, checksumdb\n\tand tfndb. Effectively "
"clusterdb, checksumdb and tfndb. Effectively "
"completely deleting that docid. "
"fileOfDocIds\n\tcontains one "
"fileOfDocIds contains one "
"docId per line, and nothing more.\n"
"\n"
@ -836,18 +853,21 @@ int main ( int argc , char *argv[] ) {
"replacehost <hostid> <spareid>"
"\n\treplaces host with hostid <hostid> with the "
"spare that has the spareid\n\t<spareid>. the host "
"spare that has the spareid <spareid>. the host "
"being replaced should already be shut down or dead.\n"
"\n"
"synchost <hostid>"
"\n\trecopies this host from its twin. host directory "
"must be empty and\n\tthe host must be marked as dead "
"must be empty and the host must be marked as dead "
"in the current gb. Use synchost2 to use secondary "
"IPs.\n"
"\n"
//#endif
);
SafeBuf sb2;
sb2.brify2 ( sb.getBufStart() , 60 , "\n\t" , false );
fprintf(stdout,sb2.getBufStart());
// disable printing of used memory
g_mem.m_used = 0;
return 0;
@ -1198,6 +1218,25 @@ int main ( int argc , char *argv[] ) {
else goto printHelp;
}
// gb inject <file> <ip:port> [startdocid]
// gb inject titledb-coll.main.0 <newhosts.conf> [startdocid]
// gb inject titledb-somedir <newhosts.conf> [startdocid]
// gb inject titledb-coll.foobar.5 <newhosts.conf> [startdocid]
if ( strcmp ( cmd , "inject" ) == 0 ) {
if ( argc != cmdarg+3 &&
argc != cmdarg+4 &&
argc != cmdarg+5 )
goto printHelp;
char *file = argv[cmdarg+1];
char *ips = argv[cmdarg+2];
long long startDocId = 0LL;
long long endDocId = DOCID_MASK;
if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]);
if ( cmdarg+4 < argc ) endDocId = atoll(argv[cmdarg+4]);
injectFile ( file , ips , startDocId , endDocId , false );
return 0;
}
// load up hosts.conf
if ( ! g_hostdb.init(hostsConf, hostId, NULL, isProxy,useTmpCluster)){
log("db: hostdb init failed." ); return 1; }
@ -1607,6 +1646,7 @@ int main ( int argc , char *argv[] ) {
return 0;
}
*/
/*
// gb inject <file> <ip:port> [startdocid]
// gb inject titledb <newhosts.conf> [startdocid]
if ( strcmp ( cmd , "inject" ) == 0 ) {
@ -1623,6 +1663,7 @@ int main ( int argc , char *argv[] ) {
injectFile ( file , ips , startDocId , endDocId , false );
return 0;
}
*/
if ( strcmp ( cmd , "reject" ) == 0 ) {
if ( argc != cmdarg+3 &&
argc != cmdarg+4 &&
@ -13382,7 +13423,7 @@ int injectFile ( char *filename , char *ips ,
long long endDocId ,
bool isDelete ) {
g_mem.init ( 50000000 );
g_mem.init ( 4000000000LL );
// set up the loop
if ( ! g_loop.init() ) return log("build: inject: Loop init "
@ -13440,13 +13481,14 @@ int injectFile ( char *filename , char *ips ,
}
s_injectTitledb = false;
char *coll = "main";
if ( strcmp(filename,"titledb") == 0 ) {
long hostId = 0;
Host *h = g_hostdb.getHost ( hostId );
if ( ! h ) { log("db: No host has id %li.",hostId); exit(0);}
if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
log("db: Conf init failed." ); exit(0); }
//char *coll = "main";
if ( strncmp(filename,"titledb",7) == 0 ) {
//long hostId = 0;
//Host *h = g_hostdb.getHost ( hostId );
//if ( ! h ) { log("db: No host has id %li.",hostId); exit(0);}
//if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
// log("db: Conf init failed." ); exit(0); }
// a new thing, titledb-gk144 or titledb-coll.main.0
// init the loop, needs g_conf
if ( ! g_loop.init() ) {
log("db: Loop init failed." ); exit(0); }
@ -13458,7 +13500,8 @@ int injectFile ( char *filename , char *ips ,
// read where we left off from file if possible
char fname[256];
sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir);
//sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir);
sprintf(fname,"./lastinjectdocid.dat");
SafeBuf ff;
ff.fillFromFile(fname);
if ( ff.length() > 1 ) {
@ -13488,9 +13531,69 @@ int injectFile ( char *filename , char *ips ,
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
//g_conf.m_urldbMaxDiskPageCacheMem = 0;
g_collectiondb.init(true);
// . add a fake coll just for it.
// . make the subdir just gk144 not coll.gk144.0 so rick
// can inject the titledb bigfile
//g_collectiondb.init(true);
/*
g_collectiondb.addRec ( "poo" , // char *coll ,
NULL , // char *cpc ,
0 , // long cpclen ,
false , // bool isNew ,
-1 , // collnum_t collnum ,
false , // bool isDump ,
false ); // bool saveIt
*/
CollectionRec *cr = new (CollectionRec);
SafeBuf *rb = &g_collectiondb.m_recPtrBuf;
rb->reserve(4);
g_collectiondb.m_recs = (CollectionRec **)rb->getBufStart();
g_collectiondb.m_recs[0] = cr;
// right now this is just for the main collection
char *coll = "main";
addCollToTable ( coll , (collnum_t) 0 );
// force RdbTree.cpp not to bitch about corruption
// assume we are only getting out collnum 0 recs i guess
g_collectiondb.m_numRecs = 1;
g_titledb.init ();
g_titledb.addColl ( coll, false );
//g_titledb.addColl ( coll, false );
// msg5::readList() requires the RdbBase for collnum 0
// which holds the array of files and the tree
Rdb *rdb = g_titledb.getRdb();
static RdbBase *s_base = new ( RdbBase );
// so getRdbBase always returns
rdb->m_collectionlessBase = s_base;
rdb->m_isCollectionLess = true;
//CollectionRec *pcr = g_collectiondb.getRec((collnum_t)0);
//pcr->m_bases[RDB_TITLEDB] = s_base;
// dir for tree loading
sprintf(g_hostdb.m_dir , "./" );
rdb->loadTree();
// titledb-
if ( gbstrlen(filename)<=8 )
return log("build: need titledb-coll.main.0 or "
"titledb-gk144 not just 'titledb'");
char *coll2 = filename + 8;
char tmp[1024];
sprintf(tmp,"./%s",coll2);
s_base->m_dir.set(tmp);
strcpy(s_base->m_dbname,rdb->m_dbname);
s_base->m_dbnameLen = gbstrlen(rdb->m_dbname);
s_base->m_coll = "main";
s_base->m_collnum = (collnum_t)0;
s_base->m_rdb = rdb;
s_base->m_fixedDataSize = rdb->m_fixedDataSize;
s_base->m_useHalfKeys = rdb->m_useHalfKeys;
s_base->m_ks = rdb->m_ks;
s_base->m_pageSize = rdb->m_pageSize;
s_base->m_isTitledb = rdb->m_isTitledb;
s_base->m_minToMerge = 99999;
// try to set the file info now!
s_base->setFiles();
}
else {
// open file
@ -13608,6 +13711,7 @@ void doInject ( int fd , void *state ) {
if ( contentLen > 0 ) contentLen--;
char c = content[contentLen];
content[contentLen] = '\0';
//log("inject: %s",xd.m_firstUrl.m_url);
// form what we would read from disk
reqLen = sprintf(req,
// print as unencoded content for speed
@ -13999,7 +14103,8 @@ void injectedWrapper ( void *state , TcpSocket *s ) {
SafeBuf sb;
sb.safePrintf("%lli\n",minDocId);
char fname[256];
sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir);
//sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir
sprintf(fname,"./lastinjectdocid.dat");
sb.dumpToFile(fname);
}
}