mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
added option to dump wanted docids (excluding those that gets blocked by blocklists and parameter stripping)
This commit is contained in:
148
main.cpp
148
main.cpp
@ -116,6 +116,7 @@ static void dumpClusterdb(const char *coll, int32_t sfn, int32_t numFiles, bool
|
||||
static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, const char *url);
|
||||
|
||||
static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
|
||||
static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
|
||||
@ -517,6 +518,8 @@ int main2 ( int argc , char *argv[] ) {
|
||||
"\t\tdump t <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
|
||||
"\ttitledb (Unwanted documents, checked against blocklist, plugins):\n"
|
||||
"\t\tdump u <collection> <fileNum> <numFiles> <includeTree>\n"
|
||||
"\ttitledb (Wanted documents, checked against blocklist, plugins):\n"
|
||||
"\t\tdump wt <collection> <fileNum> <numFiles> <includeTree>\n"
|
||||
"\ttitledb (duplicates only):\n"
|
||||
"\t\tdump D <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
|
||||
"\twaiting tree:\n"
|
||||
@ -1306,8 +1309,9 @@ int main2 ( int argc , char *argv[] ) {
|
||||
|
||||
dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, true);
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'w' )
|
||||
else if (strcmp(argv[cmdarg+1], "w") == 0) {
|
||||
dumpWaitingTree(coll);
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'x' )
|
||||
dumpDoledb (coll,startFileNum,numFiles,includeTree);
|
||||
else if ( argv[cmdarg+1][0] == 's' ) {
|
||||
@ -1349,6 +1353,8 @@ int main2 ( int argc , char *argv[] ) {
|
||||
dumpPosdb( coll, startFileNum, numFiles, includeTree, termId, false );
|
||||
} else if (strcmp(argv[cmdarg+1], "u") == 0) {
|
||||
dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
} else if (strcmp(argv[cmdarg+1], "wt") == 0) {
|
||||
dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
} else if (strcmp(argv[cmdarg+1], "us") == 0) {
|
||||
dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
} else {
|
||||
@ -3577,6 +3583,146 @@ static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int3
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
|
||||
|
||||
if(startFileNum!=0 && numFiles<0) {
|
||||
//this may apply to all files, but I haven't checked into hash-based ones yet
|
||||
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
|
||||
return;
|
||||
}
|
||||
if (!ucInit(g_hostdb.m_dir)) {
|
||||
log("Unicode initialization failed!");
|
||||
return;
|
||||
}
|
||||
// init our table for doing zobrist hashing
|
||||
if ( ! hashinit() ) {
|
||||
log("db: Failed to init hashtable." );
|
||||
return;
|
||||
}
|
||||
|
||||
g_titledb.init ();
|
||||
g_titledb.getRdb()->addRdbBase1(coll);
|
||||
key96_t startKey ;
|
||||
key96_t endKey ;
|
||||
key96_t lastKey ;
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
lastKey.setMin();
|
||||
startKey = Titledb::makeFirstKey(0);
|
||||
Msg5 msg5;
|
||||
RdbList list;
|
||||
HashTableX dedupTable;
|
||||
dedupTable.set(4,0,10000,NULL,0,false,"maintitledb");
|
||||
|
||||
// make this
|
||||
XmlDoc *xd;
|
||||
try {
|
||||
xd = new (XmlDoc);
|
||||
}
|
||||
catch(std::bad_alloc&) {
|
||||
fprintf(stdout,"could not alloc for xmldoc\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
if(cr==NULL) {
|
||||
fprintf(stderr,"Unknown collection '%s'\n", coll);
|
||||
return;
|
||||
}
|
||||
|
||||
// initialize shlib & blacklist
|
||||
if (!WantedChecker::initialize()) {
|
||||
fprintf(stderr, "Unable to initialize WantedChecker");
|
||||
return;
|
||||
}
|
||||
|
||||
g_urlBlackList.init();
|
||||
g_urlWhiteList.init();
|
||||
|
||||
for(;;) {
|
||||
// use msg5 to get the list, should ALWAYS block since no threads
|
||||
if ( ! msg5.getList ( RDB_TITLEDB ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
&startKey ,
|
||||
&endKey ,
|
||||
commandLineDumpdbRecSize,
|
||||
includeTree ,
|
||||
startFileNum ,
|
||||
numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false , // err correction?
|
||||
-1 , // maxRetries
|
||||
false)) // isRealMerge
|
||||
{
|
||||
log(LOG_LOGIC,"db: getList did not block.");
|
||||
return;
|
||||
}
|
||||
// all done if empty
|
||||
if ( list.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
// loop over entries in list
|
||||
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
|
||||
key96_t k = list.getCurrentKey();
|
||||
char *rec = list.getCurrentRec();
|
||||
int32_t recSize = list.getCurrentRecSize();
|
||||
int64_t docId = Titledb::getDocIdFromKey(&k);
|
||||
|
||||
if ( k <= lastKey ) {
|
||||
log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
|
||||
lastKey.n1, lastKey.n0, k.n1, k.n0);
|
||||
}
|
||||
|
||||
lastKey = k;
|
||||
|
||||
if ( (k.n0 & 0x01) == 0) {
|
||||
// delete key
|
||||
continue;
|
||||
}
|
||||
|
||||
// free the mem
|
||||
xd->reset();
|
||||
|
||||
// uncompress the title rec
|
||||
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
|
||||
//set2() may have logged something but not the docid
|
||||
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
|
||||
continue;
|
||||
}
|
||||
|
||||
// extract the url
|
||||
Url *url = xd->getFirstUrl();
|
||||
const char *reason = NULL;
|
||||
|
||||
if( ! isUrlUnwanted(*url, &reason)) {
|
||||
Url **redirUrlPtr = xd->getRedirUrl();
|
||||
if (redirUrlPtr && *redirUrlPtr) {
|
||||
Url *redirUrl = *redirUrlPtr;
|
||||
if (isUrlUnwanted(*redirUrl, &reason)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stdout, "%" PRId64 "|%s\n", docId, url->getUrl());
|
||||
}
|
||||
}
|
||||
startKey = *(key96_t *)list.getLastKey();
|
||||
startKey++;
|
||||
|
||||
// watch out for wrap around
|
||||
if ( startKey < *(key96_t *)list.getLastKey() ) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
|
||||
if (startFileNum < 0) {
|
||||
log(LOG_LOGIC, "db: Start file number is < 0. Must be >= 0.");
|
||||
|
Reference in New Issue
Block a user