added option to dump wanted docids (excluding those that gets blocked by blocklists and parameter stripping)

This commit is contained in:
Brian Rasmusson
2017-09-03 19:56:20 +02:00
parent 8a72b10b33
commit 6cd4ae0950

148
main.cpp

@ -116,6 +116,7 @@ static void dumpClusterdb(const char *coll, int32_t sfn, int32_t numFiles, bool
static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, const char *url);
static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
@ -517,6 +518,8 @@ int main2 ( int argc , char *argv[] ) {
"\t\tdump t <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
"\ttitledb (Unwanted documents, checked against blocklist, plugins):\n"
"\t\tdump u <collection> <fileNum> <numFiles> <includeTree>\n"
"\ttitledb (Wanted documents, checked against blocklist, plugins):\n"
"\t\tdump wt <collection> <fileNum> <numFiles> <includeTree>\n"
"\ttitledb (duplicates only):\n"
"\t\tdump D <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
"\twaiting tree:\n"
@ -1306,8 +1309,9 @@ int main2 ( int argc , char *argv[] ) {
dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, true);
}
else if ( argv[cmdarg+1][0] == 'w' )
else if (strcmp(argv[cmdarg+1], "w") == 0) {
dumpWaitingTree(coll);
}
else if ( argv[cmdarg+1][0] == 'x' )
dumpDoledb (coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 's' ) {
@ -1349,6 +1353,8 @@ int main2 ( int argc , char *argv[] ) {
dumpPosdb( coll, startFileNum, numFiles, includeTree, termId, false );
} else if (strcmp(argv[cmdarg+1], "u") == 0) {
dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "wt") == 0) {
dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "us") == 0) {
dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
} else {
@ -3577,6 +3583,146 @@ static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int3
}
}
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if(startFileNum!=0 && numFiles<0) {
//this may apply to all files, but I haven't checked into hash-based ones yet
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
return;
}
if (!ucInit(g_hostdb.m_dir)) {
log("Unicode initialization failed!");
return;
}
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." );
return;
}
g_titledb.init ();
g_titledb.getRdb()->addRdbBase1(coll);
key96_t startKey ;
key96_t endKey ;
key96_t lastKey ;
startKey.setMin();
endKey.setMax();
lastKey.setMin();
startKey = Titledb::makeFirstKey(0);
Msg5 msg5;
RdbList list;
HashTableX dedupTable;
dedupTable.set(4,0,10000,NULL,0,false,"maintitledb");
// make this
XmlDoc *xd;
try {
xd = new (XmlDoc);
}
catch(std::bad_alloc&) {
fprintf(stdout,"could not alloc for xmldoc\n");
exit(-1);
}
const CollectionRec *cr = g_collectiondb.getRec(coll);
if(cr==NULL) {
fprintf(stderr,"Unknown collection '%s'\n", coll);
return;
}
// initialize shlib & blacklist
if (!WantedChecker::initialize()) {
fprintf(stderr, "Unable to initialize WantedChecker");
return;
}
g_urlBlackList.init();
g_urlWhiteList.init();
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
commandLineDumpdbRecSize,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
{
log(LOG_LOGIC,"db: getList did not block.");
return;
}
// all done if empty
if ( list.isEmpty() ) {
return;
}
// loop over entries in list
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocIdFromKey(&k);
if ( k <= lastKey ) {
log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey.n1, lastKey.n0, k.n1, k.n0);
}
lastKey = k;
if ( (k.n0 & 0x01) == 0) {
// delete key
continue;
}
// free the mem
xd->reset();
// uncompress the title rec
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
continue;
}
// extract the url
Url *url = xd->getFirstUrl();
const char *reason = NULL;
if( ! isUrlUnwanted(*url, &reason)) {
Url **redirUrlPtr = xd->getRedirUrl();
if (redirUrlPtr && *redirUrlPtr) {
Url *redirUrl = *redirUrlPtr;
if (isUrlUnwanted(*redirUrl, &reason)) {
continue;
}
}
fprintf(stdout, "%" PRId64 "|%s\n", docId, url->getUrl());
}
}
startKey = *(key96_t *)list.getLastKey();
startKey++;
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
return;
}
}
}
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if (startFileNum < 0) {
log(LOG_LOGIC, "db: Start file number is < 0. Must be >= 0.");