added option to dump wanted docids (excluding those that gets blocked by blocklists and parameter stripping)

2025-07-15 02:36:08 -04:00 · 2017-09-03 19:56:20 +02:00
parent 8a72b10b33
commit 6cd4ae0950
1 changed files with 147 additions and 1 deletions
--- a/main.cpp
+++ b/main.cpp
@ -116,6 +116,7 @@ static void dumpClusterdb(const char *coll, int32_t sfn, int32_t numFiles, bool
 static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, const char *url);

 static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
+static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
 static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);

 static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
@ -517,6 +518,8 @@ int main2 ( int argc , char *argv[] ) {
 			"\t\tdump t <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
 			"\ttitledb (Unwanted documents, checked against blocklist, plugins):\n"
 			"\t\tdump u <collection> <fileNum> <numFiles> <includeTree>\n"
+			"\ttitledb (Wanted documents, checked against blocklist, plugins):\n"
+			"\t\tdump wt <collection> <fileNum> <numFiles> <includeTree>\n"
 			"\ttitledb (duplicates only):\n"
 			"\t\tdump D <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
 			"\twaiting tree:\n"
@ -1306,8 +1309,9 @@ int main2 ( int argc , char *argv[] ) {

 			dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, true);
 		}
-		else if ( argv[cmdarg+1][0] == 'w' )
+		else if (strcmp(argv[cmdarg+1], "w") == 0) {
 		       dumpWaitingTree(coll);
+		}
 		else if ( argv[cmdarg+1][0] == 'x' )
 			dumpDoledb  (coll,startFileNum,numFiles,includeTree);
 		else if ( argv[cmdarg+1][0] == 's' ) {
@ -1349,6 +1353,8 @@ int main2 ( int argc , char *argv[] ) {
 			dumpPosdb( coll, startFileNum, numFiles, includeTree, termId, false );
 		}  else if (strcmp(argv[cmdarg+1], "u") == 0) {
 			dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
+		}  else if (strcmp(argv[cmdarg+1], "wt") == 0) {
+			dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
 		} else if (strcmp(argv[cmdarg+1], "us") == 0) {
 			dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
 		} else {
@ -3577,6 +3583,146 @@ static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int3
 	}
 }

+
+static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
+
+	if(startFileNum!=0 && numFiles<0) {
+		//this may apply to all files, but I haven't checked into hash-based ones yet
+		fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
+		return;
+	}
+	if (!ucInit(g_hostdb.m_dir)) {
+		log("Unicode initialization failed!");
+		return;
+	}
+	// init our table for doing zobrist hashing
+	if ( ! hashinit() ) {
+		log("db: Failed to init hashtable." );
+		return;
+	}
+
+	g_titledb.init ();
+	g_titledb.getRdb()->addRdbBase1(coll);
+	key96_t startKey ;
+	key96_t endKey   ;
+	key96_t lastKey  ;
+	startKey.setMin();
+	endKey.setMax();
+	lastKey.setMin();
+	startKey = Titledb::makeFirstKey(0);
+	Msg5 msg5;
+	RdbList list;
+	HashTableX dedupTable;
+	dedupTable.set(4,0,10000,NULL,0,false,"maintitledb");
+
+	// make this
+	XmlDoc *xd;
+	try {
+		xd = new (XmlDoc);
+	}
+	catch(std::bad_alloc&) {
+		fprintf(stdout,"could not alloc for xmldoc\n");
+		exit(-1);
+	}
+
+	const CollectionRec *cr = g_collectiondb.getRec(coll);
+	if(cr==NULL) {
+		fprintf(stderr,"Unknown collection '%s'\n", coll);
+		return;
+	}
+
+	// initialize shlib & blacklist
+	if (!WantedChecker::initialize()) {
+		fprintf(stderr, "Unable to initialize WantedChecker");
+		return;
+	}
+
+	g_urlBlackList.init();
+	g_urlWhiteList.init();
+
+	for(;;) {
+		// use msg5 to get the list, should ALWAYS block since no threads
+		if ( ! msg5.getList ( RDB_TITLEDB   ,
+				      cr->m_collnum          ,
+				      &list         ,
+				      &startKey      ,
+				      &endKey        ,
+				      commandLineDumpdbRecSize,
+				      includeTree   ,
+				      startFileNum  ,
+				      numFiles      ,
+				      NULL          , // state
+				      NULL          , // callback
+				      0             , // niceness
+				      false         , // err correction?
+				      -1            , // maxRetries
+				      false))          // isRealMerge
+		{
+			log(LOG_LOGIC,"db: getList did not block.");
+			return;
+		}
+		// all done if empty
+		if ( list.isEmpty() ) {
+			return;
+		}
+
+		// loop over entries in list
+		for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
+			key96_t k = list.getCurrentKey();
+			char *rec = list.getCurrentRec();
+			int32_t recSize = list.getCurrentRecSize();
+			int64_t docId = Titledb::getDocIdFromKey(&k);
+
+			if ( k <= lastKey ) {
+				log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
+				    lastKey.n1, lastKey.n0, k.n1, k.n0);
+			}
+
+			lastKey = k;
+
+			if ( (k.n0 & 0x01) == 0) {
+				// delete key
+				continue;
+			}
+
+			// free the mem
+			xd->reset();
+
+			// uncompress the title rec
+			if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
+				//set2() may have logged something but not the docid
+				log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
+				continue;
+			}
+
+			// extract the url
+			Url *url = xd->getFirstUrl();
+			const char *reason = NULL;
+
+			if( ! isUrlUnwanted(*url, &reason)) {
+				Url **redirUrlPtr = xd->getRedirUrl();
+				if (redirUrlPtr && *redirUrlPtr) {
+					Url *redirUrl = *redirUrlPtr;
+					if (isUrlUnwanted(*redirUrl, &reason)) {
+						continue;
+					}
+				}
+
+				fprintf(stdout, "%" PRId64 "|%s\n", docId, url->getUrl());
+			}
+		}
+		startKey = *(key96_t *)list.getLastKey();
+		startKey++;
+
+		// watch out for wrap around
+		if ( startKey < *(key96_t *)list.getLastKey() ) {
+			return;
+		}
+	}
+}
+
+
+
 static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
 	if (startFileNum < 0) {
 		log(LOG_LOGIC, "db: Start file number is < 0. Must be >= 0.");