#include "XmlDoc.h" #include "Collectiondb.h" #include "SpiderCache.h" #include "Titledb.h" #include "Doledb.h" #include "CountryCode.h" #include "Log.h" #include "Conf.h" #include "Mem.h" #include <libgen.h> static void print_usage(const char *argv0) { fprintf(stdout, "Usage: %s [-h] PATH\n", argv0); fprintf(stdout, "Verify titledb\n"); fprintf(stdout, "\n"); fprintf(stdout, " -h, --help display this help and exit\n"); } static void cleanup() { g_log.m_disabled = true; g_linkdb.reset(); g_clusterdb.reset(); g_spiderCache.reset(); g_doledb.reset(); g_spiderdb.reset(); g_tagdb.reset(); g_titledb.reset(); g_posdb.reset(); g_collectiondb.reset(); g_loop.reset(); } int main(int argc, char **argv) { if (argc < 2) { print_usage(argv[0]); return 1; } if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) { print_usage(argv[0]); return 1; } g_log.m_disabled = true; // initialize library g_mem.init(); hashinit(); // current dir char path[PATH_MAX]; realpath(argv[1], path); size_t pathLen = strlen(path); if (path[pathLen] != '/') { strcat(path, "/"); } g_hostdb.init(-1, false, false, path); g_conf.init(path); ucInit(); // initialize rdbs g_loop.init(); g_collectiondb.loadAllCollRecs(); g_posdb.init(); g_titledb.init(); g_tagdb.init(); g_spiderdb.init(); g_doledb.init(); g_spiderCache.init(); g_clusterdb.init(); g_linkdb.init(); g_collectiondb.addRdbBaseToAllRdbsForEachCollRec(); g_log.m_disabled = true; g_log.m_logPrefix = false; CollectionRec *cr = g_collectiondb.getRec("main"); if (!cr) { logf(LOG_TRACE, "No main collection found"); } Msg5 msg5; RdbList list; key96_t startKey; key96_t endKey; endKey.setMax(); for (;;) { if (!msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 500000000, true, 0, -1, NULL, NULL, 0, true, -1, false)) { logf(LOG_TRACE, "msg5.getlist didn't block"); break; } if (list.isEmpty()) { break; } for (list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) { XmlDoc xmlDoc; key96_t key = list.getCurrentKey(); int64_t docId = Titledb::getDocIdFromKey(&key); if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) { logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId); break; } //fprintf(stdout, "Processing docid=%" PRId64"\r", docId); time_t ts = xmlDoc.m_spideredTime; struct tm tm_buf; struct tm *timeStruct = localtime_r(&ts,&tm_buf); char buf[128]; strftime(buf, 128, "%b-%d-%Y %H:%M:%S", timeStruct); // validate linkinfo if (xmlDoc.ptr_linkInfo1->m_version != 0 || xmlDoc.ptr_linkInfo1->m_lisize < 0 || xmlDoc.ptr_linkInfo1->m_lisize != xmlDoc.size_linkInfo1 || xmlDoc.ptr_linkInfo1->m_numStoredInlinks < 0 || xmlDoc.ptr_linkInfo1->m_numGoodInlinks < 0) { fprintf(stderr, "\ndocid=%" PRId64" url='%.*s spidered='%s'\n", docId, xmlDoc.size_firstUrl, xmlDoc.ptr_firstUrl, buf); } } startKey = *(key96_t *)list.getLastKey(); startKey++; // watch out for wrap around if (startKey < *(key96_t *) list.getLastKey()) { break; } } cleanup(); return 0; }