#include "XmlDoc.h" #include "Collectiondb.h" #include "SpiderCache.h" #include "Titledb.h" #include "Doledb.h" #include "CountryCode.h" #include "Log.h" #include "Conf.h" #include "Mem.h" #include <libgen.h> static void print_usage(const char *argv0) { fprintf(stdout, "Usage: %s [-h] PATH DOCID\n", argv0); fprintf(stdout, "Print titlerec\n"); fprintf(stdout, "\n"); fprintf(stdout, " -h, --help display this help and exit\n"); } static void cleanup() { g_log.m_disabled = true; g_linkdb.reset(); g_clusterdb.reset(); g_spiderCache.reset(); g_doledb.reset(); g_spiderdb.reset(); g_tagdb.reset(); g_titledb.reset(); g_posdb.reset(); g_collectiondb.reset(); g_loop.reset(); } int main(int argc, char **argv) { if (argc < 3) { print_usage(argv[0]); return 1; } if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ) { print_usage(argv[0]); return 1; } g_log.m_disabled = true; // initialize library g_mem.init(); hashinit(); // current dir char path[PATH_MAX]; realpath(argv[1], path); size_t pathLen = strlen(path); if (path[pathLen] != '/') { strcat(path, "/"); } g_hostdb.init(-1, false, false, true, path); g_conf.init(path); ucInit(); // initialize rdbs g_loop.init(); g_collectiondb.loadAllCollRecs(); g_posdb.init(); g_titledb.init(); g_tagdb.init(); g_spiderdb.init(); g_doledb.init(); g_spiderCache.init(); g_clusterdb.init(); g_linkdb.init(); g_collectiondb.addRdbBaseToAllRdbsForEachCollRec(); g_log.m_disabled = false; g_log.m_logPrefix = false; CollectionRec *cr = g_collectiondb.getRec("main"); if (!cr) { logf(LOG_TRACE, "No main collection found"); } uint64_t docId = strtoul(argv[2], NULL, 10); logf(LOG_TRACE, "Getting titlerec for docId=%" PRIu64, docId); Msg5 msg5; RdbList list; key96_t startKey = Titledb::makeFirstKey(docId); key96_t endKey = Titledb::makeLastKey(docId); if (!msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) { logf(LOG_TRACE, "msg5.getlist didn't block"); cleanup(); exit(1); } if (list.isEmpty()) { logf(LOG_TRACE, "Unable to find titlerec for docId=%" PRIu64, docId); cleanup(); exit(1); } for (list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) { XmlDoc xmlDoc; if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) { logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId); cleanup(); exit(1); } logf(LOG_TRACE, "XmlDoc info"); logf(LOG_TRACE, "\tfirstUrl : %.*s", xmlDoc.size_firstUrl, xmlDoc.ptr_firstUrl); logf(LOG_TRACE, "\tredirUrl : %.*s", xmlDoc.size_redirUrl, xmlDoc.ptr_redirUrl); logf(LOG_TRACE, "\trootTitle : %.*s", xmlDoc.size_rootTitleBuf, xmlDoc.ptr_rootTitleBuf); // logf(LOG_TRACE, "\timageData :"); logf(LOG_TRACE, "\t"); loghex(LOG_TRACE, xmlDoc.ptr_utf8Content, xmlDoc.size_utf8Content, "\tutf8Content:"); logf(LOG_TRACE, "\tsite : %.*s", xmlDoc.size_site, xmlDoc.ptr_site); logf(LOG_TRACE, "\tlinkInfo (from titlerec)"); LinkInfo *linkInfo = xmlDoc.getLinkInfo1(); logf(LOG_TRACE, "\t\tm_numGoodInlinks : %d", linkInfo->m_numGoodInlinks); logf(LOG_TRACE, "\t\tm_numInlinksInternal : %d", linkInfo->m_numInlinksInternal); logf(LOG_TRACE, "\t\tm_numStoredInlinks : %d", linkInfo->m_numStoredInlinks); int i = 0; for (Inlink *inlink = linkInfo->getNextInlink(NULL); inlink; inlink = linkInfo->getNextInlink(inlink)) { logf(LOG_TRACE, "\t\tinlink #%d", i++); logf(LOG_TRACE, "\t\t\tdocId : %" PRIu64, inlink->m_docId); logf(LOG_TRACE, "\t\t\turl : %s", inlink->getUrl()); logf(LOG_TRACE, "\t\t\tlinktext : %s", inlink->getLinkText()); logf(LOG_TRACE, "\t\t\tcountry : %s", getCountryCode(inlink->m_country)); logf(LOG_TRACE, "\t\t\tlanguage : %s", getLanguageAbbr(inlink->m_language)); } loghex(LOG_TRACE, xmlDoc.ptr_linkdbData, xmlDoc.size_linkdbData, "\tlinkdbData"); logf(LOG_TRACE, "\ttagRec (from titlerec)"); TagRec *tagRec = xmlDoc.getTagRec(); for (Tag *tag = tagRec->getFirstTag(); tag; tag = tagRec->getNextTag(tag)) { SafeBuf sb; tag->printDataToBuf(&sb); logf(LOG_TRACE, "\t\t%-12s: %s", getTagStrFromType(tag->m_type), sb.getBufStart()); } logf(LOG_TRACE, "\t"); logf(LOG_TRACE, "Links info (from parsed titlerec)"); g_log.m_disabled = true; Links *links = xmlDoc.getLinks(); g_log.m_disabled = false; for (int i = 0; i < links->getNumLinks(); ++i) { Url u; u.set( links->getLinkPtr(i), links->getLinkLen(i), true, false ); uint32_t h32 = u.getHostHash32(); int64_t uh64 = hash64n(u.getUrl(), u.getUrlLen()); logf(LOG_TRACE, "\tlink : %.*s", links->getLinkLen(i), links->getLinkPtr(i)); logf(LOG_TRACE, "\tsitehash32: 0x%08" PRIx32 ", urlhash=0x%012" PRIx64 "", h32, uh64); } } cleanup(); return 0; }