privacore-open-source-searc.../DumpSpiderdbSqlite.cpp
Ivan Skytte Jørgensen beeddcf35d Got rid of gb-include.h
2018-07-26 17:29:51 +02:00

309 lines
11 KiB
C++

#include "DumpSpiderdbSqlite.h"
#include "Collectiondb.h"
#include "SpiderdbSqlite.h"
#include "ip.h"
#include "Lang.h"
#include "Errno.h"
#include <time.h>
static const char *formatTime(time_t when, char buf[32]) {
struct tm tmp;
gmtime_r(&when,&tmp);
strftime(buf,32,"%Y-%m-%dT%H:%M:%SZ",&tmp);
return buf;
}
static const char *formatRequestFlags(int rf, char *buf) {
SpiderdbRequestFlags flags(rf);
sprintf(buf,"%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s",
flags.m_isAddUrl ? "au":"",
flags.m_isPageReindex ? "Ri":"",
flags.m_isUrlCanonical ? "Cn":"",
flags.m_isPageParser ? "PP":"",
flags.m_urlIsDocId ? "Di":"",
flags.m_isRSSExt ? "rs":"",
flags.m_isUrlPermalinkFormat ? "pl":"",
flags.m_recycleContent ? "rc":"",
flags.m_forceDelete ? "fd":"",
flags.m_forceDelete ? "ij":"",
flags.m_hadReply ? "rp":"",
flags.m_fakeFirstIp ? "ff":"",
flags.m_hasAuthorityInlink ? "ai":"",
flags.m_hasAuthorityInlinkValid ? "aV":"",
flags.m_avoidSpiderLinks ? "sl":"");
return buf;
}
static const char *formatReplyFlags(int rf, char *buf) {
SpiderdbReplyFlags flags(rf);
sprintf(buf,"%s|%s|%s|%s|%s",
flags.m_isRSS ? "rs":"",
flags.m_isPermalink ? "pl":"",
flags.m_isIndexed ? "ix":"",
flags.m_fromInjectionRequest ? "ir":"",
flags.m_isIndexedINValid ? "iX":"");
return buf;
}
void dumpSpiderdbSqliteCsv(const char *collname, bool interpret_values) {
const CollectionRec *cr = g_collectiondb.getRec(collname);
if(!cr) {
fprintf(stderr,"Unknown collection: %s\n", collname);
return;
}
sqlite3 *db = g_spiderdb_sqlite.getDb(cr->m_collnum);
if(!db) {
fprintf(stderr,"Could not open spiderdb for collection: %s (%d)\n", collname, cr->m_collnum);
return;
}
static const char statement_text[] =
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
" m_replyFlags"
" FROM spiderdb"
" ORDER BY m_firstIp, m_uh48";
const char *pzTail="";
sqlite3_stmt *stmt;
if(sqlite3_prepare_v2(db, statement_text, -1, &stmt, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
g_errno = EBADENGINEER;
return;
}
if(interpret_values)
printf("#%14s,%15s,%10s,%10s,%10s,%5s,%5s,%20s,%20s,%10s,%25s,%3s,%3s,%s,%s,%s,%s,%s,%s,%s,%s\n", "firstip","uh48","hosthash","domhash","sitehash","slink","plink","add-time","discovery-time","chash","reqflags","pri","err","sme","url","pctchange","spidertime","errcode","http","lang","replyflags");
else
printf("#%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n", "firstip","uh48","hosthash","domhash","sitehash","slink","plink","add-time","discovery-time","chash","reqflags","pri","err","sme","url","pctchange","spidertime","errcode","http","lang","replyflags");
int rc;
while((rc=sqlite3_step(stmt))==SQLITE_ROW) {
//fetch all columns. null checks are done later
int32_t firstIp = sqlite3_column_int(stmt, 0);
int64_t uh48 = sqlite3_column_int64(stmt, 1);
int32_t hosthash32 = sqlite3_column_int(stmt, 2);
int32_t domHash32 = sqlite3_column_int(stmt, 3);
int32_t siteHash32 = sqlite3_column_int(stmt, 4);
int32_t siteNumInlinks = sqlite3_column_int(stmt, 5);
int32_t pageNumInlinks = sqlite3_column_int(stmt, 6);
int32_t addedTime = sqlite3_column_int(stmt, 7);
int32_t discoveryTime = sqlite3_column_int(stmt, 8);
int32_t contentHash32 = sqlite3_column_int(stmt, 9);
int32_t requestFlags = sqlite3_column_int(stmt, 10);
int32_t priority = sqlite3_column_int(stmt, 11);
int32_t errCount = sqlite3_column_int(stmt, 12);
int32_t sameErrCount = sqlite3_column_int(stmt, 13);
const unsigned char *url = sqlite3_column_text(stmt, 14);
double percentChangedPerDay = sqlite3_column_double(stmt, 15);
int32_t spideredTime = sqlite3_column_int(stmt, 16);
int32_t errCode = sqlite3_column_int(stmt, 17);
int32_t httpStatus = sqlite3_column_int(stmt, 18);
int32_t langId = sqlite3_column_int(stmt, 19);
int32_t replyFlags = sqlite3_column_int(stmt, 20);
char firstIpBuf[16];
char timebuf[32];
char requestflagbuf[32];
char replyflagsbuf[32];
if(interpret_values)
printf("%15s,", iptoa(firstIp,firstIpBuf));
else
printf("%10u,", firstIp);
printf("%15lu,", uh48);
printf("%10u,",hosthash32);
printf("%10u,",domHash32);
printf("%10u,",siteHash32);
printf("%5d,",siteNumInlinks);
printf("%5u,",pageNumInlinks);
if(interpret_values)
printf("%s,",formatTime(addedTime,timebuf));
else
printf("%10d,",addedTime);
if(interpret_values)
printf("%s,",formatTime(discoveryTime,timebuf));
else
printf("%10d,",discoveryTime);
if(sqlite3_column_type(stmt,9)!=SQLITE_NULL)
printf("%10u,",contentHash32);
else
printf("%10s,","");
if(interpret_values)
printf("%-25s,",formatRequestFlags(requestFlags,requestflagbuf));
else
printf("%5u,",requestFlags);
if(sqlite3_column_type(stmt,11)!=SQLITE_NULL)
printf("%3u,",priority);
else
printf("%3s,","");
printf("%3u,",errCount);
printf("%3u,",sameErrCount);
printf("%s,",url);
if(sqlite3_column_type(stmt,20)!=SQLITE_NULL) {
printf("%.2f,",percentChangedPerDay);
if(interpret_values)
printf("%s,",formatTime(spideredTime,timebuf));
else
printf("%u,",spideredTime);
printf("%u,",errCode);
printf("%u,",httpStatus);
printf("%u,",langId);
if(interpret_values)
printf("%s",formatReplyFlags(replyFlags,replyflagsbuf));
else
printf("%u,",replyFlags);
} else
printf(",,,,,,");
printf("\n");
}
sqlite3_finalize(stmt);
}
void dumpSpiderdbSqlite(const char *collname, int32_t firstIp) {
const CollectionRec *cr = g_collectiondb.getRec(collname);
if(!cr) {
fprintf(stderr,"Unknown collection: %s\n", collname);
return;
}
sqlite3 *db = g_spiderdb_sqlite.getDb(cr->m_collnum);
if(!db) {
fprintf(stderr,"Could not open spiderdb for collection: %s (%d)\n", collname, cr->m_collnum);
return;
}
static const char statement_text[] =
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
" m_replyFlags"
" FROM spiderdb"
" ORDER BY m_firstIp, m_uh48";
static const char statement_firstip_text[] =
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
" m_replyFlags"
" FROM spiderdb"
" WHERE m_firstIp = ?"
" ORDER BY m_firstIp, m_uh48";
const char *pzTail="";
sqlite3_stmt *stmt;
if(sqlite3_prepare_v2(db, (firstIp == 0) ? statement_text : statement_firstip_text, -1, &stmt, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
g_errno = EBADENGINEER;
return;
}
if (firstIp != 0) {
sqlite3_bind_int64(stmt, 1, (uint32_t)firstIp);
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
//fetch all columns. null checks are done later
int32_t firstIp = sqlite3_column_int(stmt, 0);
int64_t uh48 = sqlite3_column_int64(stmt, 1);
int32_t hosthash32 = sqlite3_column_int(stmt, 2);
int32_t domHash32 = sqlite3_column_int(stmt, 3);
int32_t siteHash32 = sqlite3_column_int(stmt, 4);
int32_t siteNumInlinks = sqlite3_column_int(stmt, 5);
int32_t pageNumInlinks = sqlite3_column_int(stmt, 6);
int32_t addedTime = sqlite3_column_int(stmt, 7);
int32_t discoveryTime = sqlite3_column_int(stmt, 8);
int32_t contentHash32 = sqlite3_column_int(stmt, 9);
int32_t requestFlags = sqlite3_column_int(stmt, 10);
int32_t priority = sqlite3_column_int(stmt, 11);
int32_t errCount = sqlite3_column_int(stmt, 12);
int32_t sameErrCount = sqlite3_column_int(stmt, 13);
const unsigned char *url = sqlite3_column_text(stmt, 14);
double percentChangedPerDay = sqlite3_column_double(stmt, 15);
int32_t spideredTime = sqlite3_column_int(stmt, 16);
int32_t errCode = sqlite3_column_int(stmt, 17);
int32_t httpStatus = sqlite3_column_int(stmt, 18);
int32_t langId = sqlite3_column_int(stmt, 19);
int32_t replyFlags = sqlite3_column_int(stmt, 20);
printf("uh48=%lu", uh48);
char firstIpBuf[16];
printf(" firstip=%s", iptoa(firstIp,firstIpBuf));
printf(" hostHash32=0x%x", hosthash32);
printf(" domHash32=0x%x", domHash32);
printf(" siteHash32=0x%x", siteHash32);
char timebuf[32];
printf(" addedTime=%s",formatTime(addedTime,timebuf));
printf(" siteNumInLinks=%d", siteNumInlinks);
printf(" pageNumInLinks=%u", pageNumInlinks);
if(sqlite3_column_type(stmt,11)!=SQLITE_NULL)
printf(" priority=%u",priority);
else
printf(" priority=%s","-1");
SpiderdbRequestFlags reqflags(requestFlags);
if (reqflags.m_isAddUrl) printf(" ISADDURL");
if (reqflags.m_isPageReindex) printf("ISPAGEREINDEX");
if (reqflags.m_isPageParser) printf(" ISPAGEPARSER");
if (reqflags.m_urlIsDocId) printf(" URLISDOCID");
if (reqflags.m_isRSSExt) printf(" ISRSSEXT");
if (reqflags.m_isUrlPermalinkFormat) printf(" ISURLPERMALINKFORMAT");
if (reqflags.m_fakeFirstIp) printf(" ISFAKEFIRSTIP");
if (reqflags.m_isInjecting) printf(" ISINJECTING");
if (reqflags.m_forceDelete) printf(" FORCEDELETE");
if (reqflags.m_hasAuthorityInlink) printf(" HASAUTHORITYINLINK");
if (reqflags.m_avoidSpiderLinks) printf(" AVOIDSPIDERLINKS");
printf(" url=%s", url);
if (reqflags.m_hadReply) {
printf(" hadReply=1");
}
printf(" discoveryTime=%s",formatTime(discoveryTime,timebuf));
if(sqlite3_column_type(stmt,9)!=SQLITE_NULL)
printf(" ch32=%u",contentHash32);
else
printf(" ch32=0");
printf(" errorCount=%u",errCount);
printf(" sameErrorCount=%u",sameErrCount);
if(sqlite3_column_type(stmt,20)!=SQLITE_NULL) {
printf(" percentChangedPerDay=%.2f%%", percentChangedPerDay);
printf(" spideredTime=%s", formatTime(spideredTime, timebuf));
printf(" errCode=%s(%u)", mstrerror(errCode), errCode);
printf(" httpStatus=%u", httpStatus);
printf(" langId=%s(%u)", getLanguageString(langId), langId);
SpiderdbReplyFlags repFlags(replyFlags);
if (repFlags.m_isRSS) printf(" ISRSS");
if (repFlags.m_isPermalink) printf(" ISPERMALINK");
if (!repFlags.m_isIndexedINValid && repFlags.m_isIndexed) {
printf(" ISINDEXED");
}
}
printf("\n");
}
sqlite3_finalize(stmt);
}