use ./gb dump z main 0 -1 1 to generate sitelinks.txt

This commit is contained in:
Matt 2015-01-25 18:45:40 -07:00
parent e7a12fc2e5
commit 1ef3932b32
2 changed files with 51 additions and 6 deletions

@ -35,7 +35,9 @@ int32_t Tag::print ( ) {
bool Tag::printToBuf ( SafeBuf *sb ) {
sb->safePrintf("k.hsthash=%016"XINT64" k.duphash=%08"XINT32" k.sitehash=%08"XINT32" ",
sb->safePrintf("k.hsthash=%016"XINT64" "
"k.duphash=%08"XINT32" "
"k.sitehash=%08"XINT32" ",
m_key.n1,
(int32_t)(m_key.n0>>32),
(int32_t)(m_key.n0&0xffffffff));

@ -2835,6 +2835,9 @@ int main2 ( int argc , char *argv[] ) {
dumpRevdb(coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 'S' )
dumpTagdb (coll,startFileNum,numFiles,includeTree,0);
else if ( argv[cmdarg+1][0] == 'z' )
dumpTagdb (coll,startFileNum,numFiles,includeTree,0,
'z');
else if ( argv[cmdarg+1][0] == 'A' )
dumpTagdb (coll,startFileNum,numFiles,includeTree,0,
'A');
@ -11924,7 +11927,8 @@ void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTre
}
void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
bool includeTree,
int32_t c , char req, int32_t rdbId ) {
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_tagdb.init ();
@ -11952,6 +11956,13 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTre
if ( req == 'D') strcpy(action,"&deleterec=1&useNew=1");
CollectionRec *cr = g_collectiondb.getRec(coll);
int64_t hostHash = -1;
int64_t lastHostHash = -2;
char *site = NULL;
int32_t siteNumInlinks = -1;
int32_t typeSite = hash64Lower_a("site",4);
int32_t typeInlinks = hash64Lower_a("sitenuminlinksuniquecblock",26);
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( rdbId, //RDB_TAGDB ,
@ -11984,7 +11995,9 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTre
int32_t size = list.getCurrentDataSize();
// is it a delete?
if ( (k.n0 & 0x01) == 0 ) {
printf("k.n1=%016"XINT64" k.n0=%016"XINT64" (delete)\n",
if ( req == 'z' ) continue;
printf("k.n1=%016"XINT64" "
"k.n0=%016"XINT64" (delete)\n",
k.n1 , k.n0 | 0x01 ); // fix it!
continue;
}
@ -12005,7 +12018,8 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTre
size ,
false);
fprintf(stdout,
"key=%s caturl=%s #catids=%"INT32" version=%"INT32"\n"
"key=%s caturl=%s #catids=%"INT32" "
"version=%"INT32"\n"
,KEYSTR(&k,12)
,crec.m_url
,(int32_t)crec.m_numCatids
@ -12019,6 +12033,35 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTre
// print the version and site
char tmpBuf[1024];
SafeBuf sb(tmpBuf, 1024);
// making sitelist.txt?
if ( tag->m_type == typeSite ) {
hostHash = tag->m_key.n1;
site = tag->getTagData();
if ( lastHostHash == hostHash && siteNumInlinks>=0) {
if ( siteNumInlinks > 0 )
printf("%i %s\n",siteNumInlinks,site);
siteNumInlinks = -1;
}
lastHostHash = hostHash;
continue;
}
if ( tag->m_type == typeInlinks ) {
hostHash = tag->m_key.n1;
siteNumInlinks = atoi(tag->getTagData());
if ( lastHostHash == hostHash && site ) {
if ( siteNumInlinks > 0 )
printf("%i %s\n",siteNumInlinks,site);
site = NULL;
}
lastHostHash = hostHash;
continue;
}
if ( req == 'z' )
continue;
// print as an add request or just normal
if ( req == 'A' ) tag->printToBufAsAddRequest ( &sb );
else tag->printToBuf ( &sb );
@ -15804,7 +15847,7 @@ void membustest ( int32_t nb , int32_t loops , bool readf ) {
// time stamp
int64_t t = gettimeofdayInMilliseconds();
fprintf(stderr,"memtest: start = %llu\n",t);
fprintf(stderr,"memtest: start = %"INT64"\n",t);
// . time the read loop
// . each read should only be 2 assenbly movl instructions:
@ -15860,7 +15903,7 @@ void membustest ( int32_t nb , int32_t loops , bool readf ) {
g_clockNeedsUpdate = true;
// completed
int64_t now = gettimeofdayInMilliseconds();
fprintf(stderr,"memtest: now = %llu\n",t);
fprintf(stderr,"memtest: now = %"INT64"\n",t);
// multiply by 4 since these are int32_ts
char *op = "read";
if ( ! readf ) op = "wrote";