87285ba3cd
since memcpy can't be interrupted and backtrace() called.
245 lines
5.7 KiB
C++
245 lines
5.7 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Pops.h"
|
|
#include "Words.h"
|
|
#include "StopWords.h"
|
|
#include "Speller.h"
|
|
|
|
Pops::Pops () {
|
|
m_pops = NULL;
|
|
}
|
|
|
|
Pops::~Pops() {
|
|
if ( m_pops && m_pops != (int32_t *)m_localBuf )
|
|
mfree ( m_pops , m_popsSize , "Pops" );
|
|
}
|
|
|
|
/*
|
|
// should be one for each host in the network
|
|
bool Pops::readPopFiles ( ) {
|
|
int32_t n = g_hostdb.getNumGroups();
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// note it
|
|
log(LOG_INIT,"db: Reading %s/pops.%"INT32" of %"INT32".",
|
|
g_conf.m_dir,i,n);
|
|
//
|
|
}
|
|
}
|
|
|
|
bool Pops::makeFinalPopFile ( char *coll ) {
|
|
|
|
int32_t n = g_hostdb.getNumGroups();
|
|
|
|
// tell each host to write his pop file to html directory
|
|
Msg3e msg3e;
|
|
for ( int32_t i = 0 ; i < n ; i++ )
|
|
msg3e.sendRequest ( );
|
|
|
|
// no more than 4096 groups supported for this now, but up later maybe
|
|
char *buf [ 4096 ];
|
|
|
|
// retrieve it from each host (msg3f = getFile)
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get over http
|
|
g_httpServer.getDoc ( ... );
|
|
// save to disk
|
|
out[i].write ( content , contentLen );
|
|
}
|
|
|
|
// merge out file
|
|
BigFile out;
|
|
|
|
// then merge all of them out
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
|
|
}
|
|
|
|
// merge final
|
|
|
|
// distribute final copy to all
|
|
|
|
// clean up locals
|
|
}
|
|
|
|
// . make the pop file from indexdb
|
|
// . a bunch of wordhash/#docs pairs
|
|
// . word hash is lower 4 bytes of the termid
|
|
// . first int64_t in file is the # of docs
|
|
bool Pops::makeLocalPopFile ( char *coll ) {
|
|
// get the rdbmap of the first indexdb file
|
|
RdbBase *base = g_indexdb.getBase ( coll );
|
|
//RdbMap *map = base->getMap(0);
|
|
if ( ! base )
|
|
return log("admin: Collection \"%s\" does not exist.",coll);
|
|
BigFile *f = base->getFile(0);
|
|
// term must be in at least this many docs
|
|
int32_t minDocs = 4000;
|
|
// log it
|
|
log(LOG_INFO,"admin: Making popularity file from %s for coll \"%s\".",
|
|
f->getFilename(),coll);
|
|
log(LOG_INFO,"admin: Using cutoff of %"INT32" docs.",minDocs);
|
|
|
|
// output the wordId/count pairs to this file
|
|
BigFile out;
|
|
char outFilename[256];
|
|
sprintf(outFilename,"%s/popout.%"INT32"",g_conf.m_dir,g_hostdb.m_hostId);
|
|
out.set ( outFilename );
|
|
|
|
// store # of docs
|
|
int64_t n = g_titledb.getGlobalNumDocs();
|
|
out.write ( &n , 8 );
|
|
|
|
// store key read from disk into here
|
|
char tmp [ MAX_KEY_BYTES ];
|
|
|
|
//
|
|
//
|
|
// this part is taken from main.cpp:dumpIndexdb()
|
|
//
|
|
//
|
|
char buf [ 1000000 ];
|
|
int32_t bufSize = 1000000;
|
|
if ( ! f.open ( O_RDONLY ) ) return;
|
|
// init our vars
|
|
bool haveTop = false;
|
|
char top[6];
|
|
memset ( top , 0 , 6 );
|
|
bool warned = false;
|
|
// how big is this guy?
|
|
int64_t filesize = f.getFileSize();
|
|
// reset error number
|
|
g_errno = 0;
|
|
// the big read loop
|
|
loop:
|
|
int64_t readSize = bufSize;
|
|
if ( off + readSize > filesize ) readSize = filesize - off;
|
|
// return if we're done reading the whole file
|
|
if ( readSize <= 0 ) return;
|
|
// read in as much as we can
|
|
f.read ( buf , readSize , off );
|
|
// bail on read error
|
|
if ( g_errno ) {
|
|
log("admin: Read of %s failed.",f.getFilename());
|
|
return;
|
|
}
|
|
char *p = buf;
|
|
char *pend = buf + readSize;
|
|
inner:
|
|
// parse out the keys
|
|
int32_t size;
|
|
if ( ((*p) & 0x02) == 0x00 ) size = ks;
|
|
else size = ks-6;
|
|
if ( p + size > pend ) {
|
|
// skip what we read
|
|
off += readSize ;
|
|
// back up so we don't split a key we should not
|
|
off -= ( pend - p );
|
|
// read more
|
|
goto loop;
|
|
}
|
|
// new top?
|
|
if ( size == ks ) { gbmemcpy ( top , p + (ks-6) , 6 ); haveTop = true; }
|
|
// warning msg
|
|
if ( ! haveTop && ! warned ) {
|
|
warned = true;
|
|
log("admin: Warning: first key is a half key.");
|
|
}
|
|
|
|
//
|
|
// BUT i added this part to the main.cpp stuff
|
|
//
|
|
|
|
// was it the same as last key?
|
|
if ( ks == 6 )
|
|
count++;
|
|
// ok, this starts a new key
|
|
else {
|
|
// did the previous key meet the min count requirement?
|
|
if ( count >= minDocs ) {
|
|
// if so, store the upper 4 bytes of the termid
|
|
int32_t h;
|
|
gbmemcpy ( &h , tmp+8 , 4 );
|
|
// write it out
|
|
out.write ( &h , 4 );
|
|
// and the count
|
|
out.write ( &count , 4 );
|
|
}
|
|
// reset, we got a new termid
|
|
count = 1;
|
|
}
|
|
|
|
//
|
|
// end new stuff
|
|
//
|
|
|
|
|
|
// make the key
|
|
gbmemcpy ( tmp , p , ks-6 );
|
|
gbmemcpy ( tmp + ks-6 , top , 6 );
|
|
// print the key
|
|
//if ( ks == 12 )
|
|
// fprintf(stdout,"%08lli) %08"XINT32" %016"XINT64"\n",
|
|
// off + (p - buf) ,
|
|
// *(int32_t *)(tmp+8),*(int64_t *)tmp );
|
|
//else
|
|
// fprintf(stdout,"%08lli) %016"XINT64" %016"XINT64"\n",
|
|
// off + (p - buf) ,
|
|
// *(int64_t *)(tmp+8),*(int64_t *)tmp );
|
|
|
|
// go to next key
|
|
p += size;
|
|
// loop up
|
|
goto inner;
|
|
|
|
|
|
|
|
}
|
|
*/
|
|
|
|
|
|
bool Pops::set ( Words *words , int32_t a , int32_t b ) {
|
|
int32_t nw = words->getNumWords();
|
|
int64_t *wids = words->getWordIds ();
|
|
char **wp = words->m_words;
|
|
int32_t *wlen = words->m_wordLens;
|
|
|
|
// point to scores
|
|
//int32_t *ss = NULL;
|
|
//if ( scores ) ss = scores->m_scores;
|
|
|
|
int32_t need = nw * 4;
|
|
if ( need > POPS_BUF_SIZE ) m_pops = (int32_t *)mmalloc(need,"Pops");
|
|
else m_pops = (int32_t *)m_localBuf;
|
|
if ( ! m_pops ) return false;
|
|
m_popsSize = need;
|
|
|
|
for ( int32_t i = a ; i < b && i < nw ; i++ ) {
|
|
// skip if not indexable
|
|
if ( ! wids[i] ) { m_pops[i] = 0; continue; }
|
|
// or if score <= 0
|
|
//if ( ss && ss[i] <= 0 ) { m_pops[i] = 0; continue; }
|
|
// it it a common word? like "and" "the"... see StopWords.cpp
|
|
/*
|
|
if ( isCommonWord ( (int32_t)wids[i] ) ) {
|
|
max:
|
|
m_pops[i] = MAX_POP;
|
|
continue;
|
|
}
|
|
|
|
else if ( wlen[i] <= 1 && is_lower(wp[i][0]) )
|
|
goto max;
|
|
*/
|
|
// once again for the 50th time partap's utf16 crap gets in
|
|
// the way... we have to have all kinds of different hashing
|
|
// methods because of it...
|
|
uint64_t key ; // = wids[i];
|
|
key = hash64d(wp[i],wlen[i]);
|
|
m_pops[i] = g_speller.getPhrasePopularity(wp[i], key,true);
|
|
// sanity check
|
|
if ( m_pops[i] < 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( m_pops[i] == 0 ) m_pops[i] = 1;
|
|
}
|
|
return true;
|
|
}
|
|
|