simplify Inlinks class in LinkInfo.cpp.

fix some more 64-bit related cores.
This commit is contained in:
Matt
2014-11-18 16:50:31 -08:00
parent 0032052930
commit 2977845375
10 changed files with 113 additions and 85 deletions

@ -1194,7 +1194,10 @@ void DiskPageCache::writeToCache( int32_t bigOff, int32_t smallOff, void *inBuf
memcpy(p + smallOff, inBuf, size);
}
void DiskPageCache::readFromCache( void *outBuf, int32_t bigOff, int32_t smallOff,
// . store cached disk info into "outBuf". up to "size" bytes of it.
void DiskPageCache::readFromCache( void *outBuf,
int32_t bigOff,
int32_t smallOff,
int32_t size ){
#ifdef GBUSESHM
if ( m_useSHM ) {

@ -708,7 +708,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
// set up the hashtable if our first time
if ( ! g_lineTable.isInitialized() )
g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
g_lineTable.set ( 8,sizeof(Msg25Request *),256,
NULL,0,false,MAX_NICENESS,"lht25");
// . if already working on this same request, wait for it, don't
// overload server with duplicate requests
@ -976,7 +977,8 @@ bool Msg25::getLinkInfo2( char *site ,
m_adBanTable.reset();
m_adBanTable.set(4,0,0,NULL,0,false,m_niceness,"adbans");
m_table.set (4,4,0,NULL,0,false,m_niceness,"msg25tab");
m_table.set (4,sizeof(NoteEntry *),0,
NULL,0,false,m_niceness,"msg25tab");
QUICKPOLL(m_niceness);
@ -3934,7 +3936,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
// set our header
info->m_version = 0;
info->m_size = need;
info->m_lisize = need;
info->m_lastUpdated = lastUpdateTime;//getTimeGlobal();
// how many Inlinks we stored in info->m_buf[]
info->m_numStoredInlinks = count;
@ -4024,8 +4026,8 @@ LinkInfo *makeLinkInfo ( char *coll ,
return info;
}
static Inlink s_inlink;
static Inlink *s_orig;
static Inlink s_inlink;
// if we are an old version, we have to set s_inlink and return
// a ptr to that
@ -4037,16 +4039,16 @@ Inlink *LinkInfo::getNextInlink ( Inlink *k ) {
// if none, we are done
if ( ! p ) return p;
// sanity checks
if (p->m_numStrings==0 && p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
if (p->m_numStrings && p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
//if(p->m_numStrings==0&& p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
//if(p->m_numStrings&& p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
// fix this for the really old guy. we did not store these two
// things initially, but they should have been set to this...
// luckily, we had a "reserved1" int32_t...
if ( p->m_numStrings == 0 ) {
// urlBuf,linkText,surroudingText,rssItem
p->m_numStrings = 4;
p->m_firstStrPtrOffset = 64;
}
// if ( p->m_numStrings == 0 ) {
// // urlBuf,linkText,surroudingText,rssItem
// p->m_numStrings = 4;
// p->m_firstStrPtrOffset = 64;
// }
// MDW: now we just use offsets for 64bit conversion so no ptrs...
// if latest, return that
//if ( p->m_numStrings == p->getBaseNumStrings() &&
@ -4078,7 +4080,7 @@ Inlink *LinkInfo::getNextInlink2 ( Inlink *k ) {
// get the inlink to return
Inlink *next = (Inlink *)((char *)k + size);
// return NULL if breached
if ( (char *)next >= ((char *)this)+m_size ) return NULL;
if ( (char *)next >= ((char *)this)+m_lisize ) return NULL;
// otherwise, we are still good
return next;
}
@ -4337,9 +4339,9 @@ void Inlink::set ( Msg20Reply *r ) {
// . these two things are used for version-based deserializing
// . our current version has 5 strings
m_numStrings = getBaseNumStrings();
//m_numStrings = getBaseNumStrings();
// and our current string offset
m_firstStrPtrOffset = (char *)getFirstOffPtr() - (char *)this;
//m_firstStrPtrOffset = (char *)getFirstOffPtr() - (char *)this;
// set ourselves now
m_ip = r->m_ip;
@ -4525,7 +4527,7 @@ int32_t Inlink::updateStringPtrs ( char *buf ) {
void Inlink::reset ( ) {
// clear ourselves out
memset ( (char *)this,0,sizeof(Inlink) );
memset ( (char *)this,0,sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE);
}
// . set a new Inlink from an older versioned Inlink
@ -4534,14 +4536,17 @@ void Inlink::set2 ( Inlink *old ) {
// clear ouselves
reset();
// copy what is legit to us
int fullSize = sizeof(Inlink);
//int fullSize = sizeof(Inlink);
// add in the sizes of all strings
int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
for ( ; sizePtr <= sizeEnd ; sizePtr++ )
fullSize += *sizePtr;
//int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
//int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
//for ( ; sizePtr <= sizeEnd ; sizePtr++ )
// fullSize += *sizePtr;
int fullSize = old->getStoredSize();
// return how many bytes we processed
memcpy ( (char *)this , (char *)old , fullSize );
return;
// this old way is pre-64bit
@ -4577,18 +4582,28 @@ void Inlink::set2 ( Inlink *old ) {
int32_t Inlink::getStoredSize ( ) {
//int32_t size = (int32_t)sizeof(Msg);
//int32_t size = getBaseSize();
int32_t size = m_firstStrPtrOffset;
// add in string offsets AND size ptrs
size += 8 * m_numStrings;
int32_t size = sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE;
size += size_urlBuf;
size += size_linkText;
size += size_surroundingText;
size += size_rssItem;
size += size_categories;
size += size_gigabitQuery;
size += size_templateVector;
return size;
// add in string offsets AND size, 4 bytes each
//size += 8 * m_numStrings;
// start of first offset
// int32_t *sizePtr = &size_urlBuf;
// int32_t *sizeEnd = (int32_t *)((char *)this + sizeof(Inlink));
// add up string buffer sizes
//int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
//int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
int32_t *sizePtr =
(int32_t *)((char *)this + m_firstStrPtrOffset+4*m_numStrings);
int32_t *sizeEnd = sizePtr + m_numStrings;
for ( ; sizePtr < sizeEnd ; sizePtr++ )
size += *sizePtr;
return size;
//int32_t *sizePtr =
// (int32_t *)((char *)this + m_firstStrPtrOffset+4*m_numStrings);
//int32_t *sizeEnd = sizePtr + m_numStrings;
}
// . return ptr to the buffer we serialize into
@ -4611,38 +4626,40 @@ char *Inlink::serialize ( int32_t *retSize ,
// copy the easy stuff
char *p = buf;
char *pend = buf + need;
memcpy ( p , (char *)this , getBaseSize() );
p += getBaseSize();
// then store the strings!
int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
int32_t *offPtr = getFirstOffPtr (); // &ptr_qbuf;
for ( ; sizePtr <= sizeEnd ; ) {
if ( p > pend ) { char *xx=NULL;*xx=0; }
// if we are NULL, we are a "bookmark", so
// we alloc'd space for it, but don't copy into
// the space until after this call toe serialize()
// MDW: we can't use NULL now because we are offsets and 0 is
// legit. because of the 64bit conversion.
// well if empty, *sizePtr will be 0... so we don't need this.
//if ( *offPtr == -1 ) goto skip;
// sanity check -- cannot copy onto ourselves
if ( p > m_buf+*offPtr && p < m_buf+*offPtr + *sizePtr ) {
char *xx = NULL; *xx = 0; }
// copy the string into the buffer
memcpy ( p , m_buf + *offPtr , *sizePtr );
//skip:
// . make it point into the buffer now
// . MDW: why? that is causing problems for the re-call in
// Msg3a, it calls this twice with the same "m_r"
// . MDW: took out for 64bit
//if ( makePtrsRefNewBuf ) *offPtr = (p-buf);
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
sizePtr++;
offPtr++;
}
memcpy ( p , (char *)this , need );
p += need;
if ( p != pend ) { char *xx=NULL;*xx=0; }
// int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
// int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
// int32_t *offPtr = getFirstOffPtr (); // &ptr_qbuf;
// for ( ; sizePtr <= sizeEnd ; ) {
// if ( p > pend ) { char *xx=NULL;*xx=0; }
// // if we are NULL, we are a "bookmark", so
// // we alloc'd space for it, but don't copy into
// // the space until after this call toe serialize()
// // MDW: we can't use NULL now because we are offsets and 0 is
// // legit. because of the 64bit conversion.
// // well if empty, *sizePtr will be 0... so we don't need this.
// //if ( *offPtr == -1 ) goto skip;
// // sanity check -- cannot copy onto ourselves
// if ( p > m_buf+*offPtr && p < m_buf+*offPtr + *sizePtr ) {
// char *xx = NULL; *xx = 0; }
// // copy the string into the buffer
// memcpy ( p , m_buf + *offPtr , *sizePtr );
// //skip:
// // . make it point into the buffer now
// // . MDW: why? that is causing problems for the re-call in
// // Msg3a, it calls this twice with the same "m_r"
// // . MDW: took out for 64bit
// //if ( makePtrsRefNewBuf ) *offPtr = (p-buf);
// // advance our destination ptr
// p += *sizePtr;
// // advance both ptrs to next string
// sizePtr++;
// offPtr++;
// }
return buf;
}

@ -733,8 +733,8 @@ class LinkInfo {
public:
int32_t getStoredSize ( ) { return m_size; };
int32_t getSize ( ) { return m_size; };
int32_t getStoredSize ( ) { return m_lisize; };
int32_t getSize ( ) { return m_lisize; };
time_t getLastUpdated ( ) { return m_lastUpdated; };
//int32_t getNumTotalInlinks ( ) {
@ -798,7 +798,8 @@ class LinkInfo {
char m_numInlinksInternal;
char m_reserved1; // was m_siteRootQuality
char m_reserved2;
int32_t m_size;
// includes Inlinks in m_buf[] below
int32_t m_lisize;
time_t m_lastUpdated;
// this is precisely how many inlinks we stored in m_buf[] below
int32_t m_numStoredInlinks;//m_numTotalInlinks;
@ -827,18 +828,20 @@ class LinkInfo {
};
#define MAXINLINKSTRINGBUFSIZE 2048
class Inlink { // : public Msg {
public:
int32_t *getFirstSizeParm () { return &size_urlBuf; };
int32_t *getLastSizeParm () { return &size_rssItem; };
int32_t *getFirstOffPtr () { return &off_urlBuf; };
int32_t getBaseSize () { return sizeof(Inlink);};
char *getStringBuf () { return m_buf; };
//int32_t *getFirstSizeParm () { return &size_urlBuf; };
//int32_t *getLastSizeParm () { return &size_rssItem; };
//int32_t *getFirstOffPtr () { return &off_urlBuf; };
//int32_t getBaseSize () { return sizeof(Inlink);};
//char *getStringBuf () { return m_buf; };
int32_t getBaseNumStrings() {
return (char **)&size_urlBuf - (char **)&off_urlBuf; };
//int32_t getBaseNumStrings() {
// return (char **)&size_urlBuf - (char **)&off_urlBuf; };
// zero ourselves out
void reset() ;
@ -896,10 +899,10 @@ class Inlink { // : public Msg {
// . int32_t m_reserved1 ;
// . how many strings do we have?
// . makes it easy to add new strings later
uint16_t m_numStrings ;
uint16_t m_reserved_NumStrings ;
// . and were our first string ptrs starts
// . allows us to set ourselves from an "old" Inlink
uint16_t m_firstStrPtrOffset ;
uint16_t m_reserved_FirstStrPtrOffset ;
uint16_t m_numOutlinks ;
// i guess no need to store this stuff if we are storing the url
@ -1029,7 +1032,7 @@ class Inlink { // : public Msg {
int32_t size_templateVector ;
char m_buf[0] ;
char m_buf[MAXINLINKSTRINGBUFSIZE] ;
};
// . this function is normally called like "info = makeLinkInfo()"

@ -717,7 +717,7 @@ int32_t Msg20Reply::deserialize ( ) {
strPtr++;
}
// sanity
if ( ptr_linkInfo && ((LinkInfo *)ptr_linkInfo)->m_size !=
if ( ptr_linkInfo && ((LinkInfo *)ptr_linkInfo)->m_lisize !=
size_linkInfo ) {
log("xmldoc: deserialize msg20 reply corruption error");
log("xmldoc: DO YOU NEED TO NUKE CACHEDB.DAT?????");

@ -3340,7 +3340,7 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
// and stale. Both are really only for BuzzLogic.
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
// sanity
if ( info && mr->size_linkInfo != info->m_size ){char *xx=NULL;*xx=0; }
if ( info && mr->size_linkInfo!=info->m_lisize ){char *xx=NULL;*xx=0; }
// NULLify if empty
if ( mr->size_linkInfo <= 0 ) info = NULL;
// do not both if none
@ -7408,7 +7408,7 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , int32_t catId ,
sb->htmlEncode ( qstr , qlen , false );
// if it was an advanced search, this can be empty
if ( qlen == 0 && si->m_displayQuery )
if ( qlen == 0 && si && si->m_displayQuery )
sb->htmlEncode ( si->m_displayQuery );
sb->safePrintf ("\">"

@ -2714,6 +2714,11 @@ int64_t Rdb::getNumGlobalRecs ( ) {
// . return number of positive records - negative records
int64_t Rdb::getNumTotalRecs ( bool useCache ) {
// are we catdb or statsdb? then we have no associated collections
// because we are used globally, by all collections
if ( m_isCollectionLess )
return m_collectionlessBase->getNumTotalRecs();
// this gets slammed w/ too many collections so use a cache...
//if ( g_collectiondb.m_numRecsUsed > 10 ) {
int32_t now = 0;

@ -795,7 +795,7 @@ bool RdbCache::addRecord ( collnum_t collnum ,
if ( need >= m_totalBufSize )
return log(LOG_INFO,
"db: Could not fit record of %"INT32" bytes into %s "
"cache. Max size is %"INT64".",need,m_dbname,
"cache. Max size is %"INT32".",need,m_dbname,
m_totalBufSize);
if ( need >= BUFSIZE )
return log(LOG_INFO,

@ -334,7 +334,7 @@ class RdbCache {
char *m_bufs [32];
int32_t m_bufSizes [32]; // size of the alloc'd space
int32_t m_numBufs;
int64_t m_totalBufSize;
int32_t m_totalBufSize; // gbpwrite() assumes 32 bits
int32_t m_offset; // where next rec is stored
int32_t m_tail; // next rec to delete
@ -358,7 +358,7 @@ class RdbCache {
bool m_useDisk; // load/save from disk?
// have we wrapped yet?
bool m_wrapped;
int8_t m_wrapped;
// keySize of cache keys in bytes
char m_cks;

@ -3240,7 +3240,7 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
len2 = k2->size_linkText - 1; // exclude \0
if ( len1 != len2 )
goto changed;
if ( memcmp(s1,s2,len1) != 0 )
if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
goto changed;
}
// no change in link text, look for change in page content now
@ -13562,7 +13562,7 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
if ( cr->m_isCustomCrawl ) {
m_linkInfo1Valid = true;
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
s_dummy2.m_size = sizeof(LinkInfo);
s_dummy2.m_lisize = sizeof(LinkInfo);
ptr_linkInfo1 = &s_dummy2;
size_linkInfo1 = sizeof(LinkInfo);
return ptr_linkInfo1;
@ -13576,7 +13576,7 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
if ( *ip == 0 || *ip == -1 ) {
m_linkInfo1Valid = true;
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
s_dummy2.m_size = sizeof(LinkInfo);
s_dummy2.m_lisize = sizeof(LinkInfo);
ptr_linkInfo1 = &s_dummy2;
size_linkInfo1 = sizeof(LinkInfo);
return ptr_linkInfo1;

@ -946,7 +946,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
<br><b>Searching DMOZ:</b>
<ul>
<li>Gigablast provides the unique ability to search the content of the pages in the DMOZ directory. But in order to search the pages in DMOZ we have to index them. You can't search what is not indexed.
So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.) It will create several large files. Each file it creates is basically a VERY LARGE page of links and each link is a url in dmoz. Each of these files has a <i>&lt;meta name=spiderlinkslinks content=0&gt; special Gigablast meta tag that says NOT to follow the links OF THE LINKS. So it will just spider the outlinks on this massive page and then stop. Furthermore, the massive page also has a &lt;meta name=noindex content=1&gt; tag that tells Gigablast to not index this massive page itself, but only spider the outlinks.
So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.) It will create several large files. Each file it creates is basically a VERY LARGE page of links and each link is a url in dmoz. Each of these files has a <i>&lt;meta name=spiderlinkslinks content=0&gt;</i> special Gigablast meta tag that says NOT to follow the links OF THE LINKS. So it will just spider the outlinks on this massive page and then stop. Furthermore, the massive page also has a &lt;meta name=noindex content=1&gt; tag that tells Gigablast to not index this massive page itself, but only spider the outlinks.
<br><b>$ ./dmozparse urldump -s</b>