simplify Inlinks class in LinkInfo.cpp.
fix some more 64-bit related cores.
This commit is contained in:
@ -1194,7 +1194,10 @@ void DiskPageCache::writeToCache( int32_t bigOff, int32_t smallOff, void *inBuf
|
||||
memcpy(p + smallOff, inBuf, size);
|
||||
}
|
||||
|
||||
void DiskPageCache::readFromCache( void *outBuf, int32_t bigOff, int32_t smallOff,
|
||||
// . store cached disk info into "outBuf". up to "size" bytes of it.
|
||||
void DiskPageCache::readFromCache( void *outBuf,
|
||||
int32_t bigOff,
|
||||
int32_t smallOff,
|
||||
int32_t size ){
|
||||
#ifdef GBUSESHM
|
||||
if ( m_useSHM ) {
|
||||
|
139
Linkdb.cpp
139
Linkdb.cpp
@ -708,7 +708,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
|
||||
|
||||
// set up the hashtable if our first time
|
||||
if ( ! g_lineTable.isInitialized() )
|
||||
g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
|
||||
g_lineTable.set ( 8,sizeof(Msg25Request *),256,
|
||||
NULL,0,false,MAX_NICENESS,"lht25");
|
||||
|
||||
// . if already working on this same request, wait for it, don't
|
||||
// overload server with duplicate requests
|
||||
@ -976,7 +977,8 @@ bool Msg25::getLinkInfo2( char *site ,
|
||||
m_adBanTable.reset();
|
||||
m_adBanTable.set(4,0,0,NULL,0,false,m_niceness,"adbans");
|
||||
|
||||
m_table.set (4,4,0,NULL,0,false,m_niceness,"msg25tab");
|
||||
m_table.set (4,sizeof(NoteEntry *),0,
|
||||
NULL,0,false,m_niceness,"msg25tab");
|
||||
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
@ -3934,7 +3936,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
|
||||
// set our header
|
||||
info->m_version = 0;
|
||||
info->m_size = need;
|
||||
info->m_lisize = need;
|
||||
info->m_lastUpdated = lastUpdateTime;//getTimeGlobal();
|
||||
// how many Inlinks we stored in info->m_buf[]
|
||||
info->m_numStoredInlinks = count;
|
||||
@ -4024,8 +4026,8 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
return info;
|
||||
}
|
||||
|
||||
static Inlink s_inlink;
|
||||
static Inlink *s_orig;
|
||||
static Inlink s_inlink;
|
||||
|
||||
// if we are an old version, we have to set s_inlink and return
|
||||
// a ptr to that
|
||||
@ -4037,16 +4039,16 @@ Inlink *LinkInfo::getNextInlink ( Inlink *k ) {
|
||||
// if none, we are done
|
||||
if ( ! p ) return p;
|
||||
// sanity checks
|
||||
if (p->m_numStrings==0 && p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
|
||||
if (p->m_numStrings && p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
|
||||
//if(p->m_numStrings==0&& p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
|
||||
//if(p->m_numStrings&& p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
|
||||
// fix this for the really old guy. we did not store these two
|
||||
// things initially, but they should have been set to this...
|
||||
// luckily, we had a "reserved1" int32_t...
|
||||
if ( p->m_numStrings == 0 ) {
|
||||
// urlBuf,linkText,surroudingText,rssItem
|
||||
p->m_numStrings = 4;
|
||||
p->m_firstStrPtrOffset = 64;
|
||||
}
|
||||
// if ( p->m_numStrings == 0 ) {
|
||||
// // urlBuf,linkText,surroudingText,rssItem
|
||||
// p->m_numStrings = 4;
|
||||
// p->m_firstStrPtrOffset = 64;
|
||||
// }
|
||||
// MDW: now we just use offsets for 64bit conversion so no ptrs...
|
||||
// if latest, return that
|
||||
//if ( p->m_numStrings == p->getBaseNumStrings() &&
|
||||
@ -4078,7 +4080,7 @@ Inlink *LinkInfo::getNextInlink2 ( Inlink *k ) {
|
||||
// get the inlink to return
|
||||
Inlink *next = (Inlink *)((char *)k + size);
|
||||
// return NULL if breached
|
||||
if ( (char *)next >= ((char *)this)+m_size ) return NULL;
|
||||
if ( (char *)next >= ((char *)this)+m_lisize ) return NULL;
|
||||
// otherwise, we are still good
|
||||
return next;
|
||||
}
|
||||
@ -4337,9 +4339,9 @@ void Inlink::set ( Msg20Reply *r ) {
|
||||
|
||||
// . these two things are used for version-based deserializing
|
||||
// . our current version has 5 strings
|
||||
m_numStrings = getBaseNumStrings();
|
||||
//m_numStrings = getBaseNumStrings();
|
||||
// and our current string offset
|
||||
m_firstStrPtrOffset = (char *)getFirstOffPtr() - (char *)this;
|
||||
//m_firstStrPtrOffset = (char *)getFirstOffPtr() - (char *)this;
|
||||
|
||||
// set ourselves now
|
||||
m_ip = r->m_ip;
|
||||
@ -4525,7 +4527,7 @@ int32_t Inlink::updateStringPtrs ( char *buf ) {
|
||||
|
||||
void Inlink::reset ( ) {
|
||||
// clear ourselves out
|
||||
memset ( (char *)this,0,sizeof(Inlink) );
|
||||
memset ( (char *)this,0,sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE);
|
||||
}
|
||||
|
||||
// . set a new Inlink from an older versioned Inlink
|
||||
@ -4534,14 +4536,17 @@ void Inlink::set2 ( Inlink *old ) {
|
||||
// clear ouselves
|
||||
reset();
|
||||
// copy what is legit to us
|
||||
int fullSize = sizeof(Inlink);
|
||||
//int fullSize = sizeof(Inlink);
|
||||
// add in the sizes of all strings
|
||||
int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
|
||||
int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
|
||||
for ( ; sizePtr <= sizeEnd ; sizePtr++ )
|
||||
fullSize += *sizePtr;
|
||||
//int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
|
||||
//int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
|
||||
//for ( ; sizePtr <= sizeEnd ; sizePtr++ )
|
||||
// fullSize += *sizePtr;
|
||||
|
||||
int fullSize = old->getStoredSize();
|
||||
// return how many bytes we processed
|
||||
memcpy ( (char *)this , (char *)old , fullSize );
|
||||
|
||||
return;
|
||||
|
||||
// this old way is pre-64bit
|
||||
@ -4577,18 +4582,28 @@ void Inlink::set2 ( Inlink *old ) {
|
||||
int32_t Inlink::getStoredSize ( ) {
|
||||
//int32_t size = (int32_t)sizeof(Msg);
|
||||
//int32_t size = getBaseSize();
|
||||
int32_t size = m_firstStrPtrOffset;
|
||||
// add in string offsets AND size ptrs
|
||||
size += 8 * m_numStrings;
|
||||
int32_t size = sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE;
|
||||
|
||||
size += size_urlBuf;
|
||||
size += size_linkText;
|
||||
size += size_surroundingText;
|
||||
size += size_rssItem;
|
||||
size += size_categories;
|
||||
size += size_gigabitQuery;
|
||||
size += size_templateVector;
|
||||
|
||||
return size;
|
||||
// add in string offsets AND size, 4 bytes each
|
||||
//size += 8 * m_numStrings;
|
||||
// start of first offset
|
||||
// int32_t *sizePtr = &size_urlBuf;
|
||||
// int32_t *sizeEnd = (int32_t *)((char *)this + sizeof(Inlink));
|
||||
// add up string buffer sizes
|
||||
//int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
|
||||
//int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
|
||||
int32_t *sizePtr =
|
||||
(int32_t *)((char *)this + m_firstStrPtrOffset+4*m_numStrings);
|
||||
int32_t *sizeEnd = sizePtr + m_numStrings;
|
||||
for ( ; sizePtr < sizeEnd ; sizePtr++ )
|
||||
size += *sizePtr;
|
||||
return size;
|
||||
//int32_t *sizePtr =
|
||||
// (int32_t *)((char *)this + m_firstStrPtrOffset+4*m_numStrings);
|
||||
//int32_t *sizeEnd = sizePtr + m_numStrings;
|
||||
}
|
||||
|
||||
// . return ptr to the buffer we serialize into
|
||||
@ -4611,38 +4626,40 @@ char *Inlink::serialize ( int32_t *retSize ,
|
||||
// copy the easy stuff
|
||||
char *p = buf;
|
||||
char *pend = buf + need;
|
||||
memcpy ( p , (char *)this , getBaseSize() );
|
||||
p += getBaseSize();
|
||||
// then store the strings!
|
||||
int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
|
||||
int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
|
||||
int32_t *offPtr = getFirstOffPtr (); // &ptr_qbuf;
|
||||
for ( ; sizePtr <= sizeEnd ; ) {
|
||||
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
||||
// if we are NULL, we are a "bookmark", so
|
||||
// we alloc'd space for it, but don't copy into
|
||||
// the space until after this call toe serialize()
|
||||
// MDW: we can't use NULL now because we are offsets and 0 is
|
||||
// legit. because of the 64bit conversion.
|
||||
// well if empty, *sizePtr will be 0... so we don't need this.
|
||||
//if ( *offPtr == -1 ) goto skip;
|
||||
// sanity check -- cannot copy onto ourselves
|
||||
if ( p > m_buf+*offPtr && p < m_buf+*offPtr + *sizePtr ) {
|
||||
char *xx = NULL; *xx = 0; }
|
||||
// copy the string into the buffer
|
||||
memcpy ( p , m_buf + *offPtr , *sizePtr );
|
||||
//skip:
|
||||
// . make it point into the buffer now
|
||||
// . MDW: why? that is causing problems for the re-call in
|
||||
// Msg3a, it calls this twice with the same "m_r"
|
||||
// . MDW: took out for 64bit
|
||||
//if ( makePtrsRefNewBuf ) *offPtr = (p-buf);
|
||||
// advance our destination ptr
|
||||
p += *sizePtr;
|
||||
// advance both ptrs to next string
|
||||
sizePtr++;
|
||||
offPtr++;
|
||||
}
|
||||
memcpy ( p , (char *)this , need );
|
||||
p += need;
|
||||
|
||||
if ( p != pend ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
|
||||
// int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
|
||||
// int32_t *offPtr = getFirstOffPtr (); // &ptr_qbuf;
|
||||
// for ( ; sizePtr <= sizeEnd ; ) {
|
||||
// if ( p > pend ) { char *xx=NULL;*xx=0; }
|
||||
// // if we are NULL, we are a "bookmark", so
|
||||
// // we alloc'd space for it, but don't copy into
|
||||
// // the space until after this call toe serialize()
|
||||
// // MDW: we can't use NULL now because we are offsets and 0 is
|
||||
// // legit. because of the 64bit conversion.
|
||||
// // well if empty, *sizePtr will be 0... so we don't need this.
|
||||
// //if ( *offPtr == -1 ) goto skip;
|
||||
// // sanity check -- cannot copy onto ourselves
|
||||
// if ( p > m_buf+*offPtr && p < m_buf+*offPtr + *sizePtr ) {
|
||||
// char *xx = NULL; *xx = 0; }
|
||||
// // copy the string into the buffer
|
||||
// memcpy ( p , m_buf + *offPtr , *sizePtr );
|
||||
// //skip:
|
||||
// // . make it point into the buffer now
|
||||
// // . MDW: why? that is causing problems for the re-call in
|
||||
// // Msg3a, it calls this twice with the same "m_r"
|
||||
// // . MDW: took out for 64bit
|
||||
// //if ( makePtrsRefNewBuf ) *offPtr = (p-buf);
|
||||
// // advance our destination ptr
|
||||
// p += *sizePtr;
|
||||
// // advance both ptrs to next string
|
||||
// sizePtr++;
|
||||
// offPtr++;
|
||||
// }
|
||||
return buf;
|
||||
}
|
||||
|
||||
|
29
Linkdb.h
29
Linkdb.h
@ -733,8 +733,8 @@ class LinkInfo {
|
||||
|
||||
public:
|
||||
|
||||
int32_t getStoredSize ( ) { return m_size; };
|
||||
int32_t getSize ( ) { return m_size; };
|
||||
int32_t getStoredSize ( ) { return m_lisize; };
|
||||
int32_t getSize ( ) { return m_lisize; };
|
||||
time_t getLastUpdated ( ) { return m_lastUpdated; };
|
||||
|
||||
//int32_t getNumTotalInlinks ( ) {
|
||||
@ -798,7 +798,8 @@ class LinkInfo {
|
||||
char m_numInlinksInternal;
|
||||
char m_reserved1; // was m_siteRootQuality
|
||||
char m_reserved2;
|
||||
int32_t m_size;
|
||||
// includes Inlinks in m_buf[] below
|
||||
int32_t m_lisize;
|
||||
time_t m_lastUpdated;
|
||||
// this is precisely how many inlinks we stored in m_buf[] below
|
||||
int32_t m_numStoredInlinks;//m_numTotalInlinks;
|
||||
@ -827,18 +828,20 @@ class LinkInfo {
|
||||
};
|
||||
|
||||
|
||||
#define MAXINLINKSTRINGBUFSIZE 2048
|
||||
|
||||
class Inlink { // : public Msg {
|
||||
|
||||
public:
|
||||
|
||||
int32_t *getFirstSizeParm () { return &size_urlBuf; };
|
||||
int32_t *getLastSizeParm () { return &size_rssItem; };
|
||||
int32_t *getFirstOffPtr () { return &off_urlBuf; };
|
||||
int32_t getBaseSize () { return sizeof(Inlink);};
|
||||
char *getStringBuf () { return m_buf; };
|
||||
//int32_t *getFirstSizeParm () { return &size_urlBuf; };
|
||||
//int32_t *getLastSizeParm () { return &size_rssItem; };
|
||||
//int32_t *getFirstOffPtr () { return &off_urlBuf; };
|
||||
//int32_t getBaseSize () { return sizeof(Inlink);};
|
||||
//char *getStringBuf () { return m_buf; };
|
||||
|
||||
int32_t getBaseNumStrings() {
|
||||
return (char **)&size_urlBuf - (char **)&off_urlBuf; };
|
||||
//int32_t getBaseNumStrings() {
|
||||
// return (char **)&size_urlBuf - (char **)&off_urlBuf; };
|
||||
|
||||
// zero ourselves out
|
||||
void reset() ;
|
||||
@ -896,10 +899,10 @@ class Inlink { // : public Msg {
|
||||
// . int32_t m_reserved1 ;
|
||||
// . how many strings do we have?
|
||||
// . makes it easy to add new strings later
|
||||
uint16_t m_numStrings ;
|
||||
uint16_t m_reserved_NumStrings ;
|
||||
// . and were our first string ptrs starts
|
||||
// . allows us to set ourselves from an "old" Inlink
|
||||
uint16_t m_firstStrPtrOffset ;
|
||||
uint16_t m_reserved_FirstStrPtrOffset ;
|
||||
|
||||
uint16_t m_numOutlinks ;
|
||||
// i guess no need to store this stuff if we are storing the url
|
||||
@ -1029,7 +1032,7 @@ class Inlink { // : public Msg {
|
||||
int32_t size_templateVector ;
|
||||
|
||||
|
||||
char m_buf[0] ;
|
||||
char m_buf[MAXINLINKSTRINGBUFSIZE] ;
|
||||
};
|
||||
|
||||
// . this function is normally called like "info = makeLinkInfo()"
|
||||
|
@ -717,7 +717,7 @@ int32_t Msg20Reply::deserialize ( ) {
|
||||
strPtr++;
|
||||
}
|
||||
// sanity
|
||||
if ( ptr_linkInfo && ((LinkInfo *)ptr_linkInfo)->m_size !=
|
||||
if ( ptr_linkInfo && ((LinkInfo *)ptr_linkInfo)->m_lisize !=
|
||||
size_linkInfo ) {
|
||||
log("xmldoc: deserialize msg20 reply corruption error");
|
||||
log("xmldoc: DO YOU NEED TO NUKE CACHEDB.DAT?????");
|
||||
|
@ -3340,7 +3340,7 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
|
||||
// and stale. Both are really only for BuzzLogic.
|
||||
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
|
||||
// sanity
|
||||
if ( info && mr->size_linkInfo != info->m_size ){char *xx=NULL;*xx=0; }
|
||||
if ( info && mr->size_linkInfo!=info->m_lisize ){char *xx=NULL;*xx=0; }
|
||||
// NULLify if empty
|
||||
if ( mr->size_linkInfo <= 0 ) info = NULL;
|
||||
// do not both if none
|
||||
@ -7408,7 +7408,7 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , int32_t catId ,
|
||||
sb->htmlEncode ( qstr , qlen , false );
|
||||
|
||||
// if it was an advanced search, this can be empty
|
||||
if ( qlen == 0 && si->m_displayQuery )
|
||||
if ( qlen == 0 && si && si->m_displayQuery )
|
||||
sb->htmlEncode ( si->m_displayQuery );
|
||||
|
||||
sb->safePrintf ("\">"
|
||||
|
5
Rdb.cpp
5
Rdb.cpp
@ -2714,6 +2714,11 @@ int64_t Rdb::getNumGlobalRecs ( ) {
|
||||
// . return number of positive records - negative records
|
||||
int64_t Rdb::getNumTotalRecs ( bool useCache ) {
|
||||
|
||||
// are we catdb or statsdb? then we have no associated collections
|
||||
// because we are used globally, by all collections
|
||||
if ( m_isCollectionLess )
|
||||
return m_collectionlessBase->getNumTotalRecs();
|
||||
|
||||
// this gets slammed w/ too many collections so use a cache...
|
||||
//if ( g_collectiondb.m_numRecsUsed > 10 ) {
|
||||
int32_t now = 0;
|
||||
|
@ -795,7 +795,7 @@ bool RdbCache::addRecord ( collnum_t collnum ,
|
||||
if ( need >= m_totalBufSize )
|
||||
return log(LOG_INFO,
|
||||
"db: Could not fit record of %"INT32" bytes into %s "
|
||||
"cache. Max size is %"INT64".",need,m_dbname,
|
||||
"cache. Max size is %"INT32".",need,m_dbname,
|
||||
m_totalBufSize);
|
||||
if ( need >= BUFSIZE )
|
||||
return log(LOG_INFO,
|
||||
|
@ -334,7 +334,7 @@ class RdbCache {
|
||||
char *m_bufs [32];
|
||||
int32_t m_bufSizes [32]; // size of the alloc'd space
|
||||
int32_t m_numBufs;
|
||||
int64_t m_totalBufSize;
|
||||
int32_t m_totalBufSize; // gbpwrite() assumes 32 bits
|
||||
int32_t m_offset; // where next rec is stored
|
||||
int32_t m_tail; // next rec to delete
|
||||
|
||||
@ -358,7 +358,7 @@ class RdbCache {
|
||||
bool m_useDisk; // load/save from disk?
|
||||
|
||||
// have we wrapped yet?
|
||||
bool m_wrapped;
|
||||
int8_t m_wrapped;
|
||||
|
||||
// keySize of cache keys in bytes
|
||||
char m_cks;
|
||||
|
@ -3240,7 +3240,7 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
|
||||
len2 = k2->size_linkText - 1; // exclude \0
|
||||
if ( len1 != len2 )
|
||||
goto changed;
|
||||
if ( memcmp(s1,s2,len1) != 0 )
|
||||
if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
|
||||
goto changed;
|
||||
}
|
||||
// no change in link text, look for change in page content now
|
||||
@ -13562,7 +13562,7 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
if ( cr->m_isCustomCrawl ) {
|
||||
m_linkInfo1Valid = true;
|
||||
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
||||
s_dummy2.m_size = sizeof(LinkInfo);
|
||||
s_dummy2.m_lisize = sizeof(LinkInfo);
|
||||
ptr_linkInfo1 = &s_dummy2;
|
||||
size_linkInfo1 = sizeof(LinkInfo);
|
||||
return ptr_linkInfo1;
|
||||
@ -13576,7 +13576,7 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
if ( *ip == 0 || *ip == -1 ) {
|
||||
m_linkInfo1Valid = true;
|
||||
memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
|
||||
s_dummy2.m_size = sizeof(LinkInfo);
|
||||
s_dummy2.m_lisize = sizeof(LinkInfo);
|
||||
ptr_linkInfo1 = &s_dummy2;
|
||||
size_linkInfo1 = sizeof(LinkInfo);
|
||||
return ptr_linkInfo1;
|
||||
|
@ -946,7 +946,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
|
||||
<br><b>Searching DMOZ:</b>
|
||||
<ul>
|
||||
<li>Gigablast provides the unique ability to search the content of the pages in the DMOZ directory. But in order to search the pages in DMOZ we have to index them. You can't search what is not indexed.
|
||||
So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.) It will create several large files. Each file it creates is basically a VERY LARGE page of links and each link is a url in dmoz. Each of these files has a <i><meta name=spiderlinkslinks content=0> special Gigablast meta tag that says NOT to follow the links OF THE LINKS. So it will just spider the outlinks on this massive page and then stop. Furthermore, the massive page also has a <meta name=noindex content=1> tag that tells Gigablast to not index this massive page itself, but only spider the outlinks.
|
||||
So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.) It will create several large files. Each file it creates is basically a VERY LARGE page of links and each link is a url in dmoz. Each of these files has a <i><meta name=spiderlinkslinks content=0></i> special Gigablast meta tag that says NOT to follow the links OF THE LINKS. So it will just spider the outlinks on this massive page and then stop. Furthermore, the massive page also has a <meta name=noindex content=1> tag that tells Gigablast to not index this massive page itself, but only spider the outlinks.
|
||||
|
||||
<br><b>$ ./dmozparse urldump -s</b>
|
||||
|
||||
|
Reference in New Issue
Block a user