simplify Inlinks class in LinkInfo.cpp.

fix some more 64-bit related cores.
2014-11-18 16:50:31 -08:00
parent 0032052930
commit 2977845375
10 changed files with 113 additions and 85 deletions
--- a/DiskPageCache.cpp
+++ b/DiskPageCache.cpp
@ -1194,7 +1194,10 @@ void DiskPageCache::writeToCache( int32_t bigOff, int32_t smallOff,  void *inBuf
 	memcpy(p + smallOff, inBuf, size);
 }

-void DiskPageCache::readFromCache( void *outBuf, int32_t bigOff, int32_t smallOff,
+// . store cached disk info into "outBuf". up to "size" bytes of it.
+void DiskPageCache::readFromCache( void *outBuf, 
+				   int32_t bigOff, 
+				   int32_t smallOff,
 				   int32_t size ){
 #ifdef GBUSESHM
 	if ( m_useSHM ) {
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -708,7 +708,8 @@ void  handleRequest25 ( UdpSlot *slot , int32_t netnice ) {

 	// set up the hashtable if our first time
 	if ( ! g_lineTable.isInitialized() )
-		g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
+		g_lineTable.set ( 8,sizeof(Msg25Request *),256,
+				  NULL,0,false,MAX_NICENESS,"lht25");

 	// . if already working on this same request, wait for it, don't
 	//   overload server with duplicate requests
@ -976,7 +977,8 @@ bool Msg25::getLinkInfo2( char      *site                ,
 	m_adBanTable.reset();
 	m_adBanTable.set(4,0,0,NULL,0,false,m_niceness,"adbans");

-	m_table.set (4,4,0,NULL,0,false,m_niceness,"msg25tab");
+	m_table.set (4,sizeof(NoteEntry *),0,
+		     NULL,0,false,m_niceness,"msg25tab");

 	QUICKPOLL(m_niceness);

@ -3934,7 +3936,7 @@ LinkInfo *makeLinkInfo ( char        *coll                    ,
 	
 	// set our header
 	info->m_version                = 0;
-	info->m_size                   = need;
+	info->m_lisize                 = need;
 	info->m_lastUpdated            = lastUpdateTime;//getTimeGlobal();
 	// how many Inlinks we stored in info->m_buf[]
 	info->m_numStoredInlinks       = count;
@ -4024,8 +4026,8 @@ LinkInfo *makeLinkInfo ( char        *coll                    ,
 	return info;
 }

-static Inlink  s_inlink;
 static Inlink *s_orig;
+static Inlink  s_inlink;

 // if we are an old version, we have to set s_inlink and return
 // a ptr to that
@ -4037,16 +4039,16 @@ Inlink *LinkInfo::getNextInlink ( Inlink *k ) {
 	// if none, we are done
 	if ( ! p ) return p;
 	// sanity checks
-	if (p->m_numStrings==0 && p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
-	if (p->m_numStrings && p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
+	//if(p->m_numStrings==0&& p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
+	//if(p->m_numStrings&& p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
 	// fix this for the really old guy. we did not store these two
 	// things initially, but they should have been set to this...
 	// luckily, we had a "reserved1" int32_t...
-	if ( p->m_numStrings == 0 ) {
-		// urlBuf,linkText,surroudingText,rssItem
-		p->m_numStrings        = 4;
-		p->m_firstStrPtrOffset = 64;
-	}
+	// if ( p->m_numStrings == 0 ) {
+	// 	// urlBuf,linkText,surroudingText,rssItem
+	// 	p->m_numStrings        = 4;
+	// 	p->m_firstStrPtrOffset = 64;
+	// }
 	// MDW: now we just use offsets for 64bit conversion so no ptrs...
 	// if latest, return that
 	//if ( p->m_numStrings        == p->getBaseNumStrings() &&
@ -4078,7 +4080,7 @@ Inlink *LinkInfo::getNextInlink2 ( Inlink *k ) {
 	// get the inlink to return
 	Inlink *next = (Inlink *)((char *)k + size);
 	// return NULL if breached
-	if ( (char *)next >= ((char *)this)+m_size ) return NULL;
+	if ( (char *)next >= ((char *)this)+m_lisize ) return NULL;
 	// otherwise, we are still good
 	return next;
 }
@ -4337,9 +4339,9 @@ void Inlink::set ( Msg20Reply *r ) {

 	// . these two things are used for version-based deserializing
 	// . our current version has 5 strings
-	m_numStrings         = getBaseNumStrings();
+	//m_numStrings         = getBaseNumStrings();
 	// and our current string offset
-	m_firstStrPtrOffset  = (char *)getFirstOffPtr() - (char *)this;
+	//m_firstStrPtrOffset  = (char *)getFirstOffPtr() - (char *)this;

 	// set ourselves now
 	m_ip                 = r->m_ip;
@ -4525,7 +4527,7 @@ int32_t Inlink::updateStringPtrs ( char *buf ) {

 void Inlink::reset ( ) {
 	// clear ourselves out
-	memset ( (char *)this,0,sizeof(Inlink) );
+	memset ( (char *)this,0,sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE);
 }

 // . set a new Inlink from an older versioned Inlink
@ -4534,14 +4536,17 @@ void Inlink::set2 ( Inlink *old ) {
 	// clear ouselves
 	reset();
 	// copy what is legit to us
-	int fullSize = sizeof(Inlink);
+	//int fullSize = sizeof(Inlink);
 	// add in the sizes of all strings
-	int32_t  *sizePtr = getFirstSizeParm(); // &size_qbuf;
-	int32_t  *sizeEnd = getLastSizeParm (); // &size_displayMetas;
-	for ( ; sizePtr <= sizeEnd ;  sizePtr++ ) 
-		fullSize += *sizePtr;
+	//int32_t  *sizePtr = getFirstSizeParm(); // &size_qbuf;
+	//int32_t  *sizeEnd = getLastSizeParm (); // &size_displayMetas;
+	//for ( ; sizePtr <= sizeEnd ;  sizePtr++ ) 
+	//	fullSize += *sizePtr;
+
+	int fullSize = old->getStoredSize();
 	// return how many bytes we processed
 	memcpy ( (char *)this , (char *)old , fullSize );
+
 	return;

 	// this old way is pre-64bit
@ -4577,18 +4582,28 @@ void Inlink::set2 ( Inlink *old ) {
 int32_t Inlink::getStoredSize ( ) {
 	//int32_t size = (int32_t)sizeof(Msg);
 	//int32_t size = getBaseSize();
-	int32_t size = m_firstStrPtrOffset;
-	// add in string offsets AND size ptrs
-	size += 8 * m_numStrings;
+	int32_t size = sizeof(Inlink) -	MAXINLINKSTRINGBUFSIZE;
+
+	size += size_urlBuf;
+	size += size_linkText;
+	size += size_surroundingText;
+	size += size_rssItem;
+	size += size_categories;
+	size += size_gigabitQuery;
+	size += size_templateVector;
+
+	return size;
+	// add in string offsets AND size, 4 bytes each
+	//size += 8 * m_numStrings;
+	// start of first offset
+	// int32_t *sizePtr = &size_urlBuf;
+	// int32_t *sizeEnd = (int32_t *)((char *)this + sizeof(Inlink));
 	// add up string buffer sizes
 	//int32_t *sizePtr = getFirstSizeParm(); // &size_qbuf;
 	//int32_t *sizeEnd = getLastSizeParm (); // &size_displayMetas;
-	int32_t *sizePtr = 
-		(int32_t *)((char *)this + m_firstStrPtrOffset+4*m_numStrings);
-	int32_t *sizeEnd = sizePtr + m_numStrings;
-	for ( ; sizePtr < sizeEnd ; sizePtr++ )
-		size += *sizePtr;
-	return size;
+	//int32_t *sizePtr = 
+	//	(int32_t *)((char *)this + m_firstStrPtrOffset+4*m_numStrings);
+	//int32_t *sizeEnd = sizePtr + m_numStrings;
 }

 // . return ptr to the buffer we serialize into
@ -4611,38 +4626,40 @@ char *Inlink::serialize ( int32_t *retSize     ,
 	// copy the easy stuff
 	char *p = buf;
 	char *pend = buf + need;
-	memcpy ( p , (char *)this , getBaseSize() );
-	p += getBaseSize();
-	// then store the strings!
-	int32_t  *sizePtr = getFirstSizeParm(); // &size_qbuf;
-	int32_t  *sizeEnd = getLastSizeParm (); // &size_displayMetas;
-	int32_t  *offPtr  = getFirstOffPtr  (); // &ptr_qbuf;
-	for ( ; sizePtr <= sizeEnd ;  ) {
-		if ( p > pend ) { char *xx=NULL;*xx=0; }
-		// if we are NULL, we are a "bookmark", so
-		// we alloc'd space for it, but don't copy into
-		// the space until after this call toe serialize()
-		// MDW: we can't use NULL now because we are offsets and 0 is 
-		// legit. because of the 64bit conversion.
-		// well if empty, *sizePtr will be 0... so we don't need this.
-		//if ( *offPtr == -1 ) goto skip;
-		// sanity check -- cannot copy onto ourselves
-		if ( p > m_buf+*offPtr && p < m_buf+*offPtr + *sizePtr ) {
-			char *xx = NULL; *xx = 0; }
-		// copy the string into the buffer
-		memcpy ( p , m_buf + *offPtr , *sizePtr );
-		//skip:
-		// . make it point into the buffer now
-		// . MDW: why? that is causing problems for the re-call in
-		//   Msg3a, it calls this twice with the same "m_r"
-		// . MDW: took out for 64bit
-		//if ( makePtrsRefNewBuf ) *offPtr = (p-buf);
-		// advance our destination ptr
-		p += *sizePtr;
-		// advance both ptrs to next string
-		sizePtr++;
-		offPtr++;
-	}
+	memcpy ( p , (char *)this , need );
+	p += need;
+
+	if ( p != pend ) { char *xx=NULL;*xx=0; }
+
+	// int32_t  *sizePtr = getFirstSizeParm(); // &size_qbuf;
+	// int32_t  *sizeEnd = getLastSizeParm (); // &size_displayMetas;
+	// int32_t  *offPtr  = getFirstOffPtr  (); // &ptr_qbuf;
+	// for ( ; sizePtr <= sizeEnd ;  ) {
+	// 	if ( p > pend ) { char *xx=NULL;*xx=0; }
+	// 	// if we are NULL, we are a "bookmark", so
+	// 	// we alloc'd space for it, but don't copy into
+	// 	// the space until after this call toe serialize()
+	// 	// MDW: we can't use NULL now because we are offsets and 0 is 
+	// 	// legit. because of the 64bit conversion.
+	// 	// well if empty, *sizePtr will be 0... so we don't need this.
+	// 	//if ( *offPtr == -1 ) goto skip;
+	// 	// sanity check -- cannot copy onto ourselves
+	// 	if ( p > m_buf+*offPtr && p < m_buf+*offPtr + *sizePtr ) {
+	// 		char *xx = NULL; *xx = 0; }
+	// 	// copy the string into the buffer
+	// 	memcpy ( p , m_buf + *offPtr , *sizePtr );
+	// 	//skip:
+	// 	// . make it point into the buffer now
+	// 	// . MDW: why? that is causing problems for the re-call in
+	// 	//   Msg3a, it calls this twice with the same "m_r"
+	// 	// . MDW: took out for 64bit
+	// 	//if ( makePtrsRefNewBuf ) *offPtr = (p-buf);
+	// 	// advance our destination ptr
+	// 	p += *sizePtr;
+	// 	// advance both ptrs to next string
+	// 	sizePtr++;
+	// 	offPtr++;
+	// }
 	return buf;
 }

--- a/Linkdb.h
+++ b/Linkdb.h
@ -733,8 +733,8 @@ class LinkInfo {

 public:

-	int32_t   getStoredSize  ( ) { return m_size; };
-	int32_t   getSize        ( ) { return m_size; };
+	int32_t   getStoredSize  ( ) { return m_lisize; };
+	int32_t   getSize        ( ) { return m_lisize; };
 	time_t getLastUpdated ( ) { return m_lastUpdated; };

 	//int32_t   getNumTotalInlinks   ( ) { 
@ -798,7 +798,8 @@ class LinkInfo {
 	char       m_numInlinksInternal;
 	char       m_reserved1; // was m_siteRootQuality
 	char       m_reserved2;
-	int32_t       m_size;
+	// includes Inlinks in m_buf[] below
+	int32_t       m_lisize;
 	time_t     m_lastUpdated;
 	// this is precisely how many inlinks we stored in m_buf[] below
 	int32_t       m_numStoredInlinks;//m_numTotalInlinks;
@ -827,18 +828,20 @@ class LinkInfo {
 };


+#define MAXINLINKSTRINGBUFSIZE 2048
+
 class Inlink { // : public Msg {

 public:

-	int32_t  *getFirstSizeParm () { return &size_urlBuf; };
-	int32_t  *getLastSizeParm  () { return &size_rssItem; };
-	int32_t  *getFirstOffPtr   () { return &off_urlBuf; };
-	int32_t   getBaseSize      () { return sizeof(Inlink);};
-	char  *getStringBuf     () { return m_buf; };
+	//int32_t  *getFirstSizeParm () { return &size_urlBuf; };
+	//int32_t  *getLastSizeParm  () { return &size_rssItem; };
+	//int32_t  *getFirstOffPtr   () { return &off_urlBuf; };
+	//int32_t   getBaseSize      () { return sizeof(Inlink);};
+	//char  *getStringBuf     () { return m_buf; };

-	int32_t getBaseNumStrings() { 
-		return (char **)&size_urlBuf - (char **)&off_urlBuf; };
+	//int32_t getBaseNumStrings() { 
+	//	return (char **)&size_urlBuf - (char **)&off_urlBuf; };
 	
 	// zero ourselves out
 	void reset() ;
@ -896,10 +899,10 @@ class Inlink { // : public Msg {
 	// . int32_t     m_reserved1           ;
 	// . how many strings do we have?
 	// . makes it easy to add new strings later
-	uint16_t   m_numStrings          ;
+	uint16_t   m_reserved_NumStrings          ;
 	// . and were our first string ptrs starts
 	// . allows us to set ourselves from an "old" Inlink 
-	uint16_t   m_firstStrPtrOffset   ;
+	uint16_t   m_reserved_FirstStrPtrOffset   ;

 	uint16_t   m_numOutlinks         ;
 	// i guess no need to store this stuff if we are storing the url
@ -1029,7 +1032,7 @@ class Inlink { // : public Msg {
 	int32_t       size_templateVector   ;


-	char       m_buf[0]              ;
+	char       m_buf[MAXINLINKSTRINGBUFSIZE] ;
 };

 // . this function is normally called like "info = makeLinkInfo()"
--- a/Msg20.cpp
+++ b/Msg20.cpp
@ -717,7 +717,7 @@ int32_t Msg20Reply::deserialize ( ) {
 		strPtr++;
 	}
 	// sanity
-	if ( ptr_linkInfo && ((LinkInfo *)ptr_linkInfo)->m_size !=
+	if ( ptr_linkInfo && ((LinkInfo *)ptr_linkInfo)->m_lisize !=
 		    size_linkInfo ) { 
 		log("xmldoc: deserialize msg20 reply corruption error");
 		log("xmldoc: DO YOU NEED TO NUKE CACHEDB.DAT?????");
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -3340,7 +3340,7 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
 	//   and stale. Both are really only for BuzzLogic.
 	LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
 	// sanity
-	if ( info && mr->size_linkInfo != info->m_size ){char *xx=NULL;*xx=0; }
+	if ( info && mr->size_linkInfo!=info->m_lisize ){char *xx=NULL;*xx=0; }
 	// NULLify if empty
 	if ( mr->size_linkInfo <= 0 ) info = NULL;
 	// do not both if none
@ -7408,7 +7408,7 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , int32_t catId ,
 	sb->htmlEncode ( qstr , qlen , false );

 	// if it was an advanced search, this can be empty
-	if ( qlen == 0 && si->m_displayQuery )
+	if ( qlen == 0 && si && si->m_displayQuery )
 		sb->htmlEncode ( si->m_displayQuery );

 	sb->safePrintf ("\">"
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -2714,6 +2714,11 @@ int64_t Rdb::getNumGlobalRecs ( ) {
 // . return number of positive records - negative records
 int64_t Rdb::getNumTotalRecs ( bool useCache ) {

+	// are we catdb or statsdb? then we have no associated collections
+	// because we are used globally, by all collections
+	if ( m_isCollectionLess )
+		return m_collectionlessBase->getNumTotalRecs();
+
 	// this gets slammed w/ too many collections so use a cache...
 	//if ( g_collectiondb.m_numRecsUsed > 10 ) {
 	int32_t now = 0;
--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -795,7 +795,7 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 	if ( need >= m_totalBufSize )
 		return log(LOG_INFO,
 			   "db: Could not fit record of %"INT32" bytes into %s "
-			   "cache. Max size is %"INT64".",need,m_dbname,
+			   "cache. Max size is %"INT32".",need,m_dbname,
 			   m_totalBufSize);
 	if ( need >= BUFSIZE )
 		return log(LOG_INFO,
--- a/RdbCache.h
+++ b/RdbCache.h
@ -334,7 +334,7 @@ class RdbCache {
 	char      *m_bufs     [32];
 	int32_t       m_bufSizes [32]; // size of the alloc'd space
 	int32_t       m_numBufs;
-	int64_t  m_totalBufSize;
+	int32_t  m_totalBufSize; // gbpwrite() assumes 32 bits
 	int32_t       m_offset; // where next rec is stored
 	int32_t       m_tail;   // next rec to delete

@ -358,7 +358,7 @@ class RdbCache {
 	bool m_useDisk;  // load/save from disk?

 	// have we wrapped yet?
-	bool m_wrapped;
+	int8_t m_wrapped;

 	// keySize of cache keys in bytes
 	char m_cks;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -3240,7 +3240,7 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
 			len2 = k2->size_linkText - 1; // exclude \0
 			if ( len1 != len2 )
 				goto changed;
-			if ( memcmp(s1,s2,len1) != 0 )
+			if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
 				goto changed;
 		}
 		// no change in link text, look for change in page content now
@ -13562,7 +13562,7 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 	if ( cr->m_isCustomCrawl ) {
 		m_linkInfo1Valid = true;
 		memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
-		s_dummy2.m_size = sizeof(LinkInfo);
+		s_dummy2.m_lisize = sizeof(LinkInfo);
 		ptr_linkInfo1  = &s_dummy2;
 		size_linkInfo1 = sizeof(LinkInfo);
 		return ptr_linkInfo1;
@ -13576,7 +13576,7 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 	if ( *ip == 0 || *ip == -1 ) {
 		m_linkInfo1Valid = true;
 		memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
-		s_dummy2.m_size = sizeof(LinkInfo);
+		s_dummy2.m_lisize = sizeof(LinkInfo);
 		ptr_linkInfo1  = &s_dummy2;
 		size_linkInfo1 = sizeof(LinkInfo);
 		return ptr_linkInfo1;
--- a/html/faq.html
+++ b/html/faq.html
@ -946,7 +946,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
 <br><b>Searching DMOZ:</b>
 <ul>
 <li>Gigablast provides the unique ability to search the content of the pages in the DMOZ directory. But in order to search the pages in DMOZ we have to index them. You can't search what is not indexed. 
-So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.) It will create several large files. Each file it creates is basically a VERY LARGE page of links and each link is a url in dmoz. Each of these files has a <i>&lt;meta name=spiderlinkslinks content=0&gt; special Gigablast meta tag that says NOT to follow the links OF THE LINKS. So it will just spider the outlinks on this massive page and then stop. Furthermore, the massive page also has a &lt;meta name=noindex content=1&gt; tag that tells Gigablast to not index this massive page itself, but only spider the outlinks.
+So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.) It will create several large files. Each file it creates is basically a VERY LARGE page of links and each link is a url in dmoz. Each of these files has a <i>&lt;meta name=spiderlinkslinks content=0&gt;</i> special Gigablast meta tag that says NOT to follow the links OF THE LINKS. So it will just spider the outlinks on this massive page and then stop. Furthermore, the massive page also has a &lt;meta name=noindex content=1&gt; tag that tells Gigablast to not index this massive page itself, but only spider the outlinks.

 <br><b>$ ./dmozparse urldump -s</b>