trash files of length 0 that are holding up a merge.

if we can't merge files we end up stockpiling them and things get slow fast.
2016-05-13 13:21:43 -07:00 · 2016-05-13 13:21:43 -07:00 · 39e621f655
commit 39e621f655
parent dfca68ec46
3 changed files with 261 additions and 34 deletions
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -762,15 +762,30 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum ,

 	f->set ( getDir() , name , NULL ); // getStripeDir() );

-	// if new insure does not exist
+	// if new ensure does not exist
 	if ( isNew && f->doesExist() ) {
 		log("rdb: creating NEW file %s/%s which already exists!",
 		    f->getDir(),
 		    f->getFilename());
-		mdelete ( f , sizeof(BigFile),"RdbBFile");
-		delete (f); 
-		return -1;
-		char *xx=NULL;*xx=0;
+		// if length is NOT 0 then that sucks, just return
+		if ( f->getFileSize() != 0 ) {
+			mdelete ( f , sizeof(BigFile),"RdbBFile");
+			delete (f); 
+			return -1;
+			char *xx=NULL;*xx=0;
+		}
+		// otherwise, move it to the trash
+		SafeBuf cmd;
+		cmd.safePrintf("mv %s/%s %s/trash/",
+			       f->getDir(),
+			       f->getFilename(),
+			       g_hostdb.m_dir);
+		log("rdb: %s",cmd.getBufStart() );
+		gbsystem ( cmd.getBufStart() );
+		// ok, now re-set it since it is no longer there
+		f->reset();
+		// and set it again
+		f->set ( getDir() , name , NULL );
 	}


--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -770,8 +770,8 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
 			// bad url in spider request?
 			if ( g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ){
 				SpiderRequest *sr = (SpiderRequest *)rec;
-				if ( strncmp(sr->m_url,"http",4) != 0 ) {
-					log("db: spider req url");
+				if ( sr->isCorrupt() ) {
+					log("db: spider req corrupt");
 					char *xx=NULL;*xx=0;
 				}
 			}
--- a/Spider.cpp
+++ b/Spider.cpp
@ -13599,9 +13599,31 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 	int64_t reqUh48  = 0LL;
 	int64_t repUh48  = 0LL;
 	SpiderReply   *oldRep = NULL;
-	SpiderRequest *oldReq = NULL;
+	//SpiderRequest *oldReq = NULL;
 	char *lastKey     = NULL;
-	char *prevLastKey = NULL;
+
+	int32_t oldSize = list->m_listSize;
+	int32_t corrupt = 0;
+	// debug
+	// static int32_t s_count = 0;
+	// s_count++;
+	// if ( s_count == 2524 )
+	// 	log("gotit");
+
+	int32_t numToFilter = 0;
+
+	class Link {
+	public:
+		uint32_t m_srh;
+		SpiderRequest *m_sreq;
+		class Link *m_prev;
+		class Link *m_next;
+	};
+#define MAXLINKS 30
+	Link *headLink = NULL;
+	Link *tailLink = NULL;
+	Link  links[MAXLINKS];
+	int32_t numLinks = 0;

 	// save list ptr in case of re-read?
 	//char *saved = list->m_listPtr;
@ -13624,10 +13646,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 				log("spider: filter got negative key");
 				char *xx=NULL;*xx=0;
 			}
-			// save this
-			prevLastKey = lastKey;
-			lastKey     = dst;
 			// otherwise, keep it
+			lastKey = dst;
 			memmove ( dst , rec , sizeof(key128_t) );
 			dst += sizeof(key128_t);
 			continue;
@ -13655,15 +13675,13 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 					continue;
 				// otherwise, erase him
 				dst     = restorePoint;
-				lastKey = prevLastKey;
 			}
 			// save in case we get erased
 			restorePoint = dst;
-			prevLastKey  = lastKey;
-			lastKey      = dst;
 			// get our size
 			int32_t recSize = srep->getRecSize();
 			// and add us
+			lastKey = dst;
 			memmove ( dst , rec , recSize );
 			// advance
 			dst += recSize;
@ -13677,6 +13695,12 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		// int16_tcut
 		SpiderRequest *sreq = (SpiderRequest *)rec;

+		// might as well filter out corruption
+		if ( sreq->isCorrupt() ) {
+			corrupt += sreq->getRecSize();
+			continue;
+		}
+
 		// int16_tcut
 		int64_t uh48 = sreq->getUrlHash48();

@ -13714,26 +13738,15 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 			//sreq->m_hasSiteVenue       = old->m_hasSiteVenue;
 		}

-		// if we are not the same url as last request, add it
+		// if we are not the same url as last request, then
+		// we will not need to dedup, but should add ourselves to
+		// the linked list, which we also reset here.
 		if ( uh48 != reqUh48 ) {
-			// a nice hook in
-		addIt:
-			// save in case we get erased
-			restorePoint = dst;
-			prevLastKey  = lastKey;
-			// get our size
-			int32_t recSize = sreq->getRecSize();
-			// save this
-			lastKey = dst;
-			// and add us
-			memmove ( dst , rec , recSize );
-			// advance
-			dst += recSize;
-			// update this crap for comparing to next reply
-			reqUh48  = uh48;
-			oldReq   = sreq;
-			// get next spiderdb record
-			continue;
+			numLinks = 0;
+			headLink = NULL;
+			tailLink = NULL;
+			// we are the new banner carrier
+			reqUh48 = uh48;
 		}

 		// try to kinda grab the min hop count as well
@ -13745,9 +13758,140 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		// 		oldReq->m_hopCount = sreq->m_hopCount;
 		// }

+		// why does sitehash32 matter really?
+		uint32_t srh = sreq->m_siteHash32;
+		if ( sreq->m_isNewOutlink  ) srh ^= 0xb714d3a3;
+		if ( sreq->m_isInjecting   ) srh ^= 0x42538909;
+		if ( sreq->m_hasContent    ) srh ^= 0xbbbefd59;
+		if ( sreq->m_isAddUrl      ) srh ^= 0x587c5a0b;
+		if ( sreq->m_isPageReindex ) srh ^= 0x70fb3911;
+		if ( sreq->m_forceDelete   ) srh ^= 0x4e6e9aee;
+
+		if ( sreq->m_parentIsSiteMap    ) srh ^= 0xe0c20e3f;
+		if ( sreq->m_urlIsDocId         ) srh ^= 0xee015b07;
+		if ( sreq->m_fakeFirstIp        ) srh ^= 0x95b8d376;
+		if ( sreq->m_parentIsRSS        ) srh ^= 0xb08c7545;
+		if ( sreq->m_parentIsPermalink  ) srh ^= 0xbd688268;
+		if ( sreq->m_parentIsPingServer ) srh ^= 0xb4c8a811;
+		if ( sreq->m_isMenuOutlink      ) srh ^= 0xd97bb80b;
+
+		// we may assign url filter priority based on parent langid
+		srh ^= (uint32_t)g_hashtab[0][(uint8_t)sreq->m_parentLangId];
+
 		// if he's essentially different input parms but for the
 		// same url, we want to keep him because he might map the
 		// url to a different url priority!
+		bool skipUs = false;
+		Link *myLink = NULL;
+		Link *link = headLink;
+
+		// debug point. should be deduped by
+		// if ( sreq->m_key.n0==7199823231990374913LL &&
+		//      sreq->m_key.n1==6511615362168588088 )
+		// 	log("hey1");
+		// if ( sreq->m_key.n0==7199823542662487041LL &&
+		//      sreq->m_key.n1==6511615362168588088 )
+		// 	log("hey2");
+
+		// now we keep a list of the last ten
+		for ( ; link ; link = link->m_next ) {
+			if ( srh != link->m_srh ) continue;
+			SpiderRequest *prevReq = link->m_sreq;
+			// if we are better, replace him and stop
+			if ( sreq->m_hopCount < prevReq->m_hopCount ) 
+				goto replacePrevReq;
+			// skip us if previous guy is better
+			if ( sreq->m_hopCount > prevReq->m_hopCount ) {
+				skipUs = true;
+				break;
+			}
+
+			// TODO: for pro, base on parentSiteNumInlinks here,
+			// and hash hopcounts, but only 0,1,2,3. use 3
+			// for all that are >=3. we can also have two hashes,
+			// m_srh and m_srh2 in the Link class, and if your
+			// new secondary hash is unique we can let you in
+			// if your parentpageinlinks is the highest of all.
+
+			// resort to added time if hopcount is tied
+			// . if the same check who has the most recentaddedtime
+			// . if we are not the most recent, just do not add us
+			// . no, now i want the oldest so we can do 
+			//   gbssDiscoveryTime and set sreq->m_discoveryTime 
+			//   accurately, above
+			if ( sreq->m_addedTime >= prevReq->m_addedTime ) {
+				skipUs = true;
+				break;
+			}
+			// otherwise, replace him
+		replacePrevReq:
+			if ( prevReq->m_url[0] != 'h' ) { char *xx=NULL;*xx=0;}
+			prevReq->m_url[0] = 'x'; // mark for removal. xttp://
+			myLink = link;
+			// make a note of this so we physically remove these
+			// entries after we are done with this scan.
+			numToFilter++;
+			goto promoteLinkToHead;
+		}
+		// if we were not as good as someone that was basically the
+		// same SpiderRequest before us, keep going
+		if ( skipUs )
+			continue;
+
+		// add to linked list
+		if ( numLinks < MAXLINKS ) {
+			myLink = &links[numLinks++];
+			myLink->m_prev = NULL;
+			myLink->m_next = NULL;
+			// if first one, we are head and tail
+			if ( numLinks == 1 ) {
+				headLink = myLink;
+				tailLink = myLink;
+			}
+		}
+		// if full, just supplant the tail link
+		else 
+			myLink = tailLink;
+
+	promoteLinkToHead:
+
+		myLink->m_srh  = srh;
+		myLink->m_sreq = (SpiderRequest *)dst;//sreq;
+
+		// move link to head if not already
+		if ( myLink != headLink ) {
+			// if we are the tail, there will be a new tail
+			if ( myLink == tailLink ) tailLink = myLink->m_prev;
+			// make previous link point over us
+			if ( myLink->m_prev )
+				myLink->m_prev->m_next = myLink->m_next;
+			// make next link ptr point backward over us
+			if ( myLink->m_next )
+				myLink->m_next->m_prev = myLink->m_prev;
+			// make current head point backward to us
+			headLink->m_prev = myLink;
+			// and we point forward to him
+			myLink->m_next = headLink;
+			// and backward to nobody
+			myLink->m_prev = NULL;
+			// and we are the head now
+			headLink = myLink;
+		}
+
+
+		// get our size
+		int32_t recSize = sreq->getRecSize();
+
+		// and add us
+		lastKey = dst;
+		memmove ( dst , rec , recSize );
+		// advance
+		dst += recSize;
+
+		// get next spiderdb record
+		continue;
+
+		/*
 		if ( oldReq->m_siteHash32    != sreq->m_siteHash32    ||
 		     oldReq->m_isNewOutlink  != sreq->m_isNewOutlink  ||
 		     //  use hopcount now too!
@ -13771,11 +13915,14 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		// . no, now i want the oldest so we can do gbssDiscoveryTime
 		//   and set sreq->m_discoveryTime accurately, above
 		if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
+
 		// otherwise, erase over him
 		dst     = restorePoint;
 		lastKey = prevLastKey;
 		// and add us over top of him
 		goto addIt;
+		*/
+
 	}

 	// free the old list
@ -13786,6 +13933,57 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 	if ( dst < list->m_list || dst > list->m_list + list->m_listSize ) {
 		char *xx=NULL;*xx=0; }

+
+	/////////
+	//
+	// now remove xttp:// urls if we had some
+	//
+	/////////
+	if ( numToFilter > 0 ) {
+		// update list so for-loop below works
+		list->m_listSize  = dst - newList;
+		list->m_listPtr   = newList;//dst;
+		list->m_listEnd   = list->m_list + list->m_listSize;
+		list->m_listPtrHi = NULL;
+		// and we'll re-write everything back into itself at "dst"
+		dst = newList;
+	}
+	for ( ; ! list->isExhausted() ; ) {
+		// breathe. NO! assume in thread!!
+		//QUICKPOLL(niceness);
+		// get rec
+		char *rec = list->getCurrentRec();
+		// pre skip it
+		list->skipCurrentRec();
+		// skip if negative, just copy over
+		if ( ( rec[0] & 0x01 ) == 0x00 ) {
+			lastKey = dst;
+			memmove ( dst , rec , sizeof(key128_t) );
+			dst += sizeof(key128_t);
+			continue;
+		}
+		// is it a reply?
+		if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
+			SpiderReply *srep = (SpiderReply *)rec;
+			int32_t recSize = srep->getRecSize();
+			lastKey = dst;
+			memmove ( dst , rec , recSize );
+			dst += recSize;
+			continue;
+		}
+		SpiderRequest *sreq = (SpiderRequest *)rec;
+		// skip if filtered out
+		if ( sreq->m_url[0] == 'x' ) 
+			continue;
+		int32_t recSize = sreq->getRecSize();
+		lastKey = dst;
+		memmove ( dst , rec , recSize );
+		dst += recSize;
+		// if ( sreq->getUrlHash48() == 49553538838LL )
+		// 	log("napkins");
+	}
+
+
 	// and stick our newly filtered list in there
 	//list->m_list      = newList;
 	list->m_listSize  = dst - newList;
@ -13797,6 +13995,15 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 	list->m_listPtrHi = NULL;
 	//KEYSET(list->m_lastKey,lastKey,list->m_ks);

+	// log("spiderdb: remove ME!!!");
+	// // check it
+	// list->checkList_r(false,false,RDB_SPIDERDB);
+	// list->resetListPtr();
+
+	int32_t delta = oldSize - list->m_listSize;
+	log("spider: deduped %i bytes (of which %i were corrupted) "
+	    "out of %i",(int)delta,(int)corrupt,(int)oldSize);
+
 	if ( lastKey ) KEYSET(list->m_lastKey,lastKey,list->m_ks);

 	//mfree ( oldbuf , oldSize, "oldspbuf");
@ -15019,6 +15226,11 @@ bool SpiderRequest::isCorrupt ( ) {
 	if ( m_url[0] == 'h' && m_url[1]=='t' && m_url[2]=='t' &&
 	     m_url[3] == 'p' ) 
 		return false;
+	// to be a docid as url must have this set
+	if ( ! m_isPageReindex && ! m_urlIsDocId ) {
+		log("spider: got corrupt 3 spiderRequest");
+		return true;
+	}
 	// might be a docid from a pagereindex.cpp
 	if ( ! is_digit(m_url[0]) ) { 
 		log("spider: got corrupt 1 spiderRequest");