Merge branch 'master' of github.com:privacore/open-source-search-engine

2015-11-17 16:17:27 +01:00
parent 2b3b840e4a cc53fd1c4b
commit 48c38a7bc4
5 changed files with 41 additions and 62 deletions
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -1668,22 +1668,33 @@ int32_t Hostdb::getHostIdWithSpideringEnabled ( uint32_t shardNum ) {
 	return hosts [ hostNum ].m_hostId ;
 }

-
-Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum ) {
+// if niceness 0 can't pick noquery host.
+// if niceness 1 can't pick nospider host.
+Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
 	int32_t minOutstandingRequests = 0x7fffffff;
 	int32_t minOutstandingRequestsIndex = -1;
 	Host *shard = getShard ( shardNum );
+	Host *bestDead = NULL;
 	for(int32_t i = 0; i < m_numHostsPerShard; i++) {
 		Host *hh = &shard[i];
+		// don't pick a 'no spider' host if niceness is 1
+		if ( niceness >  0 && ! hh->m_spiderEnabled ) continue;
+		// don't pick a 'no query' host if niceness is 0
+		if ( niceness == 0 && ! hh->m_queryEnabled  ) continue;
+		if ( ! bestDead ) bestDead = hh;
 		if(isDead(hh)) continue;
 		// log("host %"INT32 " numOutstanding is %"INT32, hh->m_hostId, 
 		// 	hh->m_pingInfo.m_udpSlotsInUseIncoming);
-		if(hh->m_pingInfo.m_udpSlotsInUseIncoming > minOutstandingRequests) continue;
+		if ( hh->m_pingInfo.m_udpSlotsInUseIncoming > 
+		     minOutstandingRequests )
+			continue;

-		minOutstandingRequests = hh->m_pingInfo.m_udpSlotsInUseIncoming;
+		minOutstandingRequests =hh->m_pingInfo.m_udpSlotsInUseIncoming;
 		minOutstandingRequestsIndex = i;
 	}
-	if(minOutstandingRequestsIndex == -1) return shard;
+	// we should never return a nospider/noquery host depending on
+	// the niceness, so return bestDead
+	if(minOutstandingRequestsIndex == -1) return bestDead;//shard;
 	return &shard[minOutstandingRequestsIndex];
 }

--- a/Hostdb.h
+++ b/Hostdb.h
@ -450,7 +450,7 @@ class Hostdb {

 	//Host *getLiveHostInGroup ( int32_t groupId );
 	Host *getLiveHostInShard ( int32_t shardNum );
-	Host *getLeastLoadedInShard ( uint32_t shardNum );
+	Host *getLeastLoadedInShard ( uint32_t shardNum , char niceness );
 	int32_t getHostIdWithSpideringEnabled ( uint32_t shardNum );

 	// in the entire cluster. return host #0 if its alive, otherwise
--- a/Msg22.cpp
+++ b/Msg22.cpp
@ -4,6 +4,7 @@
 #include "Titledb.h"
 #include "UdpServer.h"

+
 static void handleRequest22 ( UdpSlot *slot , int32_t netnice ) ;

 Msg22Request::Msg22Request()
@ -165,46 +166,13 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 	if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
 	firstHostId = hosts [ hostNum ].m_hostId ;
 	*/
+	
+	Host *firstHost ;
+	// if niceness 0 can't pick noquery host.
+	// if niceness 1 can't pick nospider host.
+	firstHost = g_hostdb.getLeastLoadedInShard ( shardNum, r->m_niceness );
+	int32_t firstHostId = firstHost->m_hostId;

-	// get our group
-	int32_t  allNumHosts = g_hostdb.getNumHostsPerShard();
-	Host *allHosts    = g_hostdb.getShard ( shardNum );//Group ( groupId );
-
-	// put all alive hosts in this array
-	Host *cand[32];
-	int64_t  nc = 0;
-	for ( int32_t i = 0 ; i < allNumHosts ; i++ ) {
-		// get that host
-		Host *hh = &allHosts[i];
-		// skip if dead
-		if ( g_hostdb.isDead(hh) ) continue;
-		// add it if alive
-		cand[nc++] = hh;
-	}
-	// if none alive, make them all candidates then
-	bool allDead = (nc == 0);
-	for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) 
-		cand[nc++] = &allHosts[i];
-
-	// route based on docid region, not parity, because we want to hit
-	// the urldb page cache as much as possible
-	int64_t sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL;
-	// we mod by 1MB since tied scores resort to sorting by docid
-	// so we don't want to overload the host responsible for the lowest
-	// range of docids. CAUTION: do this for msg22 too!
-	// in this way we should still ensure a pretty good biased urldb
-	// cache... 
-	// . TODO: fix the urldb cache preload logic
-	int32_t hostNum = (docId % (128LL*1024*1024)) / sectionWidth;
-	if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
-	if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; }
-	int32_t firstHostId = cand [ hostNum ]->m_hostId ;
-
-	// while this prevents tfndb seeks, it also causes bottlenecks
-	// if one host is particularly slow, because load balancing is
-	// bypassed.
-	//if ( ! g_conf.m_useBiasedTfndb ) firstHostId = -1;
-	// flag it
 	m_outstanding = true;
 	r->m_inUse    = 1;

--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -2803,24 +2803,15 @@ bool Msg8a::launchGetRequests ( ) {
 	//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
 	//Host *group = g_hostdb.getGroup ( gid );
 	int32_t shardNum = getShardNum ( m_rdbId , &startKey );//, true );
-	Host *group = g_hostdb.getShard ( shardNum );
-
-	//int32_t numTwins = g_hostdb.getNumHostsPerShard();
-	// use top byte!
-	uint8_t *sks = (uint8_t *)&startKey;
-	uint8_t top = sks[sizeof(TAGDB_KEY)-1];
-	//int32_t hostNum = 0;
-	//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
-	// TODO: fix this!
-	//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
-	// support more than 2 stripes now...
-	int32_t hostNum = top % g_hostdb.getNumHostsPerShard();
-	int32_t hostId = group[hostNum].m_hostId;
-
+	Host *firstHost ;
+	// if niceness 0 can't pick noquery host.
+	// if niceness 1 can't pick nospider host.
+	firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
+	int32_t firstHostId = firstHost->m_hostId;

 	// . launch this request, even if to ourselves
 	// . TODO: just use msg0!!
-	bool status = m->getList ( hostId     , // hostId
+	bool status = m->getList ( firstHostId     , // hostId
 				   0          , // ip
 				   0          , // port
 				   0          , // maxCacheAge
@ -2837,7 +2828,7 @@ bool Msg8a::launchGetRequests ( ) {
 				   true                , // error correction?
 				   true                , // include tree?
 				   true                , // doMerge?
-				   -1                  , // firstHostId
+				   firstHostId         , // firstHostId
 				   0                   , // startFileNum
 				   -1                  , // numFiles
 				   3600*24*365         );// timeout
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -20898,6 +20898,10 @@ char *XmlDoc::getIsSiteRoot ( ) {
 	if ( ! site || site == (char *)-1 ) return (char *)site;
 	// get our url without the http:// or https://
 	char *u = getFirstUrl()->getHost();
+	if ( ! u ) {
+		g_errno = EBADURL;
+		return NULL;
+	}
 	// assume valid now
 	m_isSiteRootValid = true;
 	// get it
@ -21808,7 +21812,12 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 	// like how we index it, do not include the filename. so we can
 	// have a bunch of pathdepth 0 urls with filenames like xyz.com/abc.htm
 	if ( m_firstUrlValid ) {
-		int32_t pd = m_firstUrl.getPathDepth(false);
+		int32_t pd = -1;
+		// fix core
+		if ( m_firstUrl.m_url &&
+		     m_firstUrl.m_ulen > 0 &&
+		     m_firstUrl.m_path )
+			pd = m_firstUrl.getPathDepth(false);
 		sb->safePrintf("pathdepth=%"INT32" ",pd);
 	}
 	else {