got tag: support working in the sitelist and url filters

2014-07-10 20:41:59 -07:00
parent 0da6063983
commit 2a570cc1ef
3 changed files with 163 additions and 26 deletions
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -67,6 +67,9 @@ public:
 	// offset of the url path in the pattern, 0 means none
 	short m_pathOff; 
 	short m_pathLen;
+	// offset into buffer. for 'tag:shallow site:walmart.com' type stuff
+	long  m_tagOff;
+	short m_tagLen;
 };


@ -180,7 +183,8 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		// skip to end of line marker
 		for ( ; *pn && *pn != '\n' ; pn++ ) ;

-		char *start = s;
+		// point to the pattern (skips over "tag:xxx " if there)
+		char *patternStart = s;

 		// back p up over spaces in case ended in spaces
 	        char *pe = pn;
@ -197,13 +201,15 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		bool isNeg = false;
 		bool isFilter = true;

-	innerLoop:
 		// skip spaces at start of line
-		if ( *s == ' ' ) s++;
+		for ( ; *s && *s == ' ' ; s++ );

 		// comment?
 		if ( *s == '#' ) continue;

+		// empty line?
+		if ( s[0] == '\r' && s[1] == '\n' ) { s++; continue; }
+
 		// empty line?
 		if ( *s == '\n' ) continue;

@ -213,11 +219,11 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		//	continue;
 		//}

-		if ( *s == '-' ) {
-			sc->m_siteListHasNegatives = true;
-			isNeg = true;
-			s++;
-		}
+	innerLoop:
+
+		// skip spaces
+		for ( ; *s && *s == ' ' ; s++ );
+

 		// exact:?
 		//if ( strncmp(s,"exact:",6) == 0 ) {
@ -233,6 +239,30 @@ bool updateSiteListBuf ( collnum_t collnum ,
 			goto innerLoop;
 		}

+
+		// does it start with "tag:xxxxx "?
+		char *tag = NULL;
+		long tagLen = 0;
+		if ( *s == 't' && 
+		     s[1] == 'a' &&
+		     s[2] == 'g' &&
+		     s[3] == ':' ) {
+			tag = s+4;
+			for ( ; *s && ! is_wspace_a(*s) ; s++ );
+			tagLen = s - tag;
+			// skip over white space after tag:xxxx so "s"
+			// point to the url or contains: or whatever
+			for ( ; *s && is_wspace_a(*s) ; s++ );
+			// set pattern start to AFTER the tag stuff
+			patternStart = s;
+		}
+
+		if ( *s == '-' ) {
+			sc->m_siteListHasNegatives = true;
+			isNeg = true;
+			s++;
+		}
+
 		if ( strncmp(s,"site:",5) == 0 ) {
 			s += 5;
 			seedMe = false;
@ -252,20 +282,35 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		if ( slen <= 0 ) 
 			continue;

+		// add to string buffers
+		if ( ! isUrl && isNeg ) {
+			if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen))
+				return true;
+			if ( !sc->m_negSubstringBuf.pushChar('\0') )
+				return true;
+			if ( ! tagLen ) continue;
+			// append tag
+			if ( !sc->m_negSubstringBuf.safeMemcpy("tag:",4))
+				return true;
+			if ( !sc->m_negSubstringBuf.safeMemcpy(tag,tagLen) ) 
+				return true;
+			if ( !sc->m_negSubstringBuf.pushChar('\0') )
+				return true;
+		}
 		if ( ! isUrl ) {
-			// add to string buffers
-			if (   isNeg ) {
-				if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen))
-					return true;
-				if ( !sc->m_negSubstringBuf.pushChar('\0') )
-					return true;
-				continue;
-			}
 			// add to string buffers
 			if ( ! sc->m_posSubstringBuf.safeMemcpy(s,slen) )
 				return true;
 			if ( ! sc->m_posSubstringBuf.pushChar('\0') )
 				return true;
+			if ( ! tagLen ) continue;
+			// append tag
+			if ( !sc->m_posSubstringBuf.safeMemcpy("tag:",4))
+				return true;
+			if ( !sc->m_posSubstringBuf.safeMemcpy(tag,tagLen) ) 
+				return true;
+			if ( !sc->m_posSubstringBuf.pushChar('\0') )
+				return true;
 			continue;
 		}

@ -288,6 +333,8 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		     // a "site:" directive mean no seeding
 		     // a "contains:" directive mean no seeding
 		     seedMe &&
+		     // do not seed stuff after tag:xxx directives
+		     ! tag &&
 		     ! dedup.isInTable ( &h32 ) ) {
 			// make spider request
 			SpiderRequest sreq;
@ -316,9 +363,18 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		// . store offset since CommandUpdateSiteList() passes us
 		//   a temp buf that will be freed before copying the buf
 		//   over to its permanent place at cr->m_siteListBuf
-		pd.m_patternStrOff = start - siteListArg;
+		pd.m_patternStrOff = patternStart - siteListArg;
 		// offset of the url path in the pattern, 0 means none
 		pd.m_pathOff = 0;
+		// did we have a tag?
+		if ( tag ) {
+			pd.m_tagOff = tag - siteListArg;
+			pd.m_tagLen = tagLen;
+		}
+		else {
+			pd.m_tagOff = -1;
+			pd.m_tagLen = 0;
+		}
 		// scan url pattern, it should start at "s"
 		char *x = s;
 		// go all the way to the end
@ -337,7 +393,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
 			if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; }
 			// calc length from "start" of line so we can
 			// jump to the path quickly for compares. inc "/"
-			pd.m_pathOff = (x-1) - start;
+			pd.m_pathOff = (x-1) - patternStart;
 			pd.m_pathLen = pe - (x-1);
 			break;
 		}
@ -384,7 +440,9 @@ bool updateSiteListBuf ( collnum_t collnum ,
 // . the url patterns all contain a domain now, so this can use the domain
 //   hash to speed things up
 // . return ptr to the start of the line in case it has "tag:" i guess
-char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
+char *getMatchingUrlPattern ( SpiderColl *sc , 
+			      SpiderRequest *sreq ,
+			      char *tagArg ) { // tagArg can be NULL

 	// if it has * and no negatives, we are in!
 	//if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
@ -485,6 +543,25 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
 		}
 nomatch:		

+
+		// if caller also gave a tag we'll want to see if this
+		// "pd" has an entry for this domain that has that tag
+		if ( tagArg ) {
+			// skip if entry has no tag
+			if ( pd->m_tagLen <= 0 ) continue;
+			// skip if does not match domain or host
+			if ( pd->m_thingHash32 != sreq->m_domHash32 &&
+			     pd->m_thingHash32 != sreq->m_hostHash32 )
+				continue;
+			// compare tags
+			char *pdtag = pd->m_tagOff + buf;
+			if ( strncmp(tagArg,pdtag,pd->m_tagLen) ) continue;
+			// must be nothing after
+			if ( is_alnum_a(tagArg[pd->m_tagLen]) ) continue;
+			// that's a match
+			return patternStr;
+		}
+
 		// was the line just a domain and not a subdomain?
 		if ( pd->m_thingHash32 == sreq->m_domHash32 )
 			// this will be false if negative pattern i guess
--- a/Spider.cpp
+++ b/Spider.cpp
@ -9986,7 +9986,7 @@ bool isAggregator ( long siteHash32,long domHash32,char *url,long urlLen ) {
 #define SIGN_LE 6

 // from PageBasic.cpp
-char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) ;
+char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq, char *tag);

 // . this is called by SpiderCache.cpp for every url it scans in spiderdb
 // . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
@ -10405,7 +10405,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 				// only do once for speed
 				checkedRow = true;
 				// this function is in PageBasic.cpp
-				row = getMatchingUrlPattern ( sc, sreq );
+				row = getMatchingUrlPattern ( sc, sreq ,NULL);
 			}
 			// if we are not submitted from the add url api, skip
 			if ( (bool)row == val ) continue;
@ -10418,7 +10418,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			p += 2;
 			goto checkNextRule;
 		}
-		

 		// . was it submitted from PageAddUrl.cpp?
 		// . replaces the "add url priority" parm
@ -10851,6 +10850,41 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 		if ( *p=='d' && ! strcmp(p,"default" ) )
 			return i;

+		// is it in the big list of sites?
+		if ( *p == 't' && strncmp(p,"tag:",4) == 0 ) {
+			// skip for msg20
+			//if ( isForMsg20 ) continue;
+			// if only seeds in the sitelist and no
+
+			// if there is no domain or url explicitly listed
+			// then assume user is spidering the whole internet
+			// and we basically ignore "insitelist"
+			if ( sc->m_siteListIsEmpty &&
+			     sc->m_siteListIsEmptyValid ) {
+				row = NULL;// no row
+			}
+			else if ( ! checkedRow ) {
+				// only do once for speed
+				checkedRow = true;
+				// this function is in PageBasic.cpp
+				// . it also has to match "tag" at (p+4)
+				row = getMatchingUrlPattern ( sc, sreq ,p+4);
+			}
+			// if we are not submitted from the add url api, skip
+			if ( (bool)row == val ) continue;
+			// skip tag:
+			p += 4;
+			// skip to next constraint
+			p = strstr(p, "&&");
+			// all done?
+			if ( ! p ) return i;
+			p += 2;
+			goto checkNextRule;
+		}
+		
+
+
+
 		// set the sign
 		char *s = p;
 		// skip s to after
--- a/html/admin.html
+++ b/html/admin.html
@ -40,6 +40,11 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
 <br>
 <br>

+<a href=#multisetup>Setting up a Cluster</a> - how to run multiple gb instances in a sharded cluster.
+<br>
+<br>
+
+
 <a href=#scaling>Scaling the Cluster</a> - how to add more gb instances.
 <br>
 <br>
@ -367,6 +372,27 @@ Each directory should have the following files and subdirectories:<br><br>
 <br>
 -->

+<a name=multisetup></a>
+<table cellpadding=1 border=0 width=100% bgcolor=#0079ba>
+<tr><td><center><b><font color=#ffffff size=+1>Setting up a Cluster</td></tr></table>
+<br>
+&lt;<i>Last Updated July 2014</i>&gt;
+<br>
+<br>
+1. Locate the hosts.conf file. If installing from binaries it should be in the /var/gigablast/data0/ directory. If it does not exist yet then run <b>gb</b> or <b>./gb</b> which will create one. You will then have to exit gb after it does.
+<br><br>
+2. Update the <b>num-mirrors</b> in the hosts.conf file. Leave it as 0 if you do not want redundancy. If you want each shard to be mirrored by one other gb instance, then set this to 1. I find that 1 is typically good enough, provided that the twin is on a different physical server. So if one server gets trashed there is another to serve that shard. The sole advantage in not mirroring your cluster is that you will have twice the disk space for storing documents. Query speed should be unaffected because Gigablast is smart enough to split the load evenly between mirrors when processing queries. You can send your queries to any shard and it will communicate with all the other shards to aggregate the results. If one shard fails and you are not mirroring then you will lose that part of the index, unfortunately.
+
+<br><br>
+3. Make one entry in the hosts.conf per physical core you have on your server.  If an entry is on the same server as another, then it will need a completely different set of ports.  Each gb instance also requires 4GB of ram, so you may be limited by your RAM before being limited by your cores. You can of course run multiple gb instances on a single core if you have the RAM, but performance will not be optimal.
+
+<br><br>
+4. Continue following the instructions for <a href=#scaling>Scaling the Cluster</a> below in order to get the other shards set up and running.
+
+
+<br>
+<br>
+<br>


 <a name=scaling></a>
@ -376,11 +402,11 @@ Each directory should have the following files and subdirectories:<br><br>
 &lt;<i>Last Updated June 2014</i>&gt;
 <br>
 <br>
-1. Turn off spidering in the <a href=/admin/master>master controls</a>.
+1. If your spiders are active, then turn off spidering in the <a href=/admin/master>master controls</a>.
 <br><br>
-2. Shut down the clustering by doing a <b>gb stop</b> command on the command line OR by clicking on "save & exit" in the <a href=/admin/master>master controls</a>
+2. If your cluster is running, shut down the clustering by doing a <b>gb stop</b> command on the command line OR by clicking on "save & exit" in the <a href=/admin/master>master controls</a>
 <br><br>
-3. Edit the hosts.conf file in the working directory to add the new hosts. (<a href=/hosts.conf.txt>sample hosts.conf</a>)
+3. Edit the hosts.conf file in the working directory of host #0 (the first host entry in the hosts.conf file) to add the new hosts. (<a href=/hosts.conf.txt>sample hosts.conf</a>)
 <br><br>
 4. Ensure you can do passwordless ssh from host #0 to each new IP address you added. This generally requires running <b>ssh-keygen -t dsa</b> on host #0 to create the files <i>~/.ssh/id_dsa</i> and <i>~/.ssh/id_dsa.pub</i>. Then you need to insert the key in <i>~/.ssh/id_dsa.pub</i> into the <i>~/.ssh/authorized_keys2</i> file on every host, including host #0, in your cluster. Furthermore, you must do a <b>chmod 700 ~/.ssh/authorized_keys2</b> on each one otherwise the passwordless ssh will not work.
 <br><br>
@ -388,7 +414,7 @@ Each directory should have the following files and subdirectories:<br><br>
 <br><br>
 6. Run <b>gb start</b> on the command line to start up all gb instances/processes in the cluster.
 <br><br>
-7. Click on <b>rebalance shards</b> in the <a href=/admin/master>master controls</a> to begin moving data from the old shards to the new shards. The <a href=/admin/hosts>hosts table</a> will let you know when the rebalance operation is complete. It should be able to serve queries during the rebalancing, but spidering can not resume until it is completed.
+7. If your index was not empty, then click on <b>rebalance shards</b> in the <a href=/admin/master>master controls</a> to begin moving data from the old shards to the new shards. The <a href=/admin/hosts>hosts table</a> will let you know when the rebalance operation is complete. It should be able to serve queries during the rebalancing, but spidering can not resume until it is completed.
 <br>
 <br>
 <br>