Merge branch 'diffbot-testing' into diffbot-dan

2014-05-12 15:33:15 -07:00
parent 78e2bd8171 4bb1f99296
commit 8d1c4e3097
78 changed files with 4633 additions and 1242 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -467,13 +467,13 @@ bool Collectiondb::addNewColl ( char *coll ,
 		cr->m_collectiveRespiderFrequency = 0.0;
 		//cr->m_restrictDomain = true;
 		// reset the crawl stats
-		// . this will core if a host was dead and then when it came
-		//   back up host #0's parms.cpp told it to add a new coll
-		cr->m_diffbotCrawlStartTime=
-			gettimeofdayInMillisecondsGlobalNoCore();
-		cr->m_diffbotCrawlEndTime   = 0LL;
 	}

+	// . this will core if a host was dead and then when it came
+	//   back up host #0's parms.cpp told it to add a new coll
+	cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
+	cr->m_diffbotCrawlEndTime   = 0;
+	
 	// . just the basics on these for now
 	// . if certain parms are changed then the url filters
 	//   must be rebuilt, as well as possibly the waiting tree!!!
@ -807,6 +807,11 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
 		sc->clearLocks();
 		//sc->m_collnum = newCollnum;
 		//sc->reset();
+		// you have to set this for tryToDeleteSpiderColl to
+		// actually have a shot at deleting it
+		sc->m_deleteMyself = true;
+		// cr will be invalid shortly after this
+		sc->m_cr = NULL;
 		// this will put it on "death row" so it will be deleted
 		// once Msg5::m_waitingForList/Merge is NULL
 		tryToDeleteSpiderColl ( sc );
@ -1125,6 +1130,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	// to any rdb...
 	cr->m_collnum = newCollnum;

+	// update the timestamps since we are restarting/resetting
+	cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
+	cr->m_diffbotCrawlEndTime   = 0;
+
+
 	////////
 	//
 	// ALTER m_recs[] array
@ -1252,6 +1262,33 @@ CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
 	return g_collectiondb.getRec ( coll );
 }

+char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
+	char *coll = r->getString ( "c" );
+	if ( coll && ! coll[0] ) coll = NULL;
+	if ( coll ) return coll;
+	CollectionRec *cr = NULL;
+	// default to main first
+	if ( ! coll ) {
+		cr = g_collectiondb.getRec("main");
+		// CAUTION: cr could be deleted so don't trust this ptr
+		// if you give up control of the cpu
+		if ( cr ) return cr->m_coll;
+	}
+	// try next in line
+	if ( ! coll ) {
+		cr = getFirstRec ();
+		if ( cr ) return cr->m_coll;
+	}
+	// give up?
+	return NULL;
+}
+
+
+//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
+//	char *coll = getDefaultColl();
+//	return g_collectiondb.getRec(coll);
+//}
+
 // . get collectionRec from name
 // . returns NULL if not available
 CollectionRec *Collectiondb::getRec ( char *coll ) {
@ -1584,12 +1621,14 @@ void CollectionRec::reset() {
 	sc->m_deleteMyself = true;

 	// if not currently being accessed nuke it now
-	if ( ! sc->m_msg5.m_waitingForList &&
-	     ! sc->m_msg5b.m_waitingForList &&
-	     ! sc->m_msg1.m_mcast.m_inUse ) {
-		mdelete ( sc, sizeof(SpiderColl),"nukecr2");
-		delete ( sc );
-	}
+	tryToDeleteSpiderColl ( sc );
+
+	// if ( ! sc->m_msg5.m_waitingForList &&
+	//      ! sc->m_msg5b.m_waitingForList &&
+	//      ! sc->m_msg1.m_mcast.m_inUse ) {
+	// 	mdelete ( sc, sizeof(SpiderColl),"nukecr2");
+	// 	delete ( sc );
+	// }
 }

 CollectionRec *g_cr = NULL;
@ -1617,7 +1656,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	strcpy ( m_coll , coll );

 	if ( ! g_conf.m_doingCommandLine )
-		log(LOG_INFO,"db: loading conf for collection %s (%li)",coll,
+		log(LOG_INFO,"db: Loading conf for collection %s (%li)",coll,
 		    (long)m_collnum);

 	// collection name HACK for backwards compatibility
@ -1649,7 +1688,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// LOAD LOCAL
 	snprintf ( tmp1 , 1023, "%scoll.%s.%li/localcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log(LOG_DEBUG,"db: loading %s",tmp1);
+	log(LOG_DEBUG,"db: Loading %s",tmp1);
 	m_localCrawlInfo.reset();
 	SafeBuf sb;
 	// fillfromfile returns 0 if does not exist, -1 on read error
@ -1660,7 +1699,7 @@ bool CollectionRec::load ( char *coll , long i ) {


 	if ( ! g_conf.m_doingCommandLine )
-		log("coll: loaded %s (%li) local hasurlsready=%li",
+		log("coll: Loaded %s (%li) local hasurlsready=%li",
 		    m_coll,
 		    (long)m_collnum,
 		    (long)m_localCrawlInfo.m_hasUrlsReadyToSpider);
@ -1698,7 +1737,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// LOAD GLOBAL
 	snprintf ( tmp1 , 1023, "%scoll.%s.%li/globalcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log(LOG_DEBUG,"db: loading %s",tmp1);
+	log(LOG_DEBUG,"db: Loading %s",tmp1);
 	m_globalCrawlInfo.reset();
 	sb.reset();
 	if ( sb.fillFromFile ( tmp1 ) > 0 )
@ -1707,7 +1746,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 		memcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );

 	if ( ! g_conf.m_doingCommandLine )
-		log("coll: loaded %s (%li) global hasurlsready=%li",
+		log("coll: Loaded %s (%li) global hasurlsready=%li",
 		    m_coll,
 		    (long)m_collnum,
 		    (long)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
@ -1865,6 +1904,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	//	addDefault = true;
 	if ( ! rebuild ) return true;

+	if ( m_urlFiltersProfile == UFP_CHINESE )
+		return rebuildChineseRules();
+
 	long n = 0;

 	/*
@ -1948,7 +1990,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 		m_spiderFreqs [n] = .00347; // 5 mins
 	n++;

-
 	m_regExs[n].set("hopcount==0 && iswww");
 	m_harvestLinks       [n] = 1;
 	m_spiderFreqs        [n] = 7.0; // days b4 respider
@ -2111,6 +2152,383 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	return true;
 }

+bool CollectionRec::rebuildChineseRules ( ) {
+
+	long n = 0;
+
+	m_regExs[n].set("isdocidbased");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 80;
+	n++;
+
+	m_regExs[n].set("ismedia");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	// if not in the site list then nuke it
+	m_regExs[n].set("!ismanualadd && !insitelist");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	m_regExs[n].set("errorcount>=3 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 3;
+	n++;
+
+	m_regExs[n].set("errorcount>=1 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 45;
+	n++;
+
+	m_regExs[n].set("isaddurl");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 85;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 50;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 50;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 20;
+
+
+
+
+	m_regExs[n].set("hopcount==0 && iswww && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 48;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 48;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 19;
+	n++;
+
+
+
+
+
+	m_regExs[n].set("hopcount==0 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 49;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 49;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 18;
+	n++;
+
+
+
+	m_regExs[n].set("hopcount==0 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 47;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 47;
+	n++;
+
+	m_regExs[n].set("hopcount==0");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 17;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount==1 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 40;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 40;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 16;
+	n++;
+
+
+
+	m_regExs[n].set("hopcount==1 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 39;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 39;
+	n++;
+
+	m_regExs[n].set("hopcount==1");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 15;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount==2 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 30;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 30;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 14;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount==2 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 29;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 29;
+	n++;
+
+	m_regExs[n].set("hopcount==2");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 13;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount>=3 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 22;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 22;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 12;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount>=3 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 21;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 21;
+	n++;
+
+	m_regExs[n].set("hopcount>=3");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 11;
+	n++;
+
+
+
+	m_regExs[n].set("default");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 1;
+	n++;
+
+	m_numRegExs   = n;
+	m_numRegExs2  = n;
+	m_numRegExs3  = n;
+	m_numRegExs10 = n;
+	m_numRegExs5  = n;
+	m_numRegExs6  = n;
+	m_numRegExs8  = n;
+
+	// done rebuilding CHINESE rules
+	return true;
+}
+
 /*
 bool CrawlInfo::print (SafeBuf *sb ) {
 	return sb->safePrintf("objectsAdded:%lli\n"
@ -2350,7 +2768,6 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
 }

 bool expandRegExShortcuts ( SafeBuf *sb ) ;
-//bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
 void nukeDoledb ( collnum_t collnum );

 // . anytime the url filters are updated, this function is called
@ -2358,7 +2775,7 @@ void nukeDoledb ( collnum_t collnum );
 bool CollectionRec::rebuildUrlFilters ( ) {

 	if ( ! g_conf.m_doingCommandLine )
-		log("coll: rebuilding url filters for %s ufp=%li",m_coll,
+		log("coll: Rebuilding url filters for %s ufp=%li",m_coll,
 		    (long)m_urlFiltersProfile);

 	// if not a custom crawl, and no expressions, add a default one
@ -2762,3 +3179,4 @@ void testRegex ( ) {
 		     url,rx);
 	exit(0);
 }
+
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -75,6 +75,12 @@ class Collectiondb  {
 	class CollectionRec *getRec ( class HttpRequest *r ,
 				      bool useDefaultRec = true );

+	// do not support diffbot style token/name style for this one:
+	char *getDefaultColl ( HttpRequest *r ) ;
+
+	//class CollectionRec *getRec2 ( class HttpRequest *r ,
+	//			       bool useDefaultRec = true );
+	
 	// . get collectionRec from name
 	// returns NULL if not available
 	class CollectionRec *getRec ( char *coll );
@ -368,6 +374,8 @@ class CollectionRec {
 	// for regular crawls
 	bool rebuildUrlFilters2();

+	bool rebuildChineseRules();
+
 	bool m_urlFiltersHavePageCounts;

 	// moved from SpiderColl so we can load up at startup
@ -408,6 +416,8 @@ class CollectionRec {
 	// spidered and begin the next round
 	long   m_spiderRoundNum;

+	char  m_makeImageThumbnails;
+	char  m_indexSpiderReplies;
 	char  m_indexBody;

 	//char  m_useDatedb               ;
@ -659,8 +669,9 @@ class CollectionRec {
 	long long m_maxToProcess;
 	long      m_maxCrawlRounds;

-	long long m_diffbotCrawlStartTime;
-	long long m_diffbotCrawlEndTime;
+	// in seconds now
+	long m_diffbotCrawlStartTime;
+	long m_diffbotCrawlEndTime;

 	// for testing their regexes etc...
 	//char m_isDiffbotTestCrawl;
--- a/Conf.cpp
+++ b/Conf.cpp
@ -297,8 +297,8 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 		log(LOG_INFO,"db: Split is FULL");
 	*/
 	// sanity check
-	if ( g_hostdb.m_indexSplits > MAX_INDEXDB_SPLIT ) {
-		log("db: Increase MAX_INDEXDB_SPLIT");
+	if ( g_hostdb.m_indexSplits > MAX_SHARDS ) {
+		log("db: Increase MAX_SHARDS");
 		char *xx = NULL; *xx = 0; 
 	}
 	// and always keep a decent site quality cache of at least 3M
--- a/Conf.h
+++ b/Conf.h
@ -628,6 +628,7 @@ class Conf {
 	bool  m_logDebugDownloads;
 	bool  m_logDebugFacebook;
 	bool  m_logDebugHttp    ;
+	bool  m_logDebugImage   ;
 	bool  m_logDebugLoop    ;
 	bool  m_logDebugLang    ;
 	bool  m_logDebugLinkInfo ;
--- a/Errno.cpp
+++ b/Errno.cpp
@ -169,6 +169,7 @@ case    EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
 case	EDOCNONCANONICAL: return "Url was dup of canonical page";
 case    ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
 case    ENOTOKEN: return "Missing token";
+case    EBADIMG: return "Bad image";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/Errno.h
+++ b/Errno.h
@ -172,6 +172,7 @@ enum {
 	EWAITINGTOSYNCHOSTSCONF,
 	EDOCNONCANONICAL,
 	ECUSTOMCRAWLMISMATCH, // a crawl request was made with a name that already existed for bulk request (or the other way around)
-	ENOTOKEN
+	ENOTOKEN,
+	EBADIMG
 };
 #endif
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -43,6 +43,7 @@ void Hostdb::resetPortTables () {
 }

 static int cmp  ( const void *h1 , const void *h2 ) ;
+
 //static int cmp2 ( const void *h1 , const void *h2 ) ;

 //static void *syncStartWrapper_r ( void *state );
@ -98,7 +99,7 @@ char *Hostdb::getNetName ( ) {
 // . gets filename that contains the hosts from the Conf file
 // . return false on errro
 // . g_errno may NOT be set
-bool Hostdb::init ( long hostId , char *netName ,
+bool Hostdb::init ( long hostIdArg , char *netName ,
 		    bool proxyHost , char useTmpCluster , char *cwd ) {
 	// reset my ip and port
 	m_myIp             = 0;
@ -118,6 +119,12 @@ bool Hostdb::init ( long hostId , char *netName ,
 	char *filename = "hosts.conf";
 	//if ( strcmp ( filename , "hosts.conf" ) == 0 )
 	//	filename = "localhosts.conf";
+	//bool triedEtc = false;
+
+	// for now we autodetermine
+	if ( hostIdArg != -1 ) { char *xx=NULL;*xx=0; }
+	// init to -1
+	m_hostId = -1;

 retry:

@ -136,11 +143,11 @@ bool Hostdb::init ( long hostId , char *netName ,
 	m_netName[0] = '\0';
 	if ( netName ) strncpy ( m_netName , netName , 31 );
 	// make sure our hostId is in our conf file
-	if ( hostId < 0 ) 
-		return log(
-			   "conf: Negative hostId %li supplied",hostId);
+	//if ( hostId < 0 ) 
+	//	return log(
+	//		   "conf: Negative hostId %li supplied",hostId);
 	// set early for calling log()
-	m_hostId = hostId;
+	//m_hostId = hostId;
 	// set clock in sync in fctypes.cpp
 	//if ( m_hostId == 0 ) g_clockInSync = true;
 	// log it
@ -174,15 +181,18 @@ bool Hostdb::init ( long hostId , char *netName ,
 		if ( this == &g_hostdb2 ) return true;
 		g_errno = ENOHOSTSFILE; 
 		// if doing localhosts.conf now try hosts.conf
-		if ( strcmp(filename,"localhosts.conf") == 0 ) {
-			filename = "hosts.conf";
-			g_errno = 0;
-			goto retry;
-		}
+		// if ( ! triedEtc ) { //strcmp(filename,"hosts.conf") == 0 ) {
+		// 	triedEtc = true;
+		// 	dir = "/etc/gigablast/";
+		// 	//filename = "hosts.conf";
+		// 	g_errno = 0;
+		// 	goto retry;
+		// }
 		// now we generate one if that is not there
 		if ( ! m_created ) {
 			m_created = true;
 			g_errno = 0;
+			dir = cwd;
 			createHostsConf( cwd );
 			goto retry;
 		}
@ -200,6 +210,10 @@ bool Hostdb::init ( long hostId , char *netName ,
 			   filename,m_bufSize,
 			   (long)(MAX_HOSTS+MAX_SPARES)*128);
 	}
+	// note it
+	//log("host: reading %s",f.getFilename());
+	// save it
+	//m_hostsConfFilename.safePrintf("%s",f.getFilename());
 	// open the file
 	if ( ! f.open ( O_RDONLY ) ) return false;
 	// read in the file
@ -1016,14 +1030,19 @@ bool Hostdb::init ( long hostId , char *netName ,
 	// set # of machines
 	m_numMachines = next;

+	// get IPs of this server. last entry is 0.
+	long *localIps = getLocalIps();
+
+	// now get host based on cwd and ip
+	Host *host = getHost2 ( cwd , localIps );
+
 	// now set m_myIp, m_myPort, m_myPort2 and m_myMachineNum
-	Host *host = getHost ( hostId );
+	//Host *host = getHost ( hostId );
 	if ( proxyHost )
-		host = getProxy ( hostId );
+		host = getProxy2 ( cwd , localIps ); //hostId );
 	if ( ! host ) 
-		return log(
-			   "conf: Could not find host with hostId %li in "
-			   "%s.",hostId,filename);
+		return log("conf: Could not find host with path %s and "
+			   "local ip in %s",cwd,filename);
 	m_myIp         = host->m_ip;    // internal IP
 	m_myIpShotgun  = host->m_ipShotgun;
 	m_myPort       = host->m_port;  // low priority udp port
@ -1098,7 +1117,7 @@ bool Hostdb::init ( long hostId , char *netName ,
 	*/

 	// THIS hostId
-	m_hostId = hostId;
+	m_hostId = m_myHost->m_hostId;
 	// set hosts per shard (mirror group)
 	m_numHostsPerShard = m_numHosts / m_numShards;

@ -1131,17 +1150,17 @@ bool Hostdb::init ( long hostId , char *netName ,
 	}

 	// get THIS host
-	Host *h = getHost ( hostId );
+	Host *h = getHost ( m_hostId );
 	if ( proxyHost )
-		h = getProxy ( hostId );
+		h = getProxy ( m_hostId );
 	if ( ! h ) return log(
 			      "conf: HostId %li not found in %s.",
-			      hostId,filename);
+			      m_hostId,filename);
 	// set m_dir to THIS host's working dir
 	strcpy ( m_dir , h->m_dir );
 	// likewise, set m_htmlDir to this host's html dir
 	sprintf ( m_httpRootDir , "%shtml/" , m_dir );
-	sprintf ( m_logFilename , "%slog%03li", m_dir , hostId );
+	sprintf ( m_logFilename , "%slog%03li", m_dir , m_hostId );

 	if ( ! g_conf.m_runAsDaemon )
 		sprintf(m_logFilename,"/dev/stderr");
@ -2297,6 +2316,14 @@ uint32_t Hostdb::getShardNumByTermId ( void *k ) {
 	return m_map [(*(uint16_t *)((char *)k + 16))>>3];
 }

+// uint32_t Hostdb::getShardNumFromTermId ( long long termId ) {
+// 	key144_t sk;
+// 	// make fake posdb key
+// 	makeStartKey ( &sk, termId );
+// 	// and use this
+// 	return getShardNumByTermId ( &sk );
+// }
+
 // . if false, we don't split index and date lists, other dbs are unaffected
 // . this obsolets the g_*.getGroupId() functions
 // . this allows us to have any # of groups in a stripe, not just power of 2
@ -2494,6 +2521,12 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("# Tells us what hosts are participating in the distributed search engine.\n");
 	sb.safePrintf("\n");
 	sb.safePrintf("\n");
+
+	// put our cwd here
+	sb.safePrintf("0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 %s\n",cwd);
+	sb.safePrintf("\n");
+	sb.safePrintf("\n");
+
 	sb.safePrintf("# How many mirrors do you want? If this is 0 then your data\n");
 	sb.safePrintf("# will NOT be replicated. If it is 1 then each host listed\n");
 	sb.safePrintf("# below will have one host that mirrors it, thereby decreasing\n");
@ -2548,10 +2581,6 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("# 'gb' binary resides.\n");
 	sb.safePrintf("#\n");

-	// put our cwd here
-	sb.safePrintf("0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 %s\n",cwd);
-	sb.safePrintf("\n");
-	sb.safePrintf("\n");
 	sb.safePrintf("#\n");
 	sb.safePrintf("# Example of a four-node distributed search index running on a single\n");
 	sb.safePrintf("# server with four cores. The working directories are /home/mwells/hostN/.\n");
@ -2616,3 +2645,74 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.save ( cwd , "hosts.conf" );
 	return true;
 }
+
+static long s_localIps[20];
+#include <sys/types.h>
+#include <ifaddrs.h>
+long *getLocalIps ( ) {
+	static bool s_valid = false;
+	if ( s_valid ) return s_localIps;
+	s_valid = true;
+	struct ifaddrs *ifap = NULL;
+	getifaddrs( &ifap );
+	ifaddrs *p = ifap;
+	long ni = 0;
+	// store loopback just in case
+	long loopback = atoip("127.0.0.1");
+	s_localIps[ni++] = loopback;
+	for ( ; p && ni < 18 ; p = p->ifa_next ) {
+		
+		long ip = ((struct sockaddr_in*)p->ifa_addr)->sin_addr.s_addr;
+		// skip if loopback we stored above
+		if ( ip == loopback ) continue;
+		// skip bogus ones
+		if ( (unsigned long)ip <= 10 ) continue;
+		// show it
+		//log("host: detected local ip %s",iptoa(ip));
+		// otherwise store it
+		s_localIps[ni++] = ip;
+	}
+	// mark the end of it
+	s_localIps[ni] = 0;
+	// free that memore
+	freeifaddrs ( ifap );
+	// return the static buffer
+	return s_localIps;
+}
+
+Host *Hostdb::getHost2 ( char *cwd , long *localIps ) {
+	for ( long i = 0 ; i < m_numHosts ; i++ ) {
+		Host *h = &m_hosts[i];
+		// . get the path. guaranteed to end in '/'
+		//   as well as cwd!
+		// . if the gb binary does not reside in the working dir
+		//   for this host, skip it, it's not our host
+		if ( strcmp(h->m_dir,cwd) ) continue;
+		// now it must be our ip as well!
+		long *ipPtr = localIps;
+		for ( ; *ipPtr ; ipPtr++ ) 
+			// return the host if it also matches the ip!
+			if ( (long)h->m_ip == *ipPtr ) return h;
+	}
+	// what, no host?
+	return NULL;
+}
+
+Host *Hostdb::getProxy2 ( char *cwd , long *localIps ) {
+	for ( long i = 0 ; i < m_numProxyHosts ; i++ ) {
+		Host *h = getProxy(i);
+		if ( ! (h->m_type & HT_PROXY ) ) continue;
+		// . get the path. guaranteed to end in '/'
+		//   as well as cwd!
+		// . if the gb binary does not reside in the working dir
+		//   for this host, skip it, it's not our host
+		if ( strcmp(h->m_dir,cwd) ) continue;
+		// now it must be our ip as well!
+		long *ipPtr = localIps;
+		for ( ; *ipPtr ; ipPtr++ ) 
+			// return the host if it also matches the ip!
+			if ( (long)h->m_ip == *ipPtr ) return h;
+	}
+	// what, no host?
+	return NULL;
+}
--- a/Hostdb.h
+++ b/Hostdb.h
@ -62,6 +62,8 @@ enum {
 #define HT_SCPROXY 0x10
 #define HT_ALL_PROXIES (HT_PROXY|HT_QCPROXY|HT_SCPROXY)

+long *getLocalIps ( ) ;
+
 class EventStats {
 public:
 	long m_expired;
@ -455,9 +457,17 @@ class Hostdb {
 	// get the host in this group with the smallest avg roundtrip time
 	//Host *getFastestHostInGroup ( unsigned long groupId );

+	// get the host that has this path/ip
+	Host *getHost2 ( char *cwd , long *localIps ) ;
+	Host *getProxy2 ( char *cwd , long *localIps ) ;
+
 	// . like above but just gets one host
 	// Host *getHost ( long hostId ) { return m_groups[hostId]; };
-	Host *getHost ( long hostId ) { return m_hostPtrs[hostId]; };
+	Host *getHost ( long hostId ) { 
+		if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
+		return m_hostPtrs[hostId]; 
+	};
+
 	Host *getSpare ( long spareId ) {
 		return m_spareHosts[spareId]; };

@ -672,6 +682,8 @@ inline uint32_t getShardNumFromDocId ( long long d ) {
 	return g_hostdb.getShardNumFromDocId ( d );
 };

+//inline uint32_t getShardNumFromTermId ( long long termId );
+
 //inline uint32_t getGroupId ( char rdbId, void *key,bool split = true) {
 //	return g_hostdb.getGroupId ( rdbId , key , split );
 //};
--- a/HttpMime.cpp
+++ b/HttpMime.cpp
@ -24,7 +24,9 @@ char *g_contentTypeStrings [] = {
 	"bmp"  , // 13
 	"javascript" , // 14
 	"css"  , // 15
-	"json"   // 16
+	"json" ,  // 16
+	"image", // 17
+	"status" // 18
 };

 HttpMime::HttpMime () { reset(); }
--- a/HttpMime.h
+++ b/HttpMime.h
@ -37,6 +37,7 @@ time_t atotime5   ( char *s ) ;
 #define CT_CSS    15
 #define CT_JSON   16
 #define CT_IMAGE  17
+#define CT_STATUS 18 // an internal type indicating spider reply

 #define ET_IDENTITY 0
 #define ET_GZIP 1
--- a/HttpRequest.h
+++ b/HttpRequest.h
@ -36,6 +36,8 @@
 #define FORMAT_PROCOG 6
 #define FORMAT_WIDGET_IFRAME 7
 #define FORMAT_WIDGET_AJAX 8
+// used by ajax widget to create search results to APPEND to the end of widget
+#define FORMAT_WIDGET_APPEND 9

 class HttpRequest {

--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -95,11 +95,11 @@ bool HttpServer::init ( short port,
 		m_ssltcp.reset();
 	}
 	// log an innocent msg
-	log(LOG_INIT,"http: listening on TCP port %i with sd=%i", 
+	log(LOG_INIT,"http: Listening on TCP port %i with sd=%i", 
 	    port, m_tcp.m_sock );
 	// log for https
 	if (m_ssltcp.m_ready)
-		log(LOG_INIT,"https: listening on TCP port %i with sd=%i", 
+		log(LOG_INIT,"https: Listening on TCP port %i with sd=%i", 
 	    	    sslPort, m_ssltcp.m_sock );

 	return true;
--- a/Images.cpp
+++ b/Images.cpp
@ -5,16 +5,14 @@
 #include "Sections.h"
 #include "XmlDoc.h"
 #include "Threads.h"
-//#include "Msg16.h" // my_system_r()
+#include "Hostdb.h"
 #include "XmlDoc.h" // my_system_r()

 // TODO: image is bad if repeated on same page, check for that

-static void gotTermFreqWrapper ( void *state ) ;
+//static void gotTermFreqWrapper ( void *state ) ;
 static void gotTermListWrapper ( void *state ) ;
-static void gotImageWrapper ( void *state ) ;
 static void *thumbStartWrapper_r ( void *state , ThreadEntry *te );
-static void  thumbDoneWrapper    ( void *state , ThreadEntry *te );
 static void getImageInfo ( char *buf, long size, long *dx, long *dy, long *it);

 Images::Images ( ) {
@ -26,10 +24,12 @@ void Images::reset() {
 	m_imgDataSize    = 0;
 	m_setCalled      = false;
 	m_thumbnailValid = false;
-	m_imgBuf         = NULL;
-	m_imgBufLen      = 0;
-	m_imgBufMaxLen   = 0;
+	m_imgReply       = NULL;
+	m_imgReplyLen    = 0;
+	m_imgReplyMaxLen = 0;
 	m_numImages      = 0;
+	m_imageBufValid  = false;
+	m_phase = 0;
 }

 /*
@ -74,7 +74,7 @@ bool Images::hash ( long       titleRecVersion ,
 */

 void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
-			     Sections *sections ) {
+			     Sections *sections , XmlDoc *xd ) {
 	// not valid for now
 	m_thumbnailValid = false;
 	// reset our array of image node candidates
@ -82,10 +82,15 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
 	// flag it
 	m_setCalled = true;
 	// strange...
-	if ( m_imgBuf ) { char *xx=NULL;*xx=0; }
+	if ( m_imgReply ) { char *xx=NULL;*xx=0; }
 	// save this
 	m_xml       = xml;
 	m_pageUrl   = pageUrl;
+
+	// if we are a diffbot json reply, trust that diffbot got the
+	// best candidate, and just use that
+	if ( xd->m_isDiffbotJSONObject ) return;
+
 	//m_pageSite  = pageSite;
 	// scan the words
 	long       nw     = words->getNumWords();
@ -156,7 +161,7 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
 		if ( height != -1 && height < 50 ) continue;
 		// get the url of the image
 		long  srcLen;
-		char *src = xml->getString(nn,nn+1,"src",&srcLen);
+		char *src = xml->getString(nn,"src",&srcLen);
 		// skip if none
 		if ( srcLen <= 2 ) continue;
 		// set it to the full url
@ -180,6 +185,7 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
 		if ( strncasestr(u,ulen,"header"         ) ) continue;
 		if ( strncasestr(u,ulen,"footer"         ) ) continue;
 		if ( strncasestr(u,ulen,"menu"           ) ) continue;
+		if ( strncasestr(u,ulen,"button"         ) ) continue;
 		if ( strncasestr(u,ulen,"banner"         ) ) continue;
 		if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue;
 		if ( strncasestr(u,ulen,"ads.webfeat."   ) ) continue;
@ -222,7 +228,7 @@ bool Images::getThumbnail ( char *pageSite ,
 			    long long docId ,
 			    XmlDoc *xd ,
 			    collnum_t collnum,//char *coll ,
-			    char **statusPtr ,
+			    //char **statusPtr ,
 			    long hopCount,
 			    void *state ,
 			    void   (*callback)(void *state) ) {
@ -235,6 +241,7 @@ bool Images::getThumbnail ( char *pageSite ,
 	// reset here now
 	m_i = 0;
 	m_j = 0;
+	m_phase = 0;

 	// sanity check
 	if ( ! m_pageUrl ) { char *xx=NULL;*xx=0; }
@ -244,10 +251,18 @@ bool Images::getThumbnail ( char *pageSite ,
 	//if ( ! isPermalink ) return true;

 	// save these
-	m_statusPtr = statusPtr;
+	//m_statusPtr = statusPtr;
 	// save this
 	m_collnum = collnum;
 	m_docId = docId;
+	m_callback = callback;
+	m_state = state;
+
+	// if this doc is a json diffbot reply it already has the primary
+	// image selected so just use that
+	m_xd = xd;
+	if ( m_xd->m_isDiffbotJSONObject ) 
+		return downloadImages();

 	// if no candidates, we are done, no error
 	if ( m_numImages == 0 ) return true;
@ -280,43 +295,97 @@ bool Images::getThumbnail ( char *pageSite ,
 	// store the termid
 	long long termId = q.getTermId(0);

-	if ( ! m_msg36.getTermFreq ( m_collnum               ,
-				     0                  , // maxAge
-				     termId             ,
-				     this               ,
-				     gotTermFreqWrapper ,
-				     MAX_NICENESS       ,
-				     true               ,  // exact count?
-				     false              ,  // inc count?
-				     false              ,  // dec count?
-				     false              )) // is split?
+	key144_t startKey ;
+	key144_t endKey   ;
+	g_posdb.makeStartKey(&startKey,termId);
+	g_posdb.makeEndKey  (&endKey  ,termId);
+
+	// get shard of that (this termlist is sharded by termid -
+	// see XmlDoc.cpp::hashNoSplit() where it hashes gbsitetemplate: term)
+	long shardNum = g_hostdb.getShardNumByTermId ( &startKey );
+
+	// if ( ! m_msg36.getTermFreq ( m_collnum               ,
+	// 			     0                  , // maxAge
+	// 			     termId             ,
+	// 			     this               ,
+	// 			     gotTermFreqWrapper ,
+	// 			     MAX_NICENESS       ,
+	// 			     true               ,  // exact count?
+	// 			     false              ,  // inc count?
+	// 			     false              ,  // dec count?
+	// 			     false              )) // is split?
+	// 	return false;
+
+
+	// just use msg0 and limit to like 1k or something
+	if ( ! m_msg0.getList ( -1    , // hostid
+				-1    , // ip
+				-1    , // port
+				0     , // maxAge
+				false , // addToCache?
+				RDB_POSDB ,
+				m_collnum      ,
+				&m_list     , // RdbList ptr
+				(char *)&startKey    ,
+				(char *)&endKey      ,
+				1024        , // minRecSize
+				this        ,
+				gotTermListWrapper ,
+				MAX_NICENESS       ,
+				false , // err correction?
+				true  , // inc tree?
+				true  , // domergeobsolete
+				-1    , // firstHostId
+				0     , // start filenum
+				-1    , // numFiles
+				30    , // timeout
+				-1    , // syncpoint
+				-1    , // preferlocalreads
+				NULL  , // msg5
+				NULL  , // msg5b
+				false , // isRealMerge?
+				true  , // allow pg cache
+				false , // focelocalindexdb
+				false , // doIndexdbSplit?
+				shardNum ))// force paritysplit
 		return false;

+
 	// did not block
 	return gotTermFreq();
 }

-void gotTermFreqWrapper ( void *state ) {
-	Images *THIS = (Images *)state;
-	// process/store the reply
-	if ( ! THIS->gotTermFreq() ) return;
-	// all done
-	THIS->m_callback ( THIS->m_state );
-}
+// void gotTermFreqWrapper ( void *state ) {
+// 	Images *THIS = (Images *)state;
+// 	// process/store the reply
+// 	if ( ! THIS->gotTermFreq() ) return;
+// 	// all done
+// 	THIS->m_callback ( THIS->m_state );
+// }

+// returns false if blocked, true otherwise
 bool Images::gotTermFreq ( ) {
 	// error?
 	if ( g_errno ) return true;
 	// bail if less than 10
-	long long nt = m_msg36.getTermFreq();
-	// return true, without g_errno set, we are done
-	if ( nt < 10 ) return true;
+	//long long nt = m_msg36.getTermFreq();
+	// each key but the first is 12 bytes (compressed)
+	long long nt = (m_list.getListSize() - 6)/ 12;
+	// . return true, without g_errno set, we are done
+	// . if we do not have 10 or more webpages that share this same 
+	//   template then do not do image extraction at all, it is too risky
+	//   that we get a bad image
+	// . MDW: for debugging, do not require 10 pages of same template
+	//if ( nt < 10 ) return true;
+	if ( nt < -2 ) return true;
 	// now see which of the image urls are unique
 	if ( ! launchRequests () ) return false;
 	// i guess we did not block
 	return true;
 }

+// . returns false if blocked, true otherwise
+// . see if other pages we've indexed have this same image url
 bool Images::launchRequests ( ) {
 	// loop over all images
 	for ( long i = m_i ; i < m_numImages ; i++ ) {
@ -324,26 +393,30 @@ bool Images::launchRequests ( ) {
 		m_i++;
 		// assume no error
 		m_errors[i] = 0;
-		// make the keys
-		key_t startKey = g_indexdb.makeStartKey(m_termIds[i]);
-		key_t endKey   = g_indexdb.makeEndKey  (m_termIds[i]);
+		// make the keys. each term is a gbimage:<imageUrl> term
+		// so we are searching for the image url to see how often
+		// it is repeated on other pages.
+		key144_t startKey ; 
+		key144_t endKey   ;
+		g_posdb.makeStartKey(&startKey,m_termIds[i]);
+		g_posdb.makeEndKey  (&endKey  ,m_termIds[i]);
 		// get our residing groupid
 		//unsigned long gid = g_indexdb.getNoSplitGroupId(&startKey);
 		// no split is true for this one, so we do not split by docid
 		//uint32_t gid = getGroupId(RDB_INDEXDB,&startKey,false);
 		unsigned long shardNum;
-		shardNum = getShardNum(RDB_INDEXDB,&startKey);
+		shardNum = getShardNum(RDB_POSDB,&startKey);
 		// get the termlist
 		if ( ! m_msg0.getList ( -1    , // hostid
 					-1    , // ip
 					-1    , // port
 					0     , // maxAge
 					false , // addToCache?
-					RDB_INDEXDB ,
+					RDB_POSDB,
 					m_collnum      ,
 					&m_list     , // RdbList ptr
-					startKey    ,
-					endKey      ,
+					(char *)&startKey    ,
+					(char *)&endKey      ,
 					1024        , // minRecSize
 					this        ,
 					gotTermListWrapper ,
@ -408,71 +481,235 @@ void Images::gotTermList ( ) {

 bool Images::downloadImages () {
 	// all done if we got a valid thumbnail
-	if ( m_thumbnailValid ) return true;
-	// if not valid free old image
-	if ( m_imgBuf ) {
-		mfree ( m_imgBuf , m_imgBufMaxLen , "Image" );
-		m_imgBuf = NULL;
-	}
+	//if ( m_thumbnailValid ) return true;

-	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
+	long  srcLen;
+	char *src = NULL;
+	long node;
+
+	// downloading an image from diffbot json reply?
+	if ( m_xd->m_isDiffbotJSONObject ) {
+		// i guess this better not block cuz we'll core!
+		char **iup = m_xd->getDiffbotPrimaryImageUrl();
+		// if no image, nothing to download
+		if ( ! *iup ) {
+			//log("no diffbot image url for %s",
+			//    m_xd->m_firstUrl.m_url);
+			return true;
+		}
+		// force image count to one
+		m_numImages = 1;
+		// do not error out
+		m_errors[0] = 0;
+		// set it to the full url
+		src = *iup;
+		srcLen = gbstrlen(src);
+		// need this
+		m_imageUrl.set ( src , srcLen );
+		// jump into the for loop below
+		//if ( m_phase == 0 ) goto insertionPoint;
+	}

 	// . download each leftover image
 	// . stop as soon as we get one with good dimensions
 	// . make a thumbnail of that one
-	for ( long i = m_j ; i < m_numImages ; i++ ) {
-		// advance now
-		m_j++;
-		// if we should stop, stop
-		if ( m_stopDownloading ) break;
-		// skip if bad or not unique
-		if ( m_errors[i] ) continue;
-		// set status msg
-		sprintf ( m_statusBuf ,"downloading image %li",i);
-		// point to it
-		*m_statusPtr = m_statusBuf;
-		// get the url of the image
-		long  srcLen;
-		char *src = m_xml->getString(i,i+1,"src",&srcLen);
-		// set it to the full url
-		Url iu;
-		// use "pageUrl" as the baseUrl
-		iu.set ( m_pageUrl , src , srcLen ); 
-		// assume success
-		m_httpStatus = 200;
-		// set the request
-		Msg13Request *r = &m_msg13Request;
-		r->reset();
-		r->m_maxTextDocLen  = 200000;
-		r->m_maxOtherDocLen = 500000;
-		if ( ! strcmp(cr->m_coll,"qatest123")) {
-			r->m_useTestCache   = 1;
-			r->m_addToTestCache = 1;
+	for (  ; m_j < m_numImages ; m_j++ , m_phase = 0 ) {
+
+		// did collection get nuked?
+		CollectionRec *cr = g_collectiondb.getRec(m_collnum);
+		if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
+
+		// clear error
+		g_errno = 0;
+
+		if ( m_phase == 0 ) {
+			// advance
+			m_phase++;
+			// only if not diffbot, we set "src" above for it
+			if ( ! m_xd->m_isDiffbotJSONObject ) {
+				// get img tag node
+				node = m_imageNodes[m_j];
+				// get the url of the image
+				src = m_xml->getString(node,"src",&srcLen);
+				// use "pageUrl" as the baseUrl
+				m_imageUrl.set ( m_pageUrl , src , srcLen ); 
+			}
+			// if we should stop, stop
+			if ( m_stopDownloading ) break;
+			// skip if bad or not unique
+			if ( m_errors[m_j] ) continue;
+			// set status msg
+			sprintf ( m_statusBuf ,"downloading image %li",m_j);
+			// point to it
+			if ( m_xd ) m_xd->setStatus ( m_statusBuf );
 		}
-		// url is the most important
-		strcpy(r->m_url,iu.getUrl());
-		// . try to download it
-		// . i guess we are ignoring hammers at this point
-		if ( ! m_msg13.getDoc(r,false,this,gotImageWrapper)) 
-			return false;
-		// handle it
-		gotImage ( );
+
+		// get image ip
+		if ( m_phase == 1 ) {
+			// advance
+			m_phase++;
+			// this increments phase if it should
+			if ( ! getImageIp() ) return false;
+			// error?
+			if ( g_errno ) continue;
+		}
+
+		// download the actual image
+		if ( m_phase == 2 ) {
+			// advance
+			m_phase++;
+			// download image data
+			if ( ! downloadImage() ) return false;
+			// error downloading?
+			if ( g_errno ) continue;
+		}
+
+		// get thumbnail using threaded call to netpbm stuff
+		if ( m_phase == 3 ) {
+			// advance
+			m_phase++;
+			// call pnmscale etc. to make thumbnail
+			if ( ! makeThumb() ) return false;
+			// error downloading?
+			if ( g_errno ) continue;
+		}
+
+		// error making thumb or just not a good thumb size?
+		if ( ! m_thumbnailValid ) {
+			// free old image we downloaded, if any
+			m_msg13.reset();
+			// i guess do this too, it was pointing at it in msg13
+			m_imgReply = NULL;
+			// try the next image candidate
+			continue;
+		}
+
+		// it's a keeper
+		long urlSize = m_imageUrl.getUrlLen() + 1; // include \0
+		// . make our ThumbnailArray out of it
+		long need = 0;
+		// the array itself
+		need += sizeof(ThumbnailArray);
+		// and each thumbnail it contains
+		need += urlSize;
+		need += m_thumbnailSize;
+		need += sizeof(ThumbnailInfo);
+		// reserve it
+		m_imageBuf.reserve ( need );
+		// point to array
+		ThumbnailArray *ta =(ThumbnailArray *)m_imageBuf.getBufStart();
+		// set that as much as possible, version...
+		ta->m_version = 0;
+		// and thumb count
+		ta->m_numThumbnails = 1;
+		// now store the thumbnail info
+		ThumbnailInfo *ti = ta->getThumbnailInfo (0);
+		// and set our one thumbnail
+		ti->m_origDX = m_dx;
+		ti->m_origDY = m_dy;
+		ti->m_dx = m_tdx;
+		ti->m_dy = m_tdy;
+		ti->m_urlSize = urlSize;
+		ti->m_dataSize = m_thumbnailSize;
+		// now copy the data over sequentially
+		char *p = ti->m_buf;
+		// the image url
+		memcpy(p,m_imageUrl.getUrl(),urlSize);
+		p += urlSize;
+		// the image thumbnail data
+		memcpy(p,m_imgData,m_thumbnailSize);
+		p += m_thumbnailSize;
+		// update buf length of course
+		m_imageBuf.setLength ( p - m_imageBuf.getBufStart() );
+
+		// validate the buffer
+		m_imageBufValid = true;
+
+		// save mem. do this after because m_imgData uses m_msg13's
+		// reply buf to store the thumbnail for now...
+		m_msg13.reset();
+		m_imgReply = NULL;
+
+		g_errno = 0;
+
+		return true;
 	}
-	// now get the thumbnail from it
-	return gotImage ( );
+
+	// don't tell caller EBADIMG it will make him fail to index doc
+	g_errno = 0;
+
+	return true;
 }

-void gotImageWrapper ( void *state ) {
+static void gotImgIpWrapper ( void *state , long ip ) {
 	Images *THIS = (Images *)state;
-	// process/store the reply
-	if ( ! THIS->gotImage ( ) ) return;
-	// download the images. will set m_stopDownloading when we get one
+	// control loop
+	if ( ! THIS->downloadImages() ) return;
+	// call callback at this point, we are done with the download loop
+	THIS->m_callback ( THIS->m_state );
+}
+
+bool Images::getImageIp ( ) {
+	if ( ! m_msgc.getIp ( m_imageUrl.getHost   () , 
+			      m_imageUrl.getHostLen() ,
+			      &m_latestIp     ,
+			      this            , 
+			      gotImgIpWrapper    ))
+		// we blocked
+		return false;
+	return true;
+}
+
+static void downloadImageWrapper ( void *state ) {
+	Images *THIS = (Images *)state;
+	// control loop
 	if ( ! THIS->downloadImages() ) return;
 	// all done
 	THIS->m_callback ( THIS->m_state );
 }

-bool Images::gotImage ( ) {
+bool Images::downloadImage ( ) {
+	// error?
+	if ( m_latestIp == 0 || m_latestIp == -1 ) {
+		log(LOG_DEBUG,"images: ip of %s is %li (%s)",
+		    m_imageUrl.getUrl(),m_latestIp,mstrerror(g_errno));
+		// ignore errors
+		g_errno = 0;
+		return true;
+	}
+	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
+	if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
+	// assume success
+	m_httpStatus = 200;
+	// set the request
+	Msg13Request *r = &m_msg13Request;
+	r->reset();
+	r->m_maxTextDocLen  = 200000;
+	r->m_maxOtherDocLen = 500000;
+	r->m_urlIp = m_latestIp;
+	if ( ! strcmp(cr->m_coll,"qatest123")) {
+		r->m_useTestCache   = 1;
+		r->m_addToTestCache = 1;
+	}
+	// url is the most important
+	strcpy(r->m_url,m_imageUrl.getUrl());
+	// . try to download it
+	// . i guess we are ignoring hammers at this point
+	if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper)) 
+		return false;
+
+	return true;
+}
+
+static void makeThumbWrapper ( void *state , ThreadEntry *t ) {
+	Images *THIS = (Images *)state;
+	// control loop
+	if ( ! THIS->downloadImages() ) return;
+	// all done
+	THIS->m_callback ( THIS->m_state );
+}
+
+bool Images::makeThumb ( ) {
 	// did it have an error?
 	if ( g_errno ) {
 		// just give up on all of them if one has an error
@ -489,12 +726,12 @@ bool Images::gotImage ( ) {
 	m_imgData     = NULL;
 	m_imgDataSize = 0;

-	log( LOG_DEBUG, "image: Msg16::gotImage() entered." );
+	log( LOG_DEBUG, "image: gotImage() entered." );
 	// . if there was a problem, just ignore, don't let it stop getting
 	//   the real page.
 	if ( g_errno ) {
 		log( "ERROR? g_errno puked: %s", mstrerror(g_errno) );
-		g_errno = 0;
+		//g_errno = 0;
 		return true;
 	}
 	//if ( ! slot ) return true;
@ -503,12 +740,24 @@ bool Images::gotImage ( ) {
 	bufLen    = m_msg13.m_replyBufSize;
 	bufMaxLen = m_msg13.m_replyBufAllocSize;
 	// no image?
-	if ( ! buf || bufLen <= 0 ) return true;
+	if ( ! buf || bufLen <= 0 ) {
+		g_errno = EBADIMG;
+		return true;
+	}
 	// we are image candidate #i
-	long i = m_j - 1;
+	//long i = m_j - 1;
+	// get img tag node
 	// get the url of the image
 	long  srcLen;
-	char *src = m_xml->getString(i,i+1,"src",&srcLen);
+	char *src = NULL;
+	if ( m_xd->m_isDiffbotJSONObject ) {
+		src = *m_xd->getDiffbotPrimaryImageUrl();
+		srcLen = gbstrlen(src);
+	}
+	else {
+		long node = m_imageNodes[m_j];
+		src = m_xml->getString(node,"src",&srcLen);
+	}
 	// set it to the full url
 	Url iu;
 	// use "pageUrl" as the baseUrl
@ -518,6 +767,7 @@ bool Images::gotImage ( ) {
 		log ( "image: MIME.set() failed in gotImage()" );
 		// give up on the remaining images then
 		m_stopDownloading = true;
+		g_errno = EBADIMG;
 		return true;
 	}
 	// set the status so caller can see
@ -528,6 +778,7 @@ bool Images::gotImage ( ) {
 		     m_httpStatus);
 		// give up on the remaining images then
 		m_stopDownloading = true;
+		g_errno = EBADIMG;
 		return true;
 	}
 	// make sure this is an image
@ -536,6 +787,7 @@ bool Images::gotImage ( ) {
 		log( LOG_DEBUG, "image: gotImage() states that this image is "
 		     "not in a format we currently handle." );
 		// try the next image if any
+		g_errno = EBADIMG;
 		return true;
 	}
 	// get the content
@ -543,41 +795,64 @@ bool Images::gotImage ( ) {
 	m_imgDataSize = bufLen - mime.getMimeLen();
 	// Reset socket, so socket doesn't free the data, now we own
 	// We must free the buf after thumbnail is inserted in TitleRec
-	m_imgBuf       = buf;//slot->m_readBuf;
-	m_imgBufLen    = bufLen;//slot->m_readBufSize;
-	m_imgBufMaxLen = bufMaxLen;//slot->m_readBufMaxSize;
+	m_imgReply       = buf;//slot->m_readBuf;
+	m_imgReplyLen    = bufLen;//slot->m_readBufSize;
+	m_imgReplyMaxLen = bufMaxLen;//slot->m_readBufMaxSize;
 	// do not let UdpServer free the reply, we own it now
 	//slot->m_readBuf = NULL;

-	if ( ! m_imgBuf || m_imgBufLen == 0 ) {
-		log( LOG_DEBUG, "image: Returned empty image data!" );
+	if ( ! m_imgReply || m_imgReplyLen == 0 ) {
+		log( LOG_DEBUG, "image: Returned empty image reply!" );
+		g_errno = EBADIMG;
 		return true;
 	}

 	// get next if too small
-	if ( m_imgDataSize < 20 ) return true;
+	if ( m_imgDataSize < 20 ) { g_errno = EBADIMG; return true; }

 	long imageType;
 	getImageInfo ( m_imgData, m_imgDataSize, &m_dx, &m_dy, &imageType );

 	// log the image dimensions
-	log( LOG_DEBUG, "image: Image Link: %s", iu.getUrl() );
-	log( LOG_DEBUG, "image: Max Buffer Size: %lu bytes.",m_imgBufMaxLen );
-	log( LOG_DEBUG, "image: Image Original Size: %lu bytes.",m_imgBufLen);
-	log( LOG_DEBUG, "image: Image Buffer @ 0x%lx - 0x%lx.",(long)m_imgBuf, 
-	     (long)m_imgBuf+m_imgBufMaxLen );
+	log( LOG_DEBUG,"image: Image Link: %s", iu.getUrl() );
+	log( LOG_DEBUG,"image: Max Buffer Size: %lu bytes.",m_imgReplyMaxLen);
+	log( LOG_DEBUG,"image: Image Original Size: %lu bytes.",m_imgReplyLen);
+	log( LOG_DEBUG,"image: Image Buffer @ 0x%lx - 0x%lx",(long)m_imgReply, 
+	     (long)m_imgReply+m_imgReplyMaxLen );
 	log( LOG_DEBUG, "image: Size: %lupx x %lupx", m_dx, m_dy );

+	// what is this?
+	if ( m_dx <= 0 || m_dy <= 0 ) {
+		log(LOG_DEBUG, "image: Image has bad dimensions.");
+		g_errno = EBADIMG;
+		return true;
+	}
+
+
 	// skip if bad dimensions
 	if( ((m_dx < 50) || (m_dy < 50)) && ((m_dx > 0) && (m_dy > 0)) ) {
-	    log( "image: Image is too small to represent a news article." );
-	    return true;
+		log(LOG_DEBUG,
+		    "image: Image is too small to represent a news article." );
+		g_errno = EBADIMG;
+		return true;
 	}

+	// skip if bad aspect ratio. 5x1 or 1x5 is bad i guess
+	if ( m_dx > 0 && m_dy > 0 ) {
+		float aspect = (float)m_dx / (float)m_dy;
+		if ( aspect < .2 || aspect > 5.0 ) {
+			log(LOG_DEBUG,
+			    "image: Image aspect ratio is worse that 5 to 1");
+			g_errno = EBADIMG;
+			return true;
+		}
+	}
+
+
 	// update status
-	*m_statusPtr = "making thumbnail";
+	if ( m_xd ) m_xd->setStatus ( "making thumbnail" );
 	// log it
-	log ( LOG_DEBUG, "image: Msg16::gotImage() thumbnailing image." );
+	log ( LOG_DEBUG, "image: gotImage() thumbnailing image." );
 	// create the thumbnail...
 	// reset this... why?
 	g_errno = 0;
@ -587,23 +862,14 @@ bool Images::gotImage ( ) {
 	if ( g_threads.call ( FILTER_THREAD        ,
 			      MAX_NICENESS         ,
 			      this                 ,
-			      thumbDoneWrapper    ,
+			      makeThumbWrapper    ,
 			      thumbStartWrapper_r ) ) return false;
 	// threads might be off
 	logf ( LOG_DEBUG, "image: Calling thumbnail gen without thread.");
-	thumbStartWrapper_r ( NULL , NULL );
+	thumbStartWrapper_r ( this , NULL );
 	return true;
 }

-void thumbDoneWrapper ( void *state , ThreadEntry *t ) {
-	Images *THIS = (Images *)state;
-	// . download another image if we ! m_thumbnailValid
-	// . should also free m_imgBuf if ! m_thumbnailValid
-	if ( ! THIS->downloadImages() ) return;
-	// all done
-	THIS->m_callback ( THIS->m_state );
-}
-
 void *thumbStartWrapper_r ( void *state , ThreadEntry *t ) {
 	Images *THIS = (Images *)state;
 	THIS->thumbStart_r ( true /* am thread?*/ );
@ -614,30 +880,33 @@ void Images::thumbStart_r ( bool amThread ) {

 	long long start = gettimeofdayInMilliseconds();

-        static char  scmd[200] = "%stopnm %s | "
-	                         "pnmscale -xysize 100 100 - | "
-	                         "ppmtojpeg - > %s";
+	//static char  scmd[200] = "%stopnm %s | "
+	//                         "pnmscale -xysize 100 100 - | "
+	//                         "ppmtojpeg - > %s";
+
 	
 	log( LOG_DEBUG, "image: thumbStart_r entered." );

 	//DIR  *d;
-        char  cmd[250];
-	sprintf( cmd, "%strash", g_hostdb.m_dir );
+	//char  cmd[2500];
+	//sprintf( cmd, "%strash", g_hostdb.m_dir );
+
+	makeTrashDir();

 	// get thread id
 	long id = getpid();

 	// pass the input to the program through this file
 	// rather than a pipe, since popen() seems broken
-	char in[64];
-	sprintf ( in , "%strash/in.%li", g_hostdb.m_dir, id );
+	char in[364];
+	snprintf ( in , 363,"%strash/in.%li", g_hostdb.m_dir, id );
 	unlink ( in );

 	log( LOG_DEBUG, "image: thumbStart_r create in file." );

 	// collect the output from the filter from this file
-	char out[64];
-	sprintf ( out , "%strash/out.%li", g_hostdb.m_dir, id );
+	char out[364];
+	snprintf ( out , 363,"%strash/out.%li", g_hostdb.m_dir, id );
        unlink ( out );

 	log( LOG_DEBUG, "image: thumbStart_r create out file." );
@ -695,11 +964,28 @@ void Images::thumbStart_r ( bool amThread ) {
 		       break;
        } 

-        sprintf( cmd, scmd, ext, in, out);
+	long xysize = 250;//100;
+	// make thumbnail a little bigger for diffbot for widget
+	if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
+
+	// i hope 2500 is big enough!
+	char  cmd[2501];
+
+	//sprintf( cmd, scmd, ext, in, out);
+	char *wdir = g_hostdb.m_dir;
+	snprintf( cmd, 2500 ,
+		 "LD_LIBRARY_PATH=%s %s/%stopnm %s | "
+		 "LD_LIBRARY_PATH=%s %s/pnmscale -xysize %li %li - | "
+		 "LD_LIBRARY_PATH=%s %s/ppmtojpeg - > %s"
+		 , wdir , wdir , ext , in
+		 , wdir , wdir , xysize , xysize
+		 , wdir , wdir , out
+		 );
        
        // Call clone function for the shell to execute command
        // This call WILL BLOCK	. timeout is 30 seconds.
-        int err = my_system_r( cmd, 30 ); // m_thmbconvTimeout );
+	//int err = my_system_r( cmd, 30 ); // m_thmbconvTimeout );
+	int err = system( cmd ); // m_thmbconvTimeout );

 	//if( (m_dx != 0) && (m_dy != 0) )
 	//	unlink( in );
@ -736,13 +1022,13 @@ void Images::thumbStart_r ( bool amThread ) {
 		return;
 	}

-	if( m_thumbnailSize > m_imgBufMaxLen ) {
-		log( "image: Image thumbnail larger than buffer!" );
-		log( LOG_DEBUG, "\t\t\tFile Read Bytes: %ld", m_thumbnailSize);
-		log( LOG_DEBUG, "\t\t\tBuf Max Bytes  : %ld", m_imgBufMaxLen );
-		log( LOG_DEBUG, "\t\t\t-----------------------" );
-		log( LOG_DEBUG, "\t\t\tDiff           : %ld", 
-		     m_imgBufMaxLen-m_thumbnailSize );
+	if( m_thumbnailSize > m_imgReplyMaxLen ) {
+		log(LOG_DEBUG,"image: Image thumbnail larger than buffer!" );
+		log(LOG_DEBUG,"image: File Read Bytes: %ld", m_thumbnailSize);
+		log(LOG_DEBUG,"image: Buf Max Bytes  : %ld",m_imgReplyMaxLen );
+		log(LOG_DEBUG,"image: -----------------------" );
+		log(LOG_DEBUG,"image: Diff           : %ld", 
+		     m_imgReplyMaxLen-m_thumbnailSize );
 		return;

 	}
@ -777,10 +1063,16 @@ void Images::thumbStart_r ( bool amThread ) {
 	// tell the loop above not to download anymore, we got one
 	m_thumbnailValid = true;

-	getImageInfo ( m_imgBuf , m_thumbnailSize , &m_tdx , &m_tdy , NULL );
+	// MDW: this was m_imgReply
+	getImageInfo ( m_imgData , m_thumbnailSize , &m_tdx , &m_tdy , NULL );

-	log( LOG_DEBUG, "image: Thumbnailed size: %li bytes.", m_imgDataSize );
-	log( LOG_DEBUG, "image: Thumbnaile dx=%li dy=%li.", m_tdx,m_tdy );
+	// now make the meta data struct
+	// <imageUrl>\0<width><height><thumbnailData>
+	
+
+
+	log( LOG_DEBUG, "image: Thumbnail size: %li bytes.", m_imgDataSize );
+	log( LOG_DEBUG, "image: Thumbnail dx=%li dy=%li.", m_tdx,m_tdy );
 	log( LOG_DEBUG, "image: Thumbnail generated in %lldms.", stop-start );
 }

@ -841,6 +1133,9 @@ void getImageInfo ( char *buf , long bufSize ,
 		if( bufSize > 25 ) {
 			*dx=(unsigned long)(*(unsigned long *)&buf[16]);
 			*dy=(unsigned long)(*(unsigned long *)&buf[20]);
+			// these are in network order
+			*dx = ntohl(*dx);
+			*dy = ntohl(*dy);
 		}
 	}
 	else if( (strPtr = strncasestr( buf, 20, "MM" )) ) {
@ -886,3 +1181,46 @@ void getImageInfo ( char *buf , long bufSize ,
 		log( LOG_DEBUG, "image: Image Corrupted? No type found in "
 		     "data." );
 }
+
+// container is maxWidth X maxHeight, so try to fix widget in there
+bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb , 
+					   long maxWidth ,
+					   long maxHeight,
+					   bool printLink ,
+					   long *retNewdx ,
+					   char *style )  {
+	if ( ! style ) style = "";
+	// account for scrollbar on the right
+	//maxSide -= (long)SCROLLBAR_WIDTH;
+	// avoid distortion.
+	// if image is wide, use that to scale
+	if ( m_dx <= 0 ) return true;
+	if ( m_dy <= 0 ) return true;
+	float xscale = 
+		(float)maxWidth/
+		(float)m_dx;
+	float yscale = 
+		(float)maxHeight/
+		(float)m_dy;
+	float min = xscale;
+	if ( yscale < min ) min = yscale;
+	long newdx = (long)((float)m_dx * min);
+	long newdy = (long)((float)m_dy * min);
+	if ( printLink ) sb->safePrintf("<a href=%s>", getUrl() );
+	sb->safePrintf("<img width=%li height=%li align=left "
+		       "%s"
+		       "src=\"data:image/"
+		       "jpg;base64,"
+		       , newdx
+		       , newdy
+		       , style
+		       );
+	// encode image in base 64
+	sb->base64Encode ( getData(), m_dataSize , 0 ); // 0 niceness
+	sb->safePrintf("\">");
+	if ( printLink ) sb->safePrintf ("</a>");
+	// widget needs to know the width of the thumb for formatting
+	// the text either on top of the thumb or to the right of it
+	if ( retNewdx ) *retNewdx = newdx;
+	return true;
+}
--- a/Images.h
+++ b/Images.h
@ -7,9 +7,59 @@
 #include "Msg36.h"
 #include "Msg13.h"
 #include "IndexList.h"
+#include "MsgC.h"
+#include "SafeBuf.h"

 #define MAX_IMAGES 500

+// a single serialized thumbnail:
+class ThumbnailInfo {
+ public:
+	long  m_origDX;
+	long  m_origDY;
+	long  m_dx;
+	long  m_dy;
+	long  m_urlSize;
+	long  m_dataSize;
+	char  m_buf[];
+	char *getUrl() { return m_buf; };
+	char *getData() { return m_buf + m_urlSize; };
+	long  getDataSize() { return m_dataSize; };
+	long  getSize () { return sizeof(ThumbnailInfo)+m_urlSize+m_dataSize;};
+
+	// make sure neither the x or y side is > maxSize
+	bool printThumbnailInHtml ( SafeBuf *sb , 
+				    long maxWidth,
+				    long maxHeight,
+				    bool printLink ,
+				    long *newdx ,
+				    char *style = NULL ) ;
+};
+
+// XmlDoc::ptr_imgData is a ThumbnailArray
+class ThumbnailArray {
+ public:
+	// 1st byte if format version
+	char m_version;
+	// # of thumbs
+	long m_numThumbnails;
+	// list of ThumbnailInfos
+	char m_buf[];
+
+	long getNumThumbnails() { return m_numThumbnails;};
+
+	ThumbnailInfo *getThumbnailInfo ( long x ) {
+		if ( x >= m_numThumbnails ) return NULL;
+		char *p = m_buf;
+		for ( long i = 0 ; i < m_numThumbnails ; i++ ) {
+			if ( i == x ) return (ThumbnailInfo *)p;
+			ThumbnailInfo *ti = (ThumbnailInfo *)p;
+			p += ti->getSize();
+		}
+		return NULL;
+	};
+};
+
 class Images {

 public:
@ -31,7 +81,8 @@ class Images {
 	void setCandidates ( class Url      *pageUrl , 
 			     class Words    *words , 
 			     class Xml      *xml ,
-			     class Sections *sections );
+			     class Sections *sections ,
+			     class XmlDoc *xd );
 	
 	// . returns false if blocked, true otherwise
 	// . sets errno on error
@ -42,25 +93,38 @@ class Images {
 			    long long docId ,
 			    class XmlDoc *xd ,
 			    collnum_t collnum,
-			    char **statusPtr ,
+			    //char **statusPtr ,
 			    long hopCount,
 			    void   *state ,
 			    void   (*callback)(void *state) );

-	char *getImageData    () { return m_imgData; };
-	long  getImageDataSize() { return m_imgDataSize; };
+	//char *getImageData    () { return m_imgData; };
+	//long  getImageDataSize() { return m_imgDataSize; };
 	//long  getImageType    () { return m_imageType; };

+	SafeBuf m_imageBuf;
+	bool m_imageBufValid;
+	long m_phase;
+
 	bool gotTermFreq();
 	bool launchRequests();
 	void gotTermList();
 	bool downloadImages();
-	bool gotImage ( );
+
+
+
+	bool getImageIp();
+	bool downloadImage();
+	bool makeThumb();
+
+	//bool gotImage ( );
 	void thumbStart_r ( bool amThread );

 	long  m_i;
 	long  m_j;

+	class XmlDoc *m_xd;
+
 	// callback information
 	void  *m_state  ;
 	void (* m_callback)(void *state );
@ -69,17 +133,21 @@ class Images {
 	long      m_errno;
 	long      m_hadError;
 	bool      m_stopDownloading;
-	char    **m_statusPtr;
+	//char    **m_statusPtr;
 	char      m_statusBuf[128];
 	collnum_t m_collnum;

 	long long   m_docId;
 	IndexList   m_list;

+	long m_latestIp;
+	MsgC m_msgc;
+	Url m_imageUrl;
+
 	long      m_numImages;
 	long      m_imageNodes[MAX_IMAGES];
 	// termids for doing gbimage:<url> lookups for uniqueness
-	long      m_termIds   [MAX_IMAGES];
+	long long m_termIds   [MAX_IMAGES];
 	// for the msg0 lookup, did we have an error?
 	long      m_errors    [MAX_IMAGES];

@ -106,9 +174,9 @@ class Images {
 	long  m_imgType;

 	// udp slot buffer
-	char *m_imgBuf;
-	long  m_imgBufLen;      // how many bytes the image is
-	long  m_imgBufMaxLen;   // allocated for the image
+	char *m_imgReply;
+	long  m_imgReplyLen;      // how many bytes the image is
+	long  m_imgReplyMaxLen;   // allocated for the image
 	long  m_dx;             // width of image in pixels
 	long  m_dy;             // height of image in pixels
 	bool  m_thumbnailValid; // is it a valid thumbnail image
--- a/Indexdb.h
+++ b/Indexdb.h
@ -33,7 +33,7 @@
 //#define INDEXDB_SPLIT 8
 //#define DOCID_OFFSET_MASK (INDEXDB_SPLIT-1)
 #define DOCID_OFFSET_MASK (g_conf.m_indexdbSplit-1)
-#define MAX_INDEXDB_SPLIT 128
+#define MAX_SHARDS 128

 class Indexdb {

--- a/Log.cpp
+++ b/Log.cpp
@ -170,6 +170,7 @@ bool Log::shouldLog ( long type , char *msg ) {
 	if (msg[0]=='d'&&msg[1]=='n' ) return g_conf.m_logDebugDns    ;
 	if (msg[0]=='d'&&msg[1]=='o' ) return g_conf.m_logDebugDownloads;
 	if (msg[0]=='h'&&msg[1]=='t' ) return g_conf.m_logDebugHttp   ;
+	if (msg[0]=='i'&&msg[1]=='m' ) return g_conf.m_logDebugImage  ;
 	if (msg[0]=='l'&&msg[1]=='o' ) return g_conf.m_logDebugLoop   ;
 	if (msg[0]=='l'&&msg[1]=='a' ) return g_conf.m_logDebugLang   ;
 	if (msg[0]=='m'&&msg[2]=='m' ) return g_conf.m_logDebugMem    ;
@ -302,8 +303,8 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
 	// back up over spaces
 	while ( p[-1] == ' ' ) p--;
 	// end in period or ? or !
-	if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
-		*p++ = '.';
+	//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
+	//	*p++ = '.';
 	*p ='\0';
 	// the total length, not including the \0
 	long tlen = p - tt;
--- a/16
+++ b/16
@ -466,6 +466,22 @@ Msg6a.o:
 geo_ip_table.o: geo_ip_table.cpp geo_ip_table.h
 	$(CC) $(DEFS) -m32 -Wall -pipe -c $*.cpp 

+install:
+# gigablast will copy over the necessary files. it has a list of the
+# necessary files and that list changes over time so it is better to let gb
+# deal with it.
+	mkdir -p /var/gigablast/data0/
+	./gb copyfiles /var/gigablast/data0/
+# if user types 'gb' it will use the binary in /var/gigablast/data0/
+	rm -f /usr/bin/gb
+	ln -s /var/gigablast/data0/gb /usr/bin/gb
+# if machine restarts
+# the new way that does not use run-levels anymore
+	rm -f /etc/init.d/gb
+	ln -s /lib/init/upstart-job /etc/init.d/gb
+# initctl upstart-job conf file (gb stop|start|reload)
+	cp init.gb.conf /etc/init/gb.conf
+
 .cpp.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -c $*.cpp 

--- a/Mem.cpp
+++ b/Mem.cpp
@ -462,6 +462,10 @@ bool Mem::init  ( long long maxMem ) {
 	log(LOG_INIT,"mem: using electric fence!!!!!!!");
 #endif

+	/*
+	  take this out for now it seems to hang the OS when running
+	  as root
+
 #ifndef TITAN
 	// if we can't alloc 3gb exit and retry
 	long long start = gettimeofdayInMilliseconds();
@ -486,6 +490,7 @@ bool Mem::init  ( long long maxMem ) {
 	// return if could not alloc the full 3GB
 	if ( i < 30 ) return false;
 #endif
+	*/

 	// reset this, our max mem used over time ever because we don't
 	// want the mem test we did above to count towards it
--- a/Msg0.cpp
+++ b/Msg0.cpp
@ -31,7 +31,7 @@ void Msg0::constructor ( ) {
 	m_msg5b = NULL;
 //#ifdef SPLIT_INDEXDB
 	//for ( long i = 0; i < INDEXDB_SPLIT; i++ )
-	//for ( long i = 0; i < MAX_INDEXDB_SPLIT; i++ )
+	//for ( long i = 0; i < MAX_SHARDS; i++ )
 	//	m_mcast[i].constructor();
 	m_mcast.constructor();
 	m_mcasts      = NULL;
@ -726,8 +726,8 @@ void Msg0::gotSplitReply ( ) {
 	char *xx=NULL;*xx=0;
 	// get all the split lists
 	long totalSize = 0;
-	RdbList lists[MAX_INDEXDB_SPLIT];
-	RdbList *listPtrs[MAX_INDEXDB_SPLIT];
+	RdbList lists[MAX_SHARDS];
+	RdbList *listPtrs[MAX_SHARDS];
 	for ( long i = 0; i < m_numSplit; i++ ) {
 		listPtrs[i] = &lists[i];
 		long replySize;
--- a/Msg0.h
+++ b/Msg0.h
@ -216,7 +216,7 @@ class Msg0 {
 	// used for multicasting the request
 //#ifdef SPLIT_INDEXDB
 	//Multicast m_mcast[INDEXDB_SPLIT];
-	//Multicast m_mcast[MAX_INDEXDB_SPLIT];
+	//Multicast m_mcast[MAX_SHARDS];
 	// casting to multiple splits is obsolete, but for PageIndexdb.cpp
 	// we still need to do it, but we alloc for it
 	Multicast  m_mcast;
--- a/Msg20.h
+++ b/Msg20.h
@ -192,7 +192,7 @@ class Msg20Request {
 	char      *ptr_affWeights    ;
 	char      *ptr_linkee        ; // used by Msg25 for getting link text
 	//char      *ptr_coll          ;
-	char      *ptr_imgUrl        ;
+	//char      *ptr_imgUrl        ;
 	char      *ptr_displayMetas  ;

 	// . from here down: automatically set in Msg20Request::serialize() 
@ -209,7 +209,7 @@ class Msg20Request {
 	long       size_affWeights   ;
 	long       size_linkee       ; // size includes terminating \0
 	//long       size_coll         ; // size includes terminating \0
-	long       size_imgUrl       ;
+	//long       size_imgUrl       ;
 	long       size_displayMetas ; // size includes terminating \0

 	char       m_buf[0] ;
@ -428,6 +428,7 @@ public:
 	char       *ptr_tvbuf                ; // title vector
 	char       *ptr_gbvecbuf             ; // gigabit vector
 	char       *ptr_imgUrl               ; // youtube/metacafe vid thumb
+	char       *ptr_imgData              ; // for encoded images
 	//char       *ptr_eventEnglishTime     ; // "every saturday [[]] jan"
 	//char       *ptr_eventDateIntervals   ;
 	char       *ptr_likedbList           ;
@ -523,6 +524,7 @@ public:
 	long       size_tvbuf                ;
 	long       size_gbvecbuf             ;
 	long       size_imgUrl               ; // youtube/metacafe vid thumb
+	long       size_imgData;
 	//long       size_eventEnglishTime     ;
 	//long       size_eventDateIntervals   ;
 	long       size_likedbList           ;
--- a/Msg22.cpp
+++ b/Msg22.cpp
@ -23,6 +23,34 @@ Msg22::~Msg22(){

 static void gotReplyWrapper22     ( void *state1 , void *state2 ) ;

+
+// . sets m_availDocId or sets g_errno to ENOTFOUND on error
+// . calls callback(state) when done
+// . returns false if blocked true otherwise
+bool Msg22::getAvailDocIdOnly ( Msg22Request  *r              ,
+				long long preferredDocId ,
+				char *coll ,
+				void *state ,
+				void (* callback)(void *state) ,
+				long niceness ) {
+	return getTitleRec ( r ,
+			     NULL     , //   url
+			     preferredDocId    ,
+			     coll     ,
+			     NULL     , // **titleRecPtrPtr
+			     NULL     , //  *titleRecSizePtr
+			     false    , //   justCheckTfndb
+			     true     , //   getAvailDocIdOnly
+			     state    ,
+			     callback ,
+			     niceness ,
+			     false    , // addToCache
+			     0        , // maxCacheAge
+			     9999999  , // timeout
+			     false   ); // doLoadBalancing 
+}
+
+
 // . if url is NULL use the docId to get the titleRec
 // . if titleRec is NULL use our own internal m_myTitleRec
 // . sets g_errno to ENOTFOUND if TitleRec does not exist for this url/docId
@ -37,6 +65,10 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 			  char         **titleRecPtrPtr ,
 			  long          *titleRecSizePtr,
 			  bool           justCheckTfndb ,
+			  // when indexing spider replies we just want
+			  // a unique docid... "docId" should be the desired
+			  // one, but we might have to change it.
+			  bool           getAvailDocIdOnly  ,
 			  void          *state          ,
 			  void         (* callback) (void *state) ,
 			  long           niceness       ,
@ -45,6 +77,9 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 			  long           timeout        ,
 			  bool           doLoadBalancing ) {

+	// sanity
+	if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
+	if ( getAvailDocIdOnly && url            ) { char *xx=NULL;*xx=0; }

 	//if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url);
 	// sanity checks
@ -56,7 +91,7 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 	if ( r->m_inUse           ) { char *xx=NULL;*xx=0; }
 	if ( m_outstanding        ) { char *xx = NULL;*xx=0; }
 	// sanity check
-	if ( ! justCheckTfndb ) {
+	if ( ! justCheckTfndb && ! getAvailDocIdOnly ) {
 		if ( ! titleRecPtrPtr  ) { char *xx=NULL;*xx=0; }
 		if ( ! titleRecSizePtr ) { char *xx=NULL;*xx=0; }
 	}
@ -79,6 +114,7 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 	r->m_docId           = docId;
 	r->m_niceness        = niceness;
 	r->m_justCheckTfndb  = (bool)justCheckTfndb;
+	r->m_getAvailDocIdOnly   = (bool)getAvailDocIdOnly;
 	r->m_doLoadBalancing = (bool)doLoadBalancing;
 	r->m_collnum         = g_collectiondb.getCollnum ( coll );
 	r->m_addToCache      = false;
@ -391,6 +427,21 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
 	       st->m_docId1 = r->m_docId;
 	       st->m_docId2 = r->m_docId;
       }
+
+       // but if we are requesting an available docid, it might be taken
+       // so try the range
+       if ( r->m_getAvailDocIdOnly ) {
+	       long long pd = r->m_docId;
+	       long long d1 = g_titledb.getFirstProbableDocId ( pd );
+	       long long d2 = g_titledb.getLastProbableDocId  ( pd );
+	       // sanity - bad url with bad subdomain?
+	       if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
+	       // make sure we get a decent sample in titledb then in 
+	       // case the docid we wanted is not available
+	       st->m_docId1 = d1;
+	       st->m_docId2 = d2;
+       }
+
       // . otherwise, url was given, like from Msg15
       // . we may get multiple tfndb recs
       if ( r->m_url[0] ) {
@ -827,11 +878,18 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
 		//if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; }
 	}

+	// the probable docid is the PREFERRED docid in this case
+	if ( r->m_getAvailDocIdOnly ) pd = st->m_r->m_docId;
+			
+
 	// . these are both meant to be available docids
 	// . if ad2 gets exhausted we use ad1
 	long long ad1 = st->m_docId1;
 	long long ad2 = pd;

+
+	bool docIdWasFound = false;
+
 	// scan the titleRecs in the list
 	for ( ; ! tlist->isExhausted() ; tlist->skipCurrentRecord ( ) ) {
 		// breathe
@ -844,11 +902,16 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
 		// skip negative recs, first one should not be negative however
 		if ( ( k->n0 & 0x01 ) == 0x00 ) continue;

-		// get docid of that guy
+		// get docid of that titlerec
 		long long dd = g_titledb.getDocId(k);

+		if ( r->m_getAvailDocIdOnly ) {
+			// make sure our available docids are availble!
+			if ( dd == ad1 ) ad1++;
+			if ( dd == ad2 ) ad2++;
+		}
 		// if we had a url make sure uh48 matches
-		if ( r->m_url[0] ) {
+		else if ( r->m_url[0] ) {
 			// get it
 			long long uh48 = g_titledb.getUrlHash48(k);
 			// sanity check
@ -865,6 +928,9 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
 			if ( r->m_docId != dd ) continue;
 		}

+		// flag that we matched m_docId
+		docIdWasFound = true;
+
 		// ok, if just "checking tfndb" no need to go further
 		if ( r->m_justCheckTfndb ) {
 			// send back a good reply (empty means found!)
@ -907,12 +973,16 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
 	long long ad = ad2;
 	// but wrap around if we need to
 	if ( ad == 0LL ) ad = ad1;
+	// if "docId" was unmatched that should be the preferred available
+	// docid then...
+	if ( ! docIdWasFound && r->m_getAvailDocIdOnly && ad != r->m_docId ) { 
+		char *xx=NULL;*xx=0; }
 	// remember it
 	st->m_availDocId = ad;


 	// . ok, return an available docid
-	if ( r->m_url[0] || r->m_justCheckTfndb ) {
+	if ( r->m_url[0] || r->m_justCheckTfndb || r->m_getAvailDocIdOnly ) {
 		// store docid in reply
 		char *p = st->m_slot->m_tmpBuf;
 		// send back the available docid
--- a/Msg22.h
+++ b/Msg22.h
@ -16,6 +16,7 @@ public:
 	long      m_maxCacheAge;
 	collnum_t m_collnum;
 	char      m_justCheckTfndb  :1;
+	char      m_getAvailDocIdOnly:1;
 	char      m_doLoadBalancing :1;
 	char      m_addToCache      :1;
 	char      m_inUse           :1;
@ -35,6 +36,13 @@ class Msg22 {

 	static bool registerHandler ( ) ;

+	bool getAvailDocIdOnly ( class Msg22Request  *r              ,
+				 long long preferredDocId ,
+				 char *coll ,
+				 void *state ,
+				 void (* callback)(void *state) ,
+				 long niceness ) ;
+
 	// . make sure you keep url/coll on your stack cuz we just point to it
 	// . see the other getTitleRec() description below for more details
 	// . use a maxCacheAge of 0 to avoid the cache
@ -45,6 +53,7 @@ class Msg22 {
 			   char     **titleRecPtrPtr  ,
 			   long      *titleRecSizePtr ,
 			   bool       justCheckTfndb ,
+			   bool       getAvailDocIdOnly  ,
 			   void      *state          , 
 			   void     (* callback) (void *state ),
 			   long       niceness       ,
--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -930,7 +930,7 @@ bool Msg3::doneScanning ( ) {
 					      ff->getFilename() ,
 					      m_niceness ) ) {
 			log("net: Had error while constraining list read from "
-			    "%s: %s%s. vfd=%li parts=%li. "
+			    "%s: %s/%s. vfd=%li parts=%li. "
 			    "This is likely caused by corrupted "
 			    "data on disk.", 
 			    mstrerror(g_errno), ff->m_dir ,
--- a/Msg36.h
+++ b/Msg36.h
@ -65,12 +65,12 @@ class Msg36 {
 //#else
 //	char m_reply[8];
 //#endif
-	char m_reply[8*MAX_INDEXDB_SPLIT];
+	char m_reply[8*MAX_SHARDS];

 	// for sending the request
 //#ifdef SPLIT_INDEXDB
 	//Multicast m_mcast[INDEXDB_SPLIT];
-	Multicast m_mcast[1];//MAX_INDEXDB_SPLIT];
+	Multicast m_mcast[1];//MAX_SHARDS];
 	long      m_numRequests;
 	long      m_numReplies;
 	long      m_errno;
--- a/Msg39.cpp
+++ b/Msg39.cpp
@ -219,7 +219,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 		log("query: Query parsing inconsistency for q=%s. "
 		    "langid=%li. Check langids and m_queryExpansion parms "
 		    "which are the only parms that could be different in "
-		    "Query::set2()."
+		    "Query::set2(). You probably have different mysynoyms.txt "
+		    "files on two different hosts! check that!!"
 		    ,m_tmpq.m_orig
 		    ,(long)m_r->m_language
 		    );
--- a/Msg39.h
+++ b/Msg39.h
@ -55,7 +55,7 @@ class Msg39Request {
 		m_doMaxScoreAlgo          = true;
 		m_seoDebug                = false;
 		m_useSeoResultsCache      = false;
-
+		
 		ptr_readSizes             = NULL;
 		ptr_query                 = NULL; // in utf8?
 		ptr_whiteList             = NULL;
@ -72,6 +72,10 @@ class Msg39Request {
 		m_minDocId = -1;
 		m_maxDocId = -1;

+		// for widget, to only get results to append to last docid
+		m_maxSerpScore = 0.0;
+		m_minSerpDocId = 0LL;
+
 		m_makeReply = true;

 		// . search results knobs
@ -122,6 +126,10 @@ class Msg39Request {
 	long long m_maxDocId;
 	bool      m_makeReply;

+	// for widget, to only get results to append to last docid
+	double    m_maxSerpScore;
+	long long m_minSerpDocId;
+
 	// msg3a stuff
 	long    m_timeout; // in seconds

--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -26,17 +26,17 @@ void Msg3a::constructor ( ) {
 	m_rbuf2.constructor();

 	// NULLify all the reply buffer ptrs
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_reply[j] = NULL;
 	m_rbufPtr = NULL;
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_mcast[j].constructor();
 	m_seoCacheList.constructor();
 }

 Msg3a::~Msg3a ( ) {
 	reset();
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_mcast[j].destructor();
 	m_seoCacheList.freeList();
 }
@ -48,12 +48,12 @@ void Msg3a::reset ( ) {
 	m_siteHashes26 = NULL;
 	// . NULLify all the reply buffer ptrs
 	// . have to count DOWN with "i" because of the m_reply[i-1][j] check
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) {
+	for ( long j = 0; j < MAX_SHARDS; j++ ) {
 		if ( ! m_reply[j] ) continue;
 		mfree(m_reply[j],m_replyMaxSize[j],  "Msg3aR");
 		m_reply[j] = NULL;
 	}
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_mcast[j].reset();
 	// and the buffer that holds the final docids, etc.
 	if ( m_finalBuf )
@ -89,7 +89,7 @@ static void gotCacheReplyWrapper ( void *state ) {
 // . sets g_errno on error
 // . "query/coll" should NOT be on the stack in case we block
 // . uses Msg36 to retrieve term frequencies for each termId in query
-// . sends Msg39 request to get docids from each indexdb split
+// . sends Msg39 request to get docids from each indexdb shard
 // . merges replies together
 // . we print out debug info if debug is true
 // . "maxAge"/"addToCache" is talking about the clusterdb cache as well
@ -337,7 +337,7 @@ bool Msg3a::gotCacheReply ( ) {
 		}
 	}

-	// time how long to get each split's docids
+	// time how long to get each shard's docids
 	if ( m_debug )
 		m_startTime = gettimeofdayInMilliseconds();

@ -483,7 +483,7 @@ bool Msg3a::gotCacheReply ( ) {
 		Multicast *m = &m_mcast[i];
 		// clear it for transmit
 		m->reset();
-		// . send out a msg39 request to each split
+		// . send out a msg39 request to each shard
 		// . multicasts to a host in group "groupId"
 		// . we always block waiting for the reply with a multicast
 		// . returns false and sets g_errno on error
@ -532,10 +532,10 @@ bool Msg3a::gotCacheReply ( ) {
 	if ( m_numReplies < m_numHosts ) return false;//indexdbSplit )
 	// . otherwise, we did not block... error?
 	// . it must have been an error or just no new lists available!!
-	// . if we call gotAllSplitReplies() here, and we were called by 
+	// . if we call gotAllShardReplies() here, and we were called by 
 	//   mergeLists() we end up calling mergeLists() again... bad. so
 	//   just return true in that case.
-	//return gotAllSplitReplies();
+	//return gotAllShardReplies();
 	return true;
 }

@ -553,7 +553,7 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
 		     " err=%s", (long)THIS, THIS->m_numReplies ,
 		     mstrerror(g_errno) );

-	// if one split times out, ignore it!
+	// if one shard times out, ignore it!
 	if ( g_errno == EQUERYTRUNCATED ||
 	     g_errno == EUDPTIMEDOUT ) 
 		g_errno = 0;
@ -576,7 +576,7 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
 		// . sanity check
 		// . ntpd can screw with our local time and make this negative
 		if ( delta >= 0 ) {
-			// count the split
+			// count the shards
 			h->m_splitsDone++;
 			// accumulate the times so we can do an average display
 			// in PageHosts.cpp.
@ -587,8 +587,8 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
 	THIS->m_numReplies++;
 	// bail if still awaiting more replies
 	if ( THIS->m_numReplies < THIS->m_numHosts ) return;
-	// return if gotAllSplitReplies() blocked
-	if ( ! THIS->gotAllSplitReplies( ) ) return;
+	// return if gotAllShardReplies() blocked
+	if ( ! THIS->gotAllShardReplies( ) ) return;
 	// set g_errno i guess so parent knows
 	if ( THIS->m_errno ) g_errno = THIS->m_errno;
 	// call callback if we did not block, since we're here. all done.
@ -603,9 +603,9 @@ static void gotSerpdbReplyWrapper ( void *state ) {
 	THIS->m_callback ( THIS->m_state );
 }
 	
-bool Msg3a::gotAllSplitReplies ( ) {
+bool Msg3a::gotAllShardReplies ( ) {

-	// if any of the split requests had an error, give up and set m_errno
+	// if any of the shard requests had an error, give up and set m_errno
 	// but don't set if for non critical errors like query truncation
 	if ( m_errno ) { 
 		g_errno = m_errno; 
@ -705,23 +705,23 @@ bool Msg3a::gotAllSplitReplies ( ) {
 		if ( mr->m_nqt != m_q->getNumTerms() ) {
 			g_errno = EBADREPLY;
 			m_errno = EBADREPLY;
-			log("query: msg3a: Split reply qterms=%li != %li.",
+			log("query: msg3a: Shard reply qterms=%li != %li.",
 			    (long)mr->m_nqt,(long)m_q->getNumTerms() );
 			return true;
 		}
-		// return if split had an error, but not for a non-critical
+		// return if shard had an error, but not for a non-critical
 		// error like query truncation
 		if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
 			g_errno = mr->m_errno;
 			m_errno = mr->m_errno;
-			log("query: msg3a: Split had error: %s",
+			log("query: msg3a: Shard had error: %s",
 			    mstrerror(g_errno));
 			return true;
 		}
 		// skip down here if reply was already set
 		//skip:
-		// add of the total hits from each split, this is how many
-		// total results the lastest split is estimated to be able to 
+		// add of the total hits from each shard, this is how many
+		// total results the lastest shard is estimated to be able to 
 		// return
 		// . THIS should now be exact since we read all termlists
 		//   of posdb...
@ -732,19 +732,19 @@ bool Msg3a::gotAllSplitReplies ( ) {
 		// cast these for printing out
 		long long *docIds    = (long long *)mr->ptr_docIds;
 		double    *scores    = (double    *)mr->ptr_scores;
-		// print out every docid in this split reply
+		// print out every docid in this shard reply
 		for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
 			// print out score_t
 			logf( LOG_DEBUG,
 			     "query: msg3a: [%lu] %03li) "
-			     "split=%li docId=%012llu domHash=0x%02lx "
+			     "shard=%li docId=%012llu domHash=0x%02lx "
 			     "score=%f"                     ,
 			     (unsigned long)this                      ,
 			     j                                        , 
 			     i                                        ,
 			     docIds [j] ,
 			     (long)g_titledb.getDomHash8FromDocId(docIds[j]),
-			      (float)scores[j] );
+			      scores[j] );
 		}
 	}

@ -849,13 +849,13 @@ bool Msg3a::mergeLists ( ) {
 	// shortcut
 	//long numSplits = m_numHosts;//indexdbSplit;

-	// . point to the various docids, etc. in each split reply
+	// . point to the various docids, etc. in each shard reply
 	// . tcPtr = term count. how many required query terms does the doc 
 	//   have? formerly called topExplicits in IndexTable2.cpp
-	long long     *diPtr [MAX_INDEXDB_SPLIT];
-	double        *rsPtr [MAX_INDEXDB_SPLIT];
-	key_t         *ksPtr [MAX_INDEXDB_SPLIT];
-	long long     *diEnd [MAX_INDEXDB_SPLIT];
+	long long     *diPtr [MAX_SHARDS];
+	double        *rsPtr [MAX_SHARDS];
+	key_t         *ksPtr [MAX_SHARDS];
+	long long     *diEnd [MAX_SHARDS];
 	for ( long j = 0; j < m_numHosts ; j++ ) {
 		Msg39Reply *mr =m_reply[j];
 		// if we have gbdocid:| in query this could be NULL
@ -953,7 +953,7 @@ bool Msg3a::mergeLists ( ) {
 		return true;

 	//
-	// ***MERGE ALL SPLITS INTO m_docIds[], etc.***
+	// ***MERGE ALL SHARDS INTO m_docIds[], etc.***
 	//
 	// . merge all lists in m_replyDocIds[splitNum]
 	// . we may be re-called later after m_docsToGet is increased
@ -966,7 +966,7 @@ bool Msg3a::mergeLists ( ) {
 	//Msg39Reply *mr;
 	long hslot;

-	// get the next highest-scoring docids from all split lists
+	// get the next highest-scoring docids from all shard termlists
 	for ( long j = 0; j < m_numHosts; j++ ) {
 		// . skip exhausted lists
 		// . these both should be NULL if reply was skipped because
@ -1026,82 +1026,84 @@ bool Msg3a::mergeLists ( ) {

 	// . only add it to the final list if the docid is "unique"
 	// . BUT since different event ids share the same docid, exception!
-	if ( hslot < 0 ) {
-		// always inc this
-		//m_totalDocCount++;
-		// only do this if we need more
-		if ( m_numDocIds < m_docsToGet ) {
-			// get DocIdScore class for this docid
-			Msg39Reply *mr = m_reply[maxj];
-			// point to the array of DocIdScores
-			DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
-			long nds = mr->size_scoreInfo/sizeof(DocIdScore);
-			DocIdScore *dp = NULL;
-			for ( long i = 0 ; i < nds ; i++ ) {
-				if ( ds[i].m_docId != *diPtr[maxj] )  continue;
-				dp = &ds[i];
-				break;
-			}
-			// add the max to the final merged lists
-			m_docIds    [m_numDocIds] = *diPtr[maxj];
+	if ( hslot >= 0 ) goto skip; // < 0 ) {

-			// wtf?
-			if ( ! dp ) {
-				// this is empty if no scoring info
-				// supplied!
-				if ( m_r->m_getDocIdScoringInfo )
-					log("msg3a: CRAP! got empty score "
-					    "info for "
-					    "d=%lli",
-					    m_docIds[m_numDocIds]);
-				//char *xx=NULL; *xx=0;  261561804684
-				// qry = www.yahoo
-			}
-			// point to the single DocIdScore for this docid
-			m_scoreInfos[m_numDocIds] = dp;
+	// always inc this
+	//m_totalDocCount++;
+	// only do this if we need more
+	if ( m_numDocIds < m_docsToGet ) {
+		// get DocIdScore class for this docid
+		Msg39Reply *mr = m_reply[maxj];
+		// point to the array of DocIdScores
+		DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
+		long nds = mr->size_scoreInfo/sizeof(DocIdScore);
+		DocIdScore *dp = NULL;
+		for ( long i = 0 ; i < nds ; i++ ) {
+			if ( ds[i].m_docId != *diPtr[maxj] )  continue;
+			dp = &ds[i];
+			break;
+		}
+		// add the max to the final merged lists
+		m_docIds    [m_numDocIds] = *diPtr[maxj];

-			// reset this just in case
-			if ( dp ) {
-				dp->m_singleScores = NULL;
-				dp->m_pairScores   = NULL;
-			}
+		// wtf?
+		if ( ! dp ) {
+			// this is empty if no scoring info
+			// supplied!
+			if ( m_r->m_getDocIdScoringInfo )
+				log("msg3a: CRAP! got empty score "
+				    "info for "
+				    "d=%lli",
+				    m_docIds[m_numDocIds]);
+			//char *xx=NULL; *xx=0;  261561804684
+			// qry = www.yahoo
+		}
+		// point to the single DocIdScore for this docid
+		m_scoreInfos[m_numDocIds] = dp;

-			// now fix DocIdScore::m_pairScores and m_singleScores
-			// ptrs so they reference into the 
-			// Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
-			// like they should. it seems we do not free the
-			// Msg39Replies so we should be ok referencing them.
-			if ( dp && dp->m_singlesOffset >= 0 )
-				dp->m_singleScores = 
-					(SingleScore *)(mr->ptr_singleScoreBuf+
-							dp->m_singlesOffset) ;
-			if ( dp && dp->m_pairsOffset >= 0 )
-				dp->m_pairScores = 
-					(PairScore *)(mr->ptr_pairScoreBuf +
-						      dp->m_pairsOffset );
+		// reset this just in case
+		if ( dp ) {
+			dp->m_singleScores = NULL;
+			dp->m_pairScores   = NULL;
+		}
+
+		// now fix DocIdScore::m_pairScores and m_singleScores
+		// ptrs so they reference into the 
+		// Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
+		// like they should. it seems we do not free the
+		// Msg39Replies so we should be ok referencing them.
+		if ( dp && dp->m_singlesOffset >= 0 )
+			dp->m_singleScores = 
+				(SingleScore *)(mr->ptr_singleScoreBuf+
+						dp->m_singlesOffset) ;
+		if ( dp && dp->m_pairsOffset >= 0 )
+			dp->m_pairScores = 
+				(PairScore *)(mr->ptr_pairScoreBuf +
+					      dp->m_pairsOffset );
 					

-			// turn it into a float, that is what rscore_t is.
-			// we do this to make it easier for PostQueryRerank.cpp
-			m_scores    [m_numDocIds]=(double)*rsPtr[maxj];
-			if ( m_r->m_doSiteClustering ) 
-				m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
-			// clear this out
-			//m_eventIdBits[m_numDocIds].clear();
-			// set this for use below
-			hslot = m_numDocIds;
-			// point to next available slot to add to
-			m_numDocIds++;
-		}
-		// if it has ALL the required query terms, count it
-		//if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
-		// . add it, this should be pre-allocated!
-		// . returns false and sets g_errno on error
-		if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
+		// turn it into a float, that is what rscore_t is.
+		// we do this to make it easier for PostQueryRerank.cpp
+		m_scores    [m_numDocIds]=(double)*rsPtr[maxj];
+		if ( m_r->m_doSiteClustering ) 
+			m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
+		// clear this out
+		//m_eventIdBits[m_numDocIds].clear();
+		// set this for use below
+		hslot = m_numDocIds;
+		// point to next available slot to add to
+		m_numDocIds++;
 	}

+
+	// if it has ALL the required query terms, count it
+	//if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
+	// . add it, this should be pre-allocated!
+	// . returns false and sets g_errno on error
+	if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
+
 skip:
-	// increment the split pointers from which we took the max
+	// increment the shard pointers from which we took the max
 	rsPtr[maxj]++;
 	diPtr[maxj]++;
 	ksPtr[maxj]++;
@ -1113,7 +1115,7 @@ bool Msg3a::mergeLists ( ) {
 	if ( m_debug ) {
 		// show how long it took
 		logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li "
-		      "splits in %llu ms. "
+		      "shards in %llu ms. "
 		      ,
 		      (unsigned long)this, 
 		       m_numDocIds, (long)m_numHosts,
@ -1128,17 +1130,17 @@ bool Msg3a::mergeLists ( ) {
 			// print out score_t
 			logf(LOG_DEBUG,"query: msg3a: [%lu] "
 			    "%03li) merged docId=%012llu "
-			    "score=%.01f hosthash=0x%lx",
+			    "score=%f hosthash=0x%lx",
 			    (unsigned long)this, 
 			     i,
 			     m_docIds    [i] ,
-			     (float)m_scores    [i] ,
+			     (double)m_scores    [i] ,
 			     sh );
 		}
 	}

 	// if we had a full split, we should have gotten the cluster recs
-	// from each split already
+	// from each shard already
 	memset ( m_clusterLevels , CR_OK , m_numDocIds );

 	return true;
--- a/Msg3a.h
+++ b/Msg3a.h
@ -80,7 +80,7 @@ public:
 		return m_numTotalEstimatedHits; };

 	// called when we got a reply of docIds
-	bool gotAllSplitReplies ( );
+	bool gotAllShardReplies ( );

 	bool gotCacheReply ( );

@ -135,13 +135,13 @@ public:
 	float      m_termFreqWeights[MAX_QUERY_TERMS];

 	// a multicast class to send the request, one for each split
-	Multicast  m_mcast[MAX_INDEXDB_SPLIT];
+	Multicast  m_mcast[MAX_SHARDS];

 	// for timing how long things take
 	long long  m_startTime;

 	// this buffer should be big enough to hold all requests
-	//char       m_request [MAX_MSG39_REQUEST_SIZE * MAX_INDEXDB_SPLIT];
+	//char       m_request [MAX_MSG39_REQUEST_SIZE * MAX_SHARDS];
 	long       m_numReplies;

 	// . # estimated total hits
@ -157,8 +157,8 @@ public:
 	SafeBuf m_rbuf2;

 	// each split gives us a reply
-	class Msg39Reply   *m_reply       [MAX_INDEXDB_SPLIT];
-	long                m_replyMaxSize[MAX_INDEXDB_SPLIT];
+	class Msg39Reply   *m_reply       [MAX_SHARDS];
+	long                m_replyMaxSize[MAX_SHARDS];

 	char m_debug;

--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -100,6 +100,7 @@ Msg40::Msg40() {
 	m_sendsIn       = 0;
 	m_printi        = 0;
 	m_numDisplayed  = 0;
+	m_numPrintedSoFar = 0;
 	m_lastChunk     = false;
 	//m_numGigabitInfos = 0;
 }
@ -555,6 +556,9 @@ bool Msg40::getDocIds ( bool recall ) {
 	mr.m_maxQueryTerms             = m_si->m_maxQueryTerms; 
 	mr.m_realMaxTop                = m_si->m_realMaxTop;

+	mr.m_minSerpDocId              = m_si->m_minSerpDocId;
+	mr.m_maxSerpScore              = m_si->m_maxSerpScore;
+
 	// . get the docIds
 	// . this sets m_msg3a.m_clusterLevels[] for us
 	//if(! m_msg3a.getDocIds ( &m_r,  m_si->m_q, this , gotDocIdsWrapper))
@ -721,7 +725,6 @@ bool Msg40::gotDocIds ( ) {
 		if ( m_needFirstReplies > 100 ) m_needFirstReplies = 100;
 	}

-
 	// we have received m_numGood contiguous Msg20 replies!
 	//m_numContiguous     = 0;
 	//m_visibleContiguous = 0;
@ -1591,6 +1594,7 @@ bool Msg40::gotSummary ( ) {

 	for ( ; m_si && m_si->m_streamResults&&m_printi<m_msg3a.m_numDocIds ;
 	      m_printi++){
+
 		// if we are waiting on our previous send to complete... wait..
 		if ( m_sendsOut > m_sendsIn ) break;

@ -1658,18 +1662,38 @@ bool Msg40::gotSummary ( ) {
 		// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
 		if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
 		     mr->m_contentHash32 &&
+		     // do not dedup CT_STATUS results, those are
+		     // spider reply "documents" that indicate the last
+		     // time a doc was spidered and the error code or success
+		     // code
+		     mr->m_contentType != CT_STATUS &&
 		     m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
 			//if ( g_conf.m_logDebugQuery )
-			log("msg40: dup sum #%li (%lu)",m_printi,
-			    mr->m_contentHash32);
+			log("msg40: dup sum #%li (%lu)(d=%lli)",m_printi,
+			    mr->m_contentHash32,mr->m_docId);
 			// make it available to be reused
 			m20->reset();
 			continue;
 		}

+		// static long s_bs = 0;
+		// if ( (s_bs++ % 5) != 0 ) {
+		// 	log("msg40: FAKE dup sum #%li (%lu)(d=%lli)",m_printi,
+		// 	    mr->m_contentHash32,mr->m_docId);
+		// 	// make it available to be reused
+		// 	m20->reset();
+		// 	continue;
+		// }
+
+
 		// return true with g_errno set on error
 		if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
 		     mr->m_contentHash32 &&
+		     // do not dedup CT_STATUS results, those are
+		     // spider reply "documents" that indicate the last
+		     // time a doc was spidered and the error code or success
+		     // code
+		     mr->m_contentType != CT_STATUS &&
 		     ! m_dedupTable.addKey ( &mr->m_contentHash32 ) ) {
 			m_hadPrintError = true;
 			log("msg40: error adding to dedup table: %s",
@ -1678,19 +1702,25 @@ bool Msg40::gotSummary ( ) {

 		// assume we show this to the user
 		m_numDisplayed++;
+		//log("msg40: numdisplayed=%li",m_numDisplayed);

 		// do not print it if before the &s=X start position though
 		if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
-			log("msg40: hiding #%li (%lu)",
-			    m_printi,mr->m_contentHash32);
+			log("msg40: hiding #%li (%lu)(d=%lli)",
+			    m_printi,mr->m_contentHash32,mr->m_docId);
+			m20->reset();
 			continue;
 		}

-		log("msg40: printing #%li (%lu)",m_printi,mr->m_contentHash32);
+		log("msg40: printing #%li (%lu)(d=%lli)",
+		    m_printi,mr->m_contentHash32,mr->m_docId);

 		// . ok, we got it, so print it and stream it
 		// . this might set m_hadPrintError to true
-		printSearchResult9 ( m_printi );
+		printSearchResult9 ( m_printi , m_numPrintedSoFar );
+
+		m_numPrintedSoFar++;
+		//log("msg40: printedsofar=%li",m_numPrintedSoFar);

 		// now free the reply to save memory since we could be 
 		// streaming back 1M+. we call reset below, no need for this.
@ -1705,6 +1735,62 @@ bool Msg40::gotSummary ( ) {
 	if ( m_si->m_streamResults )
 		st->m_socket->m_streamingMode = true;

+
+	// if streaming results, and too many results were clustered or
+	// deduped then try to get more by merging the docid lists that
+	// we already have from the shards. if this still does not provide
+	// enough docids then we will need to issue a new msg39 request to
+	// each shard to get even more docids from each shard.
+	if ( m_si && m_si->m_streamResults &&
+	     // this is coring as well on multi collection federated searches
+	     // so disable that for now too. it is because Msg3a::m_r is
+	     // NULL.
+	     m_numCollsToSearch == 1 &&	     
+	     // must have no streamed chunk sends out
+	     m_sendsOut == m_sendsIn &&
+	     // if we did not ask for enough docids and they were mostly
+	     // dups so they got deduped, then ask for more.
+	     // m_numDisplayed includes results before the &s=X parm.
+	     // and so does m_docsToGetVisiable, so we can compare them.
+	     m_numDisplayed < m_docsToGetVisible && 
+	     // wait for us to have exhausted the docids we have merged
+	     m_printi >= m_msg3a.m_numDocIds &&
+	     // wait for us to have available msg20s to get summaries
+	     m_numReplies == m_numRequests &&
+	     // this is true if we can get more docids from merging
+	     // more of the termlists from the shards together.
+	     // otherwise, we will have to ask each shard for a
+	     // higher number of docids.
+	     m_msg3a.m_moreDocIdsAvail &&
+	     // do not do this if client closed connection
+	     ! m_socketHadError ) { //&&
+		// doesn't work on multi-coll just yet, it cores.
+		// MAKE it.
+		//m_numCollsToSearch == 1 ) {
+		// can it cover us?
+		long need = m_msg3a.m_docsToGet + 20;
+		// note it
+		log("msg40: too many summaries deduped. "
+		    "getting more "
+		    "docids from msg3a merge and getting summaries. "
+		    "%li are visible, need %li. "
+		    "changing docsToGet from %li to %li. "
+		    "numReplies=%li numRequests=%li",
+		    m_numDisplayed,
+		    m_docsToGetVisible,
+		    m_msg3a.m_docsToGet, 
+		    need,
+		    m_numReplies, 
+		    m_numRequests);
+		// merge more docids from the shards' termlists
+		m_msg3a.m_docsToGet = need;
+		// sanity. the original msg39request must be there
+		if ( ! m_msg3a.m_r ) { char *xx=NULL;*xx=0; }
+		// this should increase m_msg3a.m_numDocIds
+		m_msg3a.mergeLists();
+	}
+
+
 	// . wrap it up with Next 10 etc.
 	// . this is in PageResults.cpp
 	if ( m_si && m_si->m_streamResults && ! m_printedTail &&
@ -1987,6 +2073,11 @@ bool Msg40::gotSummary ( ) {
 		//long m = oldNumContiguous;
 		// get it
 		Msg20Reply *mri = m_msg20[i]->m_r;
+		// do not dedup CT_STATUS results, those are
+		// spider reply "documents" that indicate the last
+		// time a doc was spidered and the error code or 
+		// success code
+		if ( mri->m_contentType == CT_STATUS ) continue;
 		// never let it be i
 		//if ( m <= i ) m = i + 1;
 		// see if any result lower-scoring than #i is a dup of #i
@ -1997,6 +2088,11 @@ bool Msg40::gotSummary ( ) {
 			if ( *level != CR_OK ) continue;
 			// get it
 			Msg20Reply *mrm = m_msg20[m]->m_r;
+			// do not dedup CT_STATUS results, those are
+			// spider reply "documents" that indicate the last
+			// time a doc was spidered and the error code or 
+			// success code
+			if ( mrm->m_contentType == CT_STATUS ) continue;
 			// use gigabit vector to do topic clustering, etc.
 			long *vi = (long *)mri->ptr_vbuf;
 			long *vm = (long *)mrm->ptr_vbuf;
@ -5175,7 +5271,7 @@ bool Msg40::addFacts ( HashTableX *queryTable,


 // . printSearchResult into "sb"
-bool Msg40::printSearchResult9 ( long ix ) {
+bool Msg40::printSearchResult9 ( long ix , long numPrintedSoFar ) {

 	// . we stream results right onto the socket
 	// . useful for thousands of results... and saving mem
@ -5192,27 +5288,23 @@ bool Msg40::printSearchResult9 ( long ix ) {

 	// then print each result
 	// don't display more than docsWanted results
-	
+	if ( m_numPrinted >= msg40->getDocsWanted() ) return true;
+
 	// prints in xml or html
-	if ( m_numPrinted < msg40->getDocsWanted() ) {
-
-		if ( m_si->m_format == FORMAT_CSV ) {
-			printJsonItemInCSV ( st , ix );
-			//log("print: printing #%li csv",(long)ix);
-		}
-
-		// print that out into st->m_sb safebuf
-		else if ( ! printResult ( st , ix ) ) {
-			// oom?
-			if ( ! g_errno ) g_errno = EBADENGINEER;
-			log("query: had error: %s",mstrerror(g_errno));
-			m_hadPrintError = true;
-		}
-
-		// count it
-		m_numPrinted++;
-
+	if ( m_si->m_format == FORMAT_CSV ) {
+		printJsonItemInCSV ( st , ix );
+		//log("print: printing #%li csv",(long)ix);
 	}
+	// print that out into st->m_sb safebuf
+	else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
+		// oom?
+		if ( ! g_errno ) g_errno = EBADENGINEER;
+		log("query: had error: %s",mstrerror(g_errno));
+		m_hadPrintError = true;
+	}
+
+	// count it
+	m_numPrinted++;

 	return true;
 }
@ -5241,6 +5333,8 @@ bool printHttpMime ( State0 *st ) {
 		ct = "application/json";
 	if ( si->m_format == FORMAT_XML )
 		ct = "text/xml";
+	if ( si->m_format == FORMAT_HTML )
+		ct = "text/html";
 	//if ( si->m_format == FORMAT_TEXT )
 	//	ct = "text/plain";
 	if ( si->m_format == FORMAT_CSV )
@ -5360,6 +5454,10 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
 			if ( ! ji->getCompoundName ( tmpBuf ) )
 				return false;

+			// skip the "html" column, strip that out now
+			if ( strcmp(tmpBuf.getBufStart(),"html") == 0 )
+				continue;
+
 			// is it new?
 			long long h64 = hash64n ( tmpBuf.getBufStart() );
 			if ( nameTable.isInTable ( &h64 ) ) continue;
@ -5492,6 +5590,9 @@ bool Msg40::printJsonItemInCSV ( State0 *st , long ix ) {
 		// is it new?
 		long long h64 = hash64n ( tmpBuf.getBufStart() );

+		// ignore the "html" column
+		if ( strcmp(tmpBuf.getBufStart(),"html") == 0 ) continue;
+
 		long slot = columnTable->getSlot ( &h64 ) ;
 		// MUST be in there
 		// get col #
--- a/Msg40.h
+++ b/Msg40.h
@ -171,7 +171,7 @@ class Msg40 {

 	long long getDocId  ( long i ){return m_msg3a.m_docIds[i]; };
 	long long *getDocIds(        ){return m_msg3a.m_docIds; };
-	float  getScore  ( long i ){return m_msg3a.m_scores[i]; };
+	double  getScore  ( long i ){return m_msg3a.m_scores[i]; };
 	class DocIdScore *getScoreInfo(long i){
 		if ( ! m_msg3a.m_scoreInfos ) return NULL;
 		return m_msg3a.m_scoreInfos[i];
@ -208,7 +208,7 @@ class Msg40 {

 	long m_lastHeartbeat;

-	bool printSearchResult9 ( long ix ) ;
+	bool printSearchResult9 ( long ix , long numPrintedSoFar ) ;
 	HashTableX m_columnTable;
 	bool printCSVHeaderRow ( class SafeBuf *sb );
 	bool printJsonItemInCSV ( class State0 *st , long ix );
@ -265,6 +265,7 @@ class Msg40 {
 	long m_sendsIn       ;
 	long m_printi        ;
 	long m_numDisplayed  ;
+	long m_numPrintedSoFar;
 	long m_socketHadError;


--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -802,7 +802,9 @@ bool Msg5::needsRecall ( ) {
 	RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
 	// if collection was deleted from under us, base will be NULL
 	if ( ! base && ! g_errno ) {
-		log("msg5: base lost for collnum %li",(long)m_collnum);
+		log("msg5: base lost for rdbid=%li collnum %li",
+		    (long)m_rdbId,(long)m_collnum);
+		g_errno = ENOCOLLREC;
 		return false;
 	}
 	// sanity check
@ -1535,7 +1537,9 @@ void Msg5::repairLists_r ( ) {
 		if ( i < nn && base ) {
 			long fn = m_msg3.m_fileNums[i];
 			BigFile *bf = base->getFile ( fn );
-			log("db: Corrupt filename is %s.",bf->getFilename());
+			log("db: Corrupt filename is %s in collnum %li."
+			    ,bf->getFilename()
+			    ,(long)m_collnum);
 			//key_t sk = m_listPtrs[i]->getStartKey();
 			//key_t ek = m_listPtrs[i]->getEndKey  ();
 			//log("db: "
@ -1551,10 +1555,10 @@ void Msg5::repairLists_r ( ) {
 		}
 		// . remove the bad eggs from the list
 		// . TODO: support non-fixed data sizes
-		if ( m_listPtrs[i]->getFixedDataSize() >= 0 )
-			m_listPtrs[i]->removeBadData_r();
-		else
-			m_listPtrs[i]->reset();
+		//if ( m_listPtrs[i]->getFixedDataSize() >= 0 )
+		m_listPtrs[i]->removeBadData_r();
+		//else
+		//m_listPtrs[i]->reset();
 		// otherwise we have a patchable error
 		m_hadCorruption = true;
 		// don't add a list with errors to cache, please
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -4,6 +4,10 @@
 #include "Pages.h"
 #include "Parms.h"
 #include "Spider.h"
+#include "PageResults.h" // for RESULT_HEIGHT
+
+// 5 seconds
+#define DEFAULT_WIDGET_RELOAD 1000

 //bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) ;

@ -68,12 +72,18 @@ public:

 // . Collectiondb.cpp calls this when any parm flagged with 
 //   PF_REBUILDURLFILTERS is updated
+// . it only adds sites via msg4 that are in "siteListArg" but NOT in the
+//   current CollectionRec::m_siteListBuf
+// . updates SpiderColl::m_siteListDomTable to see what doms we can spider
+// . updates SpiderColl::m_negSubstringBuf and m_posSubStringBuf to
+//   see what substrings in urls are disallowed/allowable for spidering
 // . this returns false if it blocks
 // . returns true and sets g_errno on error
-// . uses msg4 to add seeds to spiderdb if necessary
+// . uses msg4 to add seeds to spiderdb if necessary if "siteListArg"
+//   has new urls that are not currently in cr->m_siteListBuf
 // . only adds seeds for the shard we are on iff we are responsible for
-//   the fake firstip!!!
-bool updateSiteListTables ( collnum_t collnum , 
+//   the fake firstip!!! that way only one shard does the add.
+bool updateSiteListBuf ( collnum_t collnum , 
 			    bool addSeeds ,
 			    char *siteListArg ) {

@ -402,7 +412,7 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
 	// need to build dom table for pattern matching?
 	if ( dt->getNumSlotsUsed() == 0 && cr ) {
 		// do not add seeds, just make siteListDomTable, etc.
-		updateSiteListTables ( sc->m_collnum , 
+		updateSiteListBuf ( sc->m_collnum , 
 				       false , // add seeds?
 				       cr->m_siteListBuf.getBufStart() );
 	}
@ -771,6 +781,461 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 		// this prints the <form tag as well
 		g_pages.printAdminTop ( &sb , socket , hr );

+	// table to split between widget and stats in left and right panes
+	if ( fmt == FORMAT_HTML ) {
+		sb.safePrintf("<TABLE id=pane>"
+			      "<TR><TD valign=top>");
+	}
+
+	long savedLen1, savedLen2;
+
+	//
+	// widget
+	//
+	// put the widget in here, just sort results by spidered date
+	//
+	// the scripts do "infinite" scrolling both up and down.
+	// but if you are at the top then new results will load above
+	// you and we try to maintain your current visual state even though
+	// the scrollbar position will change.
+	//
+	if ( fmt == FORMAT_HTML ) {
+
+		// save position so we can output the widget code
+		// so user can embed it into their own web page
+		savedLen1 = sb.length();
+		
+		sb.safePrintf("<script type=\"text/javascript\">\n\n");
+
+		// if user has the scrollbar at the top
+		// in the widget we do a search every 15 secs
+		// to try to load more recent results. we should
+		// return up to 10 results above your last 
+		// top docid and 10 results below it. that way
+		// no matter which of the 10 results you were
+		// viewing your view should remaing unchanged.
+		sb.safePrintf(
+
+			      // global var
+			      "var forcing;"
+
+			      "function widget123_handler_reload() {"
+			      // return if reply is not fully ready
+			      "if(this.readyState != 4 )return;"
+
+			      // if error or empty reply then do nothing
+			      "if(!this.responseText)return;"
+			      // get the widget container
+			      "var w=document.getElementById(\"widget123\");"
+
+			      // GET DOCID of first div/searchresult
+			      "var sd=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+			      "var cd;"
+			      "if ( sd ) cd=sd.firstChild;"
+			      "var fd=0;"
+			      "if(cd) fd=cd.getAttribute('docid');"
+
+
+			      // if the searchbox has the focus then do not
+			      // update the content just yet...
+			      "var qb=document.getElementById(\"qbox\");"
+			      "if(qb&&qb==document.activeElement)"
+			      "return;"
+
+			      // or if not forced and they scrolled down
+			      // don't jerk them back up again
+			      "if(!forcing&&sd&&sd.scrollTop!=0)return;"
+
+
+			      // just set the widget content to the reply
+			      "w.innerHTML=this.responseText;"
+
+			      //
+			      // find that SAME docid in response and see
+			      // how many new results were added above it
+			      //
+			      "var added=0;"
+			      // did we find the docid?
+			      "var found=0;"
+			      // get div again since we updated innerHTML
+			      "sd=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+			      // scan the kids
+			      "var kid=sd.firstChild;"
+			      // begin the while loop to scan the kids
+			      "while (kid) {"
+			      // if div had no docid it might have been a line
+			      // break div, so ignore
+			      "if (!kid.hasAttribute('docid') ) {"
+			      "kid=kid.nextSibling;"
+			      "continue;"
+			      "}"
+			      // set kd to docid of kid
+			      "var kd=kid.getAttribute('docid');"
+			      // stop if we hit our original top docid
+			      "if(kd==fd) {found=1;break;}"
+			      // otherwise count it as a NEW result we got
+			      "added++;"
+			      // advance kid
+			      "kid=kid.nextSibling;"
+			      // end while loop
+			      "}"
+
+			      //"alert(\"added=\"+added);"
+
+			      // how many results did we ADD above the
+			      // reported "topdocid" of the widget?
+			      // it should be in the ajax reply from the
+			      // search engine. how many result were above
+			      // the given "topdocid".
+			      //"var ta=document.getElementById(\"topadd\");"
+			      //"var added=0;"
+			      //"if(ta)added=ta.value;"
+
+			      // if nothing added do nothing
+			      "if (added==0)return;"
+
+			      // if original top docid not found, i guess we
+			      // added too many new guys to the top of the
+			      // search results, so don't bother scrolling
+			      // just reset to top
+			      "if (!found) return;"
+
+			      // show that
+			      //"alert(this.responseText);"
+
+			      // get the div that has the scrollbar
+			      "var sd=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+			      // save current scroll pos
+			      "var oldpos=parseInt(sd.scrollTop);"
+			      // note it
+			      //"alert (sd.scrollTop);"
+			      // preserve the relative scroll position so we
+			      // do not jerk around since we might have added 
+			      // "added" new results to the top.
+			      "sd.scrollTop += added*%li;"
+
+			      // try to scroll out new results if we are
+			      // still at the top of the scrollbar and
+			      // there are new results to scroll.
+			      "if(oldpos==0)widget123_scroll();}\n\n"
+
+			      // for preserving scrollbar position
+			      ,(long)RESULT_HEIGHT +2*PADDING
+
+			      );
+
+
+		// scroll the widget up until we hit the 0 position
+		sb.safePrintf(
+			      "function widget123_scroll() {"
+			      // only scroll if at the top of the widget
+			      // and not scrolled down so we do not
+			      // interrupt
+			      "var sd=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+			      // TODO: need parseInt here?
+			      "var pos=parseInt(sd.scrollTop);"
+			      // note it
+			      //"alert (sd.scrollTop);"
+			      // if already at the top of widget, return
+			      "if(pos==0)return;"
+			      // decrement by 3 pixels
+			      "pos=pos-3;"
+			      // do not go negative
+			      "if(pos<0)pos=0;"
+			      // assign to scroll up. TODO: need +\"px\"; ?
+			      "sd.scrollTop=pos;"
+			      // all done, then return
+			      "if(pos==0) return;"
+			      // otherwise, scroll more in 3ms
+			      // TODO: make this 1000ms on result boundaries
+			      // so it delays on each new result. perhaps make
+			      // it less than 1000ms if we have a lot of 
+			      // results above us!
+			      "setTimeout('widget123_scroll()',3);}\n\n"
+
+			      );
+
+		// this function appends the search results to what is
+		// already in the widget.
+		sb.safePrintf(
+			      "function widget123_handler_append() {"
+			      // return if reply is not fully ready
+			      "if(this.readyState != 4 )return;"
+			      // i guess we are done... release the lock
+			      "outstanding=0;"
+			      // if error or empty reply then do nothing
+			      "if(!this.responseText)return;"
+			      // if too small
+			      "if(this.responseText.length<=3)return;"
+			      // get the widget container
+			      "var w=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+			      // just set the widget content to the reply
+			      "w.innerHTML+=this.responseText;"
+			      "}\n\n"
+			      );
+
+
+		//sb.safePrintf ( "</script>\n\n" );
+
+		long widgetWidth = 300;
+		long widgetHeight = 500;
+
+		// make the ajax url that gets the search results
+		SafeBuf ub;
+		ub.safePrintf("/search"
+			      //"format=ajax"
+			      "?c=%s"
+			      //"&prepend=gbsortbyint%%3Agbspiderdate"
+			      "&q=-gbstatus:0+gbsortbyint%%3Agbspiderdate"
+			      "&sc=0" // no site clustering
+			      "&dr=0" // no deduping
+			      // 10 results at a time
+			      "&n=10"
+			      "&widgetheight=%li"
+			      "&widgetwidth=%li"
+			      , cr->m_coll
+			      , widgetHeight
+			      , widgetWidth
+			      );
+		//ub.safePrintf("&topdocid="
+		//	      );
+
+		// get the search results from neo as soon as this div is
+		// being rendered, and set its contents to them
+		sb.safePrintf(//"<script type=text/javascript>"
+
+			      "function widget123_reload(force) {"
+			 
+			      // when the user submits a new query in the
+			      // query box we set force to false when
+			      // we call this (see PageResults.cpp) so that
+			      // we do not register multiple timeouts
+			      "if ( ! force ) "
+			      "setTimeout('widget123_reload(0)',%li);"
+
+			      // get the query box
+			      "var qb=document.getElementById(\"qbox\");"
+
+			      // if forced then turn off focus for searchbox
+			      // since it was either 1) the initial call
+			      // or 2) someone submitted a query and
+			      // we got called from PageResults.cpp
+			      // onsubmit event.
+			      "if (force&&qb) qb.blur();"
+
+
+			      // if the searchbox has the focus then do not
+			      // reload!! unless force is true..
+			      "if(qb&&qb==document.activeElement&&!force)"
+			      "return;"
+
+			      //"var ee=document.getElementById(\"sbox\");"
+			      //"if (ee)alert('reloading '+ee.style.display);"
+
+			      // do not do timer reload if searchbox is
+			      // visible because we do not want to interrupt
+			      // a possible search
+			      //"if(!force&&ee && ee.style.display=='')return;"
+
+
+			      // do not bother timed reloading if scrollbar pos
+			      // not at top or near bottom
+			      "var sd=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+
+			      "if ( sd && !force ) {"
+			      "var pos=parseInt(sd.scrollTop);"
+			      "if (pos!=0) return;"
+			      "}"
+
+
+			      "var client=new XMLHttpRequest();"
+			      "client.onreadystatechange="
+			      "widget123_handler_reload;"
+
+			      // . this url gets the search results
+			      // . get them in "ajax" format so we can embed
+			      //   them into the base html as a widget
+			      "var u='%s&format=ajax';"
+
+			      // append our query from query box if there
+			      "var qv;"
+			      "if (qb) qv=qb.value;"
+			      "if (qv){"
+			      //"u+='&q=';"
+			      "u+='&prepend=';"
+			      "u+=encodeURI(qv);"
+			      "}"
+
+			      // set global var so handler knows if we were
+			      // forced or not
+			      "forcing=force;"
+
+			      // get the docid at the top of the widget
+			      // so we can get SURROUNDING search results,
+			      // like 10 before it and 10 after it for
+			      // our infinite scrolling
+			      //"var td=document.getElementById('topdocid');"
+			      //"if ( td ) u=u+\"&topdocid=\"+td.value;"
+
+			      //"alert('reloading');"
+
+			      "client.open('GET',u);"
+			      "client.send();"
+			      "}\n\n"
+
+			      // when page loads, populate the widget immed.
+			      "widget123_reload(1);\n\n"
+
+			      // initiate the timer loop since it was
+			      // not initiated on that call since we had to
+			      // set force=1 to load in case the query box
+			      // was currently visible.
+			      "setTimeout('widget123_reload(0)',%li);"
+
+			      //, widgetHeight
+			      , (long)DEFAULT_WIDGET_RELOAD
+			      , ub.getBufStart()
+			      , (long)DEFAULT_WIDGET_RELOAD
+			      );
+
+		//
+		// . call this when scrollbar gets 5 up from bottom
+		// . but if < 10 new results are appended, then stop!
+		//
+		sb.safePrintf(
+			      "var outstanding=0;\n\n"
+
+			      "function widget123_append() {"
+			      
+			      // bail if already outstanding
+			      "if (outstanding) return;"
+
+			      // if scrollbar not near bottom, then return
+			      "var sd=document.getElementById("
+			      "\"widget123_scrolldiv\");"
+			      "if ( sd ) {"
+			      "var pos=parseInt(sd.scrollTop);"
+			      "if (pos < (sd.scrollHeight-%li)) "
+			      "return;"
+			      "}"
+
+			      // . this url gets the search results
+			      // . just get them so we can APPEND them to
+			      //   the widget, so it will be just the
+			      //   "results" divs
+			      "var u='%s&format=append';"
+
+			      // . get score of the last docid in our widget
+			      // . it should be persistent.
+			      // . it is like a bookmark for scrolling
+			      // . append results AFTER it into the widget
+			      // . this way we can deal with the fact that
+			      //   we may be adding 100s of results to this
+			      //   query per second, especially if spidering
+			      //   at a high rate. and this will keep the
+			      //   results we append persistent.
+			      // . now we scan the children "search result"
+			      //   divs of the "widget123_scrolldiv" div
+			      //   container to get the last child and get
+			      //   its score/docid so we can re-do the search
+			      //   and just get the search results with
+			      //   a score/docid LESS THAN that. THEN our
+			      //   results should be contiguous.
+			      // . get the container div, "cd"
+			      "var cd=document.getElementById("
+			      "'widget123_scrolldiv');"
+			      // must be there
+			      "if(!cd)return;"
+			      // get the last child div in there
+			      "var d=cd.lastChild.previousSibling;"
+			      // must be there
+			      "if(!d)return;"
+			      // get docid/score
+			      "u=u+\"&maxserpscore=\"+d.getAttribute('score');"
+			      "u=u+\"&minserpdocid=\"+d.getAttribute('docid');"
+
+			      // append our query from query box if there
+			      "var qb=document.getElementById(\"qbox\");"
+			      "var qv;"
+			      "if (qb) qv=qb.value;"
+			      "if (qv){"
+			      //"u+='&q=';"
+			      "u+='&prepend=';"
+			      "u+=encodeURI(qv);"
+			      "}"
+
+
+			      // turn on the lock to prevent excessive calls
+			      "outstanding=1;"
+
+			      //"alert(\"scrolling2 u=\"+u);"
+
+			      "var client=new XMLHttpRequest();"
+			      "client.onreadystatechange="
+			      "widget123_handler_append;"
+
+			      //"alert('appending scrollTop='+sd.scrollTop+' scrollHeight='+sd.scrollHeight+' 5results=%li'+u);"
+			      "client.open('GET',u);"
+			      "client.send();"
+			      "}\n\n"
+
+			      "</script>\n\n"
+
+			      // if (pos < (sd.scrollHeight-%li)) return...
+			      // once user scrolls down to within last 5
+			      // results then try to append to the results.
+			      , widgetHeight +5*((long)RESULT_HEIGHT+2*PADDING)
+
+
+			      , ub.getBufStart()
+
+			      //,widgetHeight +5*((long)RESULT_HEIGHT+2*PADDING
+			      );
+
+
+		// then the WIDGET MASTER div. set the "id" so that the
+		// style tag the user sets can control its appearance.
+		// when the browser loads this the ajax sets the contents
+		// to the reply from neo.
+
+		// on scroll call widget123_append() which will append
+		// more search results if we are near the bottom of the
+		// widget.
+
+		sb.safePrintf("<div id=widget123 "
+			      "style=\"border:2px solid black;"
+			      "position:relative;border-radius:10px;"
+			      "width:%lipx;height:%lipx;\">"
+			      , widgetWidth
+			      , widgetHeight
+			      );
+
+		//sb.safePrintf("<style>"
+		//	      "a{color:white;}"
+		//	      "</style>");
+
+
+		sb.safePrintf("Waiting for Server...");
+
+
+		// end the containing div
+		sb.safePrintf("</div>");
+
+		savedLen2 = sb.length();
+
+	}
+
+	// the right table pane is the crawl stats
+	if ( fmt == FORMAT_HTML ) {
+		sb.safePrintf("</TD><TD valign=top>");
+	}
+

 	//
 	// show stats
@ -797,10 +1262,10 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 		if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
 			hurts = "Yes";

-		sb.safePrintf("<TABLE border=0>"
-			      "<TR><TD valign=top>"
+		sb.safePrintf(//"<TABLE border=0>"
+			      //"<TR><TD valign=top>"

-			      "<table border=0 cellpadding=5>"
+			      "<table id=stats border=0 cellpadding=5>"

 			      "<tr>"
 			      "<td><b>Crawl Status Code:</td>"
@ -830,8 +1295,8 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 			      //"</tr>"

 			      "<tr>"
-			      "<td><b>URLs Harvested</b> "
-			      "(may include dups)</td>"
+			      "<td><b><nobr>URLs Harvested</b> "
+			      "(may include dups)</nobr></td>"
 			      "<td>%lli</td>"
     
 			      "</tr>"
@ -863,8 +1328,83 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 			      , cr->m_globalCrawlInfo.m_pageDownloadSuccesses
 			      );

+		char tmp3[64];
+		struct tm *timeStruct;
+		timeStruct = localtime((time_t *)&cr->m_diffbotCrawlStartTime);
+		// Jan 01 1970 at 10:30:00
+		strftime ( tmp3,64 , "%b %d %Y at %H:%M:%S",timeStruct);
+		sb.safePrintf("<tr><td><b>Collection Created</b></td>"
+			      "<td>%s (local time)</td></tr>",tmp3);
+
+
+
+		
+		// print link to embed the code in their own site
+		SafeBuf embed;
+		embed.htmlEncode(sb.getBufStart()+savedLen1,
+				 savedLen2-savedLen1,
+				 false); // encodePoundSign #?
+		// convert all ''s to "'s for php's echo ''; cmd
+		embed.replaceChar('\'','\"');
+
+		sb.safePrintf("<tr>"
+			      "<td valign=top>"
+			      "<a onclick=\""
+			      "var dd=document.getElementById('hcode');"
+			      "if ( dd.style.display=='none' ) "
+			      "dd.style.display=''; "
+			      "else "
+			      "dd.style.display='none';"
+			      "\" style=color:blue;>"
+			      "<u>"
+			      "show Widget HTML code"
+			      "</u>"
+			      "</a>"
+			      "</td><td>"
+			      "<div id=hcode style=display:none;"
+			      "max-width:800px;>"
+			      "%s"
+			      "</div>"
+			      "</td></tr>"
+			      , embed.getBufStart() );
+
+		sb.safePrintf("<tr>"
+			      "<td valign=top>"
+			      "<a onclick=\""
+			      "var dd=document.getElementById('pcode');"
+			      "if ( dd.style.display=='none' ) "
+			      "dd.style.display=''; "
+			      "else "
+			      "dd.style.display='none';"
+			      "\" style=color:blue;>"
+			      "<u>"
+			      "show Widget PHP code"
+			      "</u>"
+			      "</a>"
+			      "</td>"
+			      "<td>"
+			      "<div id=pcode style=display:none;"
+			      "max-width:800px;>"
+			      "<i>"
+			      "echo '"
+			      "%s"
+			      "';"
+			      "</i>"
+			      "</div>"
+			      "</td></tr>"
+			      , embed.getBufStart() );
+
+
+		sb.safePrintf("</table>\n\n");
+
 	}

+	// end the right table pane
+	if ( fmt == FORMAT_HTML ) {
+		sb.safePrintf("</TD></TR></TABLE>");
+	}
+
+
 	//if ( fmt != FORMAT_JSON )
 	//	// wrap up the form, print a submit button
 	//	g_pages.printAdminBottom ( &sb );
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -2355,10 +2355,13 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
 		//nomen = "job";
 	}

-
 	sb->safePrintf("\n\n{"
 		      "\"name\":\"%s\",\n"
 		      "\"type\":\"%s\",\n"
+
+		       "\"jobCreationTimeUTC\":%li,\n"
+		       "\"jobCompletionTimeUTC\":%li,\n"
+
 		      //"\"alias\":\"%s\",\n"
 		      //"\"crawlingEnabled\":%li,\n"
 		      "\"jobStatus\":{" // nomen = jobStatus / crawlStatus
@ -2384,6 +2387,11 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
 		      //,cx->m_coll
 		      , cx->m_diffbotCrawlName.getBufStart()
 		      , crawlTypeStr
+
+		       , cx->m_diffbotCrawlStartTime
+		       // this is 0 if not over yet
+		       , cx->m_diffbotCrawlEndTime
+
 		      //, alias
 		      //, (long)cx->m_spideringEnabled
 		      , crawlStatus
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -245,19 +245,24 @@ bool processLoop ( void *state ) {
 		//xd->set3 ( st->m_docId , st->m_coll , 0 );
 		// callback
 		xd->setCallback ( state , processLoop );
-		// and tell it to load from the old title rec
+		// . and tell it to load from the old title rec
+		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
+		// . this sets xd->ptr_* and all other member vars from
+		//   the old title rec if found in titledb.
 		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
 	}

 	if ( g_errno ) return sendErrorReply ( st , g_errno );
 	// now force it to load old title rec
-	char **tr = xd->getTitleRec();
+	//char **tr = xd->getTitleRec();
+	SafeBuf *tr = xd->getTitleRecBuf();
 	// blocked? return false if so. it will call processLoop() when it rets
 	if ( tr == (void *)-1 ) return false;
 	// we did not block. check for error? this will free "st" too.
 	if ( ! tr ) return sendErrorReply ( st , g_errno );
 	// if title rec was empty, that is a problem
-	if ( xd->m_titleRecSize == 0 ) return sendErrorReply ( st , ENOTFOUND);
+	if ( xd->m_titleRecBuf.length() == 0 ) 
+		return sendErrorReply ( st , ENOTFOUND);

 	// set callback
 	char *na = xd->getIsNoArchive();
--- a/PageParser.cpp
+++ b/PageParser.cpp
@ -400,26 +400,40 @@ bool sendPageParser2 ( TcpSocket   *s ,
 		  "<td>"
 		  "<input type=text name=\"q\" size=\"20\" value=\"\"> "
 		  "</td>"
-		  "</tr>"
-
+		       "</tr>",

+			  TABLE_STYLE,
+			  us ,
+			   dd,
+			   rr, 
+			   render
+			  );

+	xbuf->safePrintf(
 		  "<tr class=poo>"
 			  "<td>"
-			  "<b>content below is xml</b>"
+			  "<b>content type below is</b>"
 			  "<br><font size=-2>"
-			  "Is the content below XML?"
+			  "Is the content below HTML? XML? JSON?"
 			  "</font>"
 			  "</td>"

 		  "<td>"
-		  "<input type=checkbox name=xml value=1> "
+		       //"<input type=checkbox name=xml value=1> "
+		       "<select name=ctype>\n"
+		       "<option value=%li selected>HTML</option>\n"
+		       "<option value=%li selected>XML</option>\n"
+		       "<option value=%li selected>JSON</option>\n"
+		       "</select>\n"

 		  "</td>"
-		  "</tr>"
-
-
+		       "</tr>",
+		       (long)CT_HTML,
+		       (long)CT_XML,
+		       (long)CT_JSON
+			  );

+	xbuf->safePrintf(

 			  "<tr class=poo>"
 			  "<td><b>content</b>"
@ -440,15 +454,6 @@ bool sendPageParser2 ( TcpSocket   *s ,
 		  "</form>"
 		  "<br>",

-			  TABLE_STYLE,
-			  us ,
-			  //(long)st->m_hopCount,
-			   //rtu,
-			   dd,
-			  //artr , 
-			   rr, 
-			  //rr2, 
-			   render , 
 			  //oips ,
 			  contentParm );

@ -807,8 +812,9 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
 	// ensure null
 	if ( contentLen == 0 ) content = NULL;

-	uint8_t contentType = CT_HTML;
+	//uint8_t contentType = CT_HTML;
 	//if ( isXml ) contentType = CT_XML;
+	long ctype = r->getLong("ctype",CT_HTML);

 	// . use the enormous power of our new XmlDoc class
 	// . this returns false if blocked
@ -821,7 +827,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
 			  content ,
 			  false, // deletefromindex
 			  0, // forced ip
-			  contentType ))
+			  ctype ))
 		// return error reply if g_errno is set
 		return sendErrorReply ( st , g_errno );
 	// make this our callback in case something blocks
@ -900,16 +906,15 @@ bool gotXmlDoc ( void *state ) {
 	}

 	long isXml = st->m_r.getLong("xml",0);
-	char ctype = CT_HTML;
-	if ( isXml ) ctype = CT_XML;
-
+	char ctype2 = CT_HTML;
+	if ( isXml ) ctype2 = CT_XML;
 	// now encapsulate it in html head/tail and send it off
 	bool status = g_httpServer.sendDynamicPage( st->m_s , 
 						    xbuf->getBufStart(), 
 						    xbuf->length() ,
 						    -1, //cachtime
 						    false ,//postreply?
-						    &ctype,
+						    &ctype2,
 						    -1 , //httpstatus
 						    NULL,//cookie
 						    "utf-8");
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -249,6 +249,58 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	// get the dmoz catid if given
 	//long searchingDmoz = hr->getLong("dmoz",0);

+	//
+	// DO WE NEED TO ALTER cr->m_siteListBuf for a widget?
+	//
+	// when a wordpress user changes the "Websites to Include" for
+	// her widget, it should send a /search?sites=xyz.com&wpid=xxx
+	// request here... 
+	// so we need to remove her old sites and add in her new ones.
+	// 
+	/*
+	  
+	  MDW TURN BACK ON IN A DAY. do indexing or err pages first.
+
+	// get wordpressid supplied with all widget requests
+	char *wpid = hr->getString("wpid");
+	// we have to add set &spidersites=1 which all widgets should do
+	if ( wpid ) {
+		// this returns NULL if cr->m_siteListBuf would be unchanged
+		// because we already have the whiteListBuf sites in there
+		// for this wordPressId (wpid)
+		SafeBuf newSiteListBuf;
+		makeNewSiteList( &si->m_whiteListBuf,
+				 cr->m_siteListBuf ,
+				 wpid ,
+				 &newSiteListBuf);
+		// . update the list of sites to crawl/search & show in widget
+		// . if they give an empty list then allow that, stops crawling
+		SafeBuf parmList;
+		g_parms.addNewParmToList1 ( &parmList,
+					    cr->m_collnum,
+					    newSiteListBuf,
+					    0,
+					    "sitelist");
+		// send the parms to all hosts in the network
+		g_parms.broadcastParmList ( &parmList , 
+					    NULL,//s,// state is socket i guess
+					    NULL);//doneBroadcastingParms2 );
+		// nothing left to do now
+		return g_httpServer.sendDynamicPage(s,
+						    "OK",//sb.getBufStart(),
+						    2,//sb.length(),
+						    cacheTime,//0,
+						    false, // POST?
+						    "text/html", 
+						    200,  // httpstatus
+						    NULL, // cookie
+						    "UTF-8"); // charset
+	}
+	*/
+	
+
+
+
 	//
 	// . send back page frame with the ajax call to get the real
 	//   search results. do not do this if a "&dir=" (dmoz category)
@ -404,7 +456,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 			      "Copyright &copy; 2014. "
 			      "All Rights Reserved.<br/>"
 			      "Powered by the "
-			      "<a href='http://www.gigablast.com/'>"
+			      "<a href=\"http://www.gigablast.com/\">"
 			      "GigaBlast</a> open source search engine."
 			      "</font>"
 			      "</center>\n"
@ -718,11 +770,6 @@ void freeMsg4Wrapper( void *st ) {
 	delete stau;
 }

-// height of each result div in the widget
-#define RESULT_HEIGHT 120
-#define SERP_SPACER 1
-#define PADDING 8
-
 // . make a web page from results stored in msg40
 // . send it on TcpSocket "s" when done
 // . returns false if blocked, true otherwise
@ -909,45 +956,61 @@ bool gotResults ( void *state ) {
 	//


+ 	long numResults = msg40->getNumResults();
+
 	// if user is doing ajax widget we need to know the current docid
 	// that is listed at the top of their widget display so we can
 	// hide the new docids above that and scroll them down slowly.
- 	long numResults = msg40->getNumResults();
-	long topDocIdPos = -1;
+	/*
+	//long topDocIdPos = -1;
 	bool hasInvisibleResults = false;
-	long numInvisible = 0;
+	//long numInvisible = 0;
+	long numAbove = 0;
 	HttpRequest *hr = &st->m_hr;
 	long long oldTop = 0LL;
+	long long lastDocId = 0LL;
+	double lastSerpScore = 0.0;
 	if ( si->m_format == FORMAT_WIDGET_AJAX ) {
+		// sanity, no stream mode here, it won't work
+		if ( si->m_streamResults )
+			log("results: do not use stream=1 for widget");
 		// get current top docid
 		long long topDocId = hr->getLongLong("topdocid",0LL);

 		// DEBUG: force it on for now
 		//topDocId = 4961990748LL;

-		// scan results
+		// scan results. this does not support &stream=1 streaming
+		// mode. it doesn't make sense that it needs to.
 		for ( long i = 0 ; i < numResults ; i++ ) {
+			// skip if already invisible
+			if ( msg40->m_msg3a.m_clusterLevels[i] != CR_OK ) 
+				continue;
 			// get it
-			Msg20      *m20 ;
-			if ( si->m_streamResults )
-				m20 = msg40->getCompletedSummary(i);
-			else
-				m20 = msg40->m_msg20[i];
+			Msg20 *m20 = msg40->m_msg20[i];
+			if ( ! m20 ) continue;
 			// checkdocid
 			Msg20Reply *mr = m20->m_r;
 			if ( ! mr ) continue;
+			// save this
+			lastDocId = mr->m_docId;
+			lastSerpScore = msg40->m_msg3a.m_scores[i];
+			// set "oldTop" to first docid we encounter
 			if ( ! oldTop ) oldTop = mr->m_docId;
 			// stop if no topdocid otherwise. oldTop is now set
-			if ( topDocId == 0 ) break;
+			if ( ! topDocId ) continue; // == 0 ) break;
 			if ( mr->m_docId != topDocId ) {
 				hasInvisibleResults = true;
-				numInvisible++;
+				// count # of docids above top docid
+				numAbove++;
 				continue;
 			}
-			topDocIdPos = i;
-			break;
+			// we match it, so set this if not already set
+			//if ( topDocIdPos != -1 ) topDocIdPos = i;
+			//break;
 		}
 	}				
+	*/

 	SafeBuf *sb = &st->m_sb;

@ -992,20 +1055,48 @@ bool gotResults ( void *state ) {

 	// propagate "topdocid" so when he does another query every 30 secs
 	// or so we know what docid was on top for scrolling purposes
-	if ( si->m_format == FORMAT_WIDGET_AJAX )
-		sb->safePrintf("<input type=hidden "
-			       "id=topdocid name=topdocid value=%lli>\n",
-			       oldTop);
+	//if ( si->m_format == FORMAT_WIDGET_AJAX )
+	//	sb->safePrintf("<input type=hidden "
+	//		       "id=topdocid name=topdocid value=%lli>\n",
+	//		       oldTop);
+
+	// report how many results we added above the topdocid provided, if any
+	// so widget can scroll down automatically
+	//if ( si->m_format == FORMAT_WIDGET_AJAX && numAbove )
+	//	sb->safePrintf("<input type=hidden "
+	//		       "id=topadd name=topadd value=%li>\n",numAbove);
+	
+
+	// we often can add 100s of things to the widget's result set per 
+	// second especially when sorting by last spidered time and spidering
+	// a lot. setting the maxserpscore of the serp score of the last result
+	// allows us to append new search results to what we have in a 
+	// consistent manner.
+	// if ( si->m_format == FORMAT_WIDGET_AJAX ) {
+	// 	// let's make this ascii encoded crap
+	// 	sb->safePrintf("<input type=hidden "
+	// 		       "id=maxserpscore "
+	// 		       "value=%f>\n",
+	// 		       lastSerpScore);
+	// 	// let's make this ascii encoded crap
+	// 	sb->safePrintf("<input type=hidden "
+	// 		       "id=maxserpdocid "
+	// 		       "value=%lli>\n",
+	// 		       lastDocId);
+	// }
+

 	// then print each result
 	// don't display more than docsWanted results
 	long count = msg40->getDocsWanted();
 	bool hadPrintError = false;
+	long numPrintedSoFar = 0;
 	//long widgetHeight = hr->getLong("widgetheight",400);
 	//long widgetwidth = hr->getLong("widgetwidth",250);

 	for ( long i = 0 ; count > 0 && i < numResults ; i++ ) {

+		/*
 		if ( hasInvisibleResults ) {
 			//
 			// MAKE THESE RESULTS INVISIBLE!
@ -1037,14 +1128,14 @@ bool gotResults ( void *state ) {
 					       "position:absolute;>"
 					       );
 		}
-
+		*/

 		//////////
 		//
 		// prints in xml or html
 		//
 		//////////
-		if ( ! printResult ( st , i ) ) {
+		if ( ! printResult ( st , i , numPrintedSoFar++ ) ) {
 			hadPrintError = true;
 			break;
 		}
@ -1066,7 +1157,7 @@ bool gotResults ( void *state ) {

 	// if we split the serps into 2 divs for scrolling purposes
 	// then close up the 2nd one
-	if ( hasInvisibleResults ) sb->safePrintf("</div>");
+	//if ( hasInvisibleResults ) sb->safePrintf("</div>");

 	// END SERP DIV
 	if ( si->m_format == FORMAT_WIDGET_IFRAME ||
@ -1111,9 +1202,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 		sb->safePrintf("<body>");
 	}

-	if ( ! g_conf.m_isMattWells && 
-	     (si->m_format==FORMAT_WIDGET_IFRAME || 
-	      si->m_format==FORMAT_WIDGET_AJAX) ) {
+	if ( ! g_conf.m_isMattWells && si->m_format==FORMAT_WIDGET_IFRAME ) {
 		printCSSHead ( sb ,si->m_format );
 		sb->safePrintf("<body style=padding:0px;margin:0px;>");
 	}
@ -1155,26 +1244,43 @@ bool printSearchResultsHeader ( State0 *st ) {
 		// put image in this div which will have top:0px JUST like
 		// the div holding the search results we print out below
 		// so that the image does not scroll when you use the
-		// scrollbar.
-		sb->safePrintf("<div style=\"position:absolute;"
+		// scrollbar. holds the magifying glass img and searchbox.
+		sb->safePrintf("<div class=magglassdiv "
+			       "style=\"position:absolute;"
 			       "right:15px;"
 			       "z-index:10;"
 			       "top:0px;\">");

-		long refresh = hr->getLong("refresh",15);
+		//long refresh = hr->getLong("refresh",15);
 		char *oq = hr->getString("q",NULL);
 		if ( ! oq ) oq = "";
 		char *prepend = hr->getString("prepend");
 		if ( ! prepend ) prepend = "";
 		char *displayStr = "none";
 		if ( prepend && prepend[0] ) displayStr = "";
-		sb->safePrintf("<form method=get action=/search>");
+		// to do a search we need to re-call the ajax,
+		// just call reload like the one that is called every 15s or so
+		sb->safePrintf("<form "//method=get action=/search "
+			       // use "1" as arg to force reload
+			       "onsubmit=\"widget123_reload(1);"
+
+			       // let user know we are loading
+			       "var w=document.getElementById("
+			       "'widget123_scrolldiv');"
+			       // just set the widget content to the reply
+			       "if (w) "
+			       "w.innerHTML='<br><br><b>Loading Results..."
+			       "</b>';"
+
+			       // prevent it from actually submitting
+			       "return false;\">");

 		sb->safePrintf("<img "
 			       "style=\""
 			       //"position:absolute;" // absolute or relative?
 			       // put it on TOP of the other stuff
 			       "z-index:10;"
+			       "margin-top:3px;"
 			       //"right:10px;"
 			       //"right:2px;"
 			       //"width:%lipx;"
@ -1185,36 +1291,46 @@ bool printSearchResultsHeader ( State0 *st ) {
 			       "var e=document.getElementById('sbox');"
 			       "if(e.style.display == 'none') {"
 			       "e.style.display = '';"
+			       // give it focus
+			       "var qb=document.getElementById('qbox');"
+			       "qb.focus();"
 			       "} else {"
 			       "e.style.display = 'none';"
 			       "}"
 			       "\" " // end function
 			       " "
-			       "width=25 "
-			       "height=25 "
-			       "src=\"http://etc-mysitemyway.s3.amazonaws.com/icons/legacy-previews/icons/simple-black-square-icons-business/126715-simple-black-square-icon-business-magnifying-glass-ps.png\">"
+			       "width=35 "
+			       "height=31 "
+			       "src=\"/magglass.png\">"
 			       );

-		sb->safePrintf("<div id=sbox style=float:left;display:%s;>"
-			       "<input type=text name=prepend size=%li "
-			       "value=\"%s\"  style=\"z-index:10;"
+		//char *origq = hr->getString("q");
+		// we sort all results by spider date now so PREPEND
+		// the actual user query 
+		char *origq = hr->getString("prepend");
+		if ( ! origq ) origq = "";
+		sb->safePrintf("<div id=sbox style=\"float:left;"
+			       "display:%s;"
+			       "opacity:0.83;"
+			       //"background-color:gray;"
+			       //"padding:5px;"
+			       "\">"
+			       // the box that holds the query
+			       "<input type=text id=qbox name=qbox "
+			       "size=%li " //name=prepend "
+			       "value=\"%s\"  "
+			       "style=\"z-index:10;"
+			       "font-weight:bold;"
+			       "font-size:18px;"
+			       "border:4px solid black;"
 			       "margin:3px;"
 			       "\">"
-			       // hidden parms like collection
-			       "<input name=c type=hidden value=\"%s\">"
-			       "<input name=format type=hidden value=widget>"
-			       "<input name=widgetwidth type=hidden value=%li>"
-			       "<input name=refresh type=hidden value=%li>"
-			       "<input name=q type=hidden value=\"%s\">"
-			       "</div>"
-			       "</form>\n"
 			       , displayStr
-			       , widgetwidth / 15 
-			       , prepend
-			       , coll
-			       , widgetwidth
-			       , refresh
-			       , oq
+			       , widgetwidth / 23 
+			       , origq
+			       );
+		sb->safePrintf("</div>"
+			       "</form>\n"
 			       );

 		// . BEGIN SERP DIV
@ -1222,9 +1338,12 @@ bool printSearchResultsHeader ( State0 *st ) {
 		// . this will have the scrollbar to just scroll the serps
 		//   and not the magnifying glass
 		sb->safePrintf("</div>"
-			       "<div style=\"position:absolute;"
+			       "<div id=widget123_scrolldiv "
+			       "onscroll=widget123_append(); "
+			       "style=\"position:absolute;"
 			       "top:0px;"
 			       "overflow-y:auto;"
+			       "overflow-x:hidden;"
 			       "width:%lipx;"
 			       "height:%lipx;\">"
 			       , widgetwidth
@ -1492,7 +1611,8 @@ bool printSearchResultsHeader ( State0 *st ) {
 	else if ( numResults == 0 && 
 		  ( si->m_format == FORMAT_WIDGET_IFRAME ||
 		    si->m_format == FORMAT_WIDGET_AJAX ) ) {
-		sb->safePrintf ( "No results found.");
+		sb->safePrintf ( "No results found. Wait for spider to "
+				 "kick in.");
 	}
 	else if ( moreFollow && si->m_format == FORMAT_HTML ) {
 		if ( isAdmin && si->m_docsToScanForReranking > 1 )
@ -1927,12 +2047,13 @@ bool printSearchResultsTail ( State0 *st ) {
 	// carry over the sites we are restricting the search results to
 	if ( si->m_whiteListBuf.length() )
 		args.safePrintf("&sites=%s",si->m_whiteListBuf.getBufStart());
-	
+

 	if ( firstNum > 0 && 
 	     (si->m_format == FORMAT_HTML || 
-	      si->m_format == FORMAT_WIDGET_AJAX ||
-	      si->m_format == FORMAT_WIDGET_IFRAME ) ) {
+	      si->m_format == FORMAT_WIDGET_IFRAME //||
+	      //si->m_format == FORMAT_WIDGET_AJAX
+	      ) ) {
 		long ss = firstNum - msg40->getDocsWanted();
 		sb->safePrintf("<a href=\"/search?s=%li&q=",ss);
 		// our current query parameters
@ -1949,8 +2070,9 @@ bool printSearchResultsTail ( State0 *st ) {
 	// now print "Next X Results"
 	if ( msg40->moreResultsFollow() && 
 	     (si->m_format == FORMAT_HTML || 
-	      si->m_format == FORMAT_WIDGET_IFRAME ||
-	      si->m_format == FORMAT_WIDGET_AJAX )) {
+	      si->m_format == FORMAT_WIDGET_IFRAME 
+	      //si->m_format == FORMAT_WIDGET_AJAX 
+	      )) {
 		long ss = firstNum + msg40->getDocsWanted();
 		// print a separator first if we had a prev results before us
 		if ( sb->length() > remember ) sb->safePrintf ( " &nbsp; " );
@ -2044,8 +2166,8 @@ bool printSearchResultsTail ( State0 *st ) {
 				"<font color=gray>"
 				"Copyright &copy; 2014. All Rights "
 				"Reserved.<br/>"
-				"Powered by the <a href='https://www."
-				"gigablast.com/'>GigaBlast</a> open source "
+				"Powered by the <a href=\"http://www."
+				"gigablast.com/\">GigaBlast</a> open source "
 				"search engine."
 				"</font>"
 				"</center>\n"
@ -2359,7 +2481,7 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,


 // use this for xml as well as html
-bool printResult ( State0 *st, long ix ) {
+bool printResult ( State0 *st, long ix , long numPrintedSoFar ) {

 	SafeBuf *sb = &st->m_sb;

@ -2440,7 +2562,7 @@ bool printResult ( State0 *st, long ix ) {
 	if ( mr->ptr_content ) {

 		// for json items separate with \n,\n
-		if ( si->m_format != FORMAT_HTML && ix>0 )
+		if ( si->m_format != FORMAT_HTML && numPrintedSoFar > 0 )
 			sb->safePrintf(",\n");

 		sb->safeStrcpy ( mr->ptr_content );
@ -2566,56 +2688,139 @@ bool printResult ( State0 *st, long ix ) {
 	// http://www.youtube.com/watch?v=auQbi_fkdGE
 	// http://img.youtube.com/vi/auQbi_fkdGE/2.jpg
 	// get the thumbnail url
-	if ( mr->ptr_imgUrl && si->m_format == FORMAT_HTML )
-		sb->safePrintf ("<a href=%s><image src=%s></a>",
+	if ( mr->ptr_imgUrl && 
+	     si->m_format == FORMAT_HTML &&
+	     // if we got thumbnail use that not this
+	     ! mr->ptr_imgData )
+		sb->safePrintf ("<a href=%s><img src=%s></a>",
 				   url,mr->ptr_imgUrl);

+	// if we have a thumbnail show it next to the search result
+	if ( si->m_format == FORMAT_HTML &&
+	     //! mr->ptr_imgUrl &&
+	     mr->ptr_imgData ) {
+		ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
+		ThumbnailInfo *ti = ta->getThumbnailInfo(0);
+		ti->printThumbnailInHtml ( sb , 
+					   100 ,  // max width
+					   100 ,  // max height
+					   true ,  // add <a href>
+					   NULL ,
+					   " style=\"margin:10px;\" ");
+	}

 	// print image for widget
 	if ( //mr->ptr_imgUrl && 
 	     ( si->m_format == FORMAT_WIDGET_IFRAME ||
-	       si->m_format == FORMAT_WIDGET_AJAX) ) {
+	       si->m_format == FORMAT_WIDGET_AJAX ||
+	       si->m_format == FORMAT_WIDGET_APPEND ) ) {

-		long widgetwidth = hr->getLong("widgetwidth",200);
-		
-		// make a div around this for widget so we can print text
-		// on top
+		long widgetWidth = hr->getLong("widgetwidth",200);
+
+		// prevent coring
+		if ( widgetWidth < 1 ) widgetWidth = 1;
+
+		// each search result in widget has a div around it
 		sb->safePrintf("<div "
+			       "class=result "
+			       // we need the docid and score of last result
+			       // when we append new results to the end
+			       // of the widget for infinite scrolling
+			       // using the scripts in PageBasic.cpp
+			       "docid=%lli "
+			       "score=%f " // double
+			       
 			       "style=\""
 			       "width:%lipx;"
 			       "min-height:%lipx;"//140px;"
 			       "height:%lipx;"//140px;"
 			       "padding:%lipx;"
-			       "display:table-cell;"
-			       "vertical-align:bottom;"
-			       , widgetwidth - 2*8 // padding is 8px
+			       "position:relative;"
+			       //"display:table-cell;"
+			       //"vertical-align:bottom;"
+			       "\""
+			       ">"
+			       , mr->m_docId
+			       // this is a double now. this won't work
+			       // for streaming...
+			       , msg40->m_msg3a.m_scores[ix]
+			       , widgetWidth - 2*8 // padding is 8px
 			       , (long)RESULT_HEIGHT
 			       , (long)RESULT_HEIGHT
 			       , (long)PADDING
 			       );
-		if ( mr->ptr_imgUrl )
-			sb->safePrintf("background-repeat:no-repeat;"
-				       "background-size:%lipx 140px;"
-				       "background-image:url('%s');"
-				       , widgetwidth - 2*8 // padding is 8px
-				       , mr->ptr_imgUrl);
+		// if ( mr->ptr_imgUrl )
+		// 	sb->safePrintf("background-repeat:no-repeat;"
+		// 		       "background-size:%lipx 140px;"
+		// 		       "background-image:url('%s');"
+		// 		       , widgetwidth - 2*8 // padding is 8px
+		// 		       , mr->ptr_imgUrl);
+		long newdx = 0;
+		if ( mr->ptr_imgData ) {
+			ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
+			ThumbnailInfo *ti = ta->getThumbnailInfo(0);
+			// account for scrollbar on the right
+			long maxWidth = widgetWidth - (long)SCROLLBAR_WIDTH;
+			long maxHeight = (long)RESULT_HEIGHT;
+			// false = do not print <a href> link on image
+			ti->printThumbnailInHtml ( sb , 
+						   maxWidth ,
+						   maxHeight , 
+						   false , // add <a href>
+						   &newdx );
+		}
 		// end the div style attribute and div tag
-		sb->safePrintf("\">");
+		//sb->safePrintf("\">");
+
+
 		sb->safePrintf ( "<a "
 				 "target=_blank "
-				 "style=text-decoration:none; href=" );
+				 "style=\"text-decoration:none;"
+				 // don't let scroll bar obscure text
+				 "margin-right:%lipx;"
+				 ,(long)SCROLLBAR_WIDTH
+				 );
+
+		// if thumbnail is wide enough put text on top of it, otherwise
+		// image is to the left and text is to the right of image
+		if ( newdx > .5 * widgetWidth )
+			sb->safePrintf("position:absolute;"
+				       "bottom:%li;"
+				       "left:%li;"
+				       , (long) PADDING 
+				       , (long) PADDING 
+				       );
+		// to align the text verticall we gotta make a textbox div
+		// otherwise it wraps below image! mdw
+		//else
+		//	sb->safePrintf("vertical-align:middle;");
+		else
+			sb->safePrintf("position:absolute;"
+				       "bottom:%li;"
+				       "left:%li;"
+				       , (long) PADDING 
+				       , (long) PADDING + newdx + 10 );
+
+		// close the style and begin the url
+		sb->safePrintf( "\" "
+				"href=\"" 
+				 );
+
 		// truncate off -diffbotxyz%li
 		long newLen = urlLen;
 		if ( diffbotSuffix ) newLen = diffbotSuffix - url;
 		// print the url in the href tag
 		sb->safeMemcpy ( url , newLen ); 
 		// then finish the a href tag and start a bold for title
-		sb->safePrintf ( ">");//<font size=+0>" );
+		sb->safePrintf ( "\">");//<font size=+0>" );
 		
 		sb->safePrintf("<b style=\""
 			       "text-decoration:none;"
 			       "font-size: 15px;"
 			       "font-weight:bold;"
+			       // add padding so shadow does not stick out
+			       //"padding-left:4px;"
+			       //"padding-right:4px;"
 			       "background-color:rgba(0,0,0,.5);"
 			       "color:white;"
 			       "font-family:arial;"
@ -2634,11 +2839,28 @@ bool printResult ( State0 *st, long ix ) {
 			       //"2px -2px 0 #000 "
 			       //"-2px -2px 0 #000;"
 			       "\">");
-		//sb->safePrintf ("<image width=50 height=50 src=%s></a>",
+		//sb->safePrintf ("<img width=50 height=50 src=%s></a>",
 		//		   mr->ptr_imgUrl);
 		// then title over image
 	}

+	// only do link here if we have no thumbnail so no bg image
+	if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
+	      si->m_format == FORMAT_WIDGET_APPEND ||
+	      si->m_format == FORMAT_WIDGET_AJAX   ) &&
+	     ! mr->ptr_imgData ) {
+		sb->safePrintf ( "<a style=text-decoration:none;"
+				 "color:white; "
+				 "href=" );
+		// truncate off -diffbotxyz%li
+		long newLen = urlLen;
+		if ( diffbotSuffix ) newLen = diffbotSuffix - url;
+		// print the url in the href tag
+		sb->safeMemcpy ( url , newLen ); 
+		// then finish the a href tag and start a bold for title
+		sb->safePrintf ( ">");//<font size=+0>" );
+	}
+

 	// the a href tag
 	if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" );
@ -2668,20 +2890,6 @@ bool printResult ( State0 *st, long ix ) {
 	}


-	// only do link here
-	if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
-	      si->m_format == FORMAT_WIDGET_AJAX   ) &&
-	     ! mr->ptr_imgUrl ) {
-		sb->safePrintf ( "<a href=" );
-		// truncate off -diffbotxyz%li
-		long newLen = urlLen;
-		if ( diffbotSuffix ) newLen = diffbotSuffix - url;
-		// print the url in the href tag
-		sb->safeMemcpy ( url , newLen ); 
-		// then finish the a href tag and start a bold for title
-		sb->safePrintf ( ">");//<font size=+0>" );
-	}
-
 	// . then the title  (should be NULL terminated)
 	// . the title can be NULL
 	// . highlight it first
@ -2737,6 +2945,7 @@ bool printResult ( State0 *st, long ix ) {
 		backTag  = "</b>";
 	}
 	if ( si->m_format == FORMAT_WIDGET_IFRAME || 
+	     si->m_format == FORMAT_WIDGET_APPEND ||
 	     si->m_format == FORMAT_WIDGET_AJAX ) {
 		frontTag = "<font style=\"background-color:yellow\">" ;
 	}
@ -2784,10 +2993,11 @@ bool printResult ( State0 *st, long ix ) {
 	if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ;


-	// close the image div
+	// close the title tag stuf
 	if ( si->m_format == FORMAT_WIDGET_IFRAME ||
+	     si->m_format == FORMAT_WIDGET_APPEND ||
 	     si->m_format == FORMAT_WIDGET_AJAX ) 
-		sb->safePrintf("</b></a></div>\n");
+		sb->safePrintf("</b></a>\n");


 	/////
@ -2796,7 +3006,7 @@ bool printResult ( State0 *st, long ix ) {
 	//
 	/////
 	unsigned char ctype = mr->m_contentType;
-	if ( ctype >= CT_HTML && ctype <= CT_JSON ) {
+	if ( ctype != CT_HTML && ctype != CT_UNKNOWN ){//&&ctype <= CT_JSON ) {
 		char *cs = g_contentTypeStrings[ctype];
 		if ( si->m_format == FORMAT_XML )
 			sb->safePrintf("\t\t<contentType>"
@ -2805,7 +3015,7 @@ bool printResult ( State0 *st, long ix ) {
 				      "]]>"
 				      "</contentType>\n",
 				      cs);
-		else if ( si->m_format == FORMAT_HTML ) {
+		else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
 			sb->safePrintf(" <b><font style=color:white;"
 				      "background-color:maroon;>");
 			char *p = cs;
@ -2845,6 +3055,7 @@ bool printResult ( State0 *st, long ix ) {
 	// do not print summaries for widgets by default unless overridden
 	// with &summary=1
 	if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
+	      si->m_format == FORMAT_WIDGET_APPEND ||
 	      si->m_format == FORMAT_WIDGET_AJAX ) && 
 	     hr->getLong("summaries",0) == 0 )
 		printSummary = false;
@ -3164,7 +3375,7 @@ bool printResult ( State0 *st, long ix ) {
 		sb->safePrintf(" - <a style=color:red; href=\"/addurl?u=");
 		sb->urlEncode ( url , gbstrlen(url) , false );
 		unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
-		sb->safePrintf("&rand64=%llu\">respider</a>",rand64);
+		sb->safePrintf("&rand64=%llu&force=1\">respider</a>",rand64);
 	}


@ -3379,12 +3590,20 @@ bool printResult ( State0 *st, long ix ) {
 	*/
 		

+	// end serp div
+	if ( si->m_format == FORMAT_WIDGET_IFRAME ||
+	     si->m_format == FORMAT_WIDGET_APPEND ||
+	     si->m_format == FORMAT_WIDGET_AJAX )
+		sb->safePrintf("</div>");
+	
+
 	if ( si->m_format == FORMAT_HTML )
 		sb->safePrintf ( "<br><br>\n");

 	// search result spacer
 	if ( si->m_format == FORMAT_WIDGET_IFRAME ||
-	     si->m_format == FORMAT_WIDGET_AJAX )
+	     si->m_format == FORMAT_WIDGET_APPEND ||
+	     si->m_format == FORMAT_WIDGET_AJAX   )
 		sb->safePrintf("<div style=line-height:%lipx;><br></div>",
 			       (long)SERP_SPACER);

@ -5839,6 +6058,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 	return true;
 }

+/*
+
+  RIP: OLD IFRAME WIDGET CODE HACK

 bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
 	//
@ -6113,23 +6335,21 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
 			"</td>"
 			"<td>"
 			// begin div with source in it
-			/*
-			 "<div "
-			//"class=grad3 "
-			"style=\""
-			"border-radius:10px;"
-			"box-shadow: 6px 6px 3px %s;"
-			"border:2px solid black;"
-			"padding:15px;"
-			 "width:600px;"
-			//"background-image:url('/ss.jpg');"
-			//"background-repeat:repeat;"
-			//"background-attachment:fixed;"
-			 "background-color:lightgray;"
-			"\">"
-			, SHADOWCOLOR
-			//"<br>"
-			*/
+			//  "<div "
+			// //"class=grad3 "
+			// "style=\""
+			// "border-radius:10px;"
+			// "box-shadow: 6px 6px 3px %s;"
+			// "border:2px solid black;"
+			// "padding:15px;"
+			//  "width:600px;"
+			// //"background-image:url('/ss.jpg');"
+			// //"background-repeat:repeat;"
+			// //"background-attachment:fixed;"
+			//  "background-color:lightgray;"
+			// "\">"
+			// , SHADOWCOLOR
+			// //"<br>"
 			);

 	// space widget to the right using this table
@ -6157,35 +6377,32 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {

 	// this iframe contains the WIDGET
 	sb->safePrintf (
-		       /*
-		       "<div "
-		       "id=scrollerxyz "
-		       "style=\""
+		       // "<div "
+		       // "id=scrollerxyz "
+		       // "style=\""
 		       //"width:%lipx;" // 200;"
 		       //"height:%lipx;" // 400;"
 		       //"overflow:hidden;"
-		       "padding:0px;"
-		       "margin:0px;"
-		       "background-color:white;"
+		       // "padding:0px;"
+		       // "margin:0px;"
+		       // "background-color:white;"
 		       //"padding-left:7px;"
-		       "%s"
+		       //"%s"
 		       //"background-color:%s;"//lightblue;"
 		       //"foreground-color:%s;"
 		       //"overflow:scroll;"
 		       //"overflow-scrolling:touch;"
 		       "\">"
-		       */

 			"<iframe width=\"%lipx\" height=\"%lipx\" "
 			//"scrolling=yes "
-			/*
-			"style=\"background-color:white;"
-			"padding-right:0px;"
+
+			//"style=\"background-color:white;"
+			//"padding-right:0px;"
 			//"%s\" "
-			"scrolling=no "
-			"frameborder=no "
+			//"scrolling=no "
+			//"frameborder=no "
 			//"src=\"http://neo.diffbot.com:8000/search?"
-			*/

 			// frameborder=no
 			"%s"
@ -6389,3 +6606,4 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
 					    NULL, // cookie
 					    "UTF-8"); // charset
 }
+*/
--- a/PageResults.h
+++ b/PageResults.h
@ -6,6 +6,14 @@
 #include "Msg40.h"
 #include "Msg0.h"

+// height of each search result div in the widget
+#define RESULT_HEIGHT 120
+// other widget parms
+#define SERP_SPACER 1
+#define PADDING 8
+#define SCROLLBAR_WIDTH 20
+
+
 class State0 {
 public:

@ -50,7 +58,7 @@ public:


 bool printSearchResultsHeader ( class State0 *st ) ;
-bool printResult ( class State0 *st,  long ix );
+bool printResult ( class State0 *st,  long ix , long numPrintedSoFar );
 bool printSearchResultsTail ( class State0 *st ) ;


--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -171,7 +171,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
 	sb.safePrintf("\n");


-	if ( cr && cr->m_coll && strcmp(cr->m_coll,"main") ) {
+	if ( cr && cr->m_coll ) { // && strcmp(cr->m_coll,"main") ) {
 		sb.safePrintf("<center>"
 			      "Searching the <b>%s</b> collection."
 			      "</center>",
--- a/PageTitledb.cpp
+++ b/PageTitledb.cpp
@ -83,7 +83,9 @@ bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) {
 	xd->set3 ( docId , coll , 0 );
 	// callback
 	xd->setCallback ( st , gotTitleRec );
-	// and tell it to load from old title rec
+	// . and tell it to load from old title rec
+	// . this sets all the member vars from it and also sets
+	//   m_titleRecBuf to contain the actual compressed title rec
 	if ( ! xd->loadFromOldTitleRec ( ) ) return false;
 	// we got it without blocking. cached?
 	return gotTitleRec ( st );
@ -118,7 +120,7 @@ bool gotTitleRec ( void *state ) {

 	// . deal with errors
 	// . print none if non title rec at or after the provided docId
-	if ( g_errno || docId == 0LL || xd->m_titleRecSize <= 0 ) {
+	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
 		// print docId in box
 		sb.safePrintf (  "<center>\nEnter docId: "
 				 "<input type=text name=d value=%lli size=15>",
--- a/Pages.cpp
+++ b/Pages.cpp
@ -69,9 +69,9 @@ static WebPage s_pages[] = {
 	{ PAGE_RESULTS   , "search"        , 0 , "search" , 0 , 0 ,
 	  "results page",
 	  sendPageResults, 0 },
-	{ PAGE_WIDGET   , "widget"        , 0 , "widget" , 0 , 0 ,
-	  "widget page",
-	  sendPageWidget, 0 },
+	//{ PAGE_WIDGET   , "widget"        , 0 , "widget" , 0 , 0 ,
+	//  "widget page",
+	//  sendPageWidget, 0 },
 	{ PAGE_ADDURL    , "addurl"       , 0 , "add url" , 0 , 0 ,
 	  "Page where you can add url for spidering",
 	  sendPageAddUrl, 0 },
@ -914,8 +914,9 @@ bool Pages::printAdminTop (SafeBuf     *sb   ,
 	//long  user   = getUserType          ( s , r );
 	//char *username   = g_users.getUsername ( r );
 	char *username = NULL;
-	char *coll   = r->getString ( "c"   );
-	if ( ! coll ) coll = "main";
+	//char *coll   = r->getString ( "c"   );
+	//if ( ! coll ) coll = "main";
+	char *coll = g_collectiondb.getDefaultColl(r);

 	//char *pwd    = r->getString ( "pwd" );
 	// get username
@ -1041,8 +1042,8 @@ bool Pages::printAdminTop (SafeBuf     *sb   ,
 				   username , pwd ,
 				   coll, NULL, s->m_ip, qs );

-	if ( g_hostdb.getNumHosts() > 1 )
-		sb->safePrintf("<br><br>");
+	//if ( g_hostdb.getNumHosts() > 1 )
+	sb->safePrintf("<br><br>");

 	// end table
 	//sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n");
@ -1655,10 +1656,11 @@ bool Pages::printHostLinks ( SafeBuf* sb     ,
 	// and proxies
 	total += g_hostdb.m_numProxyHosts;	
 	// don't print host buttons if only 1 host
-	if ( total <= 1 ) return status;
+	//if ( total <= 1 ) return status;

 	sb->safePrintf (  //"&nbsp; &nbsp; &nbsp; "
-			  "<a href=/admin/hosts>hosts</a>: ");
+			  "<a style=text-decoration:none; href=/admin/hosts>"
+			  "<b>hosts in cluster</b></a>: ");

 	if ( ! qs   ) qs   = "";
 	//if ( ! pwd  ) pwd  = "";
--- a/Pages.h
+++ b/Pages.h
@ -38,7 +38,7 @@ bool sendPageBasicStatus     ( TcpSocket *s , HttpRequest *r );
 bool sendPageRoot     ( TcpSocket *s , HttpRequest *r );
 bool sendPageRoot     ( TcpSocket *s , HttpRequest *r, char *cookie );
 bool sendPageResults  ( TcpSocket *s , HttpRequest *r );
-bool sendPageWidget   ( TcpSocket *s , HttpRequest *r );
+//bool sendPageWidget   ( TcpSocket *s , HttpRequest *r );
 //bool sendPageEvents   ( TcpSocket *s , HttpRequest *r );
 bool sendPageAddUrl   ( TcpSocket *s , HttpRequest *r );
 bool sendPageGet      ( TcpSocket *s , HttpRequest *r );
@ -295,7 +295,7 @@ enum {
 	// public pages
 	PAGE_ROOT        ,
 	PAGE_RESULTS     ,
-	PAGE_WIDGET,
+	//PAGE_WIDGET,
 	PAGE_ADDURL      , // 5
 	PAGE_GET         ,
 	PAGE_LOGIN       ,
--- a/Parms.cpp
+++ b/Parms.cpp
@ -124,7 +124,7 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) ;


 // from PageBasic.cpp:
-bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
+bool updateSiteListBuf(collnum_t collnum,bool addSeeds,char *siteListArg);

 bool CommandUpdateSiteList ( char *rec ) {
 	// caller must specify collnum
@ -145,11 +145,12 @@ bool CommandUpdateSiteList ( char *rec ) {
 	CollectionRec *cr = g_collectiondb.getRec ( collnum );
 	// get the sitelist
 	char *data = getDataFromParmRec ( rec );
-	// update it
-	updateSiteListTables ( collnum ,
-			       true , // add NEW seeds?
-			       data // entire sitelist
-			       );
+	// update the table that maps site to whether we should spider it
+	// and also add newly introduced sites in "data" into spiderdb.
+	updateSiteListBuf ( collnum ,
+			    true , // add NEW seeds?
+			    data // entire sitelist
+			    );
 	// now that we deduped the old site list with the new one for
 	// purposes of adding NEW seeds, we can do the final copy
 	cr->m_siteListBuf.set ( data );
@ -445,7 +446,7 @@ bool CommandRestartColl ( char *rec , WaitEntry *we ) {
 	// re-add the buf so it re-seeds spiderdb. it will not dedup these
 	// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
 	// "true" = addSeeds.
-	updateSiteListTables ( newCollnum , true , oldSiteList );
+	updateSiteListBuf ( newCollnum , true , oldSiteList );
 	// now put it back
 	if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );

@ -501,7 +502,7 @@ bool CommandResetColl ( char *rec , WaitEntry *we ) {
 	// re-add the buf so it re-seeds spiderdb. it will not dedup these
 	// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
 	// "true" = addSeeds.
-	updateSiteListTables ( newCollnum , true , oldSiteList );
+	updateSiteListBuf ( newCollnum , true , oldSiteList );
 	// now put it back
 	if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );

@ -1318,9 +1319,9 @@ bool printDropDown ( long n , SafeBuf* sb, char *name, long select,
 bool printDropDownProfile ( SafeBuf* sb, char *name, long select ) {
 	sb->safePrintf ( "<select name=%s>", name );
 	// the type of url filters profiles
-	char *items[] = {"custom","web","news"};
+	char *items[] = {"custom","web","news","chinese"};
 	char *s;
-	for ( long i = 0 ; i < 3 ; i++ ) {
+	for ( long i = 0 ; i < 4 ; i++ ) {
 		if ( i == select ) s = " selected";
 		else               s = "";
 		sb->safePrintf ("<option value=%li%s>%s",i,s,items[i]);
@ -1386,9 +1387,13 @@ bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) {
 	long  page = g_pages.getDynamicPageNumber ( r );
 	long nc = r->getLong("nc",1);
 	long pd = r->getLong("pd",1);
-	char *coll = r->getString ( "c"   );
-	if ( ! coll || ! coll[0] ) coll = "main";
-	CollectionRec *cr = g_collectiondb.getRec ( coll );
+	char *coll = g_collectiondb.getDefaultColl(r);
+	CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
+	//char *coll = r->getString ( "c"   );
+	//if ( ! coll || ! coll[0] ) coll = "main";
+	//CollectionRec *cr = g_collectiondb.getRec ( coll );
+	// if "main" collection does not exist, try another
+	//if ( ! cr ) cr = getCollRecFromHttpRequest ( r );
 	printParms2 ( sb, page, cr, nc, pd,0,0 , s);
 	return true;
 }
@ -5453,7 +5458,7 @@ void Parms::init ( ) {
 	m->m_cgi   = "live";
 	m->m_off   = (char *)&g_conf.m_isLive - g;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "0";
+	m->m_def   = "1";
 	m->m_flags = PF_HIDDEN | PF_NOSAVE;
 	m++;

@ -7702,7 +7707,9 @@ void Parms::init ( ) {
 		"tools. "
 		"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
 		"to add then consider using the <a href=/admin/addurl>add "
-		"urls</a> interface.";
+		"urls</a> interface. <b>IF YOU WANT TO SPIDER THE WHOLE "
+		"WEB</b> then only use the <i>seed:</i> directives here "
+		"lest you limit yourself to a set of domains.";
 	m->m_cgi   = "sitelist";
 	m->m_off   = (char *)&cr.m_siteListBuf - x;
 	m->m_page  = PAGE_BASIC_SETTINGS;
@ -8084,6 +8091,14 @@ void Parms::init ( ) {
 	m->m_priv  = 1;
 	m++;

+	m->m_title = "log debug image messages";
+	m->m_cgi   = "ldi";
+	m->m_off   = (char *)&g_conf.m_logDebugImage - g;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_priv  = 1;
+	m++;
+
 	m->m_title = "log debug loop messages";
 	m->m_cgi   = "ldl";
 	m->m_off   = (char *)&g_conf.m_logDebugLoop - g;
@ -8518,6 +8533,29 @@ void Parms::init ( ) {
 	m->m_flags = PF_DIFFBOT;
 	m++;

+	m->m_cgi   = "createdtime";
+	m->m_xml   = "collectionCreatedTime";
+	m->m_desc  = "Time when this collection was created, or time of "
+		"the last reset or restart.";
+	m->m_off   = (char *)&cr.m_diffbotCrawlStartTime - x;
+	m->m_type  = TYPE_LONG;
+	m->m_page  = PAGE_NONE;
+	m->m_obj   = OBJ_COLL;
+	m->m_def   = "0";
+	m->m_flags = 0;//PF_DIFFBOT;
+	m++;
+
+	m->m_cgi   = "spiderendtime";
+	m->m_xml   = "crawlEndTime";
+	m->m_desc  = "If spider is done, when did it finish.";
+	m->m_off   = (char *)&cr.m_diffbotCrawlEndTime - x;
+	m->m_type  = TYPE_LONG;
+	m->m_page  = PAGE_NONE;
+	m->m_obj   = OBJ_COLL;
+	m->m_def   = "0";
+	m->m_flags = 0;//PF_DIFFBOT;
+	m++;
+
 	m->m_cgi   = "dbcrawlname";
 	m->m_xml   = "diffbotCrawlName";
 	m->m_off   = (char *)&cr.m_diffbotCrawlName - x;
@ -10030,6 +10068,28 @@ void Parms::init ( ) {
 	m->m_group = 0;
 	m++;

+	m->m_title = "make image thumbnails";
+	m->m_desc  = "Try to find the best image on each page and "
+		"store it as a thumbnail for presenting in the search "
+		"results.";
+	m->m_cgi   = "mit";
+	m->m_off   = (char *)&cr.m_makeImageThumbnails - x;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m++;
+
+	m->m_title = "index spider replies";
+	m->m_desc  = "Index the spider replies of every url the spider "
+		"attempts to spider. Search for them using special "
+		"query operators like type:status or gberrorstr:success or "
+		"stats:gberrornum to get a histogram. They will not otherwise "
+		"show up in the search results.";
+	m->m_cgi   = "isr";
+	m->m_off   = (char *)&cr.m_indexSpiderReplies - x;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "1";
+	m++;
+
 	// i put this in here so i can save disk space for my global
 	// diffbot json index
 	m->m_title = "index body";
@ -15493,8 +15553,9 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "stream search results";
-	m->m_desc  = "Stream search results back on socket as they arrive. Useful "
-		"when thousands of search results are requested.";
+	m->m_desc  = "Stream search results back on socket as they arrive. "
+		"Useful when thousands/millions of search results are "
+		"requested.";
 	m->m_soff  = (char *)&si.m_streamResults - y;
 	m->m_type  = TYPE_CHAR;
 	m->m_obj   = OBJ_SI;
@ -15505,6 +15566,36 @@ void Parms::init ( ) {
 	m++;


+	m->m_title = "max serp docid";
+	m->m_desc  = "Start displaying results after this score/docid pair. "
+		"Used by widget to append results to end when index is "
+		"volatile.";
+	m->m_def   = "0";
+	m->m_soff  = (char *)&si.m_minSerpDocId - y;
+	m->m_type  = TYPE_LONG_LONG;
+	m->m_sparm = 1;
+	m->m_scgi  = "minserpdocid";
+	m->m_flags = PF_API;
+	m->m_smin  = 0;
+	m->m_sprpg = 0;
+	m->m_sprpp = 0;
+	m++;
+
+	m->m_title = "max serp score";
+	m->m_desc  = "Start displaying results after this score/docid pair. "
+		"Used by widget to append results to end when index is "
+		"volatile.";
+	m->m_def   = "0";
+	m->m_soff  = (char *)&si.m_maxSerpScore - y;
+	m->m_type  = TYPE_DOUBLE;
+	m->m_sparm = 1;
+	m->m_scgi  = "maxserpscore";
+	m->m_flags = PF_API;
+	m->m_smin  = 0;
+	m->m_sprpg = 0;
+	m->m_sprpp = 0;
+	m++;
+
 	m->m_title = "restrict search to this url";
 	m->m_desc  = "X is the url.";
 	m->m_sparm = 1;
@ -16387,6 +16478,7 @@ void Parms::init ( ) {
 		if ( t == TYPE_DATE2          ) size = 4;
 		if ( t == TYPE_DATE           ) size = 4;
 		if ( t == TYPE_FLOAT          ) size = 4;
+		if ( t == TYPE_DOUBLE         ) size = 8;
 		if ( t == TYPE_IP             ) size = 4;
 		if ( t == TYPE_RULESET        ) size = 4;
 		if ( t == TYPE_LONG           ) size = 4;
@ -18735,13 +18827,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "<td>"
 			  "This is true if the url was directly "
 			  "injected from the "
-			  "/inject page or API."
+			  "<a href=/admin/inject>inject page</a> or API."
 			  "</td></tr>"

 			  "<tr class=poo><td>isdocidbased | !isdocidbased</td>"
 			  "<td>"
 			  "This is true if the url was added from the "
-			  "reindex interface. The request does not contain "
+			  "<a href=/admin/reindex>query reindex</a> "
+			  "interface. The request does not contain "
 			  "a url, but only a docid, that way we can add "
 			  "millions of search results very quickly without "
 			  "having to lookup each of their urls. You should "
@ -18932,6 +19025,16 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "See table below for supported language "
 			  "abbreviations.</td></tr>"

+			  "<tr class=poo><td><nobr>parentlang==zh_cn,zh_tw,xx"
+			  "</nobr></td>"
+			  "<td>Matches if "
+			  "the url's referring parent url is primarily in "
+			  "this language. Useful for prioritizing spidering "
+			  "pages of a certain language."
+			  "See table below for supported language "
+			  "abbreviations."
+			  "</td></tr>"
+
 			  /*
 			  "<tr class=poo><td>link:gigablast</td>"
 			  "<td>Matches if the document links to gigablast."
--- a/Parms.h
+++ b/Parms.h
@ -22,7 +22,8 @@ enum {
 	UFP_CUSTOM = 0 ,
 	UFP_NONE   = 0 ,
 	UFP_WEB    = 1 ,
-	UFP_NEWS   = 2 
+	UFP_NEWS   = 2 ,
+	UFP_CHINESE = 3
 };

 // special priorities for the priority drop down 
@ -71,7 +72,8 @@ enum {
 	TYPE_SITERULE       , // 29
 	TYPE_SAFEBUF        ,
 	TYPE_UFP            ,
-	TYPE_FILEUPLOADBUTTON
+	TYPE_FILEUPLOADBUTTON,
+	TYPE_DOUBLE
 };

 //forward decls to make compiler happy:
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -4100,6 +4100,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
 	m_minScoreTermNumInt = -1;
 	m_maxScoreTermNumInt = -1;

+	m_hasMaxSerpScore = false;
+	if ( m_r->m_minSerpDocId )
+		m_hasMaxSerpScore = true;
+
 	//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
 	for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
 		QueryTerm *qt = &m_q->m_qterms[i];
@ -6618,6 +6622,10 @@ void PosdbTable::intersectLists10_r ( ) {
 		// no term?
 		if ( ! miniMergedList[m_sortByTermNumInt] ) goto advance;
 		intScore = g_posdb.getInt( miniMergedList[m_sortByTermNumInt]);
+		// do this so hasMaxSerpScore below works, although
+		// because of roundoff errors we might lose a docid
+		// through the cracks in the widget.
+		//score = (float)intScore;
 	}

 	// skip docid if outside of range
@ -6656,12 +6664,36 @@ void PosdbTable::intersectLists10_r ( ) {
 		if ( score3 > m_maxScoreValInt ) goto advance;
 	}

+	// now we have a maxscore/maxdocid upper range so the widget
+	// can append only new results to an older result set.
+	if ( m_hasMaxSerpScore ) {
+		// if dealing with an "int" score use the extra precision
+		// of the double that m_maxSerpScore is!
+		if ( m_sortByTermNumInt >= 0 ) {
+			if ( intScore > (long)m_r->m_maxSerpScore )
+				goto advance;
+			if ( intScore == (long)m_r->m_maxSerpScore &&
+			     (long long)m_docId <= m_r->m_minSerpDocId ) 
+				goto advance;
+		}
+		else {
+			if ( score > (float)m_r->m_maxSerpScore ) 
+				goto advance;
+			if ( score == m_r->m_maxSerpScore &&
+			     (long long)m_docId <= m_r->m_minSerpDocId ) 
+				goto advance;
+		}
+	}

 	// . seoDebug hack so we can set "dcs"
 	// . we only come here if we actually made it into m_topTree
 	if ( secondPass || m_r->m_seoDebug ) {
 		dcs.m_siteRank   = siteRank;
 		dcs.m_finalScore = score;
+		// a double can capture an int without dropping any bits,
+		// inlike a mere float
+		if ( m_sortByTermNumInt >= 0 )
+			dcs.m_finalScore = (double)intScore;
 		dcs.m_docId      = m_docId;
 		dcs.m_numRequiredTerms = m_numQueryTermInfos;
 		dcs.m_docLang = docLang;
--- a/Posdb.h
+++ b/Posdb.h
@ -576,6 +576,8 @@ class PosdbTable {

 	unsigned long long m_docIdHack;

+	bool m_hasMaxSerpScore;
+
 	// hack for seo.cpp:
 	float m_finalScore;
 	float m_preFinalScore;
@ -795,7 +797,10 @@ class DocIdScore {
 	bool serialize   ( class SafeBuf *sb );

 	long long   m_docId;
-	float       m_finalScore;
+	// made this a double because of intScores which can't be captured
+	// fully with a float. intScores are used to sort by spidered time
+	// for example. see Posdb.cpp "intScore".
+	double      m_finalScore;
 	char        m_siteRank;
 	long        m_docLang; // langId
 	long        m_numRequiredTerms;
--- a/Process.cpp
+++ b/Process.cpp
@ -112,6 +112,9 @@ char *g_files[] = {
 	// required for SSL server support for both getting web pages
 	// on https:// sites and for serving https:// pages
 	"gb.pem",
+
+	// the main binary!
+	"gb",
 	
 	//"dict/unifiedDict",
 	//"dict/thesaurus.txt",
@ -150,30 +153,37 @@ char *g_files[] = {
 	"antiword-dir/koi8-r.txt",
 	"antiword-dir/koi8-u.txt",
 	"antiword-dir/roman.txt",
-	
-	// . thumbnail generation
-	// . use 'apt-get install netpbm' to install
-	//"/usr/bin/giftopnm",
-	//"/usr/bin/tifftopnm",
-	//"/usr/bin/pngtopnm",
-	//"/usr/bin/jpegtopnm",
-	//"/usr/bin/bmptopnm",
-	//"/usr/bin/pnmscale",
-	//"/usr/bin/ppmtojpeg",
-	//"/usr/sbin/smartctl",

-	//"giftopnm",
-	//"tifftopnm",
-	//"pngtopnm",
-	//"jpegtopnm",
-	//"bmptopnm",
-	//"pnmscale",
-	//"ppmtojpeg",
+	// . thumbnail generation
+	// . i used 'apt-get install netpbm' to install
+	"bmptopnm",
+	"giftopnm",
+	"jpegtopnm",
+	"libjpeg.so.62",
+	"libnetpbm.so.10",
+	"libpng12.so.0",
+	"libtiff.so.4",
+	"libz.so.1",
+	"LICENSE",
+	"pngtopnm",
+	"pnmscale",
+	"ppmtojpeg",
+	"tifftopnm",
+
+	"mysynonyms.txt",

 	//"smartctl",

 	"wikititles.txt.part1",
 	"wikititles.txt.part2",
+
+	"wiktionary-buf.txt",
+	"wiktionary-lang.txt",
+	"wiktionary-syns.dat",
+
+	"unifiedDict.txt",
+	//"unifiedDict-buf.txt",
+	//"unifiedDict-map.dat",
 	
 	//
 	// this junk can be generated
@ -188,6 +198,31 @@ char *g_files[] = {
 };


+bool Process::getFilesToCopy ( char *srcDir , SafeBuf *buf ) {
+
+	// sanirty
+	long slen = gbstrlen(srcDir);
+	if ( srcDir[slen-1] != '/' ) { char *xx=NULL;*xx=0; }
+
+	for ( long i = 0 ; i < (long)sizeof(g_files)/4 ; i++ ) {
+		// terminate?
+		if ( ! g_files[i] ) break;
+		// skip subdir shit it won't work
+		if ( strstr(g_files[i],"/") ) continue;
+		// if not first
+		if ( i > 0 ) buf->pushChar(' ');
+		// append it
+		buf->safePrintf("%s%s"
+				, srcDir
+				, g_files[i] );
+	}
+	// and the required runtime subdirs
+	buf->safePrintf(" %santiword-dir",srcDir);
+	buf->safePrintf(" %sucdata",srcDir);
+	buf->safePrintf(" %shtml",srcDir);
+	return true;
+}
+

 bool Process::checkFiles ( char *dir ) {

@ -265,6 +300,11 @@ bool Process::checkFiles ( char *dir ) {
 			
 	}

+	if ( needsFiles ) {
+		log("db: Missing files. See above. Exiting.");
+		return false;
+	}
+
 	//if ( needsFiles ) {
 	//  log("db: use 'apt-get install -y netpbm' to install "
 	//      "pnmfiles");
@ -286,12 +326,16 @@ bool Process::checkFiles ( char *dir ) {

 	if ( ! g_conf.m_isLive ) return true;

+	m_swapEnabled = 0;
+
 	// first check to make sure swap is off
 	SafeBuf psb;
 	if ( psb.fillFromFile("/proc/swaps") < 0 ) {
 		log("gb: failed to read /proc/swaps");
-		if ( ! g_errno ) g_errno = EBADENGINEER;
-		return true;
+		//if ( ! g_errno ) g_errno = EBADENGINEER;
+		//return true;
+		// if we don't know if swap is enabled or not, use -1
+		m_swapEnabled = -1;
 	}

 	/*
@ -307,9 +351,15 @@ bool Process::checkFiles ( char *dir ) {
 			   mstrerror(g_errno));
 	buf[size] = '\0';
 	*/
-	char *buf = psb.getBufStart();
-	if ( strstr ( buf,"dev" ) )
-		return log("gb: can not start live gb with swap enabled.");
+
+	// we should redbox this! or at least be on the optimizations page
+	if ( m_swapEnabled == 0 ) {
+		char *buf = psb.getBufStart();
+		if ( strstr ( buf,"dev" ) )
+			//return log("gb: can not start live gb with swap "
+			//"enabled.");
+			m_swapEnabled = 1;
+	}

 	// . make sure elvtune is being set right
 	// . must be in /etc/rcS.d/S99local
@ -336,6 +386,9 @@ bool Process::checkFiles ( char *dir ) {
 	mfree ( buf , size+1, "S99" );
 	*/

+	// now that we are open source skip the checks below
+	return true;
+
 	// check kernel version
 	FILE *fd;
 	fd = fopen ( "/proc/version" , "r" );
@ -377,7 +430,7 @@ bool Process::checkFiles ( char *dir ) {
 		      "MST 2008\n")== 0)
 		return true;
 	log("gb: kernel version is not an approved version.");
-	return false;
+	//return false;

 	return true;
 }
--- a/Process.h
+++ b/Process.h
@ -16,6 +16,7 @@ class Process {

 public:

+	bool getFilesToCopy ( char *srcDir , class SafeBuf *buf ) ;
 	bool checkFiles ( char *dir );

 	// . the big save command
@ -94,6 +95,7 @@ class Process {
 	long  m_desiredFanState;
 	float m_diskUsage;
 	long long m_diskAvail;
+	char m_swapEnabled;
 };

 extern Process g_process;
--- a/Query.cpp
+++ b/Query.cpp
@ -3084,9 +3084,15 @@ struct QueryField g_fields[] = {
 	{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
 	{"gbrss", FIELD_GBRSS, true,"Matches all pages that are rss feeds."},
 	//{"gbruleset",FIELD_GBRULESET, true,"Obsolete."},
-	{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension."},
+	{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension. Examples: type:doc type:status type:json type:xls"},
 	{"filetype", FIELD_TYPE, false,"Same as type:"},
 	{"gbisadult",FIELD_TYPE,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."},
+	{"gbimage",FIELD_URL,false,"use gbimage:<url> to return all documents containing that image url."},
+
+	{"gbstatus",FIELD_TYPE,false,"If document is a spider reply, then search the spider status as a number using this. 0 means success, so gbstatus:0 would return all successful statuses."},
+	{"gbstatusmsg",FIELD_TYPE,false,"If document is a spider reply, then search the spider status description, which might be something like 'TCP Timed out' or 'Robots.txt disallows' or 'Success', if no error."},
+
+	{"gbhasthumbnail",FIELD_TYPE,false,"use gbhasthumbnail:0 and gbhasthumbnail:1 to restrict results to those that do not have or have thumbnails respectively."},
 	{"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."},
 	{"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."},
 	{"zipcode", FIELD_ZIP, false,"Same as zip:"},
@ -3143,7 +3149,7 @@ struct QueryField g_fields[] = {
 	},

 	{"gbminint", FIELD_GBNUMBERMININT, false,
-	 "Example: 'gbminint:spiderdate:1391749680' "
+	 "Example: 'gbminint:gbspiderdate:1391749680' "
 	 "'gbminint:count:99'. Numeric "
 	 "fields can be in JSON or in meta tag. "
 	 "Use 'gbspiderdate' field for the last time the page was "
@ -3151,7 +3157,7 @@ struct QueryField g_fields[] = {
 	},

 	{"gbmaxint", FIELD_GBNUMBERMAXINT, false,
-	 "Example: 'gbmaxint:spiderdate:1391749680' "
+	 "Example: 'gbmaxint:gbspiderdate:1391749680' "
 	 "'gbmaxint:count:99'. Numeric "
 	 "fields can be in JSON or in meta tag. "
 	 "Use 'gbspiderdate' field for the last time the page was "
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -241,6 +241,9 @@ bool Rdb::init ( char          *dir                  ,
 	// . set tree to use our fixed data size
 	// . returns false and sets g_errno on error
 	if(m_useTree) { 
+		long rdbId = m_rdbId;
+		// statsdb is collectionless really so pass on to tree
+		if ( rdbId == RDB_STATSDB ) rdbId = -1;
 		if ( ! m_tree.set ( fixedDataSize  , 
 			    maxTreeNodes   , // max # nodes in tree
 			    isTreeBalanced , 
@ -253,7 +256,7 @@ bool Rdb::init ( char          *dir                  ,
 			    // make useProtection true for debugging
 				    false          , // use protection?
 				    false , // alowdups?
-				    m_rdbId ) )
+				    rdbId ) )
 			return false;
 	}
 	else {
@ -621,6 +624,24 @@ bool Rdb::deleteAllRecs ( collnum_t collnum ) {
 	return true;
 }

+bool makeTrashDir() {
+	char trash[1024];
+	sprintf(trash, "%strash/",g_hostdb.m_dir);
+	if ( ::mkdir ( trash, 
+		       S_IRUSR | S_IWUSR | S_IXUSR | 
+		       S_IRGRP | S_IWGRP | S_IXGRP | 
+		       S_IROTH | S_IXOTH ) == -1 ) {
+		if ( errno != EEXIST ) {
+			log("dir: mkdir %s had error: %s",
+			    trash,mstrerror(errno));
+			return false;
+		}
+		// clear it
+		errno = 0;
+	}
+	return true;
+}
+

 bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {

@ -685,12 +706,7 @@ bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
 		(long)collnum,gettimeofdayInMilliseconds());
 	//Dir d; d.set ( dname );
 	// ensure ./trash dir is there
-	char trash[1024];
-	sprintf(trash, "%strash/",g_hostdb.m_dir);
-	::mkdir ( trash, 
-		  S_IRUSR | S_IWUSR | S_IXUSR | 
-		  S_IRGRP | S_IWGRP | S_IXGRP | 
-		  S_IROTH | S_IXOTH ) ;
+	makeTrashDir();
 	// move into that dir
 	::rename ( oldname , newname );

@ -1089,8 +1105,8 @@ bool Rdb::loadTree ( ) {
 			return log("db: Could not load saved buckets.");
 		long numKeys = m_buckets.getNumKeys();
 		
-		log("db: Loaded %li recs from %s's buckets on disk.",
-		    numKeys, m_dbname);
+		// log("db: Loaded %li recs from %s's buckets on disk.",
+		//     numKeys, m_dbname);
 		
 		if(!m_buckets.testAndRepair()) {
 			log("db: unrepairable buckets, "
@ -1482,6 +1498,8 @@ bool Rdb::dumpCollLoop ( ) {
 	//         just modify DiskPageCache.cpp to ignore breaches. 
 	if(m_useTree) maxFileSize = m_tree.getMemOccupiedForList ();
 	else          maxFileSize = m_buckets.getMemOccupied();
+	// sanity
+	if ( maxFileSize < 0 ) { char *xx=NULL;*xx=0; }
 	// because we are actively spidering the list we dump ends up
 	// being more, by like 20% or so, otherwise we do not make a
 	// big enough diskpagecache and it logs breach msgs... does not
@ -2389,9 +2407,10 @@ bool Rdb::addRecord ( collnum_t collnum,
 		}
 	}

-	// . cancel any spider request that is a dup in the dupcache to save disk space
-	// . twins might have different dupcaches so they might have different dups, but
-	//   it shouldn't be a big deal because they are dups!
+	// . cancel any spider request that is a dup in the dupcache to save 
+	//   disk space
+	// . twins might have different dupcaches so they might have different
+	//   dups, but it shouldn't be a big deal because they are dups!
 	if ( m_rdbId == RDB_SPIDERDB && ! KEYNEG(key) ) {
 		// . this will create it if spiders are on and its NULL
 		// . even if spiders are off we need to create it so 
@ -2402,12 +2421,18 @@ bool Rdb::addRecord ( collnum_t collnum,
 		SpiderRequest *sreq=(SpiderRequest *)(orig-4-sizeof(key128_t));
 		// is it really a request and not a SpiderReply?
 		char isReq = g_spiderdb.isSpiderRequest ( &sreq->m_key );
-		// skip if in dup cache. do NOT add to cache since addToWaitingTree()
-		// in Spider.cpp will do that when called from addSpiderRequest() below
-		if ( isReq && sc->isInDupCache ( sreq , false ) ) return true;
+		// skip if in dup cache. do NOT add to cache since 
+		// addToWaitingTree() in Spider.cpp will do that when called 
+		// from addSpiderRequest() below
+		if ( isReq && sc->isInDupCache ( sreq , false ) ) {
+			if ( g_conf.m_logDebugSpider )
+				log("spider: adding spider req %s is dup. "
+				    "skipping.",sreq->m_url);
+			return true;
+		}
 	}

-	if ( m_useTree && (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
+	if ( m_useTree && (tn=m_tree.addNode (collnum,key,data,dataSize))>=0) {
 		// if adding to spiderdb, add to cache, too
 		if ( m_rdbId != RDB_SPIDERDB && m_rdbId != RDB_DOLEDB ) 
 			return true;
@ -2453,15 +2478,18 @@ bool Rdb::addRecord ( collnum_t collnum,
 		// add the request
 		if ( isReq ) {
 			// log that. why isn't this undoling always
-			/*
 			if ( g_conf.m_logDebugSpider )
-				logf(LOG_DEBUG,"spider: rdb: got spider "
+				logf(LOG_DEBUG,"spider: rdb: added spider "
+				     "request to spiderdb rdb tree "
+				     "addnode=%li "
 				     "request for uh48=%llu prntdocid=%llu "
-				     "firstIp=%s",
+				     "firstIp=%s spiderdbkey=%s",
+				     tn,
 				     sreq->getUrlHash48(), 
 				     sreq->getParentDocId(),
-				     iptoa(sreq->m_firstIp));
-			*/
+				     iptoa(sreq->m_firstIp),
+				     KEYSTR((char *)&sreq->m_key,
+					    sizeof(key128_t)));
 			// false means to NOT call evaluateAllRequests()
 			// because we call it below. the reason we do this
 			// is because it does not always get called
--- a/Rdb.h
+++ b/Rdb.h
@ -13,6 +13,8 @@
 //#include "Dir.h"
 #include "RdbBuckets.h"

+bool makeTrashDir() ;
+
 // . each Rdb instance has an ID
 // . these ids are also return values for getIdFromRdb()
 #define	RDB_START 1
--- a/RdbBuckets.cpp
+++ b/RdbBuckets.cpp
@ -594,9 +594,9 @@ bool RdbBuckets::set ( long fixedDataSize , long maxMem,
 		return false;
 	}
 	
-	log("init: Successfully initialized buckets for %s, "
-	    "keysize is %li, max mem is %li, datasize is %li",
-	    m_dbname, (long)m_ks, m_maxMem, m_fixedDataSize);
+	// log("init: Successfully initialized buckets for %s, "
+	//     "keysize is %li, max mem is %li, datasize is %li",
+	//     m_dbname, (long)m_ks, m_maxMem, m_fixedDataSize);


 	/*
@ -719,12 +719,12 @@ bool RdbBuckets::resizeTable(long numNeeded) {
 			g_errno = ENOMEM;
 			return false;
 		}
-		log(LOG_INFO,
-		    "db: scaling down request for buckets.  "
-		    "Currently have %li "
-		    "buckets, asked for %li, max number of buckets"
-		    " for %li bytes is %li.",
-		    m_maxBuckets, numNeeded, m_maxMem, m_maxBucketsCapacity);
+		// log(LOG_INFO,
+		//     "db: scaling down request for buckets.  "
+		//     "Currently have %li "
+		//     "buckets, asked for %li, max number of buckets"
+		//     " for %li bytes is %li.",
+		//     m_maxBuckets, numNeeded, m_maxMem, m_maxBucketsCapacity);

 		numNeeded = m_maxBucketsCapacity;
 	}
@ -1114,6 +1114,7 @@ bool RdbBuckets::selfTest(bool thorough, bool core) {
 		last = kk;
 		lastcoll = b->getCollnum();
 	}
+	if ( totalNumKeys != m_numKeysApprox )
 	log(LOG_WARN, "db have %li keys,  should have %li. "
 	    "%li buckets in %li colls for db %s", 
 	    totalNumKeys, m_numKeysApprox, m_numBuckets, 
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -1091,7 +1091,14 @@ bool RdbList::removeBadData_r ( ) {
 	// . if not fixed size, remove all the data for now
 	// . TODO: make this better, man
 	if ( m_fixedDataSize == -1 ) {
-		reset();
+		// don't call reset because it sets m_ks back to 12
+		//reset();
+		m_listSize = 0;
+		m_list = NULL;
+		m_listPtr = NULL;
+		m_listEnd = NULL;
+		m_mergeMinListSize = -1;
+		m_lastKeyIsValid = false;
 		return true;
 	}
 	//key_t oldk;
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -1140,7 +1140,7 @@ void RdbTree::deleteOrderedList ( collnum_t collnum ,
 bool RdbTree::fixTree ( ) {
 	// on error, fix the linked list
 	//log("RdbTree::fixTree: tree was corrupted on disk?");
-	log("db: Trying to fix tree.");
+	log("db: Trying to fix tree for %s.",m_dbname);
 	log("db: %li occupied nodes and %li empty "
 	    "of top %li nodes.",
 	    m_numUsedNodes , m_minUnusedNode - m_numUsedNodes ,
@ -1171,6 +1171,9 @@ bool RdbTree::fixTree ( ) {
 		// verify collnum
 		if ( cn <  0   ) continue;
 		if ( cn >= max ) continue;
+		// collnum of non-existent coll
+		if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
+			continue;
 		// now add just to set m_right/m_left/m_parent
 		if ( m_fixedDataSize == 0 )
 			addNode(cn,&m_keys[i*m_ks], NULL, 0 );
@ -1183,11 +1186,11 @@ bool RdbTree::fixTree ( ) {
 		count++;
 	}

-	log("db: Fix tree removed %li nodes.",n - count);
+	log("db: Fix tree removed %li nodes for %s.",n - count,m_dbname);
 	// esure it is still good
 	if ( ! checkTree ( false , true ) )
 		return log("db: Fix tree failed.");
-	log("db: Fix tree succeeded.");
+	log("db: Fix tree succeeded for %s.",m_dbname);
 	return true;
 }

@ -1229,6 +1232,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 		// for posdb
 		if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
 			char *xx=NULL;*xx=0; }
+		// bad collnum?
+		collnum_t cn = m_collnums[i];
+		if ( m_rdbId>=0 && (cn >= g_collectiondb.m_numRecs || cn < 0) )
+			return log("db: bad collnum in tree");
+		if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
+			return log("db: collnum is obsolete in tree");
 		// if no left/right kid it MUST be -1
 		if ( m_left[i] < -1 )
 			return log(
@ -1305,8 +1314,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 		if ( ! doChainTest ) continue;
 		// ensure i goes back to head node
 		long j = i;
+		long loopCount = 0;
 		while ( j >= 0 ) { 
 			if ( j == m_headNode ) break;
+			// sanity -- loop check
+			if ( ++loopCount > 10000 ) 
+				return log("db: tree had loop");
 			j = m_parents[j];
 		}
 		if ( j != m_headNode ) 
@ -2799,8 +2812,10 @@ long RdbTree::fastLoadBlock ( BigFile   *f          ,
 			m_corrupt++;
 			continue;
 		}
-		// must have rec as well
-		if ( ! recs[c] ) {
+		// must have rec as well. unless it its statsdb tree
+		// or m_waitingTree which are collection-less and always use
+		// 0 for their collnum. if collection-less m_rdbId==-1.
+		if ( ! recs[c] && m_rdbId >= 0 ) {
 			m_corrupt++;
 			continue;
 		}
@ -3063,10 +3078,15 @@ long RdbTree::oldLoadBlock ( BigFile *f, long remainingNodes , RdbMem *stack,

 void RdbTree::cleanTree ( ) { // char **bases ) {

+	// some trees always use 0 for all node collnum_t's like
+	// statsdb, waiting tree etc.
+	if ( m_rdbId < 0 ) return;
+
 	// the liberation count
 	long count = 0;
 	collnum_t collnum;
 	long max = g_collectiondb.m_numRecs;
+
 	for ( long i = 0 ; i < m_minUnusedNode ; i++ ) {
 		// skip node if parents is -2 (unoccupied)
 		if ( m_parents[i] == -2 ) continue;
@ -3103,7 +3123,8 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
 }

 long  RdbTree::getNumNegativeKeys ( collnum_t collnum ) { 
-	if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
+	// fix for statsdb or other collectionless rdbs
+	if ( m_rdbId < 0 ) return m_numNegativeKeys;
 	CollectionRec *cr = g_collectiondb.m_recs[collnum];
 	if ( ! cr ) return 0;
 	//if ( ! m_countsInitialized ) { char *xx=NULL;*xx=0; }
@ -3111,7 +3132,8 @@ long  RdbTree::getNumNegativeKeys ( collnum_t collnum ) {
 }

 long  RdbTree::getNumPositiveKeys ( collnum_t collnum ) { 
-	if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
+	// fix for statsdb or other collectionless rdbs
+	if ( m_rdbId < 0 ) return m_numPositiveKeys;
 	CollectionRec *cr = g_collectiondb.m_recs[collnum];
 	if ( ! cr ) return 0;
 	//if ( ! m_countsInitialized ) { char *xx=NULL;*xx=0; }
--- a/Repair.cpp
+++ b/Repair.cpp
@ -2027,11 +2027,11 @@ bool Repair::injectTitleRec ( ) {
 	xd->m_tagRecValid    = false;

 	// rebuild the title rec! otherwise we re-add the old one!!!!!!!
-	xd->m_titleRecValid = false;
+	xd->m_titleRecBufValid = false;
 	// free it since set2() should have uncompressed it!
 	//mfree ( titleRec , titleRecSize, "repair" );
 	// and so xd doesn't free it
-	xd->m_titleRec = NULL;
+	xd->m_titleRecBuf.purge();// = NULL;

 	// use the ptr_utf8Content that we have
 	xd->m_recycleContent = true;
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -3285,3 +3285,92 @@ bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {

 	return true;
 }
+
+bool SafeBuf::base64Encode ( char *sx , long len , long niceness ) {
+
+	unsigned char *s = (unsigned char *)sx;
+
+	if ( ! s ) return true;
+
+	// assume all chars are double quotes and will have to be encoded
+	long need = len * 2 + 1 +3; // +3 for = padding
+
+	if ( ! reserve ( need ) ) return false;
+
+	// tmp vars
+	char *dst  = m_buf + m_length;
+
+	long round = 0;
+
+	// the table of 64 entities
+	static char tab[] = {
+		'A','B','C','D','E','F','G','H','I','J','K','L','M',
+		'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
+		'a','b','c','d','e','f','g','h','i','j','k','l','m',
+		'n','o','p','q','r','s','t','u','v','w','x','y','z',
+		'0','1','2','3','4','5','6','7','8','9','+','/'
+	};
+
+	unsigned char val;
+	// scan through all 
+	unsigned char *send = s + len;
+	for ( ; s < send ; ) {
+		// breathe
+		QUICKPOLL ( niceness );
+
+		unsigned char c1 = s[0];
+		unsigned char c2 = 0;
+		//unsigned char c3 = 0;
+		
+		if ( s+1 < send ) c2 = s[1];
+		else              c2 = 0;
+
+		if ( round == 0 ) {
+			val  = c1 >>2;
+		}
+		else if ( round == 1 ) {
+			val  = (c1 & 0x03) << 4;
+			val |=  c2 >> 4;
+			// time for this
+			s++;
+		}
+		else if ( round == 2 ) {
+			val  = ((c1 & 0x0f) << 2);
+			val |= ((c2 & 0xc0) >> 6);
+			s++;
+		}
+		else if ( round == 3 ) {
+			val  = (c1 & 0x3f);
+			s++;
+		}
+		// add '0'
+		*dst = tab[val];
+		// point to next char
+		dst++;
+		// keep going if more left
+		if ( s < send ) {
+			// repeat every 4 cycles since it is aligned then
+			if ( ++round == 4 ) round = 0;
+			continue;
+		}
+		// if we are done do padding
+		if ( round == 0 ) {
+			*dst++ = '=';
+		}
+		if ( round == 1 ) {
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		if ( round == 2 ) {
+			*dst++ = '=';
+		}
+
+
+	}
+
+	m_length += dst - (m_buf + m_length);
+
+	nullTerm();
+
+	return true;
+}
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -110,6 +110,8 @@ struct SafeBuf {

 	bool  csvEncode ( char *s , long len , long niceness = 0 );

+	bool  base64Encode ( char *s , long len , long niceness = 0 );
+
 	//bool  pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
 	bool  cat(SafeBuf& c);
 	// . only cat the sections/tag that start with "tagFilter"
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -394,6 +394,22 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 				v = atof(m->m_def);
 			*(float *)x = (float)v;
 		}
+		else if ( m->m_type == TYPE_DOUBLE ) {
+			double v = 0;
+			if ( def )
+				v = *(double *)def;
+			else if ( m->m_def ) 
+				v = atof(m->m_def);
+			*(double *)x = (double)v;
+		}
+		else if ( m->m_type == TYPE_LONG_LONG ) {
+			long long v = 0;
+			if ( def )
+				v = *(long long *)def;
+			else if ( m->m_def ) 
+				v = atoll(m->m_def);
+			*(long long *)x = (long long)v;
+		}
 		else if ( m->m_type == TYPE_STRING ||
 			  m->m_type == TYPE_STRINGBOX ) {
 			//if ( m->m_cgi && strcmp ( m->m_cgi, "erpc" ) == 0 )
@ -549,6 +565,27 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 			//			"name=%s value=\"%li\">\n", 
 			//			cgi , v );
 		}
+		else if ( m->m_type == TYPE_LONG_LONG ) {
+			// default was set above
+			long def = *(long long *)x;
+			// assume default
+			long long v = def;
+			// but cgi parms override cookie
+			v = r->getLongLong ( cgi , v );
+			// but if its a privledged parm and we're not an admin
+			// then do not allow overrides, but m_priv of 3 means
+			// to not display for clients, but to allow overrides
+			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
+			// set it
+			*(long long *)x = v;
+			// if it is the same as its default, and the default is
+			// always from m_def and never from the CollectionRec, 
+			// then do not both storing it in here! what's the 
+			// point?
+			if ( v == def && m->m_off < 0 ) continue;
+			// if not default do not propagate
+			if ( v == def ) continue;
+		}
 		else if ( m->m_type == TYPE_FLOAT ) {
 			// default was set above
 			float def = *(float *)x;
@ -587,6 +624,34 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 			//			"name=%s value=\"%f\">\n", 
 			//			cgi , v );
 		}
+		else if ( m->m_type == TYPE_DOUBLE ) {
+			// default was set above
+			double def = *(double *)x;
+			// get overriding from http request, if any
+			double v;
+			// but if its a privledged parm and we're not an admin
+			// then do not allow overrides
+			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
+			else v = r->getDouble( cgi , def );
+			// bounds checks
+			if ( v < m->m_smin ) v = m->m_smin;
+			if ( v > m->m_smax ) v = m->m_smax;
+			if ( m->m_sminc >= 0 ) {
+				double vmin=*(double *)((char *)cr+m->m_sminc);
+				if ( v < vmin ) v = vmin;
+			}
+			if ( m->m_smaxc >= 0 ) {
+				double vmax=*(double *)((char *)cr+m->m_smaxc);
+				if ( v > vmax ) v = vmax;
+			}
+			// set it
+			*(double *)x = v;
+			// include for sure if explicitly provided
+			char *vp = r->getValue(cgi, NULL, NULL);
+			if ( ! vp ) continue;
+			// unchanged from default?
+			if ( v == def ) continue;
+		}

 		else if ( m->m_type == TYPE_BOOL ) {
 			// default was set above
@ -927,11 +992,12 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 	if(m_firstResultNum < 0) m_firstResultNum = 0;

 	// DEBUG: temp hack
-	static bool first = true;
-	if ( first ) { 
-		first = false;
-		m_firstResultNum = 1;
-	}
+	// static bool first = true;
+	//  if ( first ) { 
+	//  	first = false;
+	//  	m_firstResultNum = 10;
+	//  }
+

 	// if useCache is -1 then pick a default value
 	if ( m_useCache == -1 ) {
@ -1422,6 +1488,8 @@ char getFormatFromRequest ( HttpRequest *r ) {
 		format=FORMAT_WIDGET_IFRAME;
 	if ( formatStr && strcmp(formatStr,"ajax")==0)
 		format=FORMAT_WIDGET_AJAX;
+	if ( formatStr && strcmp(formatStr,"append")==0)
+		format=FORMAT_WIDGET_APPEND;


 	// support old api &xml=1 to mean &format=1
@ -1446,5 +1514,9 @@ char getFormatFromRequest ( HttpRequest *r ) {
 		format = FORMAT_WIDGET_AJAX;
 	}

+	if ( r->getLong("append",0) ) {
+		format = FORMAT_WIDGET_APPEND;
+	}
+
 	return format;
 }
--- a/SearchInput.h
+++ b/SearchInput.h
@ -362,6 +362,9 @@ class SearchInput {
 	long   m_urlLen2;
 	char  *m_url2;

+	double    m_maxSerpScore;
+	long long m_minSerpDocId;
+
 	// for /get?d=xxxxx&strip=0&ih=1&qh=1
 	long long m_docId;
 	long      m_strip;
--- a/Speller.cpp
+++ b/Speller.cpp
@ -1039,13 +1039,14 @@ bool Speller::loadUnifiedDict() {
 		char *tail2 = m_unifiedBuf.getBufStart()+h2-1000;
 		h = hash64 ( tail1 , 1000 , h );
 		h = hash64 ( tail2 , 1000 , h );
-		long long n = 8346765853685546681LL;
+		//long long n = 8346765853685546681LL;
+		long long n = -14450509118443930LL;
 		if ( h != n ) {
 			log("gb: unifiedDict-buf.txt or "
 			    "unifiedDict-map.dat "
 			    "checksum is not approved for "
 			    "live service (%lli != %lli)" ,h,n);
-			return false;
+			//return false;
 		}

 		return true;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -110,7 +110,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
 	sb->safePrintf("parentDomHash32=0x%lx ",m_parentDomHash32 );
 	sb->safePrintf("parentSiteHash32=0x%lx ",m_parentSiteHash32 );

-	sb->safePrintf("hopCount=%li ",m_hopCount );
+	sb->safePrintf("hopCount=%li ",(long)m_hopCount );

 	//timeStruct = gmtime ( &m_spiderTime );
 	//time[0] = 0;
@ -301,7 +301,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,

 	sb->safePrintf(" <td>%li</td>\n",m_siteNumInlinks );
 	//sb->safePrintf(" <td>%li</td>\n",m_pageNumInlinks );
-	sb->safePrintf(" <td>%li</td>\n",m_hopCount );
+	sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );

 	// print time format: 7/23/1971 10:45:32
 	struct tm *timeStruct ;
@ -436,7 +436,7 @@ long SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status ,

 	sb->safePrintf(" <td>%li</td>\n",(long)m_errCount );

-	sb->safePrintf(" <td>%li</td>\n",m_hopCount );
+	sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );

 	// print time format: 7/23/1971 10:45:32
 	struct tm *timeStruct ;
@ -1026,14 +1026,22 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
 		    (long)sc,(long)sc->m_collnum);
 		return true;
 	}
+	// this means msg5 is out
+	if ( sc->m_msg5.m_waitingForList ) {
+		log("spider: deleting sc=0x%lx for collnum=%li waiting4",
+		    (long)sc,(long)sc->m_collnum);
+		return true;
+	}
 	// there's still a core of someone trying to write to someting
 	// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
 	// or spider.cpp. everyone should get sc from cr everytime i'd think
 	log("spider: deleting sc=0x%lx for collnum=%li",
 	    (long)sc,(long)sc->m_collnum);
+	// . make sure nobody has it
+	// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
+	//   have nuked it
 	CollectionRec *cr = sc->m_cr;
-	// make sure nobody has it
-	cr->m_spiderColl = NULL;
+	if ( cr ) cr->m_spiderColl = NULL;
 	mdelete ( sc , sizeof(SpiderColl),"postdel1");
 	delete ( sc );
 	return true;
@ -3075,6 +3083,8 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {

 	// reset this
 	long maxWinners = (long)MAX_WINNER_NODES;
+	if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
+
 	if ( m_winnerTree.m_numNodes == 0 &&
 	     ! m_winnerTree.set ( -1 , // fixeddatasize
 				  maxWinners , // maxnumnodes
@ -3348,7 +3358,8 @@ bool SpiderColl::evalIpLoop ( ) {

 	// if we started reading, then assume we got a fresh list here
 	if ( g_conf.m_logDebugSpider )
-		log("spider: back from msg5 spiderdb read2");
+		log("spider: back from msg5 spiderdb read2 of %li bytes",
+		    m_list.m_listSize);


 	// . set the winning request for all lists we read so far
@ -3539,7 +3550,8 @@ bool SpiderColl::readListFromSpiderdb ( ) {
 		return false ;
 	// note its return
 	if ( g_conf.m_logDebugSpider )
-		log("spider: back from msg5 spiderdb read");
+		log("spider: back from msg5 spiderdb read of %li bytes",
+		    m_list.m_listSize);
 	// no longer getting list
 	m_gettingList1 = false;

@ -4091,6 +4103,7 @@ bool SpiderColl::scanListForWinners ( ) {

 		// get the top 100 spider requests by priority/time/etc.
 		long maxWinners = (long)MAX_WINNER_NODES; // 40
+		if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;

 		// only put 40 urls from the same firstIp into doledb if
 		// we have a lot of urls in our spiderdb already.
@ -6139,9 +6152,23 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// get priority from doledb key
 	long pri = g_doledb.getPriority ( doledbKey );

-	if ( g_conf.m_logDebugSpider )
-		log("spider: setting pri2=%li nextkey to %s",
-		    m_sc->m_pri2,KEYSTR(&m_sc->m_nextDoledbKey,12));
+	// if the key went out of its priority because its priority had no
+	// spider requests then it will bleed over into another priority so
+	// in that case reset it to the top of its priority for next time
+	long pri3 = g_doledb.getPriority ( &m_sc->m_nextDoledbKey );
+	if ( pri3 != m_sc->m_pri2 ) {
+		m_sc->m_nextDoledbKey = g_doledb.makeFirstKey2 ( m_sc->m_pri2);
+		// the key must match the priority queue its in as nextKey
+		//if ( pri3 != m_sc->m_pri2 ) { char *xx=NULL;*xx=0; }
+	}
+
+	if ( g_conf.m_logDebugSpider ) {
+		long pri4 = g_doledb.getPriority ( &m_sc->m_nextDoledbKey );
+		log("spider: setting pri2=%li queue doledb nextkey to "
+		    "%s (pri=%li)",
+		    m_sc->m_pri2,KEYSTR(&m_sc->m_nextDoledbKey,12),pri4);
+		if ( pri4 != m_sc->m_pri2 ) { char *xx=NULL;*xx=0; }
+	}

 	// update next doledbkey for this priority to avoid having to
 	// process excessive positive/negative key annihilations (mdw)
@ -9912,6 +9939,13 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 		langLen = gbstrlen(lang);
 	}

+	// . get parent language in the request
+	// . primarpy language of the parent page that linked to this url
+	char *plang = NULL;
+	long  plangLen = 0;
+	plang = getLanguageAbbr(sreq->m_parentLangId);
+	if ( plang ) plangLen = gbstrlen(plang);
+
 	char *tld = (char *)-1;
 	long  tldLen;

@ -10259,7 +10293,16 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 		if ( strncmp(p,"insitelist",10) == 0 ) {
 			// skip for msg20
 			//if ( isForMsg20 ) continue;
-			if ( ! checkedRow ) {
+			// if only seeds in the sitelist and no
+
+			// if there is no domain or url explicitly listed
+			// then assume user is spidering the whole internet
+			// and we basically ignore "insitelist"
+			if ( sc->m_siteListIsEmpty ) {
+				// use a dummy row match
+				row = (char *)1;
+			}
+			else if ( ! checkedRow ) {
 				// only do once for speed
 				checkedRow = true;
 				// this function is in PageBasic.cpp
@ -11026,6 +11069,67 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			// come here if we did not match the tld
 		}

+
+		// parentlang=en,zh_cn
+		if ( *p=='p' && strncmp(p,"parentlang",10)==0){
+			// if we do not have enough info for outlink, all done
+			if ( isOutlink ) return -1;
+			// must have a reply
+			//if ( ! srep ) continue;
+			// skip if unknown? no, we support "xx" as unknown now
+			//if ( srep->m_langId == 0 ) continue;
+			// set these up
+			char *b = s;
+			// loop for the comma-separated list of langids
+			// like parentlang==en,es,...
+		subloop2b:
+			// get length of it in the expression box
+			char *start = b;
+			while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
+			long  blen = b - start;
+			//char sm;
+			// if we had parentlang==en,es,...
+			if ( sign == SIGN_EQ &&
+			     blen == plangLen && 
+			     strncasecmp(start,plang,plangLen)==0 ) 
+				// if we matched any, that's great
+				goto matched2b;
+			// if its parentlang!=en,es,...
+			// and we equal the string, then we do not matcht his
+			// particular rule!!!
+			if ( sign == SIGN_NE &&
+			     blen == plangLen && 
+			     strncasecmp(start,plang,plangLen)==0 ) 
+				// we do not match this rule if we matched
+				// and of the langs in the != list
+				continue;
+			// might have another in the comma-separated list
+			if ( *b != ',' ) {
+				// if that was the end of the list and the
+				// sign was == then skip this rule
+				if ( sign == SIGN_EQ ) continue;
+				// otherwise, if the sign was != then we win!
+				if ( sign == SIGN_NE ) goto matched2b;
+				// otherwise, bad sign?
+				continue;
+			}
+			// advance to next list item if was a comma after us
+			b++;
+			// and try again
+			goto subloop2b;
+			// come here on a match
+		matched2b:
+			// we matched, now look for &&
+			p = strstr ( b , "&&" );
+			// if nothing, else then it is a match
+			if ( ! p ) return i;
+			// skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+			// come here if we did not match the tld
+		}
+
+
 		// hopcount == 20 [&&]
 		if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
 			// skip if not valid
@ -12244,6 +12348,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 			ci->m_hasUrlsReadyToSpider = 0;
 			// save that!
 			cr->m_needsSave = true;
+			// set the time that this happens
+			cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
 		}
 		
 		// save it
--- a/Spider.h
+++ b/Spider.h
@ -528,29 +528,37 @@ class SpiderRequest {
 	// . this is zero if none or invalid
 	long    m_contentHash32;

-	/*
-	char    m_reserved1;
+	// . each request can have a different hop count
+	// . this is only valid if m_hopCountValid is true!
+	// . i made this a short from long to support m_parentLangId etc above
+	short   m_hopCount;
+	
+	// when creating a chinese search engine for instance it is nice
+	// to know the language of the page we are spidering's parent.
+	// typically a chinese page will link to another chinese page,
+	// though not always of course. this is the primary language of
+	// the parent.
+	uint8_t m_parentLangId;//reserved1;

 	// the new add url control will allow user to control link spidering
 	// on each url they add. they can also specify file:// instead of
 	// http:// to index local files. so we have to allow file://
-	char    m_onlyAddSameDomainLinks        :1;
-	char    m_onlyAddSameSubdomainLinks     :1;
-	char    m_onlyDoNotAddLinksLinks        :1; // max hopcount 1
-	char    m_onlyDoNotAddLinksLinksLinks   :1; // max hopcount 2
+	/* char    m_onlyAddSameDomainLinks        :1; */
+	/* char    m_onlyAddSameSubdomainLinks     :1; */
+	/* char    m_onlyDoNotAddLinksLinks        :1; // max hopcount 1 */
+	/* char    m_onlyDoNotAddLinksLinksLinks   :1; // max hopcount 2 */
+	char    m_reserved2a:1;
+	char    m_reserved2b:1;
+	char    m_reserved2c:1;
 	char    m_reserved2d:1;
+
 	char    m_reserved2e:1;
 	char    m_reserved2f:1;
 	char    m_reserved2g:1;
 	char    m_reserved2h:1;


-	// . each request can have a different hop count
-	// . this is only valid if m_hopCountValid is true!
-	short   m_hopCount;
-	*/
-	
-	long    m_hopCount;
+	//long    m_hopCount;

 	// . this is now computed dynamically often based on the latest
 	//   m_addedTime and m_percentChanged of all the SpideRec *replies*.
@ -715,6 +723,8 @@ class SpiderRequest {
 		m_ufn = -1;
 		// this too
 		m_priority = -1;
+		// this happens to be zero already, but just in case it changes
+		m_parentLangId = langUnknown;
 	};

 	static long getNeededSize ( long urlLen ) {
--- a/Title.cpp
+++ b/Title.cpp
@ -116,12 +116,28 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 	SafeBuf jsonTitle;
 	long vlen = 0;
 	if ( xd->m_contentType == CT_JSON ) {
+		// shortcut
+		char *s = xd->ptr_utf8Content;
 		char *jt;
-		jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
+		jt = getJSONFieldValue(s,"title",&vlen);
 		if ( jt && vlen > 0 ) {
 			jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness);
-							//true ); // decodeAll?
 			jsonTitle.nullTerm();
+		}
+		// if we got a product, try getting price
+		long oplen;
+		char *op = getJSONFieldValue(s,"offerPrice",&oplen);
+		if ( op && oplen ) {
+			if ( ! is_digit(op[0]) ) { op++; oplen--; }
+			float price = atof2(op,oplen);
+			// print without decimal point if ends in .00
+			if ( (float)(long)price == price )
+				jsonTitle.safePrintf(", &nbsp; $%li",
+						     (long)price);
+			else
+				jsonTitle.safePrintf(", &nbsp; $%.02f",price);
+		}
+		if ( jsonTitle.length() ) {
 			val = jsonTitle.getBufStart();
 			vlen = jsonTitle.length();
 		}
--- a/Wiki.cpp
+++ b/Wiki.cpp
@ -50,9 +50,16 @@ bool Wiki::load() {
 	close ( fd2 );
 	// save text size for getRandomPhrase() function below
 	m_txtSize = stats1.st_size;
+	// just use the .dat if we got it
+	if ( ! errno2 ) {
+		log(LOG_INFO,"wiki: Loading %s",ff2);
+		// "dir" is NULL since already included in ff2
+		return m_ht.load ( NULL , ff2 );
+	}
 	// if we got a newer binary version, use that
-	if ( ! errno2 && ! errno1 && stats2.st_mtime > stats1.st_mtime ) {
-		log(LOG_INFO,"wiki: loading %s",ff2);
+	// add in 10 seconds i guess
+	if ( ! errno2 && ! errno1 && stats2.st_mtime +10> stats1.st_mtime ) {
+		log(LOG_INFO,"wiki: Loading %s",ff2);
 		// "dir" is NULL since already included in ff2
 		return m_ht.load ( NULL , ff2 );
 	}
@ -70,15 +77,17 @@ bool Wiki::load() {

 bool Wiki::loadText ( long fileSize ) {

+	log(LOG_INFO,"wiki: generating wikititles2.dat file");
+
 	SafeBuf sb;
 	char ff1[256];
 	sprintf(ff1, "%swikititles.txt.part1", g_hostdb.m_dir);
-	log(LOG_INFO,"wiki: loading %s",ff1);
+	log(LOG_INFO,"wiki: Loading %s",ff1);
 	if ( ! sb.fillFromFile(ff1) ) return false;

 	char ff2[256];
 	sprintf(ff2, "%swikititles.txt.part2", g_hostdb.m_dir);
-	log(LOG_INFO,"wiki: loading %s",ff2);
+	log(LOG_INFO,"wiki: Loading %s",ff2);
 	if ( ! sb.catFile(ff2) ) return false;
 	

@ -312,6 +321,9 @@ bool Wiki::loadText ( long fileSize ) {
 	//char ff2[256];
 	//sprintf(ff2, "%s/wikititles2.dat", g_hostdb.m_dir);
 	if ( ! m_ht.save ( g_hostdb.m_dir , "wikititles2.dat" ) ) return false;
+
+	log(LOG_INFO,"wiki: done generating wikititles2.dat file");
+
 	// success
 	return true;
 }
--- a/Wiktionary.cpp
+++ b/Wiktionary.cpp
@ -261,10 +261,10 @@ bool Wiktionary::load() {
 	     ( errno1 || stats3.st_mtime > stats1.st_mtime ) 
 	     //&& ( errno2 || stats3.st_mtime > stats2.st_mtime ) 
 	     ) {
-		log(LOG_INFO,"wikt: loading %s",ff3);
+		log(LOG_INFO,"wikt: Loading %s",ff3);
 		if ( ! m_synTable .load ( NULL , ff3 ) )
 			return false;
-		log(LOG_INFO,"wikt: loading %s",ff4);
+		log(LOG_INFO,"wikt: Loading %s",ff4);
 		if ( m_synBuf.fillFromFile ( NULL , ff4 ) <= 0 )
 			return false;

@ -288,7 +288,7 @@ bool Wiktionary::load() {
 			log("gb: %s or %s checksum is not approved for "
 			    "live service (%lli != %lli)", ff3, ff4,
 			    h,nn);
-			return false;
+			//return false;
 		}

 		return true;
@ -517,7 +517,7 @@ bool Wiktionary::generateHashTableFromWiktionaryTxt ( long sizen ) {
 	//
 	char ff1[256];
 	sprintf(ff1, "%swiktionary.txt.aa", g_hostdb.m_dir);
-	log(LOG_INFO,"wikt: loading %s",ff1);
+	log(LOG_INFO,"wikt: Loading %s",ff1);
        int fd1 = open ( ff1 , O_RDONLY );
 	if ( fd1 < 0 ) {
 		log("wikt: open %s : %s",ff1,mstrerror(errno));
@ -558,7 +558,7 @@ bool Wiktionary::generateHashTableFromWiktionaryTxt ( long sizen ) {
 			round++;
 			offset = 0;
 			sprintf(ff1,"%swiktionary.txt.ab",g_hostdb.m_dir);
-			log(LOG_INFO,"wikt: loading %s",ff1);
+			log(LOG_INFO,"wikt: Loading %s",ff1);
 			int fd1 = open ( ff1 , O_RDONLY );
 			if ( fd1 < 0 ) {
 				log("wikt: open %s : %s",ff1,mstrerror(errno));
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -273,7 +273,9 @@ class XmlDoc {
 	// . we can avoid setting Xml and Words classes etc...
 	long      m_contentHash32;
 	// like the above but hash of all tags in TagRec for this url
-	long      m_tagHash32;
+	//long      m_tagHash32;
+	// this is a hash of all adjacent tag pairs for templated identificatn
+	uint32_t  m_tagPairHash32;
 	long      m_siteNumInlinks;
 	long      m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
 	long      m_siteNumInlinksUniqueCBlock; // m_sitePop;
@ -490,7 +492,13 @@ class XmlDoc {
 	key_t *getTitleRecKey() ;
 	//char *getSkipIndexing ( );
 	char *prepareToMakeTitleRec ( ) ;
-	char **getTitleRec ( ) ;
+	// store TitleRec into "buf" so it can be added to metalist
+	bool setTitleRecBuf ( SafeBuf *buf , long long docId, long long uh48 );
+	// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
+	SafeBuf *getTitleRecBuf ( );
+	SafeBuf *getSpiderReplyMetaList ( class SpiderReply *reply ) ;
+	SafeBuf *getSpiderReplyMetaList2 ( class SpiderReply *reply ) ;
+	SafeBuf m_spiderReplyMetaList;
 	char *getIsAdult ( ) ;
 	long **getIndCatIds ( ) ;
 	long **getCatIds ( ) ;
@ -540,8 +548,6 @@ class XmlDoc {
 	class HashTableX *getCountTable ( ) ;
 	bool hashString_ct ( class HashTableX *ht, char *s , long slen ) ;
 	uint8_t *getSummaryLangId ( ) ;
-	long     *getTagPairHashVector ( ) ;
-	uint32_t *getTagPairHash32 ( ) ;
 	long *getSummaryVector ( ) ;
 	long *getPageSampleVector ( ) ;
 	long *getPostLinkTextVector ( long linkNode ) ;
@ -601,6 +607,7 @@ class XmlDoc {
 	//bool *updateRootLangId ( );
 	char **getRootTitleRec ( ) ;
 	//char **getContactTitleRec ( char *url ) ;
+	long long *getAvailDocIdOnly ( long long preferredDocId ) ;
 	long long *getDocId ( ) ;
 	char *getIsIndexed ( ) ;
 	class TagRec *getTagRec ( ) ;
@ -666,11 +673,13 @@ class XmlDoc {
 	char **getUtf8Content ( ) ;
 	long *getContentHash32 ( ) ;
 	long *getContentHashJson32 ( ) ;
-	long *getTagHash32 ( ) ;
+	//long *getTagHash32 ( ) ;
+	long     *getTagPairHashVector ( ) ;
+	uint32_t *getTagPairHash32 ( ) ;
 	long getHostHash32a ( ) ;
 	long getHostHash32b ( ) ;
 	long getDomHash32 ( );
-	char **getImageData();
+	char **getThumbnailData();
 	class Images *getImages ( ) ;
 	int8_t *getNextSpiderPriority ( ) ;
 	long *getPriorityQueueNum ( ) ;
@ -696,7 +705,7 @@ class XmlDoc {
 	SafeBuf *getNewTagBuf ( ) ;

 	char *updateTagdb ( ) ;
-	bool logIt ( ) ;
+	bool logIt ( class SafeBuf *bb = NULL ) ;
 	bool m_doConsistencyTesting;
 	bool doConsistencyTest ( bool forceTest ) ;
 	long printMetaList ( ) ;
@ -733,7 +742,9 @@ class XmlDoc {
 	//		  bool       nosplit ) ;

 	long getSiteRank ();
-	bool addTable144 ( class HashTableX *tt1 );
+	bool addTable144 ( class HashTableX *tt1 , 
+			   long long docId ,
+			   class SafeBuf *buf = NULL );

 	bool addTable224 ( HashTableX *tt1 ) ;

@ -749,6 +760,7 @@ class XmlDoc {
 	bool hashNoSplit ( class HashTableX *tt ) ;
 	char *hashAll ( class HashTableX *table ) ;
 	long getBoostFromSiteNumInlinks ( long inlinks ) ;
+	bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
 	bool hashMetaTags ( class HashTableX *table ) ;
 	bool hashIsClean ( class HashTableX *table ) ;
 	bool hashZipCodes ( class HashTableX *table ) ;
@ -756,7 +768,7 @@ class XmlDoc {
 	bool hashContentType ( class HashTableX *table ) ;
 	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
-	bool hashUrl ( class HashTableX *table ) ;
+	bool hashUrl ( class HashTableX *table , bool hashNonFieldTerms=true) ;
 	bool hashDateNumbers ( class HashTableX *tt ) ;
 	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText ( class HashTableX *table            ,
@ -783,10 +795,12 @@ class XmlDoc {
 	bool hashAds(class HashTableX *table ) ;
 	class Url *getBaseUrl ( ) ;
 	bool hashSubmitUrls ( class HashTableX *table ) ;
+	bool hashImageStuff ( class HashTableX *table ) ;
 	bool hashIsAdult    ( class HashTableX *table ) ;

 	void set20 ( Msg20Request *req ) ;
 	class Msg20Reply *getMsg20Reply ( ) ;
+	char **getDiffbotPrimaryImageUrl ( ) ;
 	char **getImageUrl() ;
 	class MatchOffsets *getMatchOffsets () ;
 	Query *getQuery() ;
@ -823,6 +837,8 @@ class XmlDoc {
 	bool hashString ( char             *s    ,
 			  long              slen ,
 			  class HashInfo   *hi   ) ;
+	bool hashString ( char             *s    ,
+			  class HashInfo   *hi   ) ;



@ -1057,7 +1073,7 @@ class XmlDoc {
 	// fear of getting the buffer overwritten by crap
 	//TagRec     m_savedTagRec1;
 	//char    *m_sampleVector  ;
-	uint32_t   m_tagPairHash;
+	//uint32_t   m_tagPairHash32;
 	long       m_firstIp;

 	class SafeBuf     *m_savedSb;
@ -1077,6 +1093,7 @@ class XmlDoc {
 	char     m_firstUrlHash64Valid;
 	char     m_lastUrlValid;
 	char     m_docIdValid;
+	char     m_availDocIdValid;
 	//char     m_collValid;
 	char     m_tagRecValid;
 	char     m_robotsTxtLenValid;
@ -1162,11 +1179,9 @@ class XmlDoc {
 	//char     m_msge2Valid;
 	//char   m_sampleVectorValid;
 	char     m_gigabitHashesValid;
-	char     m_tagPairHashValid;
 	//char     m_oldsrValid;
 	char     m_sreqValid;
 	char     m_srepValid;
-	char     m_titleRecValid;

 	bool m_ipValid;
 	bool m_firstIpValid;
@ -1219,7 +1234,9 @@ class XmlDoc {
 	bool m_redirErrorValid;
 	bool m_domHash32Valid;
 	bool m_contentHash32Valid;
-	bool m_tagHash32Valid;
+	//bool m_tagHash32Valid;
+	bool m_tagPairHash32Valid;
+
 	bool m_linkInfo2Valid;
 	bool m_spiderLinksValid;
 	//bool m_nextSpiderPriorityValid;
@ -1320,6 +1337,7 @@ class XmlDoc {
 	bool m_crawlInfoValid;
 	bool m_isPageParserValid;
 	bool m_imageUrlValid;
+	bool m_imageUrl2Valid;
 	bool m_matchOffsetsValid;
 	bool m_queryValid;
 	bool m_matchesValid;
@ -1332,11 +1350,13 @@ class XmlDoc {
 	bool m_newTermInfoBufValid;
 	bool m_summaryValid;
 	bool m_gsbufValid;
+	bool m_spiderReplyMetaListValid;
 	bool m_isCompromisedValid;
 	bool m_isNoArchiveValid;
 	//bool m_isVisibleValid;
 	bool m_clockCandidatesTableValid;
 	bool m_clockCandidatesDataValid;
+	bool m_titleRecBufValid;
 	bool m_isLinkSpamValid;
 	bool m_isErrorPageValid;
 	bool m_isHijackedValid;
@ -1402,6 +1422,7 @@ class XmlDoc {
 	Msg0 m_msg0;
 	Msg5 m_msg5;
 	char m_isDup;
+	long long m_docIdWeAreADupOf;
 	long m_ei;
 	long m_lastLaunch;
 	Msg22Request m_msg22Request;
@ -1943,8 +1964,10 @@ class XmlDoc {
 	//long  m_gsbufAllocSize;
 	char *m_note;
 	char *m_imageUrl;
+	char *m_imageUrl2;
 	//char  m_imageUrlBuf[100];
 	SafeBuf m_imageUrlBuf;
+	SafeBuf m_imageUrlBuf2;
 	//long  m_imageUrlSize;
 	MatchOffsets m_matchOffsets;
 	Query m_query;
@ -1973,11 +1996,12 @@ class XmlDoc {
 	bool  m_deleteFromIndex;

 	// ptrs to stuff
-	char *m_titleRec;
-	long  m_titleRecSize;
-	bool  m_freeTitleRec;
-	long  m_titleRecAllocSize;
-	key_t m_titleRecKey;
+	//char *m_titleRec;
+	SafeBuf m_titleRecBuf;
+	//long  m_titleRecSize;
+	//bool  m_freeTitleRec;
+	//long  m_titleRecAllocSize;
+	key_t   m_titleRecKey;

 	// for isDupOfUs()
 	char *m_dupTrPtr;
@ -2335,6 +2359,8 @@ public:
 		//m_useWeights              = false;
 		m_useSynonyms             = false;
 		m_hashGroup = -1;
+		m_useCountTable = true;
+		m_useSections = true;
 		m_startDist = 0;
 		m_siteHash32 = 0;
 	};
@ -2350,6 +2376,8 @@ public:
 	char              m_hashGroup;
 	long              m_startDist;
 	long              m_siteHash32;
+	bool              m_useCountTable;
+	bool              m_useSections;
 };


--- a/gb.conf
+++ b/gb.conf
@ -281,6 +281,7 @@
 <logDebugDiskMessages>0</>
 <logDebugDnsMessages>0</>
 <logDebugHttpMessages>0</>
+<logDebugImageMessages>0</>
 <logDebugLoopMessages>0</>
 <logDebugLanguageDetectionMessages>0</>
 <logDebugLinkInfo>0</>
--- a/html/admin.html
+++ b/html/admin.html
@ -63,15 +63,29 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
 <br><br><a name=quickstart></a>
 <h1>Quick Start</h1>

-Until I get the binary packages ready, <a href=#src>build from the source code</a>, it should only take about 30 seconds to type the three commands.
-<!--
-Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM.<br><br>
+<!--Until I get the binary packages ready, <a href=#src>build from the source code</a>, it should only take about 30 seconds to type the three commands.-->
+
+Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM to run one instance/host of gb.<br><br>
+
+Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigablast-1.0-1.rpm>Gigablast package for RedHat</a>. 
+
+<br><br>
+Once installed visit your <a href=http://127.0.0.1:8000/>local port 8000</a> to access the search engine controls and begin configuration. It could take up to 20 seconds to start the search engine for the first time.
+
+<br><br>
+
+<table><tr><td colspan=2><b>Installed Files</b></td></tr>
+<tr><td>/var/gigablast/data0/</td><td>Directory of Gigablast binary and data files</td></tr>
+<tr><td>/etc/gigablast/hosts.conf</td><td>Describes the hosts in the distributed cluster. Multiple hosts may exist one one physical server. Initially hosts.conf is just configured to use /var/gigablast/data0/ as the only host. See the section on <a href=#scaling>scaling</a> to add more hosts.</td></tr>
+<tr><td>/etc/init.d/gb</td><td>start up script link</td></tr>
+<tr><td>/etc/init/gb.conf</td><td>upstart conf file so you can type 'start gb' or 'stop gb', but that will only work on local instances of gb.</td></tr>
+<tr><td>/usr/bin/gb</td><td>Link to /var/gigablast/data0/gb</td></tr>
+</table>

-Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigablast-1.0-1.rpm>Gigablast package for RedHat</a>.

 <br><br>
 If you run into an bugs let me know so i can fix them right away: mattdwells@hotmail.com.
-->
+

 <br>
 <br>
@ -100,11 +114,11 @@ You will need the following packages installed<br>

 <b>1.</b> Do <b>apt-get install make g++ gcc-multilib lib32stdc++6</b>
 <br>
-<b>2.</b> Download the <a href=https://github.com/gigablast/open-source-search-engine>Gigablast source code</a> using <b>wget --no-check-certificate "https://github.com/gigablast/open-source-search-engine/archive/master.zip"</b>, unzip it and cd into it.
+<b>2.</b> Download the <a href=https://github.com/gigablast/open-source-search-engine>Gigablast source code</a> using <b>wget --no-check-certificate "https://github.com/gigablast/open-source-search-engine/archive/master.zip"</b>, unzip it and cd into it. (optionally use <b>git clone https://github.com/gigablast/open-source-search-engine.git ./github</b> if you have <i>git</i> installed.)
 <br>
 <b>3.</b> Run <b>make</b> to compile. (e.g. use 'make -j 4' to compile on four cores)
 <br>
-<b>4.</b> Run <b>./gb 0 -d</b>  to start a single gigablast node which listens on port 8000 running in daemon mode.
+<b>4.</b> Run <b>./gb -d</b>  to start a single gigablast node which listens on port 8000 running in daemon mode (-d).
 <br>
 <b>5.</b> The first time you run gb, wait about 30 seconds for it to build some files. Check the log file to see when it completes.
 <br>
@ -157,7 +171,9 @@ You will need the following packages installed<br>
 <li> Sorting. Sort the search results by meta tags or JSON fields that contain numbers, simply by adding something like gbsortby:price or gbrevsortby:price as a query term, assuming you have meta price tags.
 <li> Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
 <li> Using &stream=1 can stream back millions of search results for a query without running out of memory.
+<li> Makes and displays thumbnail images in the search results.
 <li> Nested boolean queries using AND, OR, NOT operators.
+<li> Federated search over multiple Gigablast collections using syntax like &c=mycoll1+mycoll2+mycoll3+...
 <li> Built-in support for <a href=http://www.diffbot.com/products/automatic/>diffbot.com's api</a>, which extracts various entities from web sites, like products, articles, etc. But you will need to get a free token from them for access to their API.
 <li> Spellchecker will be renabled shortly.
 </ul>
--- a/html/magglass.png
+++ b/html/magglass.png
--- a/init.gb.conf
+++ b/init.gb.conf
@ -0,0 +1,22 @@
+# Gigablast Search Engine Service
+
+description     "Gigablast Search Engine Service"
+author          "Matt Wells <gigablast@mail.com>"
+
+start on runlevel [2345]
+stop on starting rc RUNLEVEL=[016]
+
+#respawn
+#respawn limit 2 5
+
+env HOME=/var/gigablast/shard0/
+umask 007
+
+# The default of 5 seconds is too low for mysql which needs to flush buffers
+#kill timeout 300
+
+# this will read /etc/gigablast/hosts.conf and start up the
+# hosts in there that are local on this machine based on its ip address.
+# if one is already running it should detect that it can not bind to the
+# port and just exit right away without doing any harm.
+exec /var/gigablast/shard0/gb localstart
--- a/libjpeg.so.62
+++ b/libjpeg.so.62
--- a/libnetpbm.so.10
+++ b/libnetpbm.so.10
--- a/libpng12.so.0
+++ b/libpng12.so.0
--- a/libtiff.so.4
+++ b/libtiff.so.4
--- a/libz.so.1
+++ b/libz.so.1
--- a/main.cpp
+++ b/main.cpp
@ -181,6 +181,8 @@ bool g_recoveryMode = false;
 	
 bool isRecoveryFutile ( ) ;

+int copyFiles ( char *dstDir ) ;
+
 //////
 //
 // if seo.o is being linked to it needs to override these weak stubs:
@ -402,17 +404,22 @@ int main2 ( int argc , char *argv[] ) {
 	//vpointerObject.isValidPointer(&vpointerObject); // whiny compiler
 	// End Pointer Check setup

-	if (argc < 1) {
+	if (argc < 0) {
 	printHelp:
 		SafeBuf sb;
 		sb.safePrintf(
 			      "\n"
-			      "Usage: gb [-w workingDir] <CMD>\n");
+			      "Usage: gb [CMD]\n");
 		sb.safePrintf(
 			      "\n"
-			      "\tItems in []'s are optional, and items "
-			      "in <>'s are "
-			      "required.");
+			      "\tgb will first try to load "
+			      "the hosts.conf in the same directory as the "
+			      "gb binary, if not found, then it will try "
+			      "/etc/gigablast/hosts.conf. "
+			      "Then it will determine its hostId based on "
+			      "the directory and IP address listed in the "
+			      "hosts.conf file it loaded. Things in []'s "
+			      "are optional.");
 		/*
 		sb.safePrintf(
 			      "\n\t"
@ -425,26 +432,30 @@ int main2 ( int argc , char *argv[] ) {
 			      "overwritten from git pulls.\n\n" );
 		*/
 		sb.safePrintf(
-			"<CMD> can have the following values:\n\n"
+			"[CMD] can have the following values:\n\n"

 			"-h\tprint this help.\n\n"
 			"-v\tprint version and exit.\n\n"

-			"<hostId>\n"
-			"\tstart the gb process for this <hostId> locally."
-			" <hostId> is 0 to run as host #0, for instance."
-			"\n\n"
+			//"<hostId>\n"
+			//"\tstart the gb process for this <hostId> locally."
+			//" <hostId> is 0 to run as host #0, for instance."
+			//"\n\n"


-			"<hostId> -d\n\trun as daemon.\n\n"
+			//"<hostId> -d\n\trun as daemon.\n\n"
+			"-d\trun as daemon.\n\n"

 			//"-o\tprint the overview documentation in HTML. "
 			//"Contains the format of hosts.conf.\n\n"
-			"<hostId> -r\n\tindicates recovery mode, "
+
+			// "<hostId> -r\n\tindicates recovery mode, "
+			// "sends email to addresses "
+			// "specified in Conf.h upon startup.\n\n"
+			"-r\tindicates recovery mode, "
 			"sends email to addresses "
 			"specified in Conf.h upon startup.\n\n"

-
 			"start [hostId]\n"
 			"\tstart the gb process on all hosts or just on "
 			"[hostId] if specified using an ssh command.\n\n"
@ -947,20 +958,30 @@ int main2 ( int argc , char *argv[] ) {
 		return 0;
 	}

+	//SafeBuf tt;
+	//tt.base64Encode("any carnal pleas",16);
+	//fprintf(stderr,"%s\n",tt.getBufStart());
+	//exit(0);
+
 	// get hosts.conf file
 	//char *hostsConf = "./hosts.conf";
-	long hostId = 0;
-	long  cmdarg = 1;
-	char *workingDir = NULL;
-	if ( argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='w'&&argv[1][2]=='\0') {
-		//hostsConf = argv[2];
-		workingDir = argv[2];
-		cmdarg    = 3;
-	}
+	//long hostId = -1;
+	long  cmdarg = 0;
+	//char *workingDir = NULL;
+	//if(argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='w'&&argv[1][2]=='\0') {
+	// 	//hostsConf = argv[2];
+	// 	workingDir = argv[2];
+	// 	cmdarg    = 3;
+	// }
 		
 	// get command
-	if ( argc <= cmdarg ) goto printHelp;
-	char *cmd = argv[cmdarg];
+	//if ( argc <= cmdarg ) goto printHelp;
+	// it might not be there, might be a simple "./gb" 
+	char *cmd = "";
+	if ( argc >= 2 ) {
+		cmdarg = 1;
+		cmd = argv[1];
+	}

 	// help
 	if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp;
@ -979,18 +1000,18 @@ int main2 ( int argc , char *argv[] ) {
 	//	return 0;
 	//}

-	bool hadHostId = false;
-
+	//bool hadHostId = false;
 	// assume our hostId is the command!
 	// now we advance 'cmd' past the hostId if we detect
-	// the presence of more args
-	if ( is_digit(argv[cmdarg][0]) ) {
-		hostId = atoi(argv[cmdarg]);
-		if(argc > cmdarg+1) {
-			cmd = argv[++cmdarg];
-		}
-		hadHostId = true;
-	}
+	// the presence of more args.
+	// WE NO LONGER do it this way...
+	// if ( is_digit(argv[cmdarg][0]) ) {
+	// 	hostId = atoi(argv[cmdarg]);
+	// 	if(argc > cmdarg+1) {
+	// 		cmd = argv[++cmdarg];
+	// 	}
+	// 	hadHostId = true;
+	// }

 	if ( strcmp ( cmd , "dosopen" ) == 0 ) {	
 		long ip;
@ -1024,6 +1045,25 @@ int main2 ( int argc , char *argv[] ) {
 		testMandrill = true;
 	}

+	/*
+	class foo {
+	public:
+		long poo;
+	};
+	class fart {
+	public:
+		short fart3;
+		char fart1;
+		char fart2;
+	};
+	foo xxx;
+	xxx.poo = 38123;
+	fart *yyy = (fart *)&xxx;
+	fprintf(stderr,"fart1=%li fart2=%li fart3=%li\n",
+		(long)yyy->fart1,(long)yyy->fart2,(long)yyy->fart3);
+	exit(0);
+	*/
+
 	// gb gendbs, preset the hostid at least
 	if ( //strcmp ( cmd , "gendbs"   ) == 0 ||
 	     //strcmp ( cmd , "gentfndb" ) == 0 ||
@ -1037,7 +1077,7 @@ int main2 ( int argc , char *argv[] ) {
 		// ensure we got a collection name after the cmd
 		if ( cmdarg + 2 >  argc ) goto printHelp;
 		// may also have an optional hostid
-		if ( cmdarg + 3 == argc ) hostId = atoi ( argv[cmdarg+2] );
+		//if ( cmdarg + 3 == argc ) hostId = atoi ( argv[cmdarg+2] );
 	}

 	if( (strcmp( cmd, "countdomains" ) == 0) &&  (argc >= (cmdarg + 2)) ) {
@ -1047,7 +1087,7 @@ int main2 ( int argc , char *argv[] ) {
 	}

 	// set it for g_hostdb and for logging
-	g_hostdb.m_hostId = hostId;
+	//g_hostdb.m_hostId = hostId;

 	//if ( strcmp ( cmd , "gzip" ) == 0 ) {
 	//	if ( argc > cmdarg+1 ) gbgzip(argv[cmdarg+1]);
@ -1061,7 +1101,6 @@ int main2 ( int argc , char *argv[] ) {
 	//	return 0;
 	//}

-
 	// these tests do not need a hosts.conf
 	/*
 	if ( strcmp ( cmd , "trietest" ) == 0 ) {
@ -1111,8 +1150,8 @@ int main2 ( int argc , char *argv[] ) {
 	if ( strcmp ( cmd , "parsetest"  ) == 0 ) {
 		if ( cmdarg+1 >= argc ) goto printHelp;
 		// load up hosts.conf
-		if ( ! g_hostdb.init(hostId) ) {
-			log("db: hostdb init failed." ); return 1; }
+		//if ( ! g_hostdb.init(hostId) ) {
+		//	log("db: hostdb init failed." ); return 1; }
 		// init our table for doing zobrist hashing
 		if ( ! hashinit() ) {
 			log("db: Failed to init hashtable." ); return 1; }
@ -1157,8 +1196,8 @@ int main2 ( int argc , char *argv[] ) {
 	*/

 	if ( strcmp ( cmd , "booltest" ) == 0 ){
-		if ( ! g_hostdb.init(hostId) ) {
-			log("db: hostdb init failed." ); return 1; }
+		//if ( ! g_hostdb.init(hostId) ) {
+		//	log("db: hostdb init failed." ); return 1; }
 		// init our table for doing zobrist hashing
 		if ( ! hashinit() ) {
 			log("db: Failed to init hashtable." ); return 1; }
@ -1282,7 +1321,7 @@ int main2 ( int argc , char *argv[] ) {
 	     strcmp( argv[cmdarg+1] , "load" ) == 0 ) {
 		isProxy = true;
 		// we need to parse out the hostid too!
-		if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
+		//if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
 	}		

 	// this is just like starting up a gb process, but we add one to
@ -1298,8 +1337,8 @@ int main2 ( int argc , char *argv[] ) {
 	if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
 		useTmpCluster = 1;
 		// we need to parse out the hostid too!
-		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
-		else goto printHelp;
+		//if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
+		//else goto printHelp;
 	}

 	// gb inject <file> <ip:port> [startdocid]
@ -1325,16 +1364,23 @@ int main2 ( int argc , char *argv[] ) {
 	// get current working dir that the gb binary is in. all the data
 	// files should in there too!!
 	//
-	if ( ! workingDir ) workingDir = getcwd2 ( argv[0] );
+	//if ( ! workingDir ) workingDir = getcwd2 ( argv[0] );
+	char *workingDir = getcwd2 ( argv[0] );
+
+	//log("host: working directory is %s",workingDir);

 	// load up hosts.conf
-	if ( ! g_hostdb.init(hostId, 
+	// . it will determine our hostid based on the directory path of this
+	//   gb binary and the ip address of this server
+	if ( ! g_hostdb.init(-1, // we don't know it!!!hostId, 
 			     NULL, 
 			     isProxy,
 			     useTmpCluster,
 			     workingDir)){
 		log("db: hostdb init failed." ); return 1; }

+	Host *h9 = g_hostdb.m_myHost;
+
 	// set clock file name so gettimeofdayInMmiilisecondsGlobal()
 	// see g_clockInSync to be true... unles clockadjust.dat is more
 	// than 2 days old in which case not!
@ -1788,6 +1834,12 @@ int main2 ( int argc , char *argv[] ) {
 		char *cmd = argv[cmdarg+1];
 		return install ( ifk_dsh2 , -1,NULL,NULL,-1, cmd );
 	}
+	// gb copyfiles, like gb install but takes a dir not a host #
+	if ( strcmp ( cmd , "copyfiles" ) == 0 ) {	
+		if ( cmdarg + 1 >= argc ) goto printHelp;
+		char *dir = argv[cmdarg+1];
+		return copyFiles ( dir );
+	}
 	// gb install
 	if ( strcmp ( cmd , "install" ) == 0 ) {	
 		// get hostId to install TO (-1 means all)
@ -2307,8 +2359,8 @@ int main2 ( int argc , char *argv[] ) {
 	// mainStart:

 	// get host info for this host
-	Host *h = g_hostdb.getHost ( hostId );
-	if ( ! h ) { log("db: No host has id %li.",hostId); return 1;}
+	//Host *h = g_hostdb.getHost ( hostId );
+	//if ( ! h ) { log("db: No host has id %li.",hostId); return 1;}

 	// once we are in recoverymode, that means we are being restarted
 	// from having cored, so to prevent immediate core and restart
@ -2329,7 +2381,7 @@ int main2 ( int argc , char *argv[] ) {
 	//   name gbHID.conf
 	// . now that hosts.conf has more of the burden, all gbHID.conf files
 	//   can be identical
- 	if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
+ 	if ( ! g_conf.init ( h9->m_dir ) ) { // , h->m_hostId ) ) {
 		log("db: Conf init failed." ); return 1; }
 	//if ( ! g_hostdb.validateIps ( &g_conf ) ) {
 	//	log("db: Failed to validate ips." ); return 1;}
@ -2421,10 +2473,10 @@ int main2 ( int argc , char *argv[] ) {
 	if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 &&
 	     argv[cmdarg+1][0]=='I')  {		

-		if ( ! hadHostId ) {
-			log("you must supply hostid in the dump cmd");
-			return 0;
-		}
+		//if ( ! hadHostId ) {
+		//	log("you must supply hostid in the dump cmd");
+		//	return 0;
+		//}

 		long      fileNum = 0;
 		long long off     = 0LL;
@ -2440,10 +2492,10 @@ int main2 ( int argc , char *argv[] ) {
 	if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 &&
 	     argv[cmdarg+1][0]=='T')  {		

-		if ( ! hadHostId ) {
-			log("you must supply hostid in the dump cmd");
-			return 0;
-		}
+		//if ( ! hadHostId ) {
+		//	log("you must supply hostid in the dump cmd");
+		//	return 0;
+		//}

 		long      fileNum = 0;
 		long long off     = 0LL;
@ -2462,10 +2514,10 @@ int main2 ( int argc , char *argv[] ) {
 	//           [priority] [printStats?]
 	if ( strcmp ( cmd , "dump" ) == 0 ) {

-		if ( ! hadHostId ) {
-			log("you must supply hostid in the dump cmd");
-			return 0;
-		}
+		// if ( ! hadHostId ) {
+		// 	log("you must supply hostid in the dump cmd");
+		// 	return 0;
+		// }

 		//
 		// tell Collectiondb, not to verify each rdb's data
@ -2749,6 +2801,8 @@ int main2 ( int argc , char *argv[] ) {
 	if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort))
 		return 1;

+	long *ips;
+
 	//if ( strcmp ( cmd , "gendbs"       ) == 0 ) goto jump;
 	//if ( strcmp ( cmd , "gentfndb"     ) == 0 ) goto jump;
 	if ( strcmp ( cmd , "gencatdb"     ) == 0 ) goto jump;
@ -2760,7 +2814,8 @@ int main2 ( int argc , char *argv[] ) {
 	    g_hostdb.m_logFilename );

 	if ( ! g_conf.m_runAsDaemon )
-		log("db: Use ./gb <hostid> -d to run as daemon.");
+		log("db: Use ./gb -d to run as daemon. Example: "
+		    "./gb 0 -d");

 	/*
 	// tmp stuff to generate new query log
@ -2776,7 +2831,16 @@ int main2 ( int argc , char *argv[] ) {

 	// start up log file
 	if ( ! g_log.init( g_hostdb.m_logFilename )        ) {
-		fprintf (stderr,"db: Log file init failed.\n" ); return 1; }
+		fprintf (stderr,"db: Log file init failed. Exiting.\n" ); 
+		return 1; 
+	}
+
+	// in case we do not have one, we need it for Images.cpp
+	if ( ! makeTrashDir() ) {
+		fprintf (stderr,"db: failed to make trash dir. Exiting.\n" ); 
+		return 1; 
+	}
+		

 	g_errno = 0;

@ -2807,6 +2871,20 @@ int main2 ( int argc , char *argv[] ) {

 	g_log.m_logTimestamps = true;

+	// show current working dir
+	log("host: Working directory is %s",workingDir);
+
+	log("host: Using %shosts.conf",g_hostdb.m_dir);
+
+	// from Hostdb.cpp
+	ips = getLocalIps();
+	for ( ; ips && *ips ; ips++ )
+		log("host: Detected local ip %s",iptoa(*ips));
+
+	// show it
+	log("host: Running as host id #%li",g_hostdb.m_hostId );
+
+
 	if (!ucInit(g_hostdb.m_dir, true)) {
 		log("Unicode initialization failed!");
 		return 1;
@ -3275,7 +3353,7 @@ int main2 ( int argc , char *argv[] ) {
 	// . then dns Distributed client
 	// . server should listen to a socket and register with g_loop
 	// . Only the distributed cache shall call the dns server.
-	if ( ! g_dns.init( h->m_dnsClientPort ) ) {
+	if ( ! g_dns.init( h9->m_dnsClientPort ) ) {
 		log("db: Dns distributed client init failed." ); return 1; }
 	// . then dns Local client
 	//if ( ! g_dnsLocal.init( 0 , false ) ) {
@ -3283,7 +3361,7 @@ int main2 ( int argc , char *argv[] ) {
 	// . then webserver
 	// . server should listen to a socket and register with g_loop
 	// again:
-	if ( ! g_httpServer.init( h->m_httpPort, h->m_httpsPort ) ) {
+	if ( ! g_httpServer.init( h9->m_httpPort, h9->m_httpsPort ) ) {
 		log("db: HttpServer init failed. Another gb already "
 		    "running?" ); 
 		// this is dangerous!!! do not do the shutdown thing
@ -3453,7 +3531,7 @@ int main2 ( int argc , char *argv[] ) {
 		char buf[256];
 		log("admin: Sending emails.");
 		sprintf(buf, "Host %li respawning after crash.(%s)",
-			hostId, iptoa(g_hostdb.getMyIp()));
+			h9->m_hostId, iptoa(g_hostdb.getMyIp()));
 		g_pingServer.sendEmail(NULL, buf);
 	}

@ -4642,28 +4720,31 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
 			// don't copy to ourselves
 			//if ( h2->m_hostId == h->m_hostId ) continue;
 			sprintf(tmp,
-				"rcp %sgb.conf %s:%sgb.conf &",
+				"scp %sgb.conf %shosts.conf %s:%s %s",
+				dir ,
 				dir ,
 				//h->m_hostId ,
 				iptoa(h2->m_ip),
-				h2->m_dir);
+				h2->m_dir,
 				//h2->m_hostId);
+				amp);
+
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
-			sprintf(tmp,
-				"rcp %shosts.conf %s:%shosts.conf &",
-				dir ,
-				iptoa(h2->m_ip),
-				h2->m_dir);
-			log(LOG_INIT,"admin: %s", tmp);
-			system ( tmp );
-			sprintf(tmp,
-				"rcp %shosts2.conf %s:%shosts2.conf &",
-				dir ,
-				iptoa(h2->m_ip),
-				h2->m_dir);
-			log(LOG_INIT,"admin: %s", tmp);
-			system ( tmp );
+			// sprintf(tmp,
+			// 	"scp %shosts.conf %s:%shosts.conf &",
+			// 	dir ,
+			// 	iptoa(h2->m_ip),
+			// 	h2->m_dir);
+			// log(LOG_INIT,"admin: %s", tmp);
+			// system ( tmp );
+			// sprintf(tmp,
+			// 	"scp %shosts2.conf %s:%shosts2.conf &",
+			// 	dir ,
+			// 	iptoa(h2->m_ip),
+			// 	h2->m_dir);
+			// log(LOG_INIT,"admin: %s", tmp);
+			// system ( tmp );
 		}
 		else if ( installFlag == ifk_start ) {
 			// . save old log now, too
@ -4743,7 +4824,7 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
 				"ssh %s \"cd %s ; "
 				"cp -f tmpgb tmpgb.oldsave ; "
 				"mv -f tmpgb.installed tmpgb ; "
-				"./tmpgb -w %s tmpstarthost "
+				"%s/tmpgb tmpstarthost "
 				"%li >& ./tmplog%03li &\" &",
 				iptoa(h2->m_ip),
 				h2->m_dir      ,
@ -16878,8 +16959,8 @@ bool isRecoveryFutile ( ) {
 		// get time stamp
 		long timestamp = ff.getLastModifiedTime ( );

-		// skip if not iwthin last minute
-		if ( timestamp < now - 60 ) continue;
+		// skip if not iwthin 2 minutes
+		if ( timestamp < now - 2*60 ) continue;

 		// open it up to see if ends with sighandle
 		long toRead = 3000;
@ -16931,16 +17012,27 @@ char *getcwd2 ( char *arg ) {

 	// store the relative path of gb in there now
 	static char s_cwdBuf[1025];
-	getcwd ( s_cwdBuf , 1024 );
+	getcwd ( s_cwdBuf , 1020 );
 	char *end = s_cwdBuf + gbstrlen(s_cwdBuf);

 	// if "arg" is a RELATIVE path then append it
 	if ( arg && arg[0]!='/' ) {
 		memcpy ( end , arg , alen );
 		end += alen;
+		*end = '\0';
+	}
+	// if our path started with / then it was absolute...
+	else {
+		strncpy(s_cwdBuf,arg,alen);
+	}
+
+	// make sure it ends in / for consistency
+	long clen = gbstrlen(s_cwdBuf);
+	if ( s_cwdBuf[clen-1] != '/' ) {
+		s_cwdBuf[clen++] = '/';
+		s_cwdBuf[clen++] = '\0';
 	}

-	*end = '\0';

 	// size of the whole thing
 	//long clen = gbstrlen(s_cwdBuf);
@ -16954,3 +17046,22 @@ char *getcwd2 ( char *arg ) {

 	return s_cwdBuf;
 }
+
+int copyFiles ( char *dstDir ) {
+
+	char *srcDir = "./";
+	SafeBuf fileListBuf;
+	g_process.getFilesToCopy ( srcDir , &fileListBuf );
+
+	SafeBuf tmp;
+	tmp.safePrintf(
+		       "cp -r %s %s"
+		       , fileListBuf.getBufStart()
+		       , dstDir 
+		       );
+
+	//log(LOG_INIT,"admin: %s", tmp.getBufStart());
+	fprintf(stderr,"\nRunning cmd: %s\n",tmp.getBufStart());
+	system ( tmp.getBufStart() );
+	return 0;
+}
--- a/BIN
+++ b/BIN