privacore-open-source-searc…/StopWords.cpp

// Matt Wells, copyright Jul 2001

#include "StopWords.h"
#include "gb-include.h"
#include "HashTableX.h"
#include "Speller.h"
#include "Loop.h"
#include "Lang.h"
#include "Posdb.h" // MAXLANGID
#include "GbMutex.h"
#include "ScopedLock.h"

// . h is the lower ascii 64bit hash of a word
// . this returns true if h is the hash of an ENGLISH stop word
// . list taken from www.superjournal.ac.uk/sj/application/demo/stopword.htm 
// . stop words with "mdw" next to them are ones I added


// . i shrunk this list a lot
// . see backups for the hold list
static const char * const s_stopWords[] = {
	"a",
	"b",
	"c",
	"d",
	"e",
	"f",
	"g",
	"h",
	"i",
	"j",
	"k",
	"l",
	"m",
	"n",
	"o",
	"p",
	"q",
	"r",
	"s",
	"t",
	"u",
	"v",
	"w",
	"x",
	"y",
	"z",
	"0",
	"1",
	"2",
	"3",
	"4",
	"5",
	"6",
	"7",
	"8",
	"9",
	"an",
	"as",
	"at",
	"be",
	"by",
	"of",
	"on",
	"or",
	"do",
	"he",
	"if",
	"is",
	"it",
	"in",
	"me",
	"my",
	"re",
	"so",
	"to",
	"us",
	"vs",
	"we",
	"the",
	"and",
	"are",
	"can",
	"did",
	"per",
	"for",
//		"get",
	"had",
	"has",
	"her",
	"him",
	"its",
//              "may",	  // like the month
	// wikipedia has this in lower case in the title so we need
	// not to be a stopword
	"not", // fix 'to be or not to be'... no, revert
	"our",
	"she",
	"you",
	"also",
	"been",
	"from",
	"have",
	"here",
	"hers",
//	        "mine",   // land mine
	"ours",
	"that",
	"them",
	"then",
	"they",
	"this",
	"were",
	"will",
	"with",
	"your",
	"about",
	"above",
	//"ain",   // ain't
	"could",
	//"isn",   // isn't
	"their",
	"there",
	"these",
	"those",
	"through", // fix ceder.net "Mainstream thru A1 Dance" event title
	"thru",    // fix ceder.net "Mainstream thru A1 Dance" event title
	"until", // fix event title for blackbirdbuvette.com
	"under", // fix title for http://www.harwoodmuseum.org/press_detail.php?ID=44
	"would",
	"yours",
	"theirs",
	//"aren",  // aren't
	//"hadn",  // hadn't
	//"didn",  // didn't
	//"hasn",  // hasn'y
	//"ll",    // they'll this'll that'll you'll
	//"ve",    // would've should've
	//"should",
	//"shouldn", // shouldn't
	NULL
};
static HashTableX s_stopWordTable;
static bool       s_stopWordsInitialized = false;
static GbMutex    s_stopWordTableMutex;

static bool initWordTable(HashTableX *table, const char * const words[], const char *label) {
	// count them
	int32_t count; for ( count = 0 ; words[count] ; count++ );
	// set up the hash table
	if ( ! table->set ( 8,4,count * 2,NULL,0,false,label ) ) {
		log(LOG_INIT, "build: Could not init stop words table.");
		return false;
	}
	// now add in all the stop words
	int32_t n = count;//(int32_t)size/ sizeof(char *); 
	for ( int32_t i = 0 ; i < n ; i++ ) {
		const char      *sw    = words[i];
		if ( ! sw ) break;
		int32_t       swlen = strlen ( sw );
		int64_t  swh   = hash64Lower_utf8 ( sw , swlen );
		//log("ii: #%" PRId32"  %s",i,sw);
		if ( ! table->addTerm(swh,i+1) ) return false;
	}
	return true;
}

bool isStopWord ( const char *s , int32_t len , int64_t h ) {
	ScopedLock sl(s_stopWordTableMutex);
	if ( ! s_stopWordsInitialized ) {
		s_stopWordsInitialized = 
			initWordTable(&s_stopWordTable, s_stopWords, 
				      //sizeof(s_stopWords),
				      "stopwords");
		if (!s_stopWordsInitialized) return false;
	} 
	sl.unlock();

	// . all 1 char letter words are stop words
	// . good for initials and some contractions
	if ( len == 1 && is_alpha_a(*s) ) return true;

	// get from table
	return s_stopWordTable.getScore(h);
}		


#include "query_stop_words_list.inc"

static HashTableX s_queryStopWordTables[MAXLANGID+1];
static bool       s_queryStopWordsInitialized = false;
static GbMutex    s_queryStopWordsMutex;


static void initializeQueryStopWords(lang_t lang) {
	HashTableX &table = s_queryStopWordTables[lang];
	if(!table.set(8,4,50,NULL,0,false,"qrystops") ) {
		log(LOG_INIT, "build: Could not init stop words table.");
		return;
	}
	
	const char *abbrev = getLanguageAbbr(lang);
	char filename[1024];
	sprintf(filename, "%s/query_stop_words.%s.txt", g_hostdb.getMyHost()->m_dir, abbrev);
	FILE *fp=fopen(filename,"r");
	if(fp) {
		//use what is in the file, and only that
		log(LOG_DEBUG,"Loading query-stop-words for '%s' from %s", abbrev, filename);
		char line[128];
		while(fgets(line,sizeof(line),fp)) {
			if(char *p = strchr(line,'/'))
				*p = '\0';
			if(char *p = strchr(line,' '))
				*p = '\0';
			if(char *p = strchr(line,'\n'))
				*p = '\0';
			if(line[0]=='\0')
				continue;
			
			int word_len = strlen(line);
			int64_t  word_hash = hash64Lower_utf8(line,word_len);
			table.addTerm(word_hash,1);
		}
		fclose(fp);
		log(LOG_DEBUG,"Loaded query-stop-words for '%s' from %s", abbrev, filename);
	} else {
		//fall back to build-in default, if any
		//log(LOG_DEBUG,"Using builtin-default query-stop-words for '%s'", abbrev); //too noisy log upon startup
		const char * const *words = NULL;
		for(size_t i=0; i<sizeof(s_query_stop_words_lang)/sizeof(s_query_stop_words_lang[0]); i++) {
			if(s_query_stop_words_lang[i].lang==lang) {
				words = s_query_stop_words_lang[i].words;
				break;
			}
		}
		if(!words)
			words = s_query_stop_words_xx; //use default table
		
		for(;
		    *words;
		    words++)
		{
			int word_len = strlen(*words);
			int64_t word_hash = hash64Lower_utf8(*words,word_len);
			table.addTerm(word_hash,1);
		}
	}
}


static void initializeQueryStopWords() {
	ScopedLock sl(s_queryStopWordsMutex);
	if(s_queryStopWordsInitialized)
		return;
	
	for(int lang=langUnknown; lang<langUnwanted; lang++)
		initializeQueryStopWords((lang_t)lang);
	s_queryStopWordsInitialized = true;
}


bool isQueryStopWord ( const char *s , int32_t len , int64_t h , int32_t langId ) {
	initializeQueryStopWords();

	// . all 1 char letter words are stop words
	// . good for initials and some contractions
	// . fix for 'j. w. eagan' .. return FALSE now
	// . let 'a' remain a query stop word i guess... (mdw 7/16/12)
	//if ( len == 1 && is_alpha_a(*s) ) return false;

	if ( langId < 0 ) langId = langUnknown;
	if ( langId > MAXLANGID ) langId = langUnknown;

	// get from table
	return s_queryStopWordTables[langId].getScore(h);
}

// is it a stop word?
// . these have the stop words above plus some foreign stop words
// . these aren't
// . i shrunk this list a lot
// . see backups for the hold list
// . i shrunk this list a lot
// . see backups for the hold list
static const char      *s_commonWords[] = {
	"to",   // score = 1
	"and",  // score = 2
	"of",   // score = 3
	"the",  // score = 4
	"this", // score = 5
	"between",
	"onto",
	"now",
	"during",
	"after",
	"before",
	"since",
	"his",
	"more",
	"all",
	"most",
	"each",
	"other",
	"others",
	"same",
	"throughout",
	"through",
	"part",
	"being",
	"any",
	"many",
	"than",
	"within",
	"without",
	"since",
	"because",
	"whether",
	"both",
	"able",
	"prior",
	"under",
	"beneath",
	"shall",
	"around",
	"while",
	"must",
	"inside",
	"just",
	"until",
	"behind",
	"my",
	"inc",  // incorporated
	"one",
	"two",
	"three",
	"four",
	"1",
	"2",
	"3",
	"4",
	"et",
	"est",
	"against",
	"mr",
	"mrs",
	"miss",
	"out",
	"outside",
	"well",
	"only",
	"some",
	"even",
	"may",
	"still",
	"such",
	"much",
	"ever",
	"every",
	"become",
	"along",
		
	"tion", // broken words
	"ture", // broken words

	"use",
	"used",
	"using",
	"following", // the following
	"home"     ,
	"copyright",
	"tm",          // trademark
	"information",
	"info",
	"number",    // number of
	"welcome",
	"online",
	//"contact",
	"today",
	"said",
	"says",
	"say",
	"told",
	"became",
	"again",
	"later",
	"began",
	"gotta",
	"yet",
	"maybe",
	"someone",
	"something",
	"oh",
	"thanks",
	"co.uk",
	"first",
	"takes",
	"rest",
	"might",
	"never",
	"ever",
	"ok",
	"himself",
	"herself",
	"southern",
	"northern",
	"beyond",
	"saw",
	"truly",
	"turns",
	"tonight",
	"took",
	"came",
	"seeing",
	"expect",
	"arrives",
	"arrive",
	"starts",
	"recently",
	"land",
	"born",
	"ah",
	"attack",
	"kill",
	"states",
	"down",
	"up",
	"shit",
	"fuck",
	"damn",
	"wait",
	"leave",
	"exit",
	"sleep",
	"anymore",
	"presents",
	"shares",
	"wrote",
	"pleasure",
	"mention",
	"gets",
	"get",
	"feels",
	"feeling",
	"across",
	"entirely",
	"really",
	// until we add the rule to allow month/day names only
	// if adjacent to an alpha word with only a space between
	// let's get this out of there
	"jan",
	"feb",
	"mar",
	"apr",
	"may",
	"jun",
	"jul",
	"aug",
	"sep",
	"oct",
	"nov",
	"dec",
	"january",
	"february",
	"march",
	"april",
	"may",
	"june",
	"july",
	"august",
	"september",
	"october",
	"november",
	"december",
	"sun",
	"mon",
	"tue",
	"wed",
	"thu",
	"fri",
	"sat",
	"sunday",
	"monday",
	"tuesday",
	"wednesday",
	"thursday",
	"friday",
	"saturday",
	// unfortunately com is portuguese for with.
	//"org",
	//"com", 
	//"central",
	//"click", chops off pay per click
	//"website",
	//"site",
	//"place",
	//"web",
	"best",
	"does",
	"see",
	"2003",
	"2004",
	"2005",
	"2006",
	"2007",
	"2008",
	"2009",
	"2010",
	"2011",
	"2012",
	"2013",
	"2014",
	"2015",

	"at",
	"be",
	"by",
	"on",
	"or",
	"do",
	"he",
	"if",
	"is",
	"it",
	"it's",
	"don't",
	"doesn't",
	"can't",
	"won't,"
	"shouldn't",
	"wouldn't",
	"couldn't",
	"should've",
	"would've",
	"could've",
	"wasn't",
	"hasn't",
	"hadn't",
	"like", // in too many gigabits
	"know", // in too many gigabits
	"you'd",
	"we'd",
	"i'd",
	"haven't",
	"he'd",
	"she'd",
	"they'd",
	"dont",
	"won't",
	"you're",
	"very",
	"seem",
	"seems",
	"thats",
	"aren't",
	"arent",
	"let's",
	"let",
	"you've",
	"they're",
	"you'll",
	"didn't",
	"i've",
	"we've",
	"they've",
	"we'll",
	"they'll",
	"i'll",
	"he'll",
	"she'll",
	"he's",
	"she's",
	"we're",
	"i'm",
	"though",
	"isn't",
	"in",
	"into",
	"me", 
	"my", 
	"re",
	"so",
	"us",
	"vs",
	"we",
	"are",
	"but",
	"over",
	"can",
	"did",
	"per",
	"for",
	"get",
	"had",
	"has",
	"her",
	"him",
	"its",
	"may",
	//	"not",
	"our",
	"she",
	"you",
	"also",
	"been",
	"from",
	"have",
	"here",
	"here's",
	"there's",
	"that's",
	"hers",
	//"mine",
	"ours",
	"that",
	"them",
	"then",
	"they",
	"were",
	"will",
	"with",
	"your",
	"about",
	"above",
	"ain",   // ain't
	"could",
	"isn",   // isn't
	"their",
	"there",
	"these",
	"those",
	"would",
	"yours",
	"theirs",
	"aren",  // aren't
	"hadn",  // hadn't
	"didn",  // didn't
	"hasn",  // hasn'y
	"ll",    // they'll this'll that'll you'll
	"ve",    // would've should've
	"should",
	"shouldn", // shouldn't

	// . additional english stop words for queries
	// . we don't want to require any of these words
	// . 'second hand smoke and how it affects children' 
	//   should essentially reduce to 5 words instead of 8
	"i",		// subject,
	"it",		// subject
	"what",		// 
	"what's",
	"which",	// 
	"who",		// common word
	"that",		// 
	"is",		// -s
	"are",		// present
	"was",		// 1st
	"be",		// infinitive
	"will",	        // 
	"a",		// 
	"an",		// 
	"or",		// 
	"as",		// 
	"at",		// 
	"by",		// 
	"for",		// 
	"with",		// 
	"about",	// 
	"from",		// 
	"in",		// 
	"on",		// 
	"when",		// 
	"where",	// 
	"why",		// common word
	"how",		// 

	"finally",
	"own",

	// danish stop words (in project/stopwords)
	// cat danish.txt | awk '{print "\t\t\""$1"\",\t\t// "$3}'
	"i",		// in
	"jeg",		// I
	"det",		// that
	"at",		// that
	"en",		// a/an
	"den",		// it
	"til",		// to/at/for/until/against/by/of/into,
	"er",		// present
	"som",		// who,
	"på",		// on/upon/in/on/at/to/after/of/with/for,
	"de",		// they
	"med",		// with/by/in,
	"han",		// he
	"af",		// of/by/from/off/for/in/with/on,
	"for",		// at/for/to/from/by/of/ago,
	"ikke",		// not
	"der",		// who/which,
	"var",		// past
	"mig",		// me/myself
	"sig",		// oneself/himself/herself/itself/themselves
	"men",		// but
	"et",		// a/an/one,
	"har",		// present
	"om",		// round/about/for/in/a,
	"vi",		// we
	"min",		// my
	"havde",	// past
	"ham",		// him
	"hun",		// she
	"nu",		// now
	"over",		// over/above/across/by/beyond/past/on/about,
	"da",		// then,
	"fra",		// from/off/since,
	"du",		// you
	"ud",		// out
	"sin",		// his/her/its/one's
	"dem",		// them
	"os",		// us/ourselves
	"op",		// up
	"man",		// you/one
	"hans",		// his
	"hvor",		// where
	"eller",	// or
	"hvad",		// what
	"skal",		// must/shall
	"selv",		// myself/youself/herself/ourselves
	"her",		// here
	"alle",		// all/everyone/everybody
	"vil",		// will
	"blev",		// past
	"kunne",	// could
	"ind",		// in
	"når",	// when
	"være",	// present
	"dog",	// however/yet/after
	"noget",	// something
	"ville",	// would
	"jo",		// you
	"deres",	// their/theirs
	"efter",	// after/behind/according
	"ned",	// down
	"skulle",	// should
	"denne",	// this
	"end",	// than
	"dette",	// this
	"mit",	// my/mine
	"også",		// also
	"ogsa",		// also
	"under",	// under/beneath/below/during,
	"have",		// have
	"dig",	// you
	"anden",	// other
	"hende",	// her
	"mine",		// my
	"alt",	// everything
	"meget",	// much/very,
	"sit",	// his,
	"sine",	// his,
	"vor",		// our
	"mod",	// against
	"disse",	// these
	"hvis",	// if
	"din",		// your/yours
	"nogle",	// some
	"hos",		// by/at
	"blive",	// be/become
	"mange",	// many
	"ad",		// by/through
	"bliver",	// present
	"hendes",	// her/hers
	"været",	// be
	"vaeret",	// be
	"thi",		// for
	"jer",		// you
	"sådan",	// such,

	// dutch stop words
	"de",		// the
	"en",		// and
	"van",		// of,
	"ik",		// I,
	"te",		// (1)
	"dat",		// that,
	"die",		// that,
	"in",		// in,
	"een",		// a,
	"hij",		// he
	"het",		// the,
	"niet",		// not,
	"zijn",		// (1)
	"is",		// is
	"was",		// (1)
	"op",		// on,
	"aan",		// on,
	"met",		// with,
	"als",		// like,
	"voor",		// (1)
	"had",		// had,
	"er",		// there
	"maar",		// but,
	"om",		// round,
	"hem",		// him
	"dan",		// then
	"zou",		// should/would,
	"wat",		// what,
	"mijn",		// possessive
	"men",		// people,
	"dit",		// this
	"zo",		// so,
	"door",		// through
	"over",		// over,
	"ze",		// she,
	"zich",		// oneself
	"bij",		// (1)
	"ook",		// also,
	"tot",		// till,
	"je",		// you
	"mij",		// me
	"uit",		// out
	"der",		// Old
	"daar",		// (1)
	"haar",		// (1)
	"naar",		// (1)
	"heb",		// present
	"hoe",		// how,
	"heeft",	// present
	"hebben",	// 'to
	"deze",		// this
	"u",		// you
	"want",		// (1)
	"nog",		// yet,
	"zal",		// 'shall',
	"me",		// me
	"zij",		// she,
	"nu",		// now
	"ge",		// 'thou',
	"geen",		// none
	"omdat",	// because
	"iets",		// something,
	"worden",	// to
	"toch",		// yet,
	"al",		// all,
	"waren",	// (1)
	"veel",		// much,
	"meer",		// (1)
	"doen",		// to
	"toen",		// then,
	"moet",		// noun
	"ben",		// (1)
	"zonder",	// without
	"kan",		// noun
	"hun",		// their,
	"dus",		// so,
	"alles",	// all,
	"onder",	// under,
	"ja",		// yes,
	"eens",		// once,
	"hier",		// here
	"wie",		// who
	"werd",		// imperfect
	"altijd",	// always
	"doch",		// yet,
	"wordt",	// present
	"wezen",	// (1)
	"kunnen",	// to
	"ons",		// us/our
	"zelf",		// self
	"tegen",	// against,
	"na",		// after,
	"reeds",	// already
	"wil",		// (1)
	"kon",		// could;
	"niets",	// nothing
	"uw",		// your
	"iemand",	// somebody
	"geweest",	// been;
	"andere",	// other


	// french stop words
	"au",		// a
	"aux",		// a
	"avec",		// with
	"ce",		// this
	"ces",		// these
	"dans",		// with
	"de",		// of
	"des",		// de
	"du",		// de
	"elle",		// she
	"en",		// `of
	"et",		// and
	"eux",		// them
	"il",		// he
	"je",		// I
	"la",		// the
	"le",		// the
	"leur",		// their
	"lui",		// him
	"ma",		// my
	"mais",		// but
	"me",		// me
	"même",		// same;
	"mes",		// me
	"moi",		// me
	"mon",		// my
	"ne",		// not
	"nos",		// our
	"notre",	// our
	"nous",		// we
	"on",		// one
	"ou",		// where
	"par",		// by
	"pas",		// not
	"pour",		// for
	"qu",		// que
	"que",		// that
	"qui",		// who
	"sa",		// his,
	"se",		// oneself
	"ses",		// his
	"son",		// his,
	"sur",		// on
	"ta",		// thy
	"te",		// thee
	"tes",		// thy
	"toi",		// thee
	"ton",		// thy
	"tu",		// thou
	"un",		// a
	"une",		// a
	"vos",		// your
	"votre",	// your
	"vous",		// you

	// german stop words
	"aber",		// but
	"alle",		// all
	"allem",	// 
	"allen",	// 
	"aller",	// 
	"alles",	// 
	"als",		// than,
	"also",		// so
	"am",		// an
	"an",		// at
	"ander",	// other
	"andere",	// 
	"anderem",	// 
	"anderen",	// 
	"anderer",	// 
	"anderes",	// 
	"anderm",	// 
	"andern",	// 
	"anderr",	// 
	"anders",	// 
	"auch",		// also
	"auf",		// on
	"aus",		// out
	"bei",		// by
	"bin",		// am
	"bis",		// until
	"bist",		// art
	"da",		// there
	"damit",	// with
	"dann",		// then
	"der",		// the
	"den",		// 
	"des",		// 
	"dem",		// 
	"die",		// 
	"das",		// 
	"daъ",		// that
	"derselbe",	// the
	"derselben",	// 
	"denselben",	// 
	"desselben",	// 
	"demselben",	// 
	"dieselbe",	// 
	"dieselben",	// 
	"dasselbe",	// 
	"dazu",		// to
	"dein",		// thy
	"deine",	// 
	"deinem",	// 
	"deinen",	// 
	"deiner",	// 
	"deines",	// 
	"denn",		// because
	"derer",	// of
	"dessen",	// of
	"dich",		// thee
	"dir",		// to
	"du",		// thou
	"dies",		// this
	"diese",	// 
	"diesem",	// 
	"diesen",	// 
	"dieser",	// 
	"dieses",	// 
	"doch",		// (several
	"dort",		// (over)
	"durch",	// through
	"ein",		// a
	"eine",		// 
	"einem",	// 
	"einen",	// 
	"einer",	// 
	"eines",	// 
	"einig",	// some
	"einige",	// 
	"einigem",	// 
	"einigen",	// 
	"einiger",	// 
	"einiges",	// 
	"einmal",	// once
	"er",		// he
	"ihn",		// him
	"ihm",		// to
	"es",		// it
	"etwas",	// something
	"euer",		// your
	"eure",		// 
	"eurem",	// 
	"euren",	// 
	"eurer",	// 
	"eures",	// 
	"für",		// for
	"gegen",	// towards
	"gewesen",	// p.p.
	"hab",		// have
	"habe",		// have
	"haben",	// have
	"hat",		// has
	"hatte",	// had
	"hatten",	// had
	"hier",		// here
	"hin",		// there
	"hinter",	// behind
	"ich",		// I
	"mich",		// me
	"mir",		// to
	"ihr",		// you,
	"ihre",		// 
	"ihrem",	// 
	"ihren",	// 
	"ihrer",	// 
	"ihres",	// 
	"euch",		// to
	"im",		// in
	"in",		// in
	"indem",	// while
	"ins",		// in
	"ist",		// is
	"jede",		// each,
	"jedem",	// 
	"jeden",	// 
	"jeder",	// 
	"jedes",	// 
	"jene",		// that
	"jenem",	// 
	"jenen",	// 
	"jener",	// 
	"jenes",	// 
	"jetzt",	// now
	"kann",		// can
	"kein",		// no
	"keine",	// 
	"keinem",	// 
	"keinen",	// 
	"keiner",	// 
	"keines",	// 
	"können",	// can
	"könnte",	// could
	"machen",	// do
	"man",		// one
	"manche",	// some,
	"manchem",	// 
	"manchen",	// 
	"mancher",	// 
	"manches",	// 
	"mein",		// my
	"meine",	// 
	"meinem",	// 
	"meinen",	// 
	"meiner",	// 
	"meines",	// 
	"mit",		// with
	"muss",		// must
	"musste",	// had
	"nach",		// to(wards)
	"nicht",	// not
	"nichts",	// nothing
	"noch",		// still,
	"nun",		// now
	"nur",		// only
	"ob",		// whether
	"oder",		// or
	"ohne",		// without
	"sehr",		// very
	"sein",		// his
	"seine",	// 
	"seinem",	// 
	"seinen",	// 
	"seiner",	// 
	"seines",	// 
	"selbst",	// self
	"sich",		// herself
	"sie",		// they,
	"ihnen",	// to
	"sind",		// are
	"so",		// so
	"solche",	// such
	"solchem",	// 
	"solchen",	// 
	"solcher",	// 
	"solches",	// 
	"soll",		// shall
	"sollte",	// should
	"sondern",	// but
	"sonst",	// else
	"über",		// over
	"um",		// about,
	"und",		// and
	"uns",		// us
	"unse",		// 
	"unsem",	// 
	"unsen",	// 
	"unser",	// 
	"unses",	// 
	"unter",	// under
	"viel",		// much
	"vom",		// von
	"von",		// from
	"vor",		// before
	"während",	// while
//		"war",		// was
	"waren",	// were
	"warst",	// wast
	"was",		// what
	"weg",		// away,
	"weil",		// because
	"weiter",	// further
	"welche",	// which
	"welchem",	// 
	"welchen",	// 
	"welcher",	// 
	"welches",	// 
	"wenn",		// when
	"werde",	// will
	"werden",	// will
	"wie",		// how
	"wieder",	// again
	"will",		// want
	"wir",		// we
	"wird",		// will
	"wirst",	// willst
	"wo",		// where
	"wollen",	// want
	"wollte",	// wanted
	"würde",	// would
	"würden",	// would
	"zu",		// to
	"zum",		// zu
	"zur",		// zu
	"zwar",		// indeed
	"zwischen",	// between
		
	// italian stop words
	"ad",		// a
	"al",		// a
	"allo",		// a
	"ai",		// a
	"agli",		// a
	"all",		// a
	"agl",		// a
	"alla",		// a
	"alle",		// a
	"con",		// with
	"col",		// con
	"coi",		// con
	"da",		// from
	"dal",		// da
	"dallo",	// da
	"dai",		// da
	"dagli",	// da
	"dall",		// da
	"dagl",		// da
	"dalla",	// da
	"dalle",	// da
	"di",		// of
	"del",		// di
	"dello",	// di
	"dei",		// di
	"degli",	// di
	//"dell",		// di
	"degl",		// di
	"della",	// di
	"delle",	// di
	"in",		// in
	"nel",		// in
	"nello",	// in
	"nei",		// in
	"negli",	// in
	"nell",		// in
	"negl",		// in
	"nella",	// in
	"nelle",	// in
	"su",		// on
	"sul",		// su
	"sullo",	// su
	"sui",		// su
	"sugli",	// su
	"sull",		// su
	"sugl",		// su
	"sulla",	// su
	"sulle",	// su
	"per",		// through,
	"tra",		// among
	"contro",	// against
	"io",		// I
	"tu",		// thou
	"lui",		// he
	"lei",		// she
	"noi",		// we
	"voi",		// you
	"loro",		// they
	"mio",		// my
	"mia",		// 
	"miei",		// 
	"mie",		// 
	"tuo",		// 
	"tua",		// 
	"tuoi",		// thy
	"tue",		// 
	"suo",		// 
	"sua",		// 
	"suoi",		// his,
	"sue",		// 
	"nostro",	// our
	"nostra",	// 
	"nostri",	// 
	"nostre",	// 
	"vostro",	// your
	"vostra",	// 
	"vostri",	// 
	"vostre",	// 
	"mi",		// me
	"ti",		// thee
	"ci",		// us,
	"vi",		// you,
	"lo",		// him,
	"la",		// her,
	"li",		// them
	"le",		// them,
	"gli",		// to
	"ne",		// from
	"il",		// the
	"un",		// a
	"uno",		// a
	"una",		// a
	"ma",		// but
	"ed",		// and
	"se",		// if
	"perché",	// why,
	"anche",	// also
//		"come",		// how
	"dov",		// where
	"dove",		// where
	"che",		// who,
	"chi",		// who
	"cui",		// whom
	"non",		// not
	"più",		// more
	"quale",	// who,
	"quanto",	// how
	"quanti",	// 
	"quanta",	// 
	"quante",	// 
	"quello",	// that
	"quelli",	// 
	"quella",	// 
	"quelle",	// 
	"questo",	// this
	"questi",	// 
	"questa",	// 
	"queste",	// 
	"si",		// yes
	"tutto",	// all
	"tutti",	// all
	"a",		// at
	"c",		// as
	"e",		// and
	"i",		// the
	"l",		// as
	"o",		// or
		
	// norwegian stop words
	"og",		// and
	"i",		// in
	"jeg",		// I
	"det",		// it/this/that
	"at",		// to
	"en",		// a
	"den",		// it/this/that
	"til",		// to
	"er",		// is
	"som",		// who/that
	"på",		// on
	"de",		// they
	"med",		// with
	"han",		// he
	"av",		// of
	"ikke",		// not
	"inte",		// not
	"der",		// there
	"så",		// so
	"var",		// was
	"meg",		// me
	"seg",		// you
	"men",		// but
	"ett",		// a
	"har",		// have
	"om",		// about
	"vi",		// we
	"min",		// my
	"mitt",		// my
	"ha",		// have
	"hade",		// had
	"hu",		// she
	"hun",		// she
	"nå",		// now
	"over",		// over
	"da",		// when/as
	"ved",		// by/know
	"fra",		// from
	"du",		// you
	"ut",		// out
	"sin",		// your
	"dem",		// them
	"oss",		// us
	"opp",		// up
	"man",		// you/one
	"kan",		// can
	"hans",		// his
	"hvor",		// where
	"eller",	// or
	"hva",		// what
	"skal",		// shall/must
	"selv",		// self
	"sjøl",		// self
	"her",		// here
	"alle",		// all
	"vil",		// will
	"bli",		// become
	"ble",		// became
	"blei",		// became
	"blitt",	// have
	"kunne",	// could
	"inn",		// in
	"når",		// when
	"være",		// be
	"kom",		// come
	"noen",		// some
	"noe",		// some
	"ville",	// would
	"dere",		// you
	"de",		// you
	"som",		// who/which/that
	"deres",	// their/theirs
	"kun",		// only/just
	"ja",		// yes
	"etter",	// after
	"ned",		// down
	"skulle",	// should
	"denne",	// this
	"for",		// for/because
	"deg",		// you
	"si",		// hers/his
	"sine",		// hers/his
	"sitt",		// hers/his
	"mot",		// against
	"å",		// to
	"meget",	// much
	"hvorfor",	// why
	"sia",		// since
	"sidan",	// since
	"dette",	// this
	"desse",	// these/those
	"disse",	// these/those
	"uden",		// uten
	"hvordan",	// how
	"ingen",	// noone
	"inga",		// noone
	"din",		// your
	"ditt",		// your
	"blir",		// become
	"samme",	// same
	"hvilken",	// which
	"hvilke",	// which
	"sånn",		// such
	"inni",		// inside/within
	"mellom",	// between
	"vår",		// our
	"hver",		// each
	"hvem",		// who
	"vors",		// us/ours
	"dere",		// their
	"deres",	// theirs
	"hvis",		// whose
	"både",		// both
	"båe",		// both
	"begge",	// both
	"siden",	// since
	"dykk",		// your
	"dykkar",	// yours
	"dei",		// they
	"deira",	// them
	"deires",	// theirs
	"deim",		// them
	"di",		// your
	"då",		// as/when
	"eg",		// I
	"ein",		// a/an
	"ei",		// a/an
	"eit",		// a/an
	"eitt",		// a/an
	"elles",	// or
	"honom",	// he
	"hjå",		// at
	"ho",		// she
	"hoe",		// she
	"henne",	// her
	"hennar",	// her/hers
	"hennes",	// hers
	"hoss",		// how
	"hossen",	// how
	"ikkje",	// not
	"ingi",		// noone
	"inkje",	// noone
	"korleis",	// how
	"korso",	// how
	"kva",		// what/which
	"kvar",		// where
	"kvarhelst",	// where
	"kven",		// who/whom
	"kvi",		// why
	"kvifor",	// why
	"me",		// we
	"medan",	// while
	"mi",		// my
	"mine",		// my
	"mykje",	// much
	"no",		// now
	"nokon",	// some
	"noka",		// some
	"nokor",	// some
	"noko",		// some
	"nokre",	// some
	"si",		// his/hers
	"sia",		// since
	"sidan",	// since
	"so",		// so
	"somt",		// some
	"somme",	// some
	"um",		// about*
	"upp",		// up
	"vere",		// be
	"er",		// am
	"var",		// was
	"vore",		// was
	"verte",	// become
	"vort",		// become
	"varte",	// became
	"vart",		// became
	"er",		// am
	"være",		// to
	"var",		// was
	"å",		// on


	// portuguese stop words
	"de",		// of,
	"a",		// the;
	"o",		// the;
	"que",		// who,
	"e",		// and
	"do",		// de
	"da",		// de
	"em",		// in
	"um",		// a
	"para",		// for
	//"com",		// with
	"não",		// not,
	"uma",		// a
	"os",		// the;
	"no",		// em
	"se",		// himself
	"na",		// em
	"por",		// for
	"mais",		// more
	"as",		// the;
	"dos",		// de
	"como",		// how,as
	"mas",		// but
	"ao",		// a
	"ele",		// he
	"das",		// de
	//"à",		// a
	"seu",		// his
	"sua",		// her
	"ou",		// or
	"quando",	// when
	"muito",	// much
	"nos",		// em
	"já",		// already,
	"eu",		// I
	"também",	// also
	"só",		// only,
	"pelo",		// per
	"pela",		// per
	"até",		// up
	"isso",		// that
	"ela",		// he
	"entre",	// between
	"depois",	// after
	"sem",		// without
	"mesmo",	// same
	"aos",		// a
	"seus",		// his
	"quem",		// whom
	"nas",		// em
	"me",		// me
	"esse",		// that
	"eles",		// they
	"você",		// you
	"essa",		// that
	"num",		// em
	"nem",		// nor
	"suas",		// her
	"meu",		// my
	"às",		// a
	"minha",	// my
	"numa",		// em
	"pelos",	// per
	"elas",		// they
	"qual",		// which
	"nós",		// we
	"lhe",		// to
	"deles",	// of them
	"essas",	// those
	"esses",	// those
	"pelas",	// per
	"este",		// this
	"dele",		// of
	"tu",		// thou
	"te",		// thee
	"vocês",	// you
	"vos",		// you
	"lhes",		// to
	"meus",		// my
	"minhas",	// 
	"teu",		// thy
	"tua",		// 
	"teus",		// 
	"tuas",		// 
	"nosso",	// our
	"nossa",	// 
	"nossos",	// 
	"nossas",	// 
	"dela",		// of
	"delas",	// of
	"esta",		// this
	"estes",	// these
	"estas",	// these
	"aquele",	// that
	"aquela",	// that
	"aqueles",	// those
	"aquelas",	// those
	"isto",		// this
	"aquilo",	// that
	"estou",	// 
	"está",		//
	"estamos",	// 
	"estão",	//
	"estive",	// 
	"esteve",	// 
	"estivemos",	// 
	"estiveram",	// 
	"estava",	// 
	"estАvamos",	// 
	"estavam",	// 
	"estivera",	// 
	"estivéramos",	//
	"esteja",	// 
	"estejamos",	// 
	"estejam",	// 
	"estivesse",	// 
	"estivéssemos",	//
	"estivessem",	// 
	"estiver",	// 
	"estivermos",	// 
	"estiverem",	// 

	// russian stop words
	"и",		// and
	"в",		// in/into
	"во",		// alternative
	"не",		// not
	"что",		// what/that
	"он",		// he
	"на",		// on/onto
	"я",		// i
	"с",		// from
	"со",		// alternative
	"как",		// how
	"а",		// milder
	"то",		// conjunction
	"все",		// all
	"она",		// she
	"так",		// so,
	"его",		// him
	"но",		// but
	"да",		// yes/and
	"ты",		// thou
	"к",		// towards,
	"у",		// around,
	"же",		// intensifier
	"вы",		// you
	"за",		// beyond,
	"бы",		// conditional/subj.
	"по",		// up
	"только",	// only
	"ее",		// her
	"мне",		// to
	"было",		// it
	"вот",		// here
	"от",		// away
	"меня",		// me
	"еще",		// still,
	"нет",		// no,
	"о",		// about
	"из",		// out
	"ему",		// to
	"теперь",	// now
	"когда",	// when
	"даже",		// even
	"ну",		// so,
	"вдруг",	// suddenly
	"ли",		// interrogative
	"если",		// if
	"уже",		// already,
	"или",		// or
	"ни",		// neither
	"быть",		// to
	"был",		// he
	"него",		// prepositional
	"до",		// up
	"вас",		// you
	"нибудь",	// indef.
	"опять",	// again
	"уж",		// already,
	"вам",		// to
	"сказал",	// he
	"ведь",		// particle
	"там",		// there
	"потом",	// then
	"себя",		// oneself
	"ничего",	// nothing
	"ей",		// to
	"может",	// usually
	"они",		// they
	"тут",		// here
	"где",		// where
	"есть",		// there
	"надо",		// got
	"ней",		// prepositional
	"для",		// for
	"мы",		// we
	"тебя",		// thee
	"их",		// them,
	"чем",		// than
	"была",		// she
	"сам",		// self
	"чтоб",		// in
	"без",		// without
	"будто",	// as
	"человек",	// man,
	"чего",		// genitive
	"раз",		// once
	"тоже",		// also
	"себе",		// to
	"под",		// beneath
	"жизнь",	// life
	"будет",	// will
	"ж",		// int16_t
	"тогда",	// then
	"кто",		// who
	"этот",		// this
	"говорил",	// was
	"того",		// genitive
	"потому",	// for
	"этого",	// genitive
	"какой",	// which
	"совсем",	// altogether
	"ним",		// prepositional
	"здесь",	// here
	"этом",		// prepositional
	"один",		// one
	"почти",	// almost
	"мой",		// my
	"тем",		// instrumental/dative
	"чтобы",	// full
	"нее",		// her
	"кажется",	// it
	"сейчас",	// now
	"были",		// they
	"куда",		// where
	"зачем",	// why
	"сказать",	// to
	"всех",		// all
	"никогда",	// never
	"сегодня",	// today
	"можно",	// possible,
	"при",		// by
	"наконец",	// finally
	"два",		// two
	"об",		// alternative
	"другой",	// another
	"хоть",		// even
	"после",	// after
	"над",		// above
	"больше",	// more
	"тот",		// that
	"через",	// across,
	"эти",		// these
	"нас",		// us
	"про",		// about
	"всего",	// in
	"них",		// prepositional
	"какая",	// which,
	"много",	// lots
	"разве",	// interrogative
	"сказала",	// she
	"три",		// three
	"эту",		// this,
	"моя",		// my,
	"впрочем",	// moreover,
	"хорошо",	// good
	"свою",		// ones
	"этой",		// oblique
	"перед",	// in
	"иногда",	// sometimes
	"лучше",	// better
	"чуть",		// a
	"том",		// preposn.
	"нельзя",	// one
	"такой",	// such
	"им",		// to
	"более",	// more
	"всегда",	// always
	"конечно",	// of
	"всю",		// acc.
	"между",	// between

	// spanish stop words
	"de",		// from,
	"la",		// the,
	"que",		// who,
	"el",		// the
	"en",		// in
	"y",		// and
	"a",		// to
	//"los",		// the,
	"del",		// de
	"se",		// himself,
	"las",		// the,
	"por",		// for,
	"un",		// a
	"para",		// for
	"con",		// with
	"no",		// no
	"una",		// a
	"su",		// his,
	"al",		// a
	"lo",		// him
	"como",		// how
	"más",		// more
	"pero",		// pero
	"sus",		// su
	"le",		// to
	"ya",		// already
	"o",		// or
	"este",		// this
	"sí",		// himself
	"porque",	// because
	"esta",		// this
	"entre",	// between
	"cuando",	// when
	"muy",		// very
	"sin",		// without
	"sobre",	// on
	"también",	// also
	"me",		// me
	"hasta",	// until
	"hay",		// there
	"donde",	// where
	"quien",	// whom,
	"desde",	// from
	"todo",		// all
	"nos",		// us
	"durante",	// during
	"todos",	// all
	"uno",		// a
	"les",		// to
	"ni",		// nor
	"contra",	// against
	"otros",	// other
	"ese",		// that
	"eso",		// that
	"ante",		// before
	"ellos",	// they
	"e",		// and
	"esto",		// this
	"mМ",		// me
	"antes",	// before
	"algunos",	// some
	"qué",		// what?
	"unos",		// a
	"yo",		// I
	"otro",		// other
	"otras",	// other
	"otra",		// other
	"él",		// he
	"tanto",	// so
	"esa",		// that
	"estos",	// these
	"mucho",	// much,
	"quienes",	// who
	"nada",		// nothing
	"muchos",	// many
	"cual",		// who
	"poco",		// few
	"ella",		// she
	"estar",	// to
	"estas",	// these
	"algunas",	// some
	"algo",		// something
	"nosotros",	// we
	"mi",		// me
	"mis",		// mi
	"tú",		// thou
	"te",		// thee
	"ti",		// thee
	"tu",		// thy
	"tus",		// tu
	"ellas",	// they
	"nosotras",	// we
	"vosostros",	// you
	"vosostras",	// you
	"os",		// you
	"mío",		// mine
	"mía",		//
	"míos",		//
	"mías",		//
	"tuyo",		// thine
	"tuya",		// 
	"tuyos",	// 
	"tuyas",	// 
	"suyo",		// his,
	"suya",		// 
	"suyos",	// 
	"suyas",	// 
	"nuestro",	// ours
	"nuestra",	// 
	"nuestros",	// 
	"nuestras",	// 
	"vuestro",	// yours
	"vuestra",	// 
	"vuestros",	// 
	"vuestras",	// 
	"esos",		// those
	"esas",		// those
	"estoy",	// 
	"estás",	//
	"está",		//
	"estamos",	// 
	"estáis",	//
	"están",	//
	"esté",		//
	"estés",	//
	"estemos",	// 
	"estéis",	//
	"estén",	//
	"estaré",	//
	"estarás",	//
	"estará",	//
	"estaremos",	// 
	"estaréis",	//
	"estarán",	//
	"estaría",	//
	"estarías",	//
	"estaríamos",	//
	"estaríais",	//
	"estarían",	//
	"estaba",	// 
	"estabas",	// 
	"estábamos",	//
	"estabais",	// 
	"estaban",	// 
	"estuve",	// 
	"estuviste",	// 
	"estuvo",	// 
	"estuvimos",	// 
	"estuvisteis",	// 
	"estuvieron",	// 
	"estuviera",	// 
	"estuvieras",	// 
	"estuviéramos",	//
	"estuvierais",	// 
	"estuvieran",	// 
	"estuviese",	// 
	"estuvieses",	// 
	"estuviésemos",	//
	"estuvieseis",	// 
	"estuviesen",	// 
	"estando",	// 
	"estado",	// 
	"estada",	// 
	"estados",	// 
	"estadas",	// 
	"estad",	// 

	// swedish stop words
	"och",		// and
	"det",		// it,
	"att",		// to
	"i",		// in,
	"en",		// a
	"jag",		// I
	"hon",		// she
	"som",		// who,
	"han",		// he
	"på",		// on
	"den",		// it,
	"med",		// with
	"var",		// where,
	"sig",		// him(self)
	//"för",		// for
	"så",		// so
	"till",		// to
	"är",		// is
	"men",		// but
	"ett",		// a
	"om",		// if;
	"hade",		// had
	"de",		// they,
	"av",		// of
	"icke",		// not,
	"mig",		// me
	"du",		// you
	"henne",	// her
	"då",		// then,
	"sin",		// his
	"nu",		// now
	"har",		// have
	"inte",		// inte
	"hans",		// his
	"honom",	// him
	"skulle",	// 'sake'
	"hennes",	// her
	"där",		// there
	"min",		// my
	"man",		// one
	"ej",		// nor
	"vid",		// at,
	"kunde",	// could
	"något",	// some
	"från",		// from,
	"ut",		// out
	"när",		// when
	"efter",	// after,
	"upp",		// up
	"vi",		// we
	"dem",		// them
	"vara",		// be
	"vad",		// what
	"över",		// over
	"än",		// than
	"dig",		// you
	"kan",		// can
	"sina",		// his
	"här",		// here
	"ha",		// have
	"mot",		// towards
	"alla",		// all
	"under",	// under
	"någon",	// some
	"eller",	// or
	"allt",		// all
	"mycket",	// much
	"sedan",	// since
	"ju",		// why
	"denna",	// this/that
	"själv",	// myself,
	"detta",	// this/that
	"åt",		// to
	"utan",		// without
	"varit",	// was
	"hur",		// how
	"ingen",	// no
	"mitt",		// my
	"ni",		// you
	"bli",		// to
	"blev",		// from
	"oss",		// us
	"din",		// thy
	"dessa",	// these/those
	"några",	// some
	"deras",	// their
	"blir",		// from
	"mina",		// my
	"samma",	// (the)
	"vilken",	// who,
	"er",		// you,
	"sådan",	// such
	"vår",		// our
	"blivit",	// from
	"dess",		// its
	"inom",		// within
	"mellan",	// between
	"sådant",	// such
	//"varför",	// why
	"varje",	// each
	"vilka",	// who,
	"ditt",		// thy
	"vem",		// who
	"vilket",	// who,
	"sitta",	// his
	"sådana",	// such
	"vart",		// each
	"dina",		// thy
	"vars",		// whose
	"vårt",		// our
	"våra",		// our
	"ert",		// your
	"era",		// your
	"vilkas",	// whose

	// internet stop words
	"www",
	//"com",

	// additional stop words
	//"san"           // like san francisco
};
static HashTableX s_commonWordTable;
static bool       s_commonWordsInitialized = false;
static GbMutex s_commonWordtableMutex;

// for Process.cpp::resetAll() to call when exiting to free all mem
void resetStopWordTables() {
	s_stopWordTable.reset();
	for ( int i = 0 ; i <= MAXLANGID ; i++ )
		s_queryStopWordTables[i].reset();
	s_commonWordTable.reset();
}

// used by Msg24.cpp for gigabits generation
int32_t isCommonWord ( int64_t h ) {
	
	ScopedLock sl(s_commonWordtableMutex);
	// include a bunch of foreign prepositions so they don't get required
	// by the bitScores in IndexTable.cpp
	if ( ! s_commonWordsInitialized ) {
		// set up the hash table
		if ( ! s_commonWordTable.set (8,4,sizeof(s_commonWords)*2, NULL,0,false,"commonwrds") ) {
			log(LOG_INIT, "query: Could not init common words table.");
			return 0;
		}
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_commonWords)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char *sw    = s_commonWords[i];
			int32_t  swlen = strlen ( sw );
			// use the same algo that Words.cpp computeWordIds does
			int64_t swh = hash64Lower_utf8 ( sw , swlen );
			if ( ! s_commonWordTable.addTerm(swh,i+1 ) )
				return 0;
			// . add w/o accent marks too!
			// . skip "für" though because fur is an eng. word
			//if ( *sw=='f' && *(sw+1)=='ü' &&
			//     *(sw+2)=='r' && swlen == 3 ) continue;
			//swh   = hash64AsciiLower ( sw , swlen );
			//s_commonWordTable.addTerm(swh,i+1,i+1,true);
		}
		s_commonWordsInitialized = true;
	} 
	sl.unlock();

	// . all 1 char letter words are stop words
	// . good for initials and some contractions
	//if ( len == 1 && is_alpha_a(*s) ) return true;

	// get from table
	return s_commonWordTable.getScore(h);
}
-												Added UTF-8 BOM to avoid crash in hash64Lower_utf8 when hashing stopwords with Danish letters

											
										
										
											2016-05-17 15:17:26 +02:00
+								// Matt Wells, copyright Jul 2001
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 								#include "StopWords.h"
-												#include cleanup of StopWords.h

											
										
										
											2016-08-11 16:43:25 +02:00
+								#include "gb-include.h"
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								#include "HashTableX.h"
 								#include "Speller.h"
 								#include "Loop.h"
-												#include cleanup of Hostdb.*

											
										
										
											2016-11-13 16:33:43 +01:00
+								#include "Lang.h"
-												query stop words now based on selected langid.

											
										
										
											2015-03-08 15:16:24 -07:00
+								#include "Posdb.h" // MAXLANGID
-												Make isCommonWord() thread-safe.

											
										
										
											2016-08-19 14:21:07 +02:00
+								#include "GbMutex.h"
 								#include "ScopedLock.h"
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 								// . h is the lower ascii 64bit hash of a word
 								// . this returns true if h is the hash of an ENGLISH stop word
 								// . list taken from www.superjournal.ac.uk/sj/application/demo/stopword.htm
 								// . stop words with "mdw" next to them are ones I added
 								// . i shrunk this list a lot
 								// . see backups for the hold list
-												constness on local static

											
										
										
											2016-08-19 14:18:06 +02:00
+								static const char * const s_stopWords[] = {
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"a",
 									"b",
 									"c",
-												Fix coverity warning on missing comma in a string array initialization

											
										
										
											2016-09-22 11:06:59 +02:00
+									"d",
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"e",
 									"f",
 									"g",
 									"h",
 									"i",
 									"j",
 									"k",
 									"l",
 									"m",
 									"n",
 									"o",
 									"p",
 									"q",
 									"r",
 									"s",
 									"t",
 									"u",
 									"v",
 									"w",
 									"x",
 									"y",
 									"z",
 									"0",
 									"1",
 									"2",
 									"3",
 									"4",
 									"5",
 									"6",
 									"7",
 									"8",
 									"9",
 									"an",
 									"as",
 									"at",
 									"be",
 									"by",
 									"of",
 									"on",
 									"or",
 									"do",
 									"he",
 									"if",
 									"is",
 									"it",
 									"in",
 									"me",
 									"my",
 									"re",
 									"so",
 									"to",
 									"us",
 									"vs",
 									"we",
 									"the",
 									"and",
 									"are",
 									"can",
 									"did",
 									"per",
 									"for",
 								//		"get",
 									"had",
 									"has",
 									"her",
 									"him",
 									"its",
 								//              "may",	  // like the month
 									// wikipedia has this in lower case in the title so we need
 									// not to be a stopword
 									"not", // fix 'to be or not to be'... no, revert
 									"our",
 									"she",
 									"you",
 									"also",
 									"been",
 									"from",
 									"have",
 									"here",
 									"hers",
 								//	        "mine",   // land mine
 									"ours",
 									"that",
 									"them",
 									"then",
 									"they",
 									"this",
 									"were",
 									"will",
 									"with",
 									"your",
 									"about",
 									"above",
 									//"ain",   // ain't
 									"could",
 									//"isn",   // isn't
 									"their",
 									"there",
 									"these",
 									"those",
 									"through", // fix ceder.net "Mainstream thru A1 Dance" event title
 									"thru",    // fix ceder.net "Mainstream thru A1 Dance" event title
 									"until", // fix event title for blackbirdbuvette.com
 									"under", // fix title for http://www.harwoodmuseum.org/press_detail.php?ID=44
 									"would",
 									"yours",
-												fixed langid based query stop words.

											
										
										
											2015-03-08 15:44:23 -07:00
+									"theirs",
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									//"aren",  // aren't
 									//"hadn",  // hadn't
 									//"didn",  // didn't
 									//"hasn",  // hasn'y
 									//"ll",    // they'll this'll that'll you'll
 									//"ve",    // would've should've
 									//"should",
 									//"shouldn", // shouldn't
-												fixed langid based query stop words.

											
										
										
											2015-03-08 15:44:23 -07:00
+									NULL
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								};
 								static HashTableX s_stopWordTable;
 								static bool       s_stopWordsInitialized = false;
-												Sync lock changes from nomerge2 to master

											
										
										
											2017-03-24 14:01:04 +01:00
+								static GbMutex    s_stopWordTableMutex;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
-												Make local functions static

											
										
										
											2016-11-19 15:20:00 +01:00
+								static bool initWordTable(HashTableX *table, const char * const words[], const char *label) {
-												fixed langid based query stop words.

											
										
										
											2015-03-08 15:44:23 -07:00
+									// count them
 									int32_t count; for ( count = 0 ; words[count] ; count++ );
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									// set up the hash table
-												Remove niceness from HashTableX

											
										
										
											2016-09-01 18:18:30 +02:00
+									if ( ! table->set ( 8,4,count * 2,NULL,0,false,label ) ) {
-												Log function will now return void instead of a boolean

											
										
										
											2016-08-01 15:29:03 +02:00
+										log(LOG_INIT, "build: Could not init stop words table.");
 										return false;
 									}
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									// now add in all the stop words
-												fixed langid based query stop words.

											
										
										
											2015-03-08 15:44:23 -07:00
+									int32_t n = count;//(int32_t)size/ sizeof(char *);
-												now it compiles with -m32

											
										
										
											2014-11-10 14:45:11 -08:00
+									for ( int32_t i = 0 ; i < n ; i++ ) {
-												Fix conversion from string literal to 'char *' for StopWords

											
										
										
											2016-05-30 14:38:32 +02:00
+										const char      *sw    = words[i];
-												fixed langid based query stop words.

											
										
										
											2015-03-08 15:44:23 -07:00
+										if ( ! sw ) break;
-												Removed gbstrlen()

gbstrlen() just checked for NULL and called gbshutdownAbort(). Dereferncing NULL on mordern platforms cases a SIGSEGV which is cought by our signal handler and .... gbshutdownAbort() is called. So gbstrlen() was superfluous and complicated static analysis.

											
										
										
											2016-07-28 17:04:35 +02:00
+										int32_t       swlen = strlen ( sw );
-												replace long long with int64_t

											
										
										
											2014-10-30 13:36:39 -06:00
+										int64_t  swh   = hash64Lower_utf8 ( sw , swlen );
-												Replace INT32/INT64 and likes with PRId32 and likes. Add space before definition.

											
										
										
											2016-05-20 09:18:32 +02:00
+										//log("ii: #%" PRId32"  %s",i,sw);
-												Changed HashTableX::addTerm() to take the key by-value instead of by-pointer

											
										
										
											2016-10-27 11:53:53 +02:00
+										if ( ! table->addTerm(swh,i+1) ) return false;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									}
 									return true;
 								}
-												more constness

											
										
										
											2016-05-13 17:07:07 +02:00
+								bool isStopWord ( const char *s , int32_t len , int64_t h ) {
-												Sync lock changes from nomerge2 to master

											
										
										
											2017-03-24 14:01:04 +01:00
+									ScopedLock sl(s_stopWordTableMutex);
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									if ( ! s_stopWordsInitialized ) {
 										s_stopWordsInitialized =
 											initWordTable(&s_stopWordTable, s_stopWords,
-												fixed langid based query stop words.

											
										
										
											2015-03-08 15:44:23 -07:00
+												      //sizeof(s_stopWords),
 												      "stopwords");
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+										if (!s_stopWordsInitialized) return false;
 									}
-												Sync lock changes from nomerge2 to master

											
										
										
											2017-03-24 14:01:04 +01:00
+									sl.unlock();
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									// . all 1 char letter words are stop words
 									// . good for initials and some contractions
 									if ( len == 1 && is_alpha_a(*s) ) return true;
 									// get from table
-												Changed HashTableX::getScore() to take the key by-value instead of by-pointer

											
										
										
											2016-10-27 11:57:44 +02:00
+									return s_stopWordTable.getScore(h);
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								}
-												query-stop-words: use extern files if present, fallback to builtin defaults if not

y Please enter the commit message for your changes. Lines starting

											
										
										
											2017-12-22 16:49:18 +01:00
 								#include "query_stop_words_list.inc"
-												query stop words now based on selected langid.

											
										
										
											2015-03-08 15:16:24 -07:00
 								static HashTableX s_queryStopWordTables[MAXLANGID+1];
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								static bool       s_queryStopWordsInitialized = false;
-												Sync lock changes from nomerge2 to master

											
										
										
											2017-03-24 14:01:04 +01:00
+								static GbMutex    s_queryStopWordsMutex;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
-												query stop words now based on selected langid.

											
										
										
											2015-03-08 15:16:24 -07:00
-												query-stop-words: use extern files if present, fallback to builtin defaults if not

y Please enter the commit message for your changes. Lines starting

											
										
										
											2017-12-22 16:49:18 +01:00
+								static void initializeQueryStopWords(lang_t lang) {
 									HashTableX &table = s_queryStopWordTables[lang];
 									if(!table.set(8,4,50,NULL,0,false,"qrystops") ) {
 										log(LOG_INIT, "build: Could not init stop words table.");
 										return;
 									}
 									const char *abbrev = getLanguageAbbr(lang);
 									char filename[1024];
 									sprintf(filename, "%s/query_stop_words.%s.txt", g_hostdb.getMyHost()->m_dir, abbrev);
 									FILE *fp=fopen(filename,"r");
 									if(fp) {
 										//use what is in the file, and only that
 										log(LOG_DEBUG,"Loading query-stop-words for '%s' from %s", abbrev, filename);
 										char line[128];
 										while(fgets(line,sizeof(line),fp)) {
 											if(char *p = strchr(line,'/'))
 												*p = '\0';
 											if(char *p = strchr(line,' '))
 												*p = '\0';
 											if(char *p = strchr(line,'\n'))
 												*p = '\0';
 											if(line[0]=='\0')
 												continue;
 											int word_len = strlen(line);
 											int64_t  word_hash = hash64Lower_utf8(line,word_len);
 											table.addTerm(word_hash,1);
 										}
 										fclose(fp);
 										log(LOG_DEBUG,"Loaded query-stop-words for '%s' from %s", abbrev, filename);
 									} else {
 										//fall back to build-in default, if any
 										//log(LOG_DEBUG,"Using builtin-default query-stop-words for '%s'", abbrev); //too noisy log upon startup
 										const char * const *words = NULL;
 										for(size_t i=0; i<sizeof(s_query_stop_words_lang)/sizeof(s_query_stop_words_lang[0]); i++) {
 											if(s_query_stop_words_lang[i].lang==lang) {
 												words = s_query_stop_words_lang[i].words;
 												break;
 											}
 										}
 										if(!words)
 											words = s_query_stop_words_xx; //use default table
 										for(;
 										    *words;
 										    words++)
 										{
 											int word_len = strlen(*words);
 											int64_t word_hash = hash64Lower_utf8(*words,word_len);
 											table.addTerm(word_hash,1);
 										}
 									}
 								}
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
-												query-stop-words: use extern files if present, fallback to builtin defaults if not

y Please enter the commit message for your changes. Lines starting

											
										
										
											2017-12-22 16:49:18 +01:00
 								static void initializeQueryStopWords() {
-												Sync lock changes from nomerge2 to master

											
										
										
											2017-03-24 14:01:04 +01:00
+									ScopedLock sl(s_queryStopWordsMutex);
-												query-stop-words: use extern files if present, fallback to builtin defaults if not

y Please enter the commit message for your changes. Lines starting

											
										
										
											2017-12-22 16:49:18 +01:00
+									if(s_queryStopWordsInitialized)
 										return;
 									for(int lang=langUnknown; lang<langUnwanted; lang++)
 										initializeQueryStopWords((lang_t)lang);
 									s_queryStopWordsInitialized = true;
 								}
 								bool isQueryStopWord ( const char *s , int32_t len , int64_t h , int32_t langId ) {
 									initializeQueryStopWords();
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									// . all 1 char letter words are stop words
 									// . good for initials and some contractions
 									// . fix for 'j. w. eagan' .. return FALSE now
 									// . let 'a' remain a query stop word i guess... (mdw 7/16/12)
 									//if ( len == 1 && is_alpha_a(*s) ) return false;
-												query stop words now based on selected langid.

											
										
										
											2015-03-08 15:16:24 -07:00
+									if ( langId < 0 ) langId = langUnknown;
 									if ( langId > MAXLANGID ) langId = langUnknown;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									// get from table
-												Changed HashTableX::getScore() to take the key by-value instead of by-pointer

											
										
										
											2016-10-27 11:57:44 +02:00
+									return s_queryStopWordTables[langId].getScore(h);
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								}
 								// is it a stop word?
 								// . these have the stop words above plus some foreign stop words
 								// . these aren't
 								// . i shrunk this list a lot
 								// . see backups for the hold list
 								// . i shrunk this list a lot
 								// . see backups for the hold list
-												Fix conversion from string literal to 'char *' for StopWords

											
										
										
											2016-05-30 14:38:32 +02:00
+								static const char      *s_commonWords[] = {
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"to",   // score = 1
 									"and",  // score = 2
 									"of",   // score = 3
 									"the",  // score = 4
 									"this", // score = 5
 									"between",
 									"onto",
 									"now",
 									"during",
 									"after",
 									"before",
 									"since",
 									"his",
 									"more",
 									"all",
 									"most",
 									"each",
 									"other",
 									"others",
 									"same",
 									"throughout",
 									"through",
 									"part",
 									"being",
 									"any",
 									"many",
 									"than",
 									"within",
 									"without",
 									"since",
 									"because",
 									"whether",
 									"both",
 									"able",
 									"prior",
 									"under",
 									"beneath",
 									"shall",
 									"around",
 									"while",
 									"must",
 									"inside",
 									"just",
 									"until",
 									"behind",
 									"my",
 									"inc",  // incorporated
 									"one",
 									"two",
 									"three",
 									"four",
 									"1",
 									"2",
 									"3",
 									"4",
 									"et",
 									"est",
 									"against",
 									"mr",
 									"mrs",
 									"miss",
 									"out",
 									"outside",
 									"well",
 									"only",
 									"some",
 									"even",
 									"may",
 									"still",
 									"such",
 									"much",
 									"ever",
 									"every",
 									"become",
-												good checkpoint. quite a few fixes.

											
										
										
											2014-11-17 18:13:36 -08:00
+									"along",
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									"tion", // broken words
 									"ture", // broken words
 									"use",
 									"used",
 									"using",
 									"following", // the following
 									"home"     ,
 									"copyright",
 									"tm",          // trademark
 									"information",
 									"info",
 									"number",    // number of
 									"welcome",
 									"online",
 									//"contact",
 									"today",
 									"said",
 									"says",
 									"say",
 									"told",
 									"became",
 									"again",
 									"later",
 									"began",
 									"gotta",
 									"yet",
 									"maybe",
 									"someone",
 									"something",
 									"oh",
 									"thanks",
 									"co.uk",
 									"first",
 									"takes",
 									"rest",
 									"might",
 									"never",
 									"ever",
 									"ok",
 									"himself",
 									"herself",
 									"southern",
 									"northern",
 									"beyond",
 									"saw",
 									"truly",
 									"turns",
 									"tonight",
 									"took",
 									"came",
 									"seeing",
 									"expect",
 									"arrives",
 									"arrive",
 									"starts",
 									"recently",
 									"land",
 									"born",
 									"ah",
 									"attack",
 									"kill",
 									"states",
 									"down",
 									"up",
 									"shit",
 									"fuck",
 									"damn",
 									"wait",
 									"leave",
 									"exit",
 									"sleep",
 									"anymore",
 									"presents",
 									"shares",
 									"wrote",
 									"pleasure",
 									"mention",
 									"gets",
 									"get",
 									"feels",
 									"feeling",
 									"across",
 									"entirely",
 									"really",
 									// until we add the rule to allow month/day names only
 									// if adjacent to an alpha word with only a space between
 									// let's get this out of there
 									"jan",
 									"feb",
 									"mar",
 									"apr",
 									"may",
 									"jun",
 									"jul",
 									"aug",
 									"sep",
 									"oct",
 									"nov",
 									"dec",
 									"january",
 									"february",
 									"march",
 									"april",
 									"may",
 									"june",
 									"july",
 									"august",
 									"september",
 									"october",
 									"november",
 									"december",
 									"sun",
 									"mon",
 									"tue",
 									"wed",
 									"thu",
 									"fri",
 									"sat",
 									"sunday",
 									"monday",
 									"tuesday",
 									"wednesday",
 									"thursday",
 									"friday",
 									"saturday",
 									// unfortunately com is portuguese for with.
 									//"org",
 									//"com",
 									//"central",
 									//"click", chops off pay per click
 									//"website",
 									//"site",
 									//"place",
 									//"web",
 									"best",
 									"does",
 									"see",
 									"2003",
 									"2004",
 									"2005",
 									"2006",
 									"2007",
 									"2008",
 									"2009",
 									"2010",
 									"2011",
 									"2012",
 									"2013",
 									"2014",
 									"2015",
 									"at",
 									"be",
 									"by",
 									"on",
 									"or",
 									"do",
 									"he",
 									"if",
 									"is",
 									"it",
 									"it's",
 									"don't",
 									"doesn't",
 									"can't",
 									"won't,"
 									"shouldn't",
 									"wouldn't",
 									"couldn't",
 									"should've",
 									"would've",
 									"could've",
 									"wasn't",
 									"hasn't",
-												update common word list

											
										
										
											2013-12-01 15:19:33 -07:00
+									"hadn't",
 									"like", // in too many gigabits
 									"know", // in too many gigabits
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"you'd",
 									"we'd",
 									"i'd",
 									"haven't",
 									"he'd",
 									"she'd",
 									"they'd",
 									"dont",
 									"won't",
 									"you're",
 									"very",
 									"seem",
 									"seems",
 									"thats",
 									"aren't",
 									"arent",
 									"let's",
 									"let",
 									"you've",
 									"they're",
 									"you'll",
 									"didn't",
 									"i've",
 									"we've",
 									"they've",
 									"we'll",
 									"they'll",
 									"i'll",
 									"he'll",
 									"she'll",
 									"he's",
 									"she's",
 									"we're",
 									"i'm",
 									"though",
 									"isn't",
 									"in",
 									"into",
 									"me",
 									"my",
 									"re",
 									"so",
 									"us",
 									"vs",
 									"we",
 									"are",
 									"but",
 									"over",
 									"can",
 									"did",
 									"per",
 									"for",
 									"get",
 									"had",
 									"has",
 									"her",
 									"him",
 									"its",
 									"may",
 									//	"not",
 									"our",
 									"she",
 									"you",
 									"also",
 									"been",
 									"from",
 									"have",
 									"here",
 									"here's",
 									"there's",
 									"that's",
 									"hers",
 									//"mine",
 									"ours",
 									"that",
 									"them",
 									"then",
 									"they",
 									"were",
 									"will",
 									"with",
 									"your",
 									"about",
 									"above",
 									"ain",   // ain't
 									"could",
 									"isn",   // isn't
 									"their",
 									"there",
 									"these",
 									"those",
 									"would",
 									"yours",
 									"theirs",
 									"aren",  // aren't
 									"hadn",  // hadn't
 									"didn",  // didn't
 									"hasn",  // hasn'y
 									"ll",    // they'll this'll that'll you'll
 									"ve",    // would've should've
 									"should",
 									"shouldn", // shouldn't
 									// . additional english stop words for queries
 									// . we don't want to require any of these words
 									// . 'second hand smoke and how it affects children'
 									//   should essentially reduce to 5 words instead of 8
 									"i",		// subject,
 									"it",		// subject
 									"what",		//
 									"what's",
 									"which",	//
 									"who",		// common word
 									"that",		//
 									"is",		// -s
 									"are",		// present
 									"was",		// 1st
 									"be",		// infinitive
 									"will",	        //
 									"a",		//
 									"an",		//
 									"or",		//
 									"as",		//
 									"at",		//
 									"by",		//
 									"for",		//
 									"with",		//
 									"about",	//
 									"from",		//
 									"in",		//
 									"on",		//
 									"when",		//
 									"where",	//
 									"why",		// common word
 									"how",		//
 									"finally",
 									"own",
 									// danish stop words (in project/stopwords)
 									// cat danish.txt | awk '{print "\t\t\""$1"\",\t\t// "$3}'
 									"i",		// in
 									"jeg",		// I
 									"det",		// that
 									"at",		// that
 									"en",		// a/an
 									"den",		// it
 									"til",		// to/at/for/until/against/by/of/into,
 									"er",		// present
 									"som",		// who,
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"på",		// on/upon/in/on/at/to/after/of/with/for,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"de",		// they
 									"med",		// with/by/in,
 									"han",		// he
 									"af",		// of/by/from/off/for/in/with/on,
 									"for",		// at/for/to/from/by/of/ago,
 									"ikke",		// not
 									"der",		// who/which,
 									"var",		// past
 									"mig",		// me/myself
 									"sig",		// oneself/himself/herself/itself/themselves
 									"men",		// but
 									"et",		// a/an/one,
 									"har",		// present
 									"om",		// round/about/for/in/a,
 									"vi",		// we
 									"min",		// my
 									"havde",	// past
 									"ham",		// him
 									"hun",		// she
 									"nu",		// now
 									"over",		// over/above/across/by/beyond/past/on/about,
 									"da",		// then,
 									"fra",		// from/off/since,
 									"du",		// you
 									"ud",		// out
 									"sin",		// his/her/its/one's
 									"dem",		// them
 									"os",		// us/ourselves
 									"op",		// up
 									"man",		// you/one
 									"hans",		// his
 									"hvor",		// where
 									"eller",	// or
 									"hvad",		// what
 									"skal",		// must/shall
 									"selv",		// myself/youself/herself/ourselves
 									"her",		// here
 									"alle",		// all/everyone/everybody
 									"vil",		// will
 									"blev",		// past
 									"kunne",	// could
 									"ind",		// in
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"når",	// when
 									"være",	// present
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"dog",	// however/yet/after
 									"noget",	// something
 									"ville",	// would
 									"jo",		// you
 									"deres",	// their/theirs
 									"efter",	// after/behind/according
 									"ned",	// down
 									"skulle",	// should
 									"denne",	// this
 									"end",	// than
 									"dette",	// this
 									"mit",	// my/mine
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"også",		// also
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"ogsa",		// also
 									"under",	// under/beneath/below/during,
 									"have",		// have
 									"dig",	// you
 									"anden",	// other
 									"hende",	// her
 									"mine",		// my
 									"alt",	// everything
 									"meget",	// much/very,
 									"sit",	// his,
 									"sine",	// his,
 									"vor",		// our
 									"mod",	// against
 									"disse",	// these
 									"hvis",	// if
 									"din",		// your/yours
 									"nogle",	// some
 									"hos",		// by/at
 									"blive",	// be/become
 									"mange",	// many
 									"ad",		// by/through
 									"bliver",	// present
 									"hendes",	// her/hers
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"været",	// be
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"vaeret",	// be
 									"thi",		// for
 									"jer",		// you
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"sådan",	// such,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									// dutch stop words
 									"de",		// the
 									"en",		// and
 									"van",		// of,
 									"ik",		// I,
 									"te",		// (1)
 									"dat",		// that,
 									"die",		// that,
 									"in",		// in,
 									"een",		// a,
 									"hij",		// he
 									"het",		// the,
 									"niet",		// not,
 									"zijn",		// (1)
 									"is",		// is
 									"was",		// (1)
 									"op",		// on,
 									"aan",		// on,
 									"met",		// with,
 									"als",		// like,
 									"voor",		// (1)
 									"had",		// had,
 									"er",		// there
 									"maar",		// but,
 									"om",		// round,
 									"hem",		// him
 									"dan",		// then
 									"zou",		// should/would,
 									"wat",		// what,
 									"mijn",		// possessive
 									"men",		// people,
 									"dit",		// this
 									"zo",		// so,
 									"door",		// through
 									"over",		// over,
 									"ze",		// she,
 									"zich",		// oneself
 									"bij",		// (1)
 									"ook",		// also,
 									"tot",		// till,
 									"je",		// you
 									"mij",		// me
 									"uit",		// out
 									"der",		// Old
 									"daar",		// (1)
 									"haar",		// (1)
 									"naar",		// (1)
 									"heb",		// present
 									"hoe",		// how,
 									"heeft",	// present
 									"hebben",	// 'to
 									"deze",		// this
 									"u",		// you
 									"want",		// (1)
 									"nog",		// yet,
 									"zal",		// 'shall',
 									"me",		// me
 									"zij",		// she,
 									"nu",		// now
 									"ge",		// 'thou',
 									"geen",		// none
 									"omdat",	// because
 									"iets",		// something,
 									"worden",	// to
 									"toch",		// yet,
 									"al",		// all,
 									"waren",	// (1)
 									"veel",		// much,
 									"meer",		// (1)
 									"doen",		// to
 									"toen",		// then,
 									"moet",		// noun
 									"ben",		// (1)
 									"zonder",	// without
 									"kan",		// noun
 									"hun",		// their,
 									"dus",		// so,
 									"alles",	// all,
 									"onder",	// under,
 									"ja",		// yes,
 									"eens",		// once,
 									"hier",		// here
 									"wie",		// who
 									"werd",		// imperfect
 									"altijd",	// always
 									"doch",		// yet,
 									"wordt",	// present
 									"wezen",	// (1)
 									"kunnen",	// to
 									"ons",		// us/our
 									"zelf",		// self
 									"tegen",	// against,
 									"na",		// after,
 									"reeds",	// already
 									"wil",		// (1)
 									"kon",		// could;
 									"niets",	// nothing
 									"uw",		// your
 									"iemand",	// somebody
 									"geweest",	// been;
 									"andere",	// other
 									// french stop words
 									"au",		// a
 									"aux",		// a
 									"avec",		// with
 									"ce",		// this
 									"ces",		// these
 									"dans",		// with
 									"de",		// of
 									"des",		// de
 									"du",		// de
 									"elle",		// she
 									"en",		// `of
 									"et",		// and
 									"eux",		// them
 									"il",		// he
 									"je",		// I
 									"la",		// the
 									"le",		// the
 									"leur",		// their
 									"lui",		// him
 									"ma",		// my
 									"mais",		// but
 									"me",		// me
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"même",		// same;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"mes",		// me
 									"moi",		// me
 									"mon",		// my
 									"ne",		// not
 									"nos",		// our
 									"notre",	// our
 									"nous",		// we
 									"on",		// one
 									"ou",		// where
 									"par",		// by
 									"pas",		// not
 									"pour",		// for
 									"qu",		// que
 									"que",		// that
 									"qui",		// who
 									"sa",		// his,
 									"se",		// oneself
 									"ses",		// his
 									"son",		// his,
 									"sur",		// on
 									"ta",		// thy
 									"te",		// thee
 									"tes",		// thy
 									"toi",		// thee
 									"ton",		// thy
 									"tu",		// thou
 									"un",		// a
 									"une",		// a
 									"vos",		// your
 									"votre",	// your
 									"vous",		// you
 									// german stop words
 									"aber",		// but
 									"alle",		// all
 									"allem",	//
 									"allen",	//
 									"aller",	//
 									"alles",	//
 									"als",		// than,
 									"also",		// so
 									"am",		// an
 									"an",		// at
 									"ander",	// other
 									"andere",	//
 									"anderem",	//
 									"anderen",	//
 									"anderer",	//
 									"anderes",	//
 									"anderm",	//
 									"andern",	//
 									"anderr",	//
 									"anders",	//
 									"auch",		// also
 									"auf",		// on
 									"aus",		// out
 									"bei",		// by
 									"bin",		// am
 									"bis",		// until
 									"bist",		// art
 									"da",		// there
 									"damit",	// with
 									"dann",		// then
 									"der",		// the
 									"den",		//
 									"des",		//
 									"dem",		//
 									"die",		//
 									"das",		//
-												Fix encoding (was tored in a mix of koi8-r, iso8859-1. Not only utf8 is used

											
										
										
											2016-05-17 15:06:40 +02:00
+									"daъ",		// that
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"derselbe",	// the
 									"derselben",	//
 									"denselben",	//
 									"desselben",	//
 									"demselben",	//
 									"dieselbe",	//
 									"dieselben",	//
 									"dasselbe",	//
 									"dazu",		// to
 									"dein",		// thy
 									"deine",	//
 									"deinem",	//
 									"deinen",	//
 									"deiner",	//
 									"deines",	//
 									"denn",		// because
 									"derer",	// of
 									"dessen",	// of
 									"dich",		// thee
 									"dir",		// to
 									"du",		// thou
 									"dies",		// this
 									"diese",	//
 									"diesem",	//
 									"diesen",	//
 									"dieser",	//
 									"dieses",	//
 									"doch",		// (several
 									"dort",		// (over)
 									"durch",	// through
 									"ein",		// a
 									"eine",		//
 									"einem",	//
 									"einen",	//
 									"einer",	//
 									"eines",	//
 									"einig",	// some
 									"einige",	//
 									"einigem",	//
 									"einigen",	//
 									"einiger",	//
 									"einiges",	//
 									"einmal",	// once
 									"er",		// he
 									"ihn",		// him
 									"ihm",		// to
 									"es",		// it
 									"etwas",	// something
 									"euer",		// your
 									"eure",		//
 									"eurem",	//
 									"euren",	//
 									"eurer",	//
 									"eures",	//
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"für",		// for
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"gegen",	// towards
 									"gewesen",	// p.p.
 									"hab",		// have
 									"habe",		// have
 									"haben",	// have
 									"hat",		// has
 									"hatte",	// had
 									"hatten",	// had
 									"hier",		// here
 									"hin",		// there
 									"hinter",	// behind
 									"ich",		// I
 									"mich",		// me
 									"mir",		// to
 									"ihr",		// you,
 									"ihre",		//
 									"ihrem",	//
 									"ihren",	//
 									"ihrer",	//
 									"ihres",	//
 									"euch",		// to
 									"im",		// in
 									"in",		// in
 									"indem",	// while
 									"ins",		// in
 									"ist",		// is
 									"jede",		// each,
 									"jedem",	//
 									"jeden",	//
 									"jeder",	//
 									"jedes",	//
 									"jene",		// that
 									"jenem",	//
 									"jenen",	//
 									"jener",	//
 									"jenes",	//
 									"jetzt",	// now
 									"kann",		// can
 									"kein",		// no
 									"keine",	//
 									"keinem",	//
 									"keinen",	//
 									"keiner",	//
 									"keines",	//
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"können",	// can
 									"könnte",	// could
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"machen",	// do
 									"man",		// one
 									"manche",	// some,
 									"manchem",	//
 									"manchen",	//
 									"mancher",	//
 									"manches",	//
 									"mein",		// my
 									"meine",	//
 									"meinem",	//
 									"meinen",	//
 									"meiner",	//
 									"meines",	//
 									"mit",		// with
 									"muss",		// must
 									"musste",	// had
 									"nach",		// to(wards)
 									"nicht",	// not
 									"nichts",	// nothing
 									"noch",		// still,
 									"nun",		// now
 									"nur",		// only
 									"ob",		// whether
 									"oder",		// or
 									"ohne",		// without
 									"sehr",		// very
 									"sein",		// his
 									"seine",	//
 									"seinem",	//
 									"seinen",	//
 									"seiner",	//
 									"seines",	//
 									"selbst",	// self
 									"sich",		// herself
 									"sie",		// they,
 									"ihnen",	// to
 									"sind",		// are
 									"so",		// so
 									"solche",	// such
 									"solchem",	//
 									"solchen",	//
 									"solcher",	//
 									"solches",	//
 									"soll",		// shall
 									"sollte",	// should
 									"sondern",	// but
 									"sonst",	// else
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"über",		// over
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"um",		// about,
 									"und",		// and
 									"uns",		// us
 									"unse",		//
 									"unsem",	//
 									"unsen",	//
 									"unser",	//
 									"unses",	//
 									"unter",	// under
 									"viel",		// much
 									"vom",		// von
 									"von",		// from
 									"vor",		// before
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"während",	// while
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								//		"war",		// was
 									"waren",	// were
 									"warst",	// wast
 									"was",		// what
 									"weg",		// away,
 									"weil",		// because
 									"weiter",	// further
 									"welche",	// which
 									"welchem",	//
 									"welchen",	//
 									"welcher",	//
 									"welches",	//
 									"wenn",		// when
 									"werde",	// will
 									"werden",	// will
 									"wie",		// how
 									"wieder",	// again
 									"will",		// want
 									"wir",		// we
 									"wird",		// will
 									"wirst",	// willst
 									"wo",		// where
 									"wollen",	// want
 									"wollte",	// wanted
-												encoding fix in StopWords.cpp (da/de). More bugs remain

											
										
										
											2017-02-10 17:28:46 +01:00
+									"würde",	// would
 									"würden",	// would
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"zu",		// to
 									"zum",		// zu
 									"zur",		// zu
 									"zwar",		// indeed
 									"zwischen",	// between
 									// italian stop words
 									"ad",		// a
 									"al",		// a
 									"allo",		// a
 									"ai",		// a
 									"agli",		// a
 									"all",		// a
 									"agl",		// a
 									"alla",		// a
 									"alle",		// a
 									"con",		// with
 									"col",		// con
 									"coi",		// con
 									"da",		// from
 									"dal",		// da
 									"dallo",	// da
 									"dai",		// da
 									"dagli",	// da
 									"dall",		// da
 									"dagl",		// da
 									"dalla",	// da
 									"dalle",	// da
 									"di",		// of
 									"del",		// di
 									"dello",	// di
 									"dei",		// di
 									"degli",	// di
 									//"dell",		// di
 									"degl",		// di
 									"della",	// di
 									"delle",	// di
 									"in",		// in
 									"nel",		// in
 									"nello",	// in
 									"nei",		// in
 									"negli",	// in
 									"nell",		// in
 									"negl",		// in
 									"nella",	// in
 									"nelle",	// in
 									"su",		// on
 									"sul",		// su
 									"sullo",	// su
 									"sui",		// su
 									"sugli",	// su
 									"sull",		// su
 									"sugl",		// su
 									"sulla",	// su
 									"sulle",	// su
 									"per",		// through,
 									"tra",		// among
 									"contro",	// against
 									"io",		// I
 									"tu",		// thou
 									"lui",		// he
 									"lei",		// she
 									"noi",		// we
 									"voi",		// you
 									"loro",		// they
 									"mio",		// my
 									"mia",		//
 									"miei",		//
 									"mie",		//
 									"tuo",		//
 									"tua",		//
 									"tuoi",		// thy
 									"tue",		//
 									"suo",		//
 									"sua",		//
 									"suoi",		// his,
 									"sue",		//
 									"nostro",	// our
 									"nostra",	//
 									"nostri",	//
 									"nostre",	//
 									"vostro",	// your
 									"vostra",	//
 									"vostri",	//
 									"vostre",	//
 									"mi",		// me
 									"ti",		// thee
 									"ci",		// us,
 									"vi",		// you,
 									"lo",		// him,
 									"la",		// her,
 									"li",		// them
 									"le",		// them,
 									"gli",		// to
 									"ne",		// from
 									"il",		// the
 									"un",		// a
 									"uno",		// a
 									"una",		// a
 									"ma",		// but
 									"ed",		// and
 									"se",		// if
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"perché",	// why,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"anche",	// also
 								//		"come",		// how
 									"dov",		// where
 									"dove",		// where
 									"che",		// who,
 									"chi",		// who
 									"cui",		// whom
 									"non",		// not
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"più",		// more
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"quale",	// who,
 									"quanto",	// how
 									"quanti",	//
 									"quanta",	//
 									"quante",	//
 									"quello",	// that
 									"quelli",	//
 									"quella",	//
 									"quelle",	//
 									"questo",	// this
 									"questi",	//
 									"questa",	//
 									"queste",	//
 									"si",		// yes
 									"tutto",	// all
 									"tutti",	// all
 									"a",		// at
 									"c",		// as
 									"e",		// and
 									"i",		// the
 									"l",		// as
 									"o",		// or
 									// norwegian stop words
 									"og",		// and
 									"i",		// in
 									"jeg",		// I
 									"det",		// it/this/that
 									"at",		// to
 									"en",		// a
 									"den",		// it/this/that
 									"til",		// to
 									"er",		// is
 									"som",		// who/that
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"på",		// on
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"de",		// they
 									"med",		// with
 									"han",		// he
 									"av",		// of
 									"ikke",		// not
 									"inte",		// not
 									"der",		// there
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"så",		// so
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"var",		// was
 									"meg",		// me
 									"seg",		// you
 									"men",		// but
 									"ett",		// a
 									"har",		// have
 									"om",		// about
 									"vi",		// we
 									"min",		// my
 									"mitt",		// my
 									"ha",		// have
 									"hade",		// had
 									"hu",		// she
 									"hun",		// she
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"nå",		// now
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"over",		// over
 									"da",		// when/as
 									"ved",		// by/know
 									"fra",		// from
 									"du",		// you
 									"ut",		// out
 									"sin",		// your
 									"dem",		// them
 									"oss",		// us
 									"opp",		// up
 									"man",		// you/one
 									"kan",		// can
 									"hans",		// his
 									"hvor",		// where
 									"eller",	// or
 									"hva",		// what
 									"skal",		// shall/must
 									"selv",		// self
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"sjøl",		// self
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"her",		// here
 									"alle",		// all
 									"vil",		// will
 									"bli",		// become
 									"ble",		// became
 									"blei",		// became
 									"blitt",	// have
 									"kunne",	// could
 									"inn",		// in
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"når",		// when
 									"være",		// be
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"kom",		// come
 									"noen",		// some
 									"noe",		// some
 									"ville",	// would
 									"dere",		// you
 									"de",		// you
 									"som",		// who/which/that
 									"deres",	// their/theirs
 									"kun",		// only/just
 									"ja",		// yes
 									"etter",	// after
 									"ned",		// down
 									"skulle",	// should
 									"denne",	// this
 									"for",		// for/because
 									"deg",		// you
 									"si",		// hers/his
 									"sine",		// hers/his
 									"sitt",		// hers/his
 									"mot",		// against
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"å",		// to
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"meget",	// much
 									"hvorfor",	// why
 									"sia",		// since
 									"sidan",	// since
 									"dette",	// this
 									"desse",	// these/those
 									"disse",	// these/those
 									"uden",		// uten
 									"hvordan",	// how
 									"ingen",	// noone
 									"inga",		// noone
 									"din",		// your
 									"ditt",		// your
 									"blir",		// become
 									"samme",	// same
 									"hvilken",	// which
 									"hvilke",	// which
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"sånn",		// such
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"inni",		// inside/within
 									"mellom",	// between
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"vår",		// our
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"hver",		// each
 									"hvem",		// who
 									"vors",		// us/ours
 									"dere",		// their
 									"deres",	// theirs
 									"hvis",		// whose
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"både",		// both
 									"båe",		// both
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"begge",	// both
 									"siden",	// since
 									"dykk",		// your
 									"dykkar",	// yours
 									"dei",		// they
 									"deira",	// them
 									"deires",	// theirs
 									"deim",		// them
 									"di",		// your
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"då",		// as/when
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"eg",		// I
 									"ein",		// a/an
 									"ei",		// a/an
 									"eit",		// a/an
 									"eitt",		// a/an
 									"elles",	// or
 									"honom",	// he
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"hjå",		// at
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"ho",		// she
 									"hoe",		// she
 									"henne",	// her
 									"hennar",	// her/hers
 									"hennes",	// hers
 									"hoss",		// how
 									"hossen",	// how
 									"ikkje",	// not
 									"ingi",		// noone
 									"inkje",	// noone
 									"korleis",	// how
 									"korso",	// how
 									"kva",		// what/which
 									"kvar",		// where
 									"kvarhelst",	// where
 									"kven",		// who/whom
 									"kvi",		// why
 									"kvifor",	// why
 									"me",		// we
 									"medan",	// while
 									"mi",		// my
 									"mine",		// my
 									"mykje",	// much
 									"no",		// now
 									"nokon",	// some
 									"noka",		// some
 									"nokor",	// some
 									"noko",		// some
 									"nokre",	// some
 									"si",		// his/hers
 									"sia",		// since
 									"sidan",	// since
 									"so",		// so
 									"somt",		// some
 									"somme",	// some
 									"um",		// about*
 									"upp",		// up
 									"vere",		// be
 									"er",		// am
 									"var",		// was
 									"vore",		// was
 									"verte",	// become
 									"vort",		// become
 									"varte",	// became
 									"vart",		// became
 									"er",		// am
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"være",		// to
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"var",		// was
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+									"å",		// on
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									// portuguese stop words
 									"de",		// of,
 									"a",		// the;
 									"o",		// the;
 									"que",		// who,
 									"e",		// and
 									"do",		// de
 									"da",		// de
 									"em",		// in
 									"um",		// a
 									"para",		// for
 									//"com",		// with
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"não",		// not,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"uma",		// a
 									"os",		// the;
 									"no",		// em
 									"se",		// himself
 									"na",		// em
 									"por",		// for
 									"mais",		// more
 									"as",		// the;
 									"dos",		// de
 									"como",		// how,as
 									"mas",		// but
 									"ao",		// a
 									"ele",		// he
 									"das",		// de
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									//"à",		// a
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"seu",		// his
 									"sua",		// her
 									"ou",		// or
 									"quando",	// when
 									"muito",	// much
 									"nos",		// em
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"já",		// already,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"eu",		// I
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"também",	// also
 									"só",		// only,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"pelo",		// per
 									"pela",		// per
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"até",		// up
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"isso",		// that
 									"ela",		// he
 									"entre",	// between
 									"depois",	// after
 									"sem",		// without
 									"mesmo",	// same
 									"aos",		// a
 									"seus",		// his
 									"quem",		// whom
 									"nas",		// em
 									"me",		// me
 									"esse",		// that
 									"eles",		// they
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"você",		// you
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"essa",		// that
 									"num",		// em
 									"nem",		// nor
 									"suas",		// her
 									"meu",		// my
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"às",		// a
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"minha",	// my
 									"numa",		// em
 									"pelos",	// per
 									"elas",		// they
 									"qual",		// which
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"nós",		// we
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"lhe",		// to
 									"deles",	// of them
 									"essas",	// those
 									"esses",	// those
 									"pelas",	// per
 									"este",		// this
 									"dele",		// of
 									"tu",		// thou
 									"te",		// thee
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"vocês",	// you
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"vos",		// you
 									"lhes",		// to
 									"meus",		// my
 									"minhas",	//
 									"teu",		// thy
 									"tua",		//
 									"teus",		//
 									"tuas",		//
 									"nosso",	// our
 									"nossa",	//
 									"nossos",	//
 									"nossas",	//
 									"dela",		// of
 									"delas",	// of
 									"esta",		// this
 									"estes",	// these
 									"estas",	// these
 									"aquele",	// that
 									"aquela",	// that
 									"aqueles",	// those
 									"aquelas",	// those
 									"isto",		// this
 									"aquilo",	// that
 									"estou",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"está",		//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estamos",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estão",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estive",	//
 									"esteve",	//
 									"estivemos",	//
 									"estiveram",	//
 									"estava",	//
-												Fix encoding (was tored in a mix of koi8-r, iso8859-1. Not only utf8 is used

											
										
										
											2016-05-17 15:06:40 +02:00
+									"estАvamos",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estavam",	//
 									"estivera",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estivéramos",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"esteja",	//
 									"estejamos",	//
 									"estejam",	//
 									"estivesse",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estivéssemos",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estivessem",	//
 									"estiver",	//
 									"estivermos",	//
 									"estiverem",	//
 									// russian stop words
-												Fix encoding (was tored in a mix of koi8-r, iso8859-1. Not only utf8 is used

											
										
										
											2016-05-17 15:06:40 +02:00
+									"и",		// and
 									"в",		// in/into
 									"во",		// alternative
 									"не",		// not
 									"что",		// what/that
 									"он",		// he
 									"на",		// on/onto
 									"я",		// i
 									"с",		// from
 									"со",		// alternative
 									"как",		// how
 									"а",		// milder
 									"то",		// conjunction
 									"все",		// all
 									"она",		// she
 									"так",		// so,
 									"его",		// him
 									"но",		// but
 									"да",		// yes/and
 									"ты",		// thou
 									"к",		// towards,
 									"у",		// around,
 									"же",		// intensifier
 									"вы",		// you
 									"за",		// beyond,
 									"бы",		// conditional/subj.
 									"по",		// up
 									"только",	// only
 									"ее",		// her
 									"мне",		// to
 									"было",		// it
 									"вот",		// here
 									"от",		// away
 									"меня",		// me
 									"еще",		// still,
 									"нет",		// no,
 									"о",		// about
 									"из",		// out
 									"ему",		// to
 									"теперь",	// now
 									"когда",	// when
 									"даже",		// even
 									"ну",		// so,
 									"вдруг",	// suddenly
 									"ли",		// interrogative
 									"если",		// if
 									"уже",		// already,
 									"или",		// or
 									"ни",		// neither
 									"быть",		// to
 									"был",		// he
 									"него",		// prepositional
 									"до",		// up
 									"вас",		// you
 									"нибудь",	// indef.
 									"опять",	// again
 									"уж",		// already,
 									"вам",		// to
 									"сказал",	// he
 									"ведь",		// particle
 									"там",		// there
 									"потом",	// then
 									"себя",		// oneself
 									"ничего",	// nothing
 									"ей",		// to
 									"может",	// usually
 									"они",		// they
 									"тут",		// here
 									"где",		// where
 									"есть",		// there
 									"надо",		// got
 									"ней",		// prepositional
 									"для",		// for
 									"мы",		// we
 									"тебя",		// thee
 									"их",		// them,
 									"чем",		// than
 									"была",		// she
 									"сам",		// self
 									"чтоб",		// in
 									"без",		// without
 									"будто",	// as
 									"человек",	// man,
 									"чего",		// genitive
 									"раз",		// once
 									"тоже",		// also
 									"себе",		// to
 									"под",		// beneath
 									"жизнь",	// life
 									"будет",	// will
 									"ж",		// int16_t
 									"тогда",	// then
 									"кто",		// who
 									"этот",		// this
 									"говорил",	// was
 									"того",		// genitive
 									"потому",	// for
 									"этого",	// genitive
 									"какой",	// which
 									"совсем",	// altogether
 									"ним",		// prepositional
 									"здесь",	// here
 									"этом",		// prepositional
 									"один",		// one
 									"почти",	// almost
 									"мой",		// my
 									"тем",		// instrumental/dative
 									"чтобы",	// full
 									"нее",		// her
 									"кажется",	// it
 									"сейчас",	// now
 									"были",		// they
 									"куда",		// where
 									"зачем",	// why
 									"сказать",	// to
 									"всех",		// all
 									"никогда",	// never
 									"сегодня",	// today
 									"можно",	// possible,
 									"при",		// by
 									"наконец",	// finally
 									"два",		// two
 									"об",		// alternative
 									"другой",	// another
 									"хоть",		// even
 									"после",	// after
 									"над",		// above
 									"больше",	// more
 									"тот",		// that
 									"через",	// across,
 									"эти",		// these
 									"нас",		// us
 									"про",		// about
 									"всего",	// in
 									"них",		// prepositional
 									"какая",	// which,
 									"много",	// lots
 									"разве",	// interrogative
 									"сказала",	// she
 									"три",		// three
 									"эту",		// this,
 									"моя",		// my,
 									"впрочем",	// moreover,
 									"хорошо",	// good
 									"свою",		// ones
 									"этой",		// oblique
 									"перед",	// in
 									"иногда",	// sometimes
 									"лучше",	// better
 									"чуть",		// a
 									"том",		// preposn.
 									"нельзя",	// one
 									"такой",	// such
 									"им",		// to
 									"более",	// more
 									"всегда",	// always
 									"конечно",	// of
 									"всю",		// acc.
 									"между",	// between
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									// spanish stop words
 									"de",		// from,
 									"la",		// the,
 									"que",		// who,
 									"el",		// the
 									"en",		// in
 									"y",		// and
 									"a",		// to
 									//"los",		// the,
 									"del",		// de
 									"se",		// himself,
 									"las",		// the,
 									"por",		// for,
 									"un",		// a
 									"para",		// for
 									"con",		// with
 									"no",		// no
 									"una",		// a
 									"su",		// his,
 									"al",		// a
 									"lo",		// him
 									"como",		// how
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"más",		// more
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"pero",		// pero
 									"sus",		// su
 									"le",		// to
 									"ya",		// already
 									"o",		// or
 									"este",		// this
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"sí",		// himself
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"porque",	// because
 									"esta",		// this
 									"entre",	// between
 									"cuando",	// when
 									"muy",		// very
 									"sin",		// without
 									"sobre",	// on
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"también",	// also
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"me",		// me
 									"hasta",	// until
 									"hay",		// there
 									"donde",	// where
 									"quien",	// whom,
 									"desde",	// from
 									"todo",		// all
 									"nos",		// us
 									"durante",	// during
 									"todos",	// all
 									"uno",		// a
 									"les",		// to
 									"ni",		// nor
 									"contra",	// against
 									"otros",	// other
 									"ese",		// that
 									"eso",		// that
 									"ante",		// before
 									"ellos",	// they
 									"e",		// and
 									"esto",		// this
-												Fix encoding (was tored in a mix of koi8-r, iso8859-1. Not only utf8 is used

											
										
										
											2016-05-17 15:06:40 +02:00
+									"mМ",		// me
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"antes",	// before
 									"algunos",	// some
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"qué",		// what?
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"unos",		// a
 									"yo",		// I
 									"otro",		// other
 									"otras",	// other
 									"otra",		// other
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"él",		// he
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"tanto",	// so
 									"esa",		// that
 									"estos",	// these
 									"mucho",	// much,
 									"quienes",	// who
 									"nada",		// nothing
 									"muchos",	// many
 									"cual",		// who
 									"poco",		// few
 									"ella",		// she
 									"estar",	// to
 									"estas",	// these
 									"algunas",	// some
 									"algo",		// something
 									"nosotros",	// we
 									"mi",		// me
 									"mis",		// mi
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"tú",		// thou
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"te",		// thee
 									"ti",		// thee
 									"tu",		// thy
 									"tus",		// tu
 									"ellas",	// they
 									"nosotras",	// we
 									"vosostros",	// you
 									"vosostras",	// you
 									"os",		// you
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"mío",		// mine
 									"mía",		//
 									"míos",		//
 									"mías",		//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"tuyo",		// thine
 									"tuya",		//
 									"tuyos",	//
 									"tuyas",	//
 									"suyo",		// his,
 									"suya",		//
 									"suyos",	//
 									"suyas",	//
 									"nuestro",	// ours
 									"nuestra",	//
 									"nuestros",	//
 									"nuestras",	//
 									"vuestro",	// yours
 									"vuestra",	//
 									"vuestros",	//
 									"vuestras",	//
 									"esos",		// those
 									"esas",		// those
 									"estoy",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estás",	//
 									"está",		//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estamos",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estáis",	//
 									"están",	//
 									"esté",		//
 									"estés",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estemos",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estéis",	//
 									"estén",	//
 									"estaré",	//
 									"estarás",	//
 									"estará",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estaremos",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estaréis",	//
 									"estarán",	//
 									"estaría",	//
 									"estarías",	//
 									"estaríamos",	//
 									"estaríais",	//
 									"estarían",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estaba",	//
 									"estabas",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estábamos",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estabais",	//
 									"estaban",	//
 									"estuve",	//
 									"estuviste",	//
 									"estuvo",	//
 									"estuvimos",	//
 									"estuvisteis",	//
 									"estuvieron",	//
 									"estuviera",	//
 									"estuvieras",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estuviéramos",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estuvierais",	//
 									"estuvieran",	//
 									"estuviese",	//
 									"estuvieses",	//
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"estuviésemos",	//
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"estuvieseis",	//
 									"estuviesen",	//
 									"estando",	//
 									"estado",	//
 									"estada",	//
 									"estados",	//
 									"estadas",	//
 									"estad",	//
 									// swedish stop words
 									"och",		// and
 									"det",		// it,
 									"att",		// to
 									"i",		// in,
 									"en",		// a
 									"jag",		// I
 									"hon",		// she
 									"som",		// who,
 									"han",		// he
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"på",		// on
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"den",		// it,
 									"med",		// with
 									"var",		// where,
 									"sig",		// him(self)
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									//"för",		// for
 									"så",		// so
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"till",		// to
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"är",		// is
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"men",		// but
 									"ett",		// a
 									"om",		// if;
 									"hade",		// had
 									"de",		// they,
 									"av",		// of
 									"icke",		// not,
 									"mig",		// me
 									"du",		// you
 									"henne",	// her
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"då",		// then,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"sin",		// his
 									"nu",		// now
 									"har",		// have
 									"inte",		// inte
 									"hans",		// his
 									"honom",	// him
 									"skulle",	// 'sake'
 									"hennes",	// her
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"där",		// there
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"min",		// my
 									"man",		// one
 									"ej",		// nor
 									"vid",		// at,
 									"kunde",	// could
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"något",	// some
 									"från",		// from,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"ut",		// out
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"när",		// when
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"efter",	// after,
 									"upp",		// up
 									"vi",		// we
 									"dem",		// them
 									"vara",		// be
 									"vad",		// what
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"över",		// over
 									"än",		// than
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"dig",		// you
 									"kan",		// can
 									"sina",		// his
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"här",		// here
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"ha",		// have
 									"mot",		// towards
 									"alla",		// all
 									"under",	// under
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"någon",	// some
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"eller",	// or
 									"allt",		// all
 									"mycket",	// much
 									"sedan",	// since
 									"ju",		// why
 									"denna",	// this/that
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"själv",	// myself,
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"detta",	// this/that
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"åt",		// to
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"utan",		// without
 									"varit",	// was
 									"hur",		// how
 									"ingen",	// no
 									"mitt",		// my
 									"ni",		// you
 									"bli",		// to
 									"blev",		// from
 									"oss",		// us
 									"din",		// thy
 									"dessa",	// these/those
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"några",	// some
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"deras",	// their
 									"blir",		// from
 									"mina",		// my
 									"samma",	// (the)
 									"vilken",	// who,
 									"er",		// you,
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"sådan",	// such
 									"vår",		// our
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"blivit",	// from
 									"dess",		// its
 									"inom",		// within
 									"mellan",	// between
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"sådant",	// such
 									//"varför",	// why
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"varje",	// each
 									"vilka",	// who,
 									"ditt",		// thy
 									"vem",		// who
 									"vilket",	// who,
 									"sitta",	// his
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"sådana",	// such
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"vart",		// each
 									"dina",		// thy
 									"vars",		// whose
-												Fix more is8859-1 -> utf8 encoding errors in StopWords.cpp

											
										
										
											2017-12-28 12:45:15 +01:00
+									"vårt",		// our
 									"våra",		// our
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									"ert",		// your
 									"era",		// your
 									"vilkas",	// whose
 									// internet stop words
 									"www",
 									//"com",
 									// additional stop words
 									//"san"           // like san francisco
 								};
 								static HashTableX s_commonWordTable;
 								static bool       s_commonWordsInitialized = false;
-												Make isCommonWord() thread-safe.

											
										
										
											2016-08-19 14:21:07 +02:00
+								static GbMutex s_commonWordtableMutex;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 								// for Process.cpp::resetAll() to call when exiting to free all mem
 								void resetStopWordTables() {
 									s_stopWordTable.reset();
-												query stop words now based on selected langid.

											
										
										
											2015-03-08 15:16:24 -07:00
+									for ( int i = 0 ; i <= MAXLANGID ; i++ )
 										s_queryStopWordTables[i].reset();
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									s_commonWordTable.reset();
 								}
 								// used by Msg24.cpp for gigabits generation
-												now it compiles with -m32

											
										
										
											2014-11-10 14:45:11 -08:00
+								int32_t isCommonWord ( int64_t h ) {
-												Make isCommonWord() thread-safe.

											
										
										
											2016-08-19 14:21:07 +02:00
 									ScopedLock sl(s_commonWordtableMutex);
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+									// include a bunch of foreign prepositions so they don't get required
 									// by the bitScores in IndexTable.cpp
 									if ( ! s_commonWordsInitialized ) {
 										// set up the hash table
-												Remove niceness from HashTableX

											
										
										
											2016-09-01 18:18:30 +02:00
+										if ( ! s_commonWordTable.set (8,4,sizeof(s_commonWords)*2, NULL,0,false,"commonwrds") ) {
-												Log function will now return void instead of a boolean

											
										
										
											2016-08-01 15:29:03 +02:00
+											log(LOG_INIT, "query: Could not init common words table.");
-												stop returning false from a function returning int32_t in StopWords

											
										
										
											2016-10-22 23:15:27 +02:00
+											return 0;
-												Log function will now return void instead of a boolean

											
										
										
											2016-08-01 15:29:03 +02:00
+										}
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+										// now add in all the stop words
-												now it compiles with -m32

											
										
										
											2014-11-10 14:45:11 -08:00
+										int32_t n = (int32_t)sizeof(s_commonWords)/ sizeof(char *);
 										for ( int32_t i = 0 ; i < n ; i++ ) {
-												Fix conversion from string literal to 'char *' for StopWords

											
										
										
											2016-05-30 14:38:32 +02:00
+											const char *sw    = s_commonWords[i];
-												Removed gbstrlen()

gbstrlen() just checked for NULL and called gbshutdownAbort(). Dereferncing NULL on mordern platforms cases a SIGSEGV which is cought by our signal handler and .... gbshutdownAbort() is called. So gbstrlen() was superfluous and complicated static analysis.

											
										
										
											2016-07-28 17:04:35 +02:00
+											int32_t  swlen = strlen ( sw );
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+											// use the same algo that Words.cpp computeWordIds does
-												replace long long with int64_t

											
										
										
											2014-10-30 13:36:39 -06:00
+											int64_t swh = hash64Lower_utf8 ( sw , swlen );
-												Changed HashTableX::addTerm() to take the key by-value instead of by-pointer

											
										
										
											2016-10-27 11:53:53 +02:00
+											if ( ! s_commonWordTable.addTerm(swh,i+1 ) )
-												stop returning false from a function returning int32_t in StopWords

											
										
										
											2016-10-22 23:15:27 +02:00
+												return 0;
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+											// . add w/o accent marks too!
-												More encoding fixes for query-stopwords (german/italian/norwegian)

											
										
										
											2017-02-23 13:53:56 +01:00
+											// . skip "für" though because fur is an eng. word
 											//if ( *sw=='f' && *(sw+1)=='ü' &&
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+											//     *(sw+2)=='r' && swlen == 3 ) continue;
 											//swh   = hash64AsciiLower ( sw , swlen );
-												Changed HashTableX::addTerm() to take the key by-value instead of by-pointer

											
										
										
											2016-10-27 11:53:53 +02:00
+											//s_commonWordTable.addTerm(swh,i+1,i+1,true);
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+										}
 										s_commonWordsInitialized = true;
 									}
-												Make isCommonWord() thread-safe.

											
										
										
											2016-08-19 14:21:07 +02:00
+									sl.unlock();
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
 									// . all 1 char letter words are stop words
 									// . good for initials and some contractions
 									//if ( len == 1 && is_alpha_a(*s) ) return true;
 									// get from table
-												Changed HashTableX::getScore() to take the key by-value instead of by-pointer

											
										
										
											2016-10-27 11:57:44 +02:00
+									return s_commonWordTable.getScore(h);
-												Initial file population.

											
										
										
											2013-08-02 13:12:24 -07:00
+								}