mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-02-02 03:38:43 -05:00
267 lines
4.4 KiB
C++
267 lines
4.4 KiB
C++
// Matt Wells, copyright Jul 2001
|
|
|
|
|
|
#include "HashTableX.h"
|
|
#include "Process.h"
|
|
#include "GbMutex.h"
|
|
#include "ScopedLock.h"
|
|
|
|
|
|
class Abbr {
|
|
public:
|
|
const char *m_str;
|
|
// MUST it have a word after it????
|
|
char m_hasWordAfter;
|
|
};
|
|
|
|
// . i shrunk this list a lot
|
|
// . see backups for the hold list
|
|
static const class Abbr s_abbrs99[] = {
|
|
{"hghway",0},//highway
|
|
{"hway",0},//highway
|
|
{"hwy",0},//highway
|
|
{"ln",0}, // lane
|
|
{"mil",0}, // military
|
|
{"pkway",0}, // parkway
|
|
{"pkwy",0}, // parkway
|
|
{"lp",0}, // Loop
|
|
{"phd",0}, // Loop
|
|
{"demon",0}, // demonstration
|
|
{"alz",0}, // alzheimer's
|
|
|
|
{"lang",0}, // language
|
|
{"gr",0}, // grade(s) "xmas concert gr. 1-5"
|
|
{"vars",0}, // varsity
|
|
{"avg",0}, // average
|
|
{"amer",0}, // america
|
|
|
|
{"bet",0}, // between 18th and 19th for piratecatradio.com
|
|
{"nr",0}, // near 6th street = nr. 6th street
|
|
{"appt",0},
|
|
{"tel",1},
|
|
{"intl",0},
|
|
{"div",1}, // div. II
|
|
|
|
{"int",1}, // Intermediate Dance
|
|
{"beg",1}, // Beginner Dance
|
|
{"adv",1}, // Advanced Dance
|
|
|
|
{"feat",1}, // featuring.
|
|
{"tdlr",0}, // toddler
|
|
{"schl",0}, // pre-schl
|
|
|
|
// times
|
|
{"am",0}, // unm.edu url puts {"7 am. - 9 am.{" time ranges!
|
|
{"pm",0},
|
|
{"mon",0},
|
|
{"tue",0},
|
|
{"tues",0},
|
|
{"wed",0},
|
|
{"wednes",0},
|
|
{"thu",0},
|
|
{"thur",0},
|
|
{"thurs",0},
|
|
{"fri",0},
|
|
{"sat",0},
|
|
{"sun",0},
|
|
|
|
{"Ala",0},
|
|
{"Ariz",0},
|
|
{"Assn",0},
|
|
{"Assoc",0},
|
|
{"asst",0}, // assistant
|
|
{"Atty",0},
|
|
{"Attn",1},
|
|
{"Aug",0},
|
|
{"Ave",0},
|
|
{"Bldg",0},
|
|
{"Bros",0}, // brothers
|
|
{"Blvd",0},
|
|
{"Calif",0},
|
|
{"Capt",1},
|
|
{"Cf",0},
|
|
{"Ch",0},
|
|
{"Co",0},
|
|
{"Col",0},
|
|
{"Colo",0},
|
|
{"Conn",0},
|
|
{"Mfg",0},
|
|
{"Corp",0},
|
|
{"DR",0},
|
|
{"Dec",0},
|
|
{"Dept",0},
|
|
{"Dist",0},
|
|
{"Dr",0},
|
|
{"Drs",0},
|
|
{"Ed",0},
|
|
{"Eq",0},
|
|
{"ext",0}, // extension
|
|
{"FEB",0},
|
|
{"Feb",0},
|
|
{"Fig",0},
|
|
{"Figs",0},
|
|
{"Fla",0},
|
|
{"Ft",1}, // ft. worth texas or feet
|
|
{"Ga",0},
|
|
{"Gen",0},
|
|
{"Gov",0},
|
|
{"HON",0},
|
|
{"Ill",0},
|
|
{"Inc",0},
|
|
{"JR",0},
|
|
{"Jan",0},
|
|
{"Jr",0},
|
|
{"Kan",0},
|
|
//{"Ky",0},
|
|
{"La",0},
|
|
{"Lt",0},
|
|
{"Ltd",0},
|
|
{"MR",1},
|
|
{"MRS",1},
|
|
{"Mar",0},
|
|
{"Mass",0},
|
|
{"Md",0},
|
|
{"Messrs",1},
|
|
{"Mich",0},
|
|
{"Minn",0},
|
|
{"Miss",0},
|
|
{"Mmes",0},
|
|
//{"Mo",0}, no more 2-letter state abbreviations
|
|
{"Mr",1},
|
|
{"Mrs",1},
|
|
{"Ms",1},
|
|
{"Msgr",1},
|
|
{"Mt",1},
|
|
{"NO",0},
|
|
{"No",0},
|
|
{"Nov",0},
|
|
{"Oct",0},
|
|
{"Okla",0},
|
|
{"Op",0},
|
|
{"Ore",0},
|
|
//{"Pa",0},
|
|
{"Pp",0},
|
|
{"Prof",1},
|
|
{"Prop",0},
|
|
{"Rd",0},
|
|
{"Ref",0},
|
|
{"Rep",0},
|
|
{"Reps",0},
|
|
{"Rev",0},
|
|
{"Rte",0},
|
|
{"Sen",0},
|
|
{"Sept",0},
|
|
{"Sr",0},
|
|
{"St",0},
|
|
{"ste",0},
|
|
{"Stat",0},
|
|
{"Supt",0},
|
|
{"Tech",0},
|
|
{"Tex",0},
|
|
{"Va",0},
|
|
{"Vol",0},
|
|
{"Wash",0},
|
|
//{"al",0},
|
|
{"av",0},
|
|
{"ave",0},
|
|
{"ca",0},
|
|
{"cc",0},
|
|
{"chap",0},
|
|
{"cm",0},
|
|
{"cu",0},
|
|
{"dia",0},
|
|
{"dr",0},
|
|
{"eqn",0},
|
|
{"etc",0},
|
|
{"fig",1},
|
|
{"figs",1},
|
|
{"ft",0}, // fort or feet or featuring
|
|
//{"gm",0},
|
|
{"hr",0},
|
|
//{"in",0},
|
|
//{"kc",0},
|
|
{"lb",0},
|
|
{"lbs",0},
|
|
{"mg",0},
|
|
{"ml",0},
|
|
{"mm",0},
|
|
{"mv",0},
|
|
//{"nw",0},
|
|
{"oz",0},
|
|
{"pl",0},
|
|
{"pp",0},
|
|
{"sec",0},
|
|
{"sq",0},
|
|
{"st",0},
|
|
{"vs",1},
|
|
{"yr",0},
|
|
{"yrs",0}, // 3 yrs old
|
|
// middle initials
|
|
{"a",0},
|
|
{"b",0},
|
|
{"c",0},
|
|
{"d",0},
|
|
{"e",0},
|
|
{"f",0},
|
|
{"g",0},
|
|
{"h",0},
|
|
{"i",0},
|
|
{"j",0},
|
|
{"k",0},
|
|
{"l",0},
|
|
{"m",0},
|
|
{"n",0},
|
|
{"o",0},
|
|
{"p",0},
|
|
{"q",0},
|
|
{"r",0},
|
|
{"s",0},
|
|
{"t",0},
|
|
{"u",0},
|
|
{"v",1}, // versus
|
|
{"w",0},
|
|
{"x",0},
|
|
{"y",0},
|
|
{"z",0}
|
|
};
|
|
|
|
static HashTableX s_abbrTable;
|
|
static bool s_abbrInitialized = false;
|
|
static GbMutex s_mtx;
|
|
|
|
bool isAbbr ( int64_t h , bool *hasWordAfter ) {
|
|
ScopedLock sl(s_mtx);
|
|
if ( ! s_abbrInitialized ) {
|
|
// set up the hash table
|
|
int32_t n = ((int32_t)sizeof(s_abbrs99))/ ((int32_t)sizeof(Abbr));
|
|
if ( ! s_abbrTable.set ( 8,4,n*4, NULL,0,false,"abbrtbl")) {
|
|
log( LOG_ERROR, "build: Could not init abbrev table." );
|
|
return false;
|
|
}
|
|
// now add in all the stop words
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
const char *sw = s_abbrs99[i].m_str;
|
|
int64_t swh = hash64Lower_utf8 ( sw );
|
|
int32_t val = i + 1;
|
|
if ( ! s_abbrTable.addKey (&swh,&val) ) return false;
|
|
}
|
|
s_abbrInitialized = true;
|
|
// test it
|
|
int64_t h = hash64Lower_utf8("St");
|
|
if ( ! s_abbrTable.isInTable(&h) ) { g_process.shutdownAbort(true); }
|
|
int32_t sc = s_abbrTable.getScore(h);
|
|
if ( sc >= n ) { g_process.shutdownAbort(true); }
|
|
}
|
|
// get from table
|
|
int32_t sc = s_abbrTable.getScore(h);
|
|
if ( sc <= 0 ) return false;
|
|
if ( hasWordAfter ) *hasWordAfter = s_abbrs99[sc-1].m_hasWordAfter;
|
|
return true;
|
|
}
|
|
|
|
|
|
void resetAbbrTable ( ) {
|
|
s_abbrTable.reset();
|
|
}
|
|
|